# Create and explore manually annotated testset of temporal expressions

In [3]:
# Load JSON besluithistories

import json
from copy import deepcopy

with open('besluithistorie.json') as f:
  besluitjson = json.load(f)

with open('besluitenhistorie.json') as f:
  besluitenjson = json.load(f)


histories = {}


def addHistories(items):
    for item in items:
        #print(item['tuple'][0]['attributes']['http://schema.org/identifier'])

        #if this is a document (as opposed to an Instrument, which is not a besluithistorie)
        if item['tuple'][0]['class'][0] == "http://utrecht.nl/data/Document":
            #temp history
            history = { "id": item['tuple'][0]['attributes']['http://schema.org/identifier'],
                       "name": item['tuple'][0]['attributes']['http://schema.org/name'],
                       "filesize": item['tuple'][0]['attributes']['http://schema.org/fileSize'],
                       "url": item['tuple'][0]['attributes']['http://schema.org/url']
                       }
            #print(history)

            if history['id'] in histories:
                print('already exists, ignored: ' + history['name'])
        #        histories[history['id']].append(deepcopy(history))
            else:
                histories[history['id']] = deepcopy(history)
        else:
            print('Item was not a document, but a ' + str(item['tuple'][0]['class']))


        #print()
        
addHistories(besluitjson['items'])
addHistories(besluitenjson['items'])

print(str(len(histories)) + ' histories found')

already exists, ignored: Besluitenhistorie Lombokplein.pdf
already exists, ignored: Bijlage 9 Besluitenhistorie Luchtkwaliteit.pdf
already exists, ignored: Bijlage De Besluitenhistorie Smakkelaarsveld
already exists, ignored: Bijlage Besluitenhistorie Hoge Woerd
Item was not a document, but a ['http://utrecht.nl/data/Brief', 'http://utrecht.nl/data/Instrument']
211 histories found


In [4]:
#remove duplicates
histories_clean = {}

def is_duplicate(hist1, hist2):
    if(hist1['name'] == hist2['name'] and hist1['filesize'] == hist2['filesize']):
        return True
    #elif hist1['name'] == hist2['name']:
        #print('almost')
        #print(hist1)
        #print(hist2)
        #print()
    else:
        return False

for old_hist in histories:
    is_dupe = False
    for new_hist in histories_clean:
        if is_duplicate(histories[old_hist], histories_clean[new_hist]):
            is_dupe = True
            break
    if not is_dupe:
        histories_clean[old_hist] = deepcopy(histories[old_hist])

print('There are ' + str(len(histories_clean)) + ' histories after removing exact duplicates, and up to 23 more near duplicates (same name).')



#we don't need the dupes
histories = histories_clean


#At this point there were 3 besluithistories with a filesize of 1 kb. These are corrupted files, and I deleted them
# Remove those from the dict:
histories.pop('6e95de96-42a2-4573-bc5d-344d80224fc0', None)
histories.pop('b0356fef-b081-4dda-9b98-6fcb8a4e6679', None)
histories.pop('665e32b9-7299-4f3c-a1e2-9c1765eb4803', None)

There are 175 histories after removing exact duplicates, and up to 23 more near duplicates (same name).


{'id': '665e32b9-7299-4f3c-a1e2-9c1765eb4803',
 'name': 'Bijlage 2 Besluitenhistorie Parkeergarage Jaarbeursplein',
 'filesize': 188,
 'url': 'https://api1.ibabs.eu/publicdownload.aspx?site=Utrecht&id=665e32b9-7299-4f3c-a1e2-9c1765eb4803'}

In [5]:
#download besluithistories
#import requests
#for history in histories:
#    r = requests.get(histories[history]['url'], allow_redirects=True)
#    open('./besluitdocs/' + histories[history]['id'] + '.pdf', 'wb').write(r.content)

In [6]:
# use tool recommended by daria for text content
#!pip install pdfminer

# use second tool for extracting urls
#!pip install pypdf2

In [45]:
# Now extract text from besluithistories

#test example from stackoverflow
#https://stackoverflow.com/questions/26494211/extracting-text-from-a-pdf-file-using-pdfminer-in-python

import io

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage


def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)



    fp.close()
    device.close()
    text = retstr.getvalue()
    retstr.close()
    return text

for h in histories:
    print(h)
    histories[h]['content'] = convert_pdf_to_txt('./besluitdocs/' + str(h) + '.pdf')
    

009fd1d9-dc31-4086-8132-18c568b0db61
03675b7f-3a66-4d12-878a-5caca2df82e9
03efecf7-9303-44c4-afd9-135a191c796d
0454d040-0cc2-46a9-9edd-d1a8b652a59a
04d84b2f-0a9f-4586-b783-280fe0167706
06a12533-91b2-490f-aa93-04add0fd7c34
09ab9fe9-e513-4bfa-98c0-c317540856c2
0ac3d7d8-c7b7-4a6a-806e-0470af7bf8bf
0cb87edf-2e48-433e-b473-f0282fc0edf6
0cda393f-f184-4fc5-91e3-7b92a0138841
0eb158a3-1a1a-4bb0-b52b-c1675a13c7f4
109fd4bd-47aa-4690-9254-bfa267ec7a8c
1100db10-7c28-45ed-bd31-c5b7db975b4b
117bfc54-d153-4d86-a230-4aa797c4cb56
148d9580-9951-4d74-af47-568561db68c9
14d03874-83b8-44aa-9e57-eda078f8613e
14facc8d-9b15-4466-bbe1-bf4717021d6d
1611e400-0d05-4561-89dc-bc0ecb3ab28b
175d3f8e-05f3-4d02-9d75-6ae30da279e1
19197683-24f0-4dcb-bd9d-52d09e2f45e6
1bf35fe9-9cc5-4af5-8fc4-6fc287db2bdc
1c0381ab-4939-4a52-98fd-558198f916ae
1d270fbb-4454-48c2-b22c-d0141efe8e01
1d7b4020-da37-4f28-ab2b-6e0ca301f27f
1d9d4a81-eb60-4306-95f8-a35b99d68f09
1e3838ba-3a09-4ccc-95f7-b9ad3ed9e951
1ed3c698-36c8-4457-8055-1893e289a805
2

In [46]:

# Now extract urls from each fulltext
# Because I'm too lazy to figure it out with pdfminer, I'm using another tool
# stackoverflow source https://stackoverflow.com/questions/27744210/extract-hyperlinks-from-pdf-in-python

import PyPDF2

def getURLs(path):
    ibabs_urls = []
    
    PDFFile = open(path,'rb')

    PDF = PyPDF2.PdfFileReader(PDFFile)
    pages = PDF.getNumPages()
    key = '/Annots'
    uri = '/URI'
    ank = '/A'

    for page in range(pages):
        print("Current Page: {}".format(page))
        
        pageSliced = PDF.getPage(page)
        pageObject = pageSliced.getObject()
        if key in pageObject.keys():
            ann = pageObject[key]
            for a in ann:
                u = a.getObject()
                #if uri in u[ank].keys():
                if ank in u and uri in u[ank].keys():
                    #SO it is an URL, now filter for ibabs URLs
                    newurl = u[ank][uri]
                    if 'ibabs' in newurl and 'trecht' in newurl:
                        ibabs_urls.append(u[ank][uri])
                        print(u[ank][uri])
    return ibabs_urls
                    
for h in histories:
    print(h)
    histories[h]['url'] = getURLs('./besluitdocs/' + str(h) + '.pdf')


009fd1d9-dc31-4086-8132-18c568b0db61
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/447b89ba-f685-4c4d-8485-d27d1c7cf899https:/ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/447b89ba-f685-4c4d-8485-d27d1c7cf899
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/d1d3c755-14b0-4429-9f54-10f37506c1fe
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/2b70f4a5-2e0b-46f5-8c0a-ac53554fe740
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/f7ede071-b249-48fc-bf9f-796ba3fba65a
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/28d38e1e-e95e-47cf-8a7d-9e55199e8c9a
Current Page: 1
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/00678d55-effe-487a-9516-1f199d8b5a66
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/00678d55-effe-487a-9516-1f199d8b5a66
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/13956fc1-39c4-4aca-8adf-daa843686031
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/ace4f4cb-e0f4-4cbf-b049-386d88128ecc
https://ris2.ibabs.eu/Reports/ViewListEntry

Current Page: 0
Current Page: 1
Current Page: 2
Current Page: 3
Current Page: 4
Current Page: 5
87a2daed-7f68-444e-9af1-72ff12a8d819
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/dd5e828e-dbf4-48ac-b6f6-a1f5ffc490c3
https://ris2.ibabs.eu/Agenda/Details/Utrecht/6b124a5f-0849-46a1-b40a-b9f2401586c5
885f0a2b-e728-4df4-9a86-8a4fe146707d
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/81ea98f4-383e-4c58-aab8-7e8b2272fe8e
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/1bfd51bb-ccd4-43de-892f-556c3b60ee4d
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/c8bc6470-11db-410d-86b6-526e985cf954
https://ris2.ibabs.eu/Agenda/Details/Utrecht/945c26d2-6bcd-4bb4-b919-472b3820cf33
https://ris2.ibabs.eu/Agenda/Details/Utrecht/3e528f2f-d0be-4a5a-af53-82308cde077a
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/4c550fd9-151a-47c7-b989-af948a71c8c0
https://ris2.ibabs.eu/Agenda/Details/Utrecht/f6128d20-ae27-4955-974a-226a16a9f2f7
https://ris2.ibabs.eu/Age

Current Page: 0
http://ibabsonline.eu/LijstDetails.aspx?site=Utrecht&ListId=41b35454-6329-4f11-b803-d7e5a6141085&ReportId=547701b1-04af-4e12-b84a-4cca1b29d943&EntryId=7d2d3112-c0a6-4b1b-97bf-88d011270c81&searchtext=hoge%20woerd
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/6384fe2e-d329-4966-acd1-cf70481c19d9
https://ris2.ibabs.eu/Agenda/Details/Utrecht/5ab09d6e-f4e3-40ea-a00a-4419544cde76
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/1f9c3885-5dc6-4c83-9e03-b522f127a4e6
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/695eede7-dd35-4d17-a190-9edc55811a3c
fad9f787-c967-4ffd-9628-0a88de532f0b
Current Page: 0
Current Page: 1
Current Page: 2
Current Page: 3
Current Page: 4
Current Page: 5
Current Page: 6
Current Page: 7
Current Page: 8
Current Page: 9
Current Page: 10
Current Page: 11
Current Page: 12
Current Page: 13
fb620849-d218-483d-9425-56bd6acc863f
Current Page: 0
http://ibabsonline.eu/LijstDetails.aspx?site=Utrecht&ListId=fcf15b6c-0a82-4e7a-b405-f38656cf60f7&ReportId

In [47]:
# Used e.g. when we're looking for the attached agenda item of a besluithistorie
import pickle

#Load the knowledge base
path = "C:/Users/tmsch/Desktop/ADS_Thesis/Thomas/"
with open(path + "meetingsInformation.txt", "rb") as fp:   
    meetingsInformation = pickle.load(fp)
with open(path + "itemsInformation.txt", "rb") as fp:   
    itemsInformation = pickle.load(fp)
with open(path + "entriesInformation.txt", "rb") as fp:   
    entriesInformation = pickle.load(fp)

with open(path + "documentsInformation.txt", "rb") as fp:   
    documentsInformation = pickle.load(fp)    
with open(path + "listdocumentsInformation.txt", "rb") as fp:   
    listdocumentsInformation = pickle.load(fp)
with open(path + "meetdocumentsInformation.txt", "rb") as fp:   
    meetdocumentsInformation = pickle.load(fp)

In [88]:
#Let's sort through the urls we found
count = 0
listcount = 0
agendacount = 0
doccount = 0
listdoccount = 0

unknowns = 0

listmissing = 0
agendamissing = 0
docmissing = 0
probcount = 0

for h in histories:
    histories[h]['urls_list'] = []
    histories[h]['urls_meeting'] = []
    histories[h]['urls_docs'] = []
    histories[h]['urls_listdocs'] = []
    histories[h]['urls_other'] = []
    
    count += len(histories[h]['url'])
    
    if isinstance(histories[h]['url'], str):
        histories[h]['url'] = [histories[h]['url']]
        
    for url in histories[h]['url']:
        #list entries always have a nice URL format
        if 'Reports/ViewListEntry/Utrecht' in url:
            histories[h]['urls_list'].append(url.split("/")[-1])
            listcount += 1
            
            #Test if this file is in our data
            if url.split("/")[-1] not in entriesInformation:
                listmissing += 1
            
        #meeting ids are also formatted nicely  
        elif 'Agenda/Details/Utrecht/' in url:
            histories[h]['urls_meeting'].append(url.split("/")[-1])
            agendacount += 1
            
            #Test if this file is in our data
            if url.split("/")[-1] not in meetingsInformation:
                agendamissing += 1
                
        elif 'publicdownload.aspx?site=' in url:
            
            #Test if this file is in our data
            if url.split("=")[-1] not in listdocumentsInformation:
                    if url.split("=")[-1] not in documentsInformation:
                        docmissing += 1
                        
                        print('PROBLEM?')
                        print(url.split("=")[-1])
                        probcount += 1
                    else:
                        doccount += 1
                        histories[h]['urls_listdocs'].append(url.split("=")[-1])

            else:
                listdoccount += 1
                histories[h]['urls_listdocs'].append(url.split("=")[-1])

        elif 'LijstDetails.aspx?site=' in url:
            histories[h]['urls_list'].append(url.split("=")[-1])
            listcount += 1
            
            #Test if this file is in our data
            if url.split("=")[-1] not in entriesInformation:
                listmissing += 1
                
        #special case we identified
        elif '4b9de60a-ad2a-461b-903a-a37cc44f7617' in url:
            histories[h]['urls_meeting'].append('4b9de60a-ad2a-461b-903a-a37cc44f7617')
            agendacount += 1

        else:
            unknowns += 1
            print('not sure, ignoring for now')
            print(url)
            histories[h]['urls_other'].append(url)

#        print(histories[h]['url'])
    
print(str(listcount) + ' lists')
print(str(agendacount) + ' meetings')
print(str(doccount) + ' documents')
print(str(listdoccount) + ' listdocuments')
print()
print(str(unknowns) + ' unknowns')

print(str(probcount) + ' problems')
print(str(listcount+agendacount+doccount+listdoccount) + ' combined')

print(str(count) + ' total ibabs urls found')

print()
print('Of those meetings and list entries we are missing ids: ' + str(listmissing) + ' lists and ' + str(agendamissing) + ' meetings')

not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=356&FoundIDs=A1707,Dc3ce4d3f-cd2d-43fc-b6bc-a0cd4696bb3f,D66ad8de5-b414-4657-b02d-6d87951e8af7
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=319&FoundIDs=&year=2014
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=324&FoundIDs=&year=2014
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=290&FoundIDs=A1707,Dc3ce4d3f-cd2d-43fc-b6bc-a0cd4696bb3f
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=570&FoundIDs=&year=2015
394 lists
128 meetings
5 documents
11 listdocuments

5 unknowns
0 problems
538 combined
543 total ibabs urls found

Of those meetings and list entries we are missing ids: 66 lists and 7 meetings


In [86]:
print(meetingsInformation['319'])

#prep for writing to file
print_hist = {}
for h in histories:
    print_hist[h] = {'id': histories[h]['id'],
                    'name': histories[h]['name'],
                    'filesize': histories[h]['filesize'],
                    'list_ids': histories[h]['urls_list'],
                    'meeting_ids': histories[h]['urls_meeting'],
                    'unparsed_urls': histories[h]['urls_other'],
                    'attached_to': []
                    }
print(print_hist)

KeyError: '319'

In [50]:
# 10 decision histories

manual_annots = {}
#How this is structured
#manual_annots['doc id besluithistorie']
#  date_list = list of entry id's that were referenced by date
#  date_meeting = list of meeting id's that  were referenced by date
#  title_list = list of entry id's that were referenced by title
#  title_meeting = list of meeting id's that were referenced by title

manual_annots['009fd1d9-dc31-4086-8132-18c568b0db61'] = {
    'date_list': ['447b89ba-f685-4c4d-8485-d27d1c7cf899',
'38595779-13ed-447d-87c9-7b63f7cecd47',
'6322c4d9-4d56-42af-b369-c8332794667d',
'2b70f4a5-2e0b-46f5-8c0a-ac53554fe740',
'9b8c1cf2-f763-432e-b6a1-d778245861fe',
'28d38e1e-e95e-47cf-8a7d-9e55199e8c9a',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'00678d55-effe-487a-9516-1f199d8b5a66',
'e5aa2282-d2c0-4fd3-a1be-c5e69266bd30',
'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'96e9f216-f723-4a77-b845-12953c54165a',
'd82ffa13-90d8-4294-b134-bd6f79143cff'],
    'date_meeting':['37e5d134-0c6c-4795-abac-9f6f468ff483',
'772f6c47-c89b-4b21-9e74-57742dd26464',
'6df38124-4025-411d-92c0-7935fed2e7e2',
'6df38124-4025-411d-92c0-7935fed2e7e2',
'229adb78-0392-4d8e-8ff3-09180d747b5d',
'229adb78-0392-4d8e-8ff3-09180d747b5d',
'ba044cb3-80b0-4c7b-a77b-aa6306c3b6b2'],
    'title_list':['447b89ba-f685-4c4d-8485-d27d1c7cf899',
'447b89ba-f685-4c4d-8485-d27d1c7cf899',
'6322c4d9-4d56-42af-b369-c8332794667d',
'2b70f4a5-2e0b-46f5-8c0a-ac53554fe740',
'9b8c1cf2-f763-432e-b6a1-d778245861fe',
'28d38e1e-e95e-47cf-8a7d-9e55199e8c9a',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'00678d55-effe-487a-9516-1f199d8b5a66',
'00678d55-effe-487a-9516-1f199d8b5a66',
'e5aa2282-d2c0-4fd3-a1be-c5e69266bd30',
'e5aa2282-d2c0-4fd3-a1be-c5e69266bd30',
'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'96e9f216-f723-4a77-b845-12953c54165a',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'd82ffa13-90d8-4294-b134-bd6f79143cff'],
    'title_meeting':['6df38124-4025-411d-92c0-7935fed2e7e2',
'229adb78-0392-4d8e-8ff3-09180d747b5d',
'ba044cb3-80b0-4c7b-a77b-aa6306c3b6b2'],
    'urls':16,
}
manual_annots['03efecf7-9303-44c4-afd9-135a191c796d'] = {
    'date_list': [],
    'date_meeting':['b548d7ec-bfff-49cc-8017-caeccd3cbbb8'],
    'title_list':[],
    'title_meeting':['b548d7ec-bfff-49cc-8017-caeccd3cbbb8'],
    'urls':2,
}
manual_annots['0454d040-0cc2-46a9-9edd-d1a8b652a59a'] = {
    'date_list': ['9b5177d4-f3cd-43d7-81fe-e49a7788aa3a',
'12235406-ebe7-4041-a837-57f4e9ba8827',
'dfdbcc33-1aab-4b3b-ab4f-739ccfcedea7'],
    'date_meeting':[],
    'title_list':['9b5177d4-f3cd-43d7-81fe-e49a7788aa3a',
'12235406-ebe7-4041-a837-57f4e9ba8827',
'dfdbcc33-1aab-4b3b-ab4f-739ccfcedea7'],
    'title_meeting':[],
    'urls':3,
}
manual_annots['06a12533-91b2-490f-aa93-04add0fd7c34'] = {
    'date_list': [],
    'date_meeting':['f3d08622-7a45-475b-afc9-68e96ab49b22'],
    'title_list':[],
    'title_meeting':['f3d08622-7a45-475b-afc9-68e96ab49b22'],
    'urls':1,
}
manual_annots['09ab9fe9-e513-4bfa-98c0-c317540856c2'] = {
    'date_list': [],
    'date_meeting':['a792bc6b-1fdb-4c70-a13b-88f2ee4fcb9f'],
    'title_list':[],
    'title_meeting':['a792bc6b-1fdb-4c70-a13b-88f2ee4fcb9f'],
    'urls':1,
}
manual_annots['0ac3d7d8-c7b7-4a6a-806e-0470af7bf8bf'] = {
    'date_list': ['89186d4b-8e79-445e-897d-993e6156e8f7',
'c44c7ef6-5bbb-4455-8149-3bcfd13936ed'],
    'date_meeting':['fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'title_list':['89186d4b-8e79-445e-897d-993e6156e8f7'],
    'title_meeting':['fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'urls':2,
}
manual_annots['19197683-24f0-4dcb-bd9d-52d09e2f45e6'] = {
    'date_list': ['9f64b943-1ea3-45be-9cdd-5451c0e734b9',
'031c089b-c9f6-423c-a353-5f8994b78afe',
'f6d888fa-b118-4d7e-a01e-34f2cf94e0e9',
'0b2bd741-f8a8-44b5-a820-e9f6c7554522'],
    'date_meeting':['2f481393-776f-4a98-b65c-7dab102929af'],
    'title_list':['9f64b943-1ea3-45be-9cdd-5451c0e734b9',
'031c089b-c9f6-423c-a353-5f8994b78afe',
'f6d888fa-b118-4d7e-a01e-34f2cf94e0e9',
'0b2bd741-f8a8-44b5-a820-e9f6c7554522'],
    'title_meeting':['2f481393-776f-4a98-b65c-7dab102929af'],
    'urls':5,
}
manual_annots['1bf35fe9-9cc5-4af5-8fc4-6fc287db2bdc'] = {
    'date_list': ['3a2a86e4-c76e-4166-a17e-1a63066bf26f',
'39fc8a83-9fa8-4d27-9d91-b892840d3358'],
    'date_meeting':['fe93132e-f5ce-436f-8aee-623c02548a70',
'faa222bc-c31b-4b92-98a3-1c85be8c3c83'],
    'title_list':[],
    'title_meeting':['fe93132e-f5ce-436f-8aee-623c02548a70',
'faa222bc-c31b-4b92-98a3-1c85be8c3c83'],
    'urls':4,
}
manual_annots['1c0381ab-4939-4a52-98fd-558198f916ae'] = {
    'date_list': ['644f64f7-1cb3-4045-9919-91bc7aa7d330',
'9a3e5c6d-4ff0-4bf9-a0aa-15f1634b8981',
'd5fc7a26-d5b3-4f8d-8e2c-7684932f3479'],
    'date_meeting':['cb9690c8-9940-4b26-86ca-270e94850b1f',
'd998a211-0c87-4381-8bbe-2f652ea4de3a',
'e01a664c-c484-4507-b634-3891fef46b22'],
    'title_list':['644f64f7-1cb3-4045-9919-91bc7aa7d330',
'9a3e5c6d-4ff0-4bf9-a0aa-15f1634b8981',
'd5fc7a26-d5b3-4f8d-8e2c-7684932f3479'],
    'title_meeting':['cb9690c8-9940-4b26-86ca-270e94850b1f',
'd998a211-0c87-4381-8bbe-2f652ea4de3a',
'e01a664c-c484-4507-b634-3891fef46b22'],
    'urls':6,
}
manual_annots['1d270fbb-4454-48c2-b22c-d0141efe8e01'] = {
    'date_list': [],
    'date_meeting':['ea94b4ad-ee17-447d-aa25-81fd1aea8bdf',
'fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'title_list':[],
    'title_meeting':['ea94b4ad-ee17-447d-aa25-81fd1aea8bdf',
'fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'urls':5,
}

for h in print_hist:
    if h in manual_annots:
        print_hist[h]['date_list'] = manual_annots[h]['date_list']
        print_hist[h]['date_meeting'] = manual_annots[h]['date_meeting']
        print_hist[h]['title_meeting'] = manual_annots[h]['title_list']
        print_hist[h]['title_list'] = manual_annots[h]['title_meeting']
        
    else:
        print_hist[h]['date_list'] = []
        print_hist[h]['date_meeting'] = []
        print_hist[h]['title_meeting'] = []
        print_hist[h]['title_list'] = []
        
#test
print_hist['009fd1d9-dc31-4086-8132-18c568b0db61']
print_hist

{'009fd1d9-dc31-4086-8132-18c568b0db61': {'id': '009fd1d9-dc31-4086-8132-18c568b0db61',
  'name': 'Besluitenhistorie participatie 25 juni 2019.pdf',
  'filesize': 72508,
  'list_ids': ['447b89ba-f685-4c4d-8485-d27d1c7cf899',
   'd1d3c755-14b0-4429-9f54-10f37506c1fe',
   '2b70f4a5-2e0b-46f5-8c0a-ac53554fe740',
   'f7ede071-b249-48fc-bf9f-796ba3fba65a',
   '28d38e1e-e95e-47cf-8a7d-9e55199e8c9a',
   '00678d55-effe-487a-9516-1f199d8b5a66',
   '00678d55-effe-487a-9516-1f199d8b5a66',
   '13956fc1-39c4-4aca-8adf-daa843686031',
   'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
   '71bf9d2b-9eee-4fb4-be28-ea843d013b76',
   '96e9f216-f723-4a77-b845-12953c54165a',
   '1e848d79-764d-467d-887a-c7418141cea8'],
  'meeting_ids': ['6df38124-4025-411d-92c0-7935fed2e7e2',
   '229adb78-0392-4d8e-8ff3-09180d747b5d'],
  'unparsed_urls': [],
  'attached_to': [],
  'date_list': ['447b89ba-f685-4c4d-8485-d27d1c7cf899',
   '38595779-13ed-447d-87c9-7b63f7cecd47',
   '6322c4d9-4d56-42af-b369-c8332794667d',
   '2b70f4a5-

# Test manual annotations

In [59]:
#PAPER:  manually noted URLs is all URLs, all found by extractor. Number after that is the number that contains ibabs and utrecht in URL
print(you'])
print()

for m in manual_annots:
    print(m)
    print(manual_annots[m]['urls'])
    print(len(histories[m]['url']))
    #print((histories[m]))
    print(histories[m]['urls_other'])
    print()
    
    


{'document id': '2e313e7a-dbc5-4547-8a9e-2e9de89eed24', 'origin id': '89186d4b-8e79-445e-897d-993e6156e8f7', 'origin type': 'entry', 'displayname': 'Raadsbrief Voortgang initiatiefvoorstel gelijke behandeling payrollers en motie 60 over ‘oneigenlijke’ werkervaringsplaatsen.docx', 'filename': 'Raadsbrief Voortgang initiatiefvoorstel gelijke behandeling payrollers en motie 60 over ‘oneigenlijke’ werkervaringsplaatsen.pdf', 'filesize': '221152', 'publicdownloadurl': 'https://api1.ibabs.eu/publicdownload.aspx?site=Utrecht&id=2e313e7a-dbc5-4547-8a9e-2e9de89eed24'}

009fd1d9-dc31-4086-8132-18c568b0db61
16
14
[]

03efecf7-9303-44c4-afd9-135a191c796d
2
1
[]

0454d040-0cc2-46a9-9edd-d1a8b652a59a
3
3
[]

06a12533-91b2-490f-aa93-04add0fd7c34
1
1
[]

09ab9fe9-e513-4bfa-98c0-c317540856c2
1
1
[]

0ac3d7d8-c7b7-4a6a-806e-0470af7bf8bf
2
3
['https://online.ibabs.eu/ibabsapi/publicdownload.aspx?site=Utrecht&id=2e313e7a-dbc5-4547-8a9e-2e9de89eed24', 'https://online.ibabs.eu/ibabsapi/publicdownload.aspx?s

In [52]:
TODO
* Manual annots
    * Structured raadsbrief?
    * Unstructured raadsbrief?


* What tagger do we use
    * Did we find all dates

* What is the temporal coverage etc

#let's compare three taggers?
#https://github.com/HeidelTime/heideltime  https://dbs.ifi.uni-heidelberg.de/resources/temporal-tagging/
#https://nlp.stanford.edu/software/sutime.shtml
#spacy




SyntaxError: invalid syntax (Temp/ipykernel_19528/214965076.py, line 2)