# Create and explore manually annotated testset of temporal expressions

In [1]:
# Load JSON besluithistories

import json
from copy import deepcopy

with open('besluithistorie.json') as f:
  besluitjson = json.load(f)

with open('besluitenhistorie.json') as f:
  besluitenjson = json.load(f)


histories = {}


def addHistories(items):
    for item in items:
        #print(item['tuple'][0]['attributes']['http://schema.org/identifier'])

        #if this is a document (as opposed to an Instrument, which is not a besluithistorie)
        if item['tuple'][0]['class'][0] == "http://utrecht.nl/data/Document":
            #temp history
            history = { "id": item['tuple'][0]['attributes']['http://schema.org/identifier'],
                       "name": item['tuple'][0]['attributes']['http://schema.org/name'],
                       "filesize": item['tuple'][0]['attributes']['http://schema.org/fileSize'],
                       "url": item['tuple'][0]['attributes']['http://schema.org/url']
                       }
            #print(history)

            if history['id'] in histories:
                print('already exists, ignored: ' + history['name'])
        #        histories[history['id']].append(deepcopy(history))
            else:
                histories[history['id']] = deepcopy(history)
        else:
            print('Item was not a document, but a ' + str(item['tuple'][0]['class']))


        #print()
        
addHistories(besluitjson['items'])
addHistories(besluitenjson['items'])

print(str(len(histories)) + ' histories found')

already exists, ignored: Besluitenhistorie Lombokplein.pdf
already exists, ignored: Bijlage 9 Besluitenhistorie Luchtkwaliteit.pdf
already exists, ignored: Bijlage De Besluitenhistorie Smakkelaarsveld
already exists, ignored: Bijlage Besluitenhistorie Hoge Woerd
Item was not a document, but a ['http://utrecht.nl/data/Brief', 'http://utrecht.nl/data/Instrument']
211 histories found


In [2]:
#remove duplicates
histories_clean = {}

def is_duplicate(hist1, hist2):
    if(hist1['name'] == hist2['name'] and hist1['filesize'] == hist2['filesize']):
        return True
    #elif hist1['name'] == hist2['name']:
        #print('almost')
        #print(hist1)
        #print(hist2)
        #print()
    else:
        return False

for old_hist in histories:
    is_dupe = False
    for new_hist in histories_clean:
        if is_duplicate(histories[old_hist], histories_clean[new_hist]):
            is_dupe = True
            break
    if not is_dupe:
        histories_clean[old_hist] = deepcopy(histories[old_hist])

print('There are ' + str(len(histories_clean)) + ' histories after removing exact duplicates, and up to 23 more near duplicates (same name).')



#we don't need the dupes
histories = histories_clean


#At this point there were 3 besluithistories with a filesize of 1 kb. These are corrupted files, and I deleted them
# Remove those from the dict:
histories.pop('6e95de96-42a2-4573-bc5d-344d80224fc0', None)
histories.pop('b0356fef-b081-4dda-9b98-6fcb8a4e6679', None)
histories.pop('665e32b9-7299-4f3c-a1e2-9c1765eb4803', None)

There are 175 histories after removing exact duplicates, and up to 23 more near duplicates (same name).


{'id': '665e32b9-7299-4f3c-a1e2-9c1765eb4803',
 'name': 'Bijlage 2 Besluitenhistorie Parkeergarage Jaarbeursplein',
 'filesize': 188,
 'url': 'https://api1.ibabs.eu/publicdownload.aspx?site=Utrecht&id=665e32b9-7299-4f3c-a1e2-9c1765eb4803'}

In [3]:
#download besluithistories
#import requests
#for history in histories:
#    r = requests.get(histories[history]['url'], allow_redirects=True)
#    open('./besluitdocs/' + histories[history]['id'] + '.pdf', 'wb').write(r.content)

In [4]:
# use tool recommended by daria for text content
#!pip install pdfminer

# use second tool for extracting urls
#!pip install pypdf2

In [5]:
# Now extract text from besluithistories

#test example from stackoverflow
#https://stackoverflow.com/questions/26494211/extracting-text-from-a-pdf-file-using-pdfminer-in-python

import io

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage


def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)



    fp.close()
    device.close()
    text = retstr.getvalue()
    retstr.close()
    return text

for h in histories:
    print(h)
    histories[h]['content'] = convert_pdf_to_txt('./besluitdocs/' + str(h) + '.pdf')
    

009fd1d9-dc31-4086-8132-18c568b0db61
03675b7f-3a66-4d12-878a-5caca2df82e9
03efecf7-9303-44c4-afd9-135a191c796d
0454d040-0cc2-46a9-9edd-d1a8b652a59a
04d84b2f-0a9f-4586-b783-280fe0167706
06a12533-91b2-490f-aa93-04add0fd7c34
09ab9fe9-e513-4bfa-98c0-c317540856c2
0ac3d7d8-c7b7-4a6a-806e-0470af7bf8bf
0cb87edf-2e48-433e-b473-f0282fc0edf6
0cda393f-f184-4fc5-91e3-7b92a0138841
0eb158a3-1a1a-4bb0-b52b-c1675a13c7f4
109fd4bd-47aa-4690-9254-bfa267ec7a8c
1100db10-7c28-45ed-bd31-c5b7db975b4b
117bfc54-d153-4d86-a230-4aa797c4cb56
148d9580-9951-4d74-af47-568561db68c9
14d03874-83b8-44aa-9e57-eda078f8613e
14facc8d-9b15-4466-bbe1-bf4717021d6d
1611e400-0d05-4561-89dc-bc0ecb3ab28b
175d3f8e-05f3-4d02-9d75-6ae30da279e1
19197683-24f0-4dcb-bd9d-52d09e2f45e6
1bf35fe9-9cc5-4af5-8fc4-6fc287db2bdc
1c0381ab-4939-4a52-98fd-558198f916ae
1d270fbb-4454-48c2-b22c-d0141efe8e01
1d7b4020-da37-4f28-ab2b-6e0ca301f27f
1d9d4a81-eb60-4306-95f8-a35b99d68f09
1e3838ba-3a09-4ccc-95f7-b9ad3ed9e951
1ed3c698-36c8-4457-8055-1893e289a805
2

In [6]:

# Now extract urls from each fulltext
# Because I'm too lazy to figure it out with pdfminer, I'm using another tool
# stackoverflow source https://stackoverflow.com/questions/27744210/extract-hyperlinks-from-pdf-in-python

import PyPDF2

def getURLs(path):
    ibabs_urls = []
    
    PDFFile = open(path,'rb')

    PDF = PyPDF2.PdfFileReader(PDFFile)
    pages = PDF.getNumPages()
    key = '/Annots'
    uri = '/URI'
    ank = '/A'

    for page in range(pages):
        print("Current Page: {}".format(page))
        
        pageSliced = PDF.getPage(page)
        pageObject = pageSliced.getObject()
        if key in pageObject.keys():
            ann = pageObject[key]
            for a in ann:
                u = a.getObject()
                #if uri in u[ank].keys():
                if ank in u and uri in u[ank].keys():
                    #SO it is an URL, now filter for ibabs URLs
                    newurl = u[ank][uri]
                    if 'ibabs' in newurl and 'trecht' in newurl:
                        ibabs_urls.append(u[ank][uri])
                        print(u[ank][uri])
    return ibabs_urls
                    
for h in histories:
    print(h)
    histories[h]['url'] = getURLs('./besluitdocs/' + str(h) + '.pdf')


009fd1d9-dc31-4086-8132-18c568b0db61
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/447b89ba-f685-4c4d-8485-d27d1c7cf899https:/ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/447b89ba-f685-4c4d-8485-d27d1c7cf899
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/d1d3c755-14b0-4429-9f54-10f37506c1fe
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/2b70f4a5-2e0b-46f5-8c0a-ac53554fe740
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/f7ede071-b249-48fc-bf9f-796ba3fba65a
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/28d38e1e-e95e-47cf-8a7d-9e55199e8c9a
Current Page: 1
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/00678d55-effe-487a-9516-1f199d8b5a66
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/00678d55-effe-487a-9516-1f199d8b5a66
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/13956fc1-39c4-4aca-8adf-daa843686031
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/ace4f4cb-e0f4-4cbf-b049-386d88128ecc
https://ris2.ibabs.eu/Reports/ViewListEntry



http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=4b9de60a-ad2a-461b-903a-a37cc44f7617&FoundIDs=A960_8056,D1c0eaa3a-2fb2-423d-8648-ed9b88a4d50b,D6b0405de-7e00-4194-b6f1-bae6049b3cb7,D8457fb37-1e42-4229-ae11-aa7bd651dacd,Deb467caa-2651-4ba7-ad06-c735d84b016b
http://ibabsonline.eu/LijstDetails.aspx?site=Utrecht&ListId=41b35454-6329-4f11-b803-d7e5a6141085&ReportId=547701b1-04af-4e12-b84a-4cca1b29d943&EntryId=caa6f515-5c6d-40c3-a05f-b5efcc92689a&searchtext=hardebollenstraat
http://ibabsonline.eu/LijstDetails.aspx?site=Utrecht&ListId=1a44fa8b-87b1-4076-bcbc-2e4da459ad2b&ReportId=6d335650-40af-4a17-abf6-b33ca18b6a19&EntryId=f3783409-6505-4c18-8222-8485409b3d95&searchtext=
http://ibabsonline.eu/LijstDetails.aspx?site=Utrecht&ListId=41b35454-6329-4f11-b803-d7e5a6141085&ReportId=547701b1-04af-4e12-b84a-4cca1b29d943&EntryId=b5023f73-de6e-4c19-b595-81868550aa9b&searchtext=
http://ibabsonline.eu/LijstDetails.aspx?site=Utrecht&ListId=41b35454-6329-4f11-b803-d7e5a6141085&ReportId=547701b1-04af

Current Page: 4
Current Page: 5
e509c7cb-998d-4f71-858a-c131e4d6ce79
Current Page: 0
Current Page: 1
Current Page: 2
Current Page: 3
Current Page: 4
Current Page: 5
e638d5f8-f0e8-44b0-9168-bde292150c73
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/8a5a9465-ab09-40b6-a284-6916a7091794
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/698380c7-09d7-44c0-bdb5-eea3d8b87521
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/6fadde87-2092-45b5-bade-15069c76cb52
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/8e1519fb-1adc-4741-ae64-a83b5775f343
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/b1c064f9-ce79-4ddb-81c2-923b88977ca6
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/8e681350-080c-4856-9b78-694fc1f660bb
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/25090a3d-c0d7-4d73-982c-24f46dca6ce1
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/e21a591a-ef03-425c-9a1d-50eb6dcc4a84
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/551c219b-f429-4cac

In [7]:
# Used e.g. when we're looking for the attached agenda item of a besluithistorie
import pickle

#Load the knowledge base
path = "C:/Users/tmsch/Desktop/ADS_Thesis/Thomas/"
with open(path + "meetingsInformation.txt", "rb") as fp:   
    meetingsInformation = pickle.load(fp)
with open(path + "itemsInformation.txt", "rb") as fp:   
    itemsInformation = pickle.load(fp)
with open(path + "entriesInformation.txt", "rb") as fp:   
    entriesInformation = pickle.load(fp)

with open(path + "documentsInformation.txt", "rb") as fp:   
    documentsInformation = pickle.load(fp)    
with open(path + "listdocumentsInformation.txt", "rb") as fp:   
    listdocumentsInformation = pickle.load(fp)
with open(path + "meetdocumentsInformation.txt", "rb") as fp:   
    meetdocumentsInformation = pickle.load(fp)

In [8]:
#Let's sort through the urls we found
count = 0
listcount = 0
agendacount = 0
doccount = 0
listdoccount = 0

unknowns = 0

listmissing = 0
agendamissing = 0
docmissing = 0
probcount = 0

for h in histories:
    histories[h]['urls_list'] = []
    histories[h]['urls_meeting'] = []
    histories[h]['urls_docs'] = []
    histories[h]['urls_listdocs'] = []
    histories[h]['urls_other'] = []
    
    count += len(histories[h]['url'])
    
    if isinstance(histories[h]['url'], str):
        histories[h]['url'] = [histories[h]['url']]
        
    for url in histories[h]['url']:
        #list entries always have a nice URL format
        if 'Reports/ViewListEntry/Utrecht' in url:
            histories[h]['urls_list'].append(url.split("/")[-1])
            listcount += 1
            
            #Test if this file is in our data
            if url.split("/")[-1] not in entriesInformation:
                listmissing += 1
            
        #meeting ids are also formatted nicely  
        elif 'Agenda/Details/Utrecht/' in url:
            histories[h]['urls_meeting'].append(url.split("/")[-1])
            agendacount += 1
            
            #Test if this file is in our data
            if url.split("/")[-1] not in meetingsInformation:
                agendamissing += 1
                
        elif 'publicdownload.aspx?site=' in url:
            
            #Test if this file is in our data
            if url.split("=")[-1] not in listdocumentsInformation:
                    if url.split("=")[-1] not in documentsInformation:
                        docmissing += 1
                        
                        print('PROBLEM?')
                        print(url.split("=")[-1])
                        probcount += 1
                    else:
                        doccount += 1
                        histories[h]['urls_listdocs'].append(url.split("=")[-1])

            else:
                listdoccount += 1
                histories[h]['urls_listdocs'].append(url.split("=")[-1])

        elif 'LijstDetails.aspx?site=' in url:
            histories[h]['urls_list'].append(url.split("=")[-1])
            listcount += 1
            
            #Test if this file is in our data
            if url.split("=")[-1] not in entriesInformation:
                listmissing += 1
                
        #special case we identified
        elif '4b9de60a-ad2a-461b-903a-a37cc44f7617' in url:
            histories[h]['urls_meeting'].append('4b9de60a-ad2a-461b-903a-a37cc44f7617')
            agendacount += 1

        else:
            unknowns += 1
            print('not sure, ignoring for now')
            print(url)
            histories[h]['urls_other'].append(url)

#        print(histories[h]['url'])
    
print(str(listcount) + ' lists')
print(str(agendacount) + ' meetings')
print(str(doccount) + ' documents')
print(str(listdoccount) + ' listdocuments')
print()
print(str(unknowns) + ' unknowns')

print(str(probcount) + ' problems')
print(str(listcount+agendacount+doccount+listdoccount) + ' combined')

print(str(count) + ' total ibabs urls found')

print()
print('Of those meetings and list entries we are missing ids: ' + str(listmissing) + ' lists and ' + str(agendamissing) + ' meetings')

not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=356&FoundIDs=A1707,Dc3ce4d3f-cd2d-43fc-b6bc-a0cd4696bb3f,D66ad8de5-b414-4657-b02d-6d87951e8af7
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=319&FoundIDs=&year=2014
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=324&FoundIDs=&year=2014
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=290&FoundIDs=A1707,Dc3ce4d3f-cd2d-43fc-b6bc-a0cd4696bb3f
not sure, ignoring for now
http://ibabsonline.eu/Agenda.aspx?site=utrecht&agendaid=570&FoundIDs=&year=2015
394 lists
128 meetings
5 documents
11 listdocuments

5 unknowns
0 problems
538 combined
543 total ibabs urls found

Of those meetings and list entries we are missing ids: 66 lists and 7 meetings


In [9]:
#prep for writing to file
print_hist = {}
for h in histories:
    print_hist[h] = {'id': histories[h]['id'],
                    'name': histories[h]['name'],
                    'filesize': histories[h]['filesize'],
                    'list_ids': histories[h]['urls_list'],
                    'meeting_ids': histories[h]['urls_meeting'],
                    'unparsed_urls': histories[h]['urls_other'],
                    'attached_to': []
                    }
print(print_hist)

{'009fd1d9-dc31-4086-8132-18c568b0db61': {'id': '009fd1d9-dc31-4086-8132-18c568b0db61', 'name': 'Besluitenhistorie participatie 25 juni 2019.pdf', 'filesize': 72508, 'list_ids': ['447b89ba-f685-4c4d-8485-d27d1c7cf899', 'd1d3c755-14b0-4429-9f54-10f37506c1fe', '2b70f4a5-2e0b-46f5-8c0a-ac53554fe740', 'f7ede071-b249-48fc-bf9f-796ba3fba65a', '28d38e1e-e95e-47cf-8a7d-9e55199e8c9a', '00678d55-effe-487a-9516-1f199d8b5a66', '00678d55-effe-487a-9516-1f199d8b5a66', '13956fc1-39c4-4aca-8adf-daa843686031', 'ace4f4cb-e0f4-4cbf-b049-386d88128ecc', '71bf9d2b-9eee-4fb4-be28-ea843d013b76', '96e9f216-f723-4a77-b845-12953c54165a', '1e848d79-764d-467d-887a-c7418141cea8'], 'meeting_ids': ['6df38124-4025-411d-92c0-7935fed2e7e2', '229adb78-0392-4d8e-8ff3-09180d747b5d'], 'unparsed_urls': [], 'attached_to': []}, '03675b7f-3a66-4d12-878a-5caca2df82e9': {'id': '03675b7f-3a66-4d12-878a-5caca2df82e9', 'name': 'Bijlage 1 Besluitenhistorie Stadskantoor', 'filesize': 135780, 'list_ids': [], 'meeting_ids': [], 'unparse

In [10]:
# 10 decision histories

manual_annots = {}
#How this is structured
#manual_annots['doc id besluithistorie']
#  date_list = list of entry id's that were referenced by date
#  date_meeting = list of meeting id's that  were referenced by date
#  title_list = list of entry id's that were referenced by title
#  title_meeting = list of meeting id's that were referenced by title



manual_annots['009fd1d9-dc31-4086-8132-18c568b0db61'] = {
    'date_list': ['447b89ba-f685-4c4d-8485-d27d1c7cf899',
'38595779-13ed-447d-87c9-7b63f7cecd47',
'6322c4d9-4d56-42af-b369-c8332794667d',
'2b70f4a5-2e0b-46f5-8c0a-ac53554fe740',
'9b8c1cf2-f763-432e-b6a1-d778245861fe',
'28d38e1e-e95e-47cf-8a7d-9e55199e8c9a',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'00678d55-effe-487a-9516-1f199d8b5a66',
'e5aa2282-d2c0-4fd3-a1be-c5e69266bd30',
'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'96e9f216-f723-4a77-b845-12953c54165a',
'd82ffa13-90d8-4294-b134-bd6f79143cff'],
    'date_meeting':['37e5d134-0c6c-4795-abac-9f6f468ff483',
'772f6c47-c89b-4b21-9e74-57742dd26464',
'6df38124-4025-411d-92c0-7935fed2e7e2',
'6df38124-4025-411d-92c0-7935fed2e7e2',
'229adb78-0392-4d8e-8ff3-09180d747b5d',
'229adb78-0392-4d8e-8ff3-09180d747b5d',
'ba044cb3-80b0-4c7b-a77b-aa6306c3b6b2'],
    'title_list':['447b89ba-f685-4c4d-8485-d27d1c7cf899',
'447b89ba-f685-4c4d-8485-d27d1c7cf899',
'6322c4d9-4d56-42af-b369-c8332794667d',
'2b70f4a5-2e0b-46f5-8c0a-ac53554fe740',
'9b8c1cf2-f763-432e-b6a1-d778245861fe',
'28d38e1e-e95e-47cf-8a7d-9e55199e8c9a',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'00678d55-effe-487a-9516-1f199d8b5a66',
'00678d55-effe-487a-9516-1f199d8b5a66',
'e5aa2282-d2c0-4fd3-a1be-c5e69266bd30',
'e5aa2282-d2c0-4fd3-a1be-c5e69266bd30',
'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'71bf9d2b-9eee-4fb4-be28-ea843d013b76',
'96e9f216-f723-4a77-b845-12953c54165a',
'd82ffa13-90d8-4294-b134-bd6f79143cff',
'd82ffa13-90d8-4294-b134-bd6f79143cff'],
    'title_meeting':['6df38124-4025-411d-92c0-7935fed2e7e2',
'229adb78-0392-4d8e-8ff3-09180d747b5d',
'ba044cb3-80b0-4c7b-a77b-aa6306c3b6b2'],
    'urls':16,
    'other_dates':0,
    'future_dates':0,
    'titles':3
}
manual_annots['03efecf7-9303-44c4-afd9-135a191c796d'] = {
    'date_list': [],
    'date_meeting':['b548d7ec-bfff-49cc-8017-caeccd3cbbb8'],
    'title_list':[],
    'title_meeting':['b548d7ec-bfff-49cc-8017-caeccd3cbbb8'],
    'urls':2,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['0454d040-0cc2-46a9-9edd-d1a8b652a59a'] = {
    'date_list': ['9b5177d4-f3cd-43d7-81fe-e49a7788aa3a',
'12235406-ebe7-4041-a837-57f4e9ba8827',
'dfdbcc33-1aab-4b3b-ab4f-739ccfcedea7'],
    'date_meeting':[],
    'title_list':['9b5177d4-f3cd-43d7-81fe-e49a7788aa3a',
'12235406-ebe7-4041-a837-57f4e9ba8827',
'dfdbcc33-1aab-4b3b-ab4f-739ccfcedea7'],
    'title_meeting':[],
    'urls':3,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['06a12533-91b2-490f-aa93-04add0fd7c34'] = {
    'date_list': [],
    'date_meeting':['f3d08622-7a45-475b-afc9-68e96ab49b22'],
    'title_list':[],
    'title_meeting':['f3d08622-7a45-475b-afc9-68e96ab49b22'],
    'urls':1,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['09ab9fe9-e513-4bfa-98c0-c317540856c2'] = {
    'date_list': [],
    'date_meeting':['a792bc6b-1fdb-4c70-a13b-88f2ee4fcb9f'],
    'title_list':[],
    'title_meeting':['a792bc6b-1fdb-4c70-a13b-88f2ee4fcb9f'],
    'urls':1,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['0ac3d7d8-c7b7-4a6a-806e-0470af7bf8bf'] = {
    'date_list': ['89186d4b-8e79-445e-897d-993e6156e8f7',
'c44c7ef6-5bbb-4455-8149-3bcfd13936ed'],
    'date_meeting':['fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'title_list':['89186d4b-8e79-445e-897d-993e6156e8f7'],
    'title_meeting':['fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'urls':2,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['19197683-24f0-4dcb-bd9d-52d09e2f45e6'] = {
    'date_list': ['9f64b943-1ea3-45be-9cdd-5451c0e734b9',
'031c089b-c9f6-423c-a353-5f8994b78afe',
'f6d888fa-b118-4d7e-a01e-34f2cf94e0e9',
'0b2bd741-f8a8-44b5-a820-e9f6c7554522'],
    'date_meeting':['2f481393-776f-4a98-b65c-7dab102929af'],
    'title_list':['9f64b943-1ea3-45be-9cdd-5451c0e734b9',
'031c089b-c9f6-423c-a353-5f8994b78afe',
'f6d888fa-b118-4d7e-a01e-34f2cf94e0e9',
'0b2bd741-f8a8-44b5-a820-e9f6c7554522'],
    'title_meeting':['2f481393-776f-4a98-b65c-7dab102929af'],
    'urls':5,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['1bf35fe9-9cc5-4af5-8fc4-6fc287db2bdc'] = {
    'date_list': ['3a2a86e4-c76e-4166-a17e-1a63066bf26f',
'39fc8a83-9fa8-4d27-9d91-b892840d3358'],
    'date_meeting':['fe93132e-f5ce-436f-8aee-623c02548a70',
'faa222bc-c31b-4b92-98a3-1c85be8c3c83'],
    'title_list':[],
    'title_meeting':['fe93132e-f5ce-436f-8aee-623c02548a70',
'faa222bc-c31b-4b92-98a3-1c85be8c3c83'],
    'urls':4,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}
manual_annots['1c0381ab-4939-4a52-98fd-558198f916ae'] = {
    'date_list': ['644f64f7-1cb3-4045-9919-91bc7aa7d330',
'9a3e5c6d-4ff0-4bf9-a0aa-15f1634b8981',
'd5fc7a26-d5b3-4f8d-8e2c-7684932f3479'],
    'date_meeting':['cb9690c8-9940-4b26-86ca-270e94850b1f',
'd998a211-0c87-4381-8bbe-2f652ea4de3a',
'e01a664c-c484-4507-b634-3891fef46b22'],
    'title_list':['644f64f7-1cb3-4045-9919-91bc7aa7d330',
'9a3e5c6d-4ff0-4bf9-a0aa-15f1634b8981',
'd5fc7a26-d5b3-4f8d-8e2c-7684932f3479'],
    'title_meeting':['cb9690c8-9940-4b26-86ca-270e94850b1f',
'd998a211-0c87-4381-8bbe-2f652ea4de3a',
'e01a664c-c484-4507-b634-3891fef46b22'],
    'urls':6,
    'other_dates':0,
    'future_dates':0,
    'titles':3
}
manual_annots['1d270fbb-4454-48c2-b22c-d0141efe8e01'] = {
    'date_list': [],
    'date_meeting':['ea94b4ad-ee17-447d-aa25-81fd1aea8bdf',
'fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'title_list':[],
    'title_meeting':['ea94b4ad-ee17-447d-aa25-81fd1aea8bdf',
'fefbd3b3-b9d8-44a4-8d24-0dd25032e56c'],
    'urls':5,
    'other_dates':0,
    'future_dates':0,
    'titles':0
}

for h in print_hist:
    if h in manual_annots:
        print_hist[h]['date_list'] = manual_annots[h]['date_list']
        print_hist[h]['date_meeting'] = manual_annots[h]['date_meeting']
        print_hist[h]['title_meeting'] = manual_annots[h]['title_list']
        print_hist[h]['title_list'] = manual_annots[h]['title_meeting']
        
    else:
        print_hist[h]['date_list'] = []
        print_hist[h]['date_meeting'] = []
        print_hist[h]['title_meeting'] = []
        print_hist[h]['title_list'] = []
        
#test
print_hist['009fd1d9-dc31-4086-8132-18c568b0db61']
print_hist

{'009fd1d9-dc31-4086-8132-18c568b0db61': {'id': '009fd1d9-dc31-4086-8132-18c568b0db61',
  'name': 'Besluitenhistorie participatie 25 juni 2019.pdf',
  'filesize': 72508,
  'list_ids': ['447b89ba-f685-4c4d-8485-d27d1c7cf899',
   'd1d3c755-14b0-4429-9f54-10f37506c1fe',
   '2b70f4a5-2e0b-46f5-8c0a-ac53554fe740',
   'f7ede071-b249-48fc-bf9f-796ba3fba65a',
   '28d38e1e-e95e-47cf-8a7d-9e55199e8c9a',
   '00678d55-effe-487a-9516-1f199d8b5a66',
   '00678d55-effe-487a-9516-1f199d8b5a66',
   '13956fc1-39c4-4aca-8adf-daa843686031',
   'ace4f4cb-e0f4-4cbf-b049-386d88128ecc',
   '71bf9d2b-9eee-4fb4-be28-ea843d013b76',
   '96e9f216-f723-4a77-b845-12953c54165a',
   '1e848d79-764d-467d-887a-c7418141cea8'],
  'meeting_ids': ['6df38124-4025-411d-92c0-7935fed2e7e2',
   '229adb78-0392-4d8e-8ff3-09180d747b5d'],
  'unparsed_urls': [],
  'attached_to': [],
  'date_list': ['447b89ba-f685-4c4d-8485-d27d1c7cf899',
   '38595779-13ed-447d-87c9-7b63f7cecd47',
   '6322c4d9-4d56-42af-b369-c8332794667d',
   '2b70f4a5-

# Test manual annotations

In [11]:
print(entriesInformation['1e01d6e2-50b5-4184-9c31-477869fe791b'])

manual_memos = {}

manual_memos['85934e45-f4d5-4a82-b0ea-8a0ec793eb3d'] = {
    'date_list':[],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':1,
    'urls':0,
    'titles':0
}

manual_memos['176a77b4-323e-4c0c-ada1-4d848f35d6ab'] = {
    'date_list':[],
    'date_meeting':['308e048f-17be-48e9-996d-869cce130b60'],
    'title_list':[],
    'title_meeting':['308e048f-17be-48e9-996d-869cce130b60'],
    'other_dates':1,
    'future_dates':0,
    'urls':1,
    'titles':1
}

manual_memos['722c0818-6533-4e16-98db-86a142ef41f5'] = {
    'date_list':[],
    'date_meeting':['1b788f56-d02e-4901-b8cb-51355ac10709'],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':1,
    'titles':0
}


manual_memos['5ccf47af-0cba-4e6a-b02f-8a79c3af7012'] = {
    'date_list':[],
    'date_meeting':['9242bf68-2597-4206-9a7f-c8bb77682ab6'],
    'title_list':[],
    'title_meeting':['9242bf68-2597-4206-9a7f-c8bb77682ab6'],  #raadsvoorstel
    'other_dates':0,
    'future_dates':1,
    'urls':1,
    'titles':0
}

manual_memos['391d005b-8350-47f6-8bfa-313cc852dea6'] = {
    'date_list':['290ac9de-dff0-425e-b95f-d6f72dee386d'],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':1,
    'urls':1,
    'titles':0
}

manual_memos['ef97a283-2eb1-485d-aa20-9c4a5687587e'] = {
    'date_list':['1d0aa951-0d7f-4d90-888a-8946976cf955'],
    'date_meeting':['9e2a36f6-1327-4702-ba7f-2cb31642db3f'],
    'title_list':['1d0aa951-0d7f-4d90-888a-8946976cf955'],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':2,
    'titles':1
}

manual_memos['d5a871fb-8284-49e5-9087-8aec710b2c0d'] = {
    'date_list':['da702218-c910-46bc-bf1b-c7665066c515'],
    'date_meeting':[''],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':1,
    'titles':0
}

manual_memos['02ba1f30-6bba-4fdf-9c97-db215fc8280b'] = {
    'date_list':[],
    'date_meeting':['aa8b7367-6100-47fd-9c3f-785c242d68ae'],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':0,
    'titles':0
}

manual_memos['85934e45-f4d5-4a82-b0ea-8a0ec793eb3d'] = {
    'date_list':[],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':3,
    'urls':0,
    'titles':0
}

manual_memos['546b2a33-1671-4246-a320-520f25e10fd2'] = {
    'date_list':[''],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':1,
    'urls':0,
    'titles':0
}





{'entry id': '1e01d6e2-50b5-4184-9c31-477869fe791b', 'doctype': "Memo's", 'ID': '804', 'Onderwerp': 'Memo Uitstel beantwoording SV 2021, 56 Duurzaam fietspad in duigen', 'Datum invoer': 'Mar 12 2021 12:00AM', 'Portefeuillehouder': 'wethouder Eerenberg', 'Beleidsveld': 'Stationsgebied', 'Opmerking': None, 'Documents': ['aa84a55a-aca0-4a44-bb93-4ea8cf9c9e0d']}


In [12]:
#print(entriesInformation['1e01d6e2-50b5-4184-9c31-477869fe791b'])

manual_brieven = {}

manual_brieven['6ffbfcd6-47c6-45d4-9572-52582985712d'] = {
    'date_list':[],
    'date_meeting':['d6095083-7656-4f0b-b0ed-1c35db387db4'],
    'title_list':['9adb1e1f-0f3d-4f46-bcd6-0f61962e3137', 'bb6ef059-5a8b-415b-9bae-b5394c28d2c0'],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':3,
    'titles':3
}

manual_brieven['1328dfaf-5593-42b3-9a9b-5a0f5d156c3e'] = {
    'date_list':[],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':2,
    'titles':0
}

manual_brieven['551ce314-c09f-480e-a06e-94e17df558ae'] = {
    'date_list':['ca67cee7-a65f-493c-88ab-22985e14d5a0'],
    'date_meeting':[''],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':2,
    'urls':0,
    'titles':1
}

manual_brieven['07476354-b558-4b4b-93c2-17ff67dc859e'] = {
    'date_list':[],
    'date_meeting':['27ca45d9-183a-4ec6-a7d7-e397358811b7'],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':0,
    'titles':0
}
                    
                    
manual_brieven['8a603db9-757f-4f7f-b083-86f72470a226'] = {
    'date_list':['3dcecafb-9ae1-4b05-a2c2-d1e477209067'],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':0,
    'titles':0
}
                    
                    
manual_brieven['3ef920e4-b4f8-4f99-a31f-59ea51079a3c'] = {
    'date_list':[],
    'date_meeting':['08e5a1a3-954c-42f5-b469-f2ad6fd231da'],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':0,
    'titles':0
}                    

manual_brieven['0b6db332-f1fc-4ca7-a85e-3420c17c1e81'] = {
    'date_list':['90347a91-98e2-49d1-90f8-a991dbe49f81'],
    'date_meeting':['e23e61e9-14a9-47bb-a1bc-7b8f78983038', '43331f54-e9b9-4f9f-9255-ce52129f4e73'],
    'title_list':['90347a91-98e2-49d1-90f8-a991dbe49f81'],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':0,
    'urls':0,
    'titles':1
}                    

                    
                    
manual_brieven['c052528d-1c05-4fbb-9bc3-f4240f6f1724'] = {
    'date_list':[],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':2,
    'future_dates':2,
    'urls':0,
    'titles':0
}    
                    
manual_brieven['56456183-3799-482d-94e0-40a784d6df39'] = {
    'date_list':['555bfb23-efb7-48fc-b7a4-f624d8815c02'],
    'date_meeting':[],
    'title_list':[],
    'title_meeting':[],
    'other_dates':0,
    'future_dates':1,
    'urls':0,
    'titles':0
}
                    
                    
manual_brieven['d98692b5-6eca-4291-8c5d-d437cc27ac6e'] = {
    'date_list':[],
    'date_meeting':['5ad6000d-6eb7-41cc-b849-abad712ab8ff', '4edbf69b-5e32-4d63-9045-9967016ccec6', '5ad6000d-6eb7-41cc-b849-abad712ab8ff'],
    'title_list':[],    #initiatief
    'title_meeting':['372ce9b7-5283-4546-bfd6-f4f3d5ea82cf'],
    'other_dates':0,
    'future_dates':0,
    'urls':0,
    'titles':1
}

In [13]:
#PAPER:  manually noted URLs is all URLs, all found by extractor. Number after that is the number that contains ibabs and utrecht in URL
#print(you'])
#print()

for m in manual_annots:
    print(m)
    print(manual_annots[m]['urls'])
    print(len(histories[m]['url']))
    #print((histories[m]))
    print(histories[m]['urls_other'])
    print()
    
    


009fd1d9-dc31-4086-8132-18c568b0db61
16
14
[]

03efecf7-9303-44c4-afd9-135a191c796d
2
1
[]

0454d040-0cc2-46a9-9edd-d1a8b652a59a
3
3
[]

06a12533-91b2-490f-aa93-04add0fd7c34
1
1
[]

09ab9fe9-e513-4bfa-98c0-c317540856c2
1
1
[]

0ac3d7d8-c7b7-4a6a-806e-0470af7bf8bf
2
3
[]

19197683-24f0-4dcb-bd9d-52d09e2f45e6
5
5
[]

1bf35fe9-9cc5-4af5-8fc4-6fc287db2bdc
4
4
[]

1c0381ab-4939-4a52-98fd-558198f916ae
6
6
[]

1d270fbb-4454-48c2-b22c-d0141efe8e01
5
1
[]



In [16]:
#we should also add the url ids sorted by type..

def urlTypes(manual_set, h):
    manual_set[h]['urls_list'] = histories[h]['urls_list']
    manual_set[h]['urls_meeting'] = histories[h]['urls_meeting']
    manual_set[h]['urls_docs'] = histories[h]['urls_docs']
    manual_set[h]['urls_listdocs'] = histories[h]['urls_listdocs']
    manual_set[h]['urls_other'] = histories[h]['urls_other']
    
for h in manual_brieven:
    manual_brieven[h]['url'] = getURLs('./brieven/' + str(h) + '.pdf')
    #urlTypes(manual_brieven, h)
for h in manual_memos:
    manual_memos[h]['url'] = getURLs('./memos/' + str(h) + '.pdf')
    #urlTypes(manual_memos, h)
    
for h in manual_annots:
    manual_annots[h]['url'] = getURLs('./besluitdocs/' + str(h) + '.pdf')
    urlTypes(manual_annots, h)


Current Page: 0
https://ris2.ibabs.eu/Agenda/Details/Utrecht/d6095083-7656-4f0b-b0ed-1c35db387db4
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/9adb1e1f-0f3d-4f46-bcd6-0f61962e3137
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/bb6ef059-5a8b-415b-9bae-b5394c28d2c0
Current Page: 1
Current Page: 2
Current Page: 0
https://ris2.ibabs.eu/Agenda/Details/Utrecht/bcc776a5-dd26-4bfd-a866-a43d5eab6f4d
https://ris2.ibabs.eu/Agenda/Details/Utrecht/faa222bc-c31b-4b92-98a3-1c85be8c3c83
Current Page: 1
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/ca67cee7-a65f-493c-88ab-22985e14d5a0
Current Page: 1
Current Page: 0
https://ris2.ibabs.eu/Agenda/Details/Utrecht/27ca45d9-183a-4ec6-a7d7-e397358811b7
Current Page: 1
Current Page: 0
Current Page: 1
Current Page: 2
Current Page: 0
Current Page: 1
Current Page: 2
Current Page: 0
https://ris2.ibabs.eu/Reports/ViewListEntry/Utrecht/90347a91-98e2-49d1-90f8-a991dbe49f81
https://ris2.ibabs.eu/Agenda/Details/Utrecht/e23e61e9-14a9-4

In [17]:
#store manually annotated set
testset = {'memos': manual_memos,
          'brieven': manual_brieven,
          'histories': manual_annots}
print(len(testset['histories']))

import json
with open('28-3-testset.json', 'w') as outfile:
    json.dump(testset, outfile)

10


In [52]:
* What tagger do we use
    * Did we find all dates
    
* Inhoudelijk
    * Get year from document uplaod!  if not in date or in sentence
    * Vind de eerste datum die /niet/ de datum van het document zelf is!
    * dataset gekozen door van een willekeurige pagina een set van 10 items te pakken
        * alsnog enorme temporal coverage!
    * our sample did not include antwoorden to schriftleijke vragen
    
    raadsvojorstellen moet je ook anders modelleren (is niet een entry of een meeting)

    * memos verwijzen meestal naar meetings, soms naar raadsvoorstellen en af en toe naar brieven
    
    * meerdere vergaderingen op dezelfde dag
    
    * reference by title is te disambigueren voor entries maybe - maar niet ovor agendapunten '372ce9b7-5283-4546-bfd6-f4f3d5ea82cf'
    
* What is the temporal coverage etc
    * references to the future are not to meetings

#let's compare three taggers?
#https://github.com/HeidelTime/heideltime  https://dbs.ifi.uni-heidelberg.de/resources/temporal-tagging/
#https://nlp.stanford.edu/software/sutime.shtml
#spacy




SyntaxError: invalid syntax (Temp/ipykernel_19528/214965076.py, line 2)

In [None]:
#Let's test title-based extraction!
#Approach 1: Make list of unambiguous document titles, see if full title in document
#Approach 2: Extract quoted items from documents `Onderwerp brief' and see if it is in one of the title lists

In [None]:
Vragen
* Hoe extract je titles tussen verschillende aanhalingstekens