In [1]:
import sys
!{sys.executable} -m pip install edtf

You should consider upgrading via the '/Users/fkraeutli/anaconda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import csv
import json
import random
import re
from collections import OrderedDict
from edtf import parse_edtf

In [3]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"
output = "./output"

In [4]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [5]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [6]:
monthsGermanFull = r'(Januar)|(Februar)|(März)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)'
monthsGermanAbr = r'(Jan)|(Feb)|(Mär)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)'
monthsFrenchFull = r'(janvier)|(février)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(décembre)'
monthsFrenchAbr = r'(janv)|(févr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(déc)'
monthsEnglishFull = r'(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)'
monthsEnglishAbr = r'(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)'
numeralRomans = r'(VIII)|(III)|(IV)|(VI)|(VII)|(V)|(IX)|(X)|II|I'
remaining = r'[A-z]'
unknowns = r'XX|xx'

In [7]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        uuid = row['UUID']
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                if d:
                    genericDate = re.sub(r'\d','_', d)
                    genericDate = re.sub(monthsGermanFull, '🌕🇩🇪', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchFull, '🌕🇫🇷', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishFull, '🌕🇬🇧', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsGermanAbr, '🌙🇩🇪', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchAbr, '🌙🇫🇷', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishAbr, '🌙🇬🇧', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(unknowns, '❓', genericDate)
                    genericDate = re.sub(numeralRomans, '🏛', genericDate)
                    #genericDate = re.sub(remaining, '', genericDate)
                    genericDate = genericDate.strip()
                    entry = { 'uuid': uuid, 'value': d}
                    if genericDate in dateVariants:
                        dateVariants[genericDate]['count'] += 1
                        dateVariants[genericDate]['examples'].append(entry)
                    else:
                        dateVariants[genericDate] = {
                            'count': 1,
                            'examples': [entry]
                        }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

# Interpret Values

In [8]:
field = '260$c'

In [9]:
uncertaintyQualifiers = 'ca\.|ca|circa|um|vermutlich um'
r = {
    'singleYearWithQualifier': r'^\[?(?:' + uncertaintyQualifiers + '|A°|A°\.|Anno|a\[nn\]o|gezeichnet nach der Natur)?\s?(____)\??\]?$',
    'yearRangeWithQualifier': r'(?:ca\.)?\s?(?:zwischen)?\s?(____\??)\s?(?:-|und|ud)\s?(____\??)'
}

def singleYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def yearRangeWithQualifier(dateString):
    years = re.findall(r'(\d{4}\??)', dateString)
    uncertain = re.search(r'(ca)', dateString)
    if uncertain:
        for i, year in enumerate(years):
            if not '?' in year:
                years[i] += '?'
    return "/".join(years)

def interpret(dateString, pattern):
    m = re.search(r['singleYearWithQualifier'], pattern)
    if m:
        return singleYearWithQualifier(dateString)
    
    m = re.search(r['yearRangeWithQualifier'], pattern)
    if m:
        return yearRangeWithQualifier(dateString)
    
    return None

In [10]:
matches = {
    'matched': [],
    'notMatched': []
}
for pattern in dateVariantsPerField[field].keys():

    m = re.search(r['yearRangeWithQualifier'], pattern)
    if m:
        matches['matched'].append(pattern)
        example = random.choice(dateVariantsPerField[field][pattern]['examples'])['value']
        print(example, " --> ", yearRangeWithQualifier(example))
    else:
        matches['notMatched'].append(pattern)

# print("Matches")
# print("\n".join(matches['matched']))
# print("====\nNot matched")
# print("\n".join(matches['notMatched']))

[zwischen 1760 und 1806]  -->  1760/1806
[zwischen 1770 und 1790?]  -->  1770/1790?
[ca. zwischen 1860 und 1910]  -->  1860?/1910?
[1900 und 1904]  -->  1900/1904
[zwischen 1730? und 1750?]  -->  1730?/1750?
1852-1853  -->  1852/1853
[ca. 1770-1780]  -->  1770?/1780?
[zwischen und 1670 und 1712]  -->  1670/1712
[zwischen 1770 - 1790?]  -->  1770/1790?
zwischen 1891 und 1896  -->  1891/1896
[ca. 1890 und 1904]  -->  1890?/1904?
186? [i.e. zwischen 1860 und 1869]  -->  1860/1869
[zwischen 1675 ud 1713?]  -->  1675/1713?


## Output

In [11]:
for field in datefields:
    with open(output + '/dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example', 'Interpretation'))
        for k, d in enumerate(dateVariantsPerField[field]):
            example = dateVariantsPerField[field][d]['examples'][0]['value']
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], example, interpret(example, d)))
