In [1]:
import sys
!{sys.executable} -m pip install edtf

You should consider upgrading via the '/Users/fkraeutli/anaconda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import csv
import json
import random
import re
from collections import OrderedDict
from edtf import parse_edtf

In [3]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"
output = "./output"

In [4]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [5]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [47]:
monthsGermanFull = r'(Januar)|(Februar)|(M√§rz)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)'
monthsGermanAbr = r'(Jan)|(Feb)|(M√§r)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)'
monthsFrenchFull = r'(janvier)|(f√©vrier)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(d√©cembre)'
monthsFrenchAbr = r'(janv)|(f√©vr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(d√©c)'
monthsEnglishFull = r'(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)'
monthsEnglishAbr = r'(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)'
monthsRoman = r'(I)|(II)|(III)|(IV)|(V)|(VI)|(VII)|(VIII)|(IX)|(X)|(XI)|(XII)'
numeralRomans = r'(VIII)|(III)|(IV)|(VI)|(VII)|(V)|(IX)|(X)|II|I'
remaining = r'[A-z]'
unknowns = r'XX|xx'

In [7]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        uuid = row['UUID']
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                if d:
                    genericDate = re.sub(r'\d','_', d)
                    genericDate = re.sub(monthsGermanFull, 'üåïüá©üá™', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchFull, 'üåïüá´üá∑', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishFull, 'üåïüá¨üáß', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsGermanAbr, 'üåôüá©üá™', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchAbr, 'üåôüá´üá∑', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishAbr, 'üåôüá¨üáß', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(unknowns, '‚ùì', genericDate)
                    genericDate = re.sub(numeralRomans, 'üèõ', genericDate)
                    #genericDate = re.sub(remaining, '', genericDate)
                    genericDate = genericDate.strip()
                    entry = { 'uuid': uuid, 'value': d}
                    if genericDate in dateVariants:
                        dateVariants[genericDate]['count'] += 1
                        dateVariants[genericDate]['examples'].append(entry)
                    else:
                        dateVariants[genericDate] = {
                            'count': 1,
                            'examples': [entry]
                        }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

# Interpret Values

In [121]:
uncertaintyQualifiers = 'ca\.|ca|circa|um|vermutlich um'
r = {
    'afterYearWithQualifier': r'^\[?(' + uncertaintyQualifiers + ')?(?:nach|nicht vor)\s?(_{4})\??\]?$',
    'beforeYearWithQualifier': r'^\[?(' + uncertaintyQualifiers + ')?(?:vor|nicht nach)\s?(_{4})\??\]?$',
    'singleDate': r'(?:i\.e\.|den|le)?\s?(_{1,2}\._{1,2}\._{2,4})',
    'singleYearWithQualifier': r'^\[?(?:' + uncertaintyQualifiers + '|A¬∞|A¬∞\.|Anno|a\[nn\]o|gezeichnet nach der Natur)?\s?(____)\??\]?$',
    'yearRangeWithQualifier': r'(?:ca\.)?\s?(?:zwischen)?\s?(____\??)\s?(?:-|und|ud)\s?(____\??)'
}
    
def guessMonth(monthString):
    monthsGermanFull = ["Januar","Februar","M√§rz","April","Mai","Juni","Juli","August","September","Oktober","November","Dezember"]
    monthsGermanAbr = ["Jan","Feb","M√§r","Apr","Mai","Jun","Jul","Aug","Sept","Okt","Nov","Dez"]
    monthsFrenchFull = ["janvier","f√©vrier","mars","avril","mai","juin","juillet","aout","septembre","octobre","novembre","d√©cembre"]
    monthsFrenchAbr = ["janv","f√©vr","mars","avril","avr","mai","juin","juil","aout","aou","sept","oct","nov","d√©c"]
    monthsEnglishFull = ["January","February","March","April","May","June","July","August","September","October","November","December"]
    monthsEnglishAbr = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
    monthsRoman = ["I","II","III","IV","V","VI","VII","VIII","IX","X","XI","XII"]
    testOrder = [monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman]
    for test in testOrder:
        for i, month in enumerate(test):
            if month.lower() == monthString.lower():
                return i + 1
    return 0
            
def afterYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def beforeYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def singleYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def yearRangeWithQualifier(dateString):
    years = re.findall(r'(\d{4}\??)', dateString)
    uncertain = re.search(r'(ca)', dateString)
    if uncertain:
        for i, year in enumerate(years):
            if not '?' in year:
                years[i] += '?'
    return "/".join(years)

def singleDate(dateString):
    date = re.search(r'\d{1,2}\.\d{1,2}\.\d{2,4}', dateString)
    return date.group(0)

def fullDateWithMonthInLangOrRoman(dateString):
    allMonths = '|'.join([monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman])
    datePattern = r'(\d{1,2})(?:\.|\s)*(?:' + allMonths + ')(?:\.|\s)*(?:\d{2,4}|\[\d{1,3}\]\d{1,3})'
    yearPattern = r'(\d{2,4}|\[\d{1,3}\]\d{1,3})\]?\.?$'
    try:
        date = re.search(datePattern, dateString, flags=re.IGNORECASE).group(1)
    except:
        date = "xx"
        
    try:
        monthWords = re.search(allMonths, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        month = "xx"

    try:
        yearExtract = re.search(yearPattern, dateString).group(1)
        year = re.sub(r'\[|\]','', yearExtract)
    except:
        year = "xxxx"
    
    return '.'.join([date, month, year])


def interpret(dateString, pattern):
    testOrder = ['singleDate', 'singleYearWithQualifier', 'beforeYearWithQualifier', 'afterYearWithQualifier', 'yearRangeWithQualifier']
    possibles = globals().copy()
    possibles.update(locals())
    for test in testOrder:
        m = re.search(r[test], pattern)
        if m:    
            f = possibles.get(test)
            if not f:
                raise NotImplementedError("Function %s not implemented" % test)
            return f(dateString)
    
    return None

In [122]:
field = '260$c'
matches = {
    'matched': [],
    'notMatched': []
}
print(len(dateVariantsPerField[field]))
for pattern in dateVariantsPerField[field].keys():
    example = random.choice(dateVariantsPerField[field][pattern]['examples'])['value']
    if not interpret(example, pattern):    
        m = re.search(r'_{1,2}(\.|\s)*((üåï|üåô)(üá©üá™|üá´üá∑|üá¨üáß)|üèõ)(\.|\s)*(_{2,4}|\[_{1,3}\]_{1,3})', pattern)
        if m:
            matches['matched'].append(pattern)
            print(example, " --> ", fullDateWithMonthInLangOrRoman(example))
        else:
            matches['notMatched'].append(pattern)

# print("Matches")
# print("\n".join(matches['matched']))
# print("====\nNot matched")
# print("\n".join(matches['notMatched']))

540
12 Juli 1844  -->  12.7.1844
5 April 1862  -->  5.4.1862
28 Aug. 1859  -->  28.8.1859
1 Aug 1902  -->  1.8.1902
8 Aug. 1902  -->  8.8.1902
17 III 1896  -->  17.1.1896
28 Aug 1859  -->  28.8.1859
16 IX. 1904  -->  16.1.1904
27 Aug [18]85  -->  27.8.1885
26 V [18]89  -->  26.5.1889
11. Mai 1867  -->  11.5.1867
2 IV. 1901  -->  2.1.1901
21 August [18]59  -->  21.8.1859
6 Oct [18]80  -->  6.12.1880
10/11 Aug. 1900  -->  11.8.1900
5. April 1861  -->  5.4.1861
27 Oct 1861  -->  27.12.1861
d[en] 21 Sep [18]74  -->  21.9.1874
31 May 1827  -->  31.5.1827
7 IX 1904  -->  7.1.1904
2 Sept [18]72  -->  2.9.1872
14 Oct. 1897  -->  14.12.1897
1 Oct 1889  -->  1.12.1889
1 Oct. 1906  -->  1.12.1906
5/7 Aug. 1911  -->  7.8.1911
d[en] 9 Sep [18]74  -->  9.9.1874
le 14 July 1844  -->  14.7.1844
le 6. october 1786  -->  6.10.1786
le 31 mars 1788  -->  31.3.1788
9 VIII [18]74  -->  9.5.1874
[nach 8. November 1792]  -->  8.11.1792
[29. Juli 1794]  -->  29.7.1794
5/6 IV. 1903  -->  6.1.1903
7/8 Oct 1897  

## Output

In [123]:
for field in datefields:
    with open(output + '/dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example', 'Interpretation'))
        for k, d in enumerate(dateVariantsPerField[field]):
            example = dateVariantsPerField[field][d]['examples'][0]['value']
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], example, interpret(example, d)))


In [124]:

allMonths = '|'.join([monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman])
datePattern = r'(\d{1,2})(?:\.|\s)*(?:' + allMonths + ')(?:\.|\s)*(?:\d{2,4}|\[\d{1,3}\]\d{1,3})'
yearPattern = r'(\d{2,4}|\[\d{1,3}\]\d{1,3})\]?$'

In [125]:
print(datePattern)

(\d{1,2})(?:\.|\s)*(?:(Januar)|(Februar)|(M√§rz)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)|(janvier)|(f√©vrier)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(d√©cembre)|(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)|(Jan)|(Feb)|(M√§r)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)|(janv)|(f√©vr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(d√©c)|(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)|(I)|(II)|(III)|(IV)|(V)|(VI)|(VII)|(VIII)|(IX)|(X)|(XI)|(XII))(?:\.|\s)*(?:\d{2,4}|\[\d{1,3}\]\d{1,3})
