In [None]:
import sys
!{sys.executable} -m pip install edtf

In [None]:
import csv
import json
import random
import re
from collections import OrderedDict
from edtf import parse_edtf

In [None]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"
output = "./output"

In [None]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [None]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [None]:
artefactsToRemoveBefore = r'\[|\]'
monthsGermanFull = r'(Januar)|(Februar)|(März)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)'
monthsGermanAbr = r'(Jan)|(Feb)|(Mär)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)'
monthsFrenchFull = r'(janvier)|(fégenericDatevrier)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(décembre)'
monthsFrenchAbr = r'(janv)|(févr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(déc)'
monthsEnglishFull = r'(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)'
monthsEnglishAbr = r'(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)'
monthsRoman = r'((?:\s|\.)I(?:\s|\.))|((?:\s|\.)II(?:\s|\.))|((?:\s|\.)III(?:\s|\.))|((?:\s|\.)IV(?:\s|\.))|((?:\s|\.)V(?:\s|\.))|((?:\s|\.)VI(?:\s|\.))|((?:\s|\.)VII(?:\s|\.))|((?:\s|\.)VIII(?:\s|\.))|((?:\s|\.)IX(?:\s|\.))|((?:\s|\.)X(?:\s|\.))|((?:\s|\.)XI(?:\s|\.))|((?:\s|\.)XII(?:\s|\.))'
unknowns = r'XX|xx'

In [None]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        uuid = row['UUID']
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                if d:
                    genericDate = re.sub(artefactsToRemoveBefore, '', d)
                    genericDate = re.sub(r'\d','_', genericDate)
                    genericDate = re.sub(monthsGermanFull, '🌕🇩🇪', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchFull, '🌕🇫🇷', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishFull, '🌕🇬🇧', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsGermanAbr, '🌙🇩🇪', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchAbr, '🌙🇫🇷', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishAbr, '🌙🇬🇧', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(unknowns, '❓', genericDate)
                    genericDate = re.sub(monthsRoman, ' 🏛 ', genericDate)
                    genericDate = genericDate.strip()
                    entry = { 'uuid': uuid, 'value': d}
                    if genericDate in dateVariants:
                        dateVariants[genericDate]['count'] += 1
                        dateVariants[genericDate]['examples'].append(entry)
                    else:
                        dateVariants[genericDate] = {
                            'count': 1,
                            'examples': [entry]
                        }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

# Interpret Values

In [None]:
uncertaintyQualifiers = 'ca\.|ca|circa|um|vermutlich um'
r = {
    'afterYearWithQualifier': r'^(' + uncertaintyQualifiers + ')?(?:nach|nicht vor)\s?(_{4})\??$',
    'beforeYearWithQualifier': r'^(' + uncertaintyQualifiers + ')?(?:vor|nicht nach)\s?(_{4})\??$',
    'fullDateWithMonthInLangOrRoman': '_{1,2}(\.|\s)*((🌕|🌙)(🇩🇪|🇫🇷|🇬🇧)|🏛)(\.|\s)*(_{2,4})',
    'monthAndYearWithMonthInLangOrRoman': '((🌕|🌙)(🇩🇪|🇫🇷|🇬🇧)|🏛)(\.|\s)*(_{2,4})',
    'singleDate': r'(?:i\.e\.|den|le)?\s?(_{1,2}\._{1,2}\._{2,4})',
    'singleYearWithQualifier': r'^(?:' + uncertaintyQualifiers + '|A°|A°\.|Anno|anno|gezeichnet nach der Natur)?\s?(____)\??$',
    'yearRangeWithQualifier': r'(?:ca\.)?\s?(?:zwischen)?\s?(____\??)\s?(?:-|und|ud)\s?(____\??)',
    'yearWithPlaceHolderAndQualifier': r'(__--|___-)'
}

def cleanDateString(dateString):
    s = re.sub('\[|\]', '', dateString)
    return s

def guessMonth(monthString):
    monthsGermanFull = ["Januar","Februar","März","April","Mai","Juni","Juli","August","September","Oktober","November","Dezember"]
    monthsGermanAbr = ["Jan","Feb","Mär","Apr","Mai","Jun","Jul","Aug","Sept","Okt","Nov","Dez"]
    monthsFrenchFull = ["janvier","février","mars","avril","mai","juin","juillet","aout","septembre","octobre","novembre","décembre"]
    monthsFrenchAbr = ["janv","févr","mars","avril","avr","mai","juin","juil","aout","aou","sept","oct","nov","déc"]
    monthsEnglishFull = ["January","February","March","April","May","June","July","August","September","October","November","December"]
    monthsEnglishAbr = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
    monthsRoman = ["I","II","III","IV","V","VI","VII","VIII","IX","X","XI","XII"]
    testOrder = [monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman]
    monthString = re.sub(r'\.|\s', '', monthString)
    for test in testOrder:
        for i, month in enumerate(test):
            if month.lower() == monthString.lower():
                return i + 1
    return 0
            
def afterYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def beforeYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def singleYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def yearRangeWithQualifier(dateString):
    years = re.findall(r'(\d{4}\??)', dateString)
    uncertain = re.search(r'(ca)', dateString)
    if uncertain:
        for i, year in enumerate(years):
            if not '?' in year:
                years[i] += '?'
    return "/".join(years)

def singleDate(dateString):
    date = re.search(r'\d{1,2}\.\d{1,2}\.\d{2,4}', dateString)
    return date.group(0)

def fullDateWithMonthInLangOrRoman(dateString):
    allMonths = '|'.join([monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman])
    datePattern = r'(\d{1,2})(?:\.|\s)*(?:' + allMonths + ')(?:\.|\s)*(?:\d{2,4})'
    yearPattern = r'((\d{2,4})\.?$|\d{4})'
    try:
        date = re.search(datePattern, dateString, flags=re.IGNORECASE).group(1)
    except:
        date = "xx"
        
    try:
        monthWords = re.search(allMonths, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        month = "xx"

    try:
        year = re.search(yearPattern, dateString).group(1)
    except:
        year = "xxxx"
    
    return '.'.join([date, month, year])

def monthAndYearWithMonthInLangOrRoman(dateString):
    allMonths = '|'.join([monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman])
    yearPattern = r'((\d{2,4})\.?$|\d{4})'
        
    try:
        monthWords = re.search(allMonths, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        month = "xx"

    try:
        year = re.search(yearPattern, dateString).group(1)
    except:
        year = "xxxx"
    
    return '.'.join([month, year])

def yearWithPlaceHolderAndQualifier(dateString):
    uncertain = re.search(r'(ca|\?)', dateString)
    quantifier = '?' if uncertain else ''
    m = re.search(r'(\d{2})--', dateString)
    if m:
        century = m.group(1)
        return "%s00%s/%s99%s" % (century, quantifier, century, quantifier)
    m = re.search(r'(\d{3})-', dateString)
    if m:
        century = m.group(1)
        return "%s0%s/%s9%s" % (century, quantifier, century, quantifier)

def interpret(dateString, pattern):
    ds = cleanDateString(dateString)
    testOrder = ['singleDate', 'fullDateWithMonthInLangOrRoman', 'monthAndYearWithMonthInLangOrRoman', 'singleYearWithQualifier', 'beforeYearWithQualifier', 'afterYearWithQualifier', 'yearRangeWithQualifier', 'yearWithPlaceHolderAndQualifier']
    possibles = globals().copy()
    possibles.update(locals())
    for test in testOrder:
        m = re.search(r[test], pattern)
        if m:    
            f = possibles.get(test)
            if not f:
                raise NotImplementedError("Function %s not implemented" % test)
            return f(ds)
    
    return None

In [None]:
field = '260$c'
matches = {
    'matched': [],
    'notMatched': []
}
print(len(dateVariantsPerField[field]))
for pattern in dateVariantsPerField[field].keys():
    example = random.choice(dateVariantsPerField[field][pattern]['examples'])['value']
    if not interpret(example, pattern):    
        m = re.search(r'(__--|___-)', pattern)
        if m:
            matches['matched'].append(pattern)
#             print(example, " --> ", yearWithPlaceHolderAndQualifier(example))
        else:
            matches['notMatched'].append(pattern)

# print("Matches")
# print("\n".join(matches['matched']))
# print("====\nNot matched")
# print("\n".join(matches['notMatched']))

## Output

In [None]:
for field in datefields:
    with open(output + '/dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example', 'Interpretation'))
        for k, d in enumerate(dateVariantsPerField[field]):
            example = dateVariantsPerField[field][d]['examples'][0]['value']
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], example, interpret(example, d)))