In [None]:
import sys
!{sys.executable} -m pip install edtf

In [None]:
import csv
import json
import random
import re
from collections import OrderedDict
from edtf import parse_edtf

In [None]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"
output = "./output"

In [None]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [None]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [None]:
artefactsToRemoveBefore = r'\[|\]'
unknowns = r'XX|xx'

In [None]:
uncertaintyQualifiers = 'ca\.|ca|circa|um|vermutlich um|\?'
r = {
    'afterYearWithQualifier': r'^(' + uncertaintyQualifiers + ')?((?:nach|nicht vor)\s?(_{4})|_{4}-|_{4}-❓{1,2})\??$',
    'beforeYearWithQualifier': r'^(' + uncertaintyQualifiers + ')?((?:vor|nicht nach)\s?(_{4})|-_{4}|❓{1,2}-_{4})\??$',
    'century': r'_{1,2}(\s|\.)*¢',
    'centuryRange': r'_{1,2}(\s|\.)*¢?(\/|-)_{1,2}(\s|\.)*¢',
    'fullDateWithMonthInLangOrRoman': r'_{1,2}(\.|\s)*(🌕)(\.|\s)*(_{2,4})',
    'midCentury': r'_\.\s?½\s?_{1,2}(\.|\s)*¢',
    'monthAndYearWithMonthInLangOrRoman': r'🌕(\.|\s)*(_{2,4})',
    'singleDate': r'(?:i\.e\.|den|le)?\s?(_{1,2}\._{1,2}\._{2,4})',
    'singleYearWithQualifier': r'^(?:' + uncertaintyQualifiers + '|A°|Ao|Ao\.|A°\.|Anno|anno|gezeichnet nach der Natur|i\.e\.)?\s?(____)\??$',
    'singleYearRelaxed': r'_{4}',
    'yearRangeWithQualifier': r'(?:ca\.)?\s?(?:zwischen)?\s?(_{4}\??)\s?(?:-|und|ud|/)\s?(_{2,4}\??)',
    'yearWithPlaceHolderAndQualifier': r'(([^_]|^)__--|([^_]|^)___-)'
}

monthTerms = {
    'de': {
        '1': ["Januar", "Jan.", "Jan"],
        '2': ["Februar", "Febr", "Feb.", "Feb"],
        '3': ["März", "Maerz", "Merz", "Mrz", "Mrz."],
        '4': ["April", "Apr.", "Apr", "Ap."],
        '5': ["Mai"],
        '6': ["Juni", "Jun.", "Jun"],
        '7': ["Juli", "Jul.", "Jul"],
        '8': ["August", "Augst", "Aug.", "Aug"],
        '9': ["September", "Sept.", "Sept", "Sep"],
        '10': ["Oktober", "Okt.", "Okt"],
        '11': ["November", "Nov.", "Nov"],
        '12': ["Dezember", "Dez.", "Dez"],
    },
    'en': {
        '1': ["January", "Jan"],
        '2': ["February", "Feb"],
        '3': ["March", "Mar"],
        '4': ["April", "Apr"],
        '5': ["May"],
        '6': ["June", "Juny", "Jun"],
        '7': ["July", "Jul"],
        '8': ["August", "Aug", "Aust"],
        '9': ["September", "Sep", "Sept"],
        '10': ["October", "Oct"],
        '11': ["November", "Nov"],
        '12': ["December", "Dec"],
    },
    'fr': {
        '1': ["Janvier", "Janv", "Jan"],
        '2': ["Février", "Févr", "Fév"],
        '3': ["Mars", "Mar"],
        '4': ["Avril", "Avr"],
        '5': ["Mai"],
        '6': ["Juin"],
        '7': ["Juillet", "Juil"],
        '8': ["Août", "Aout", "Aoust", "Aost", "Aost", "Aou"],
        '9': ["Septembre", "Septbr", "Sept", "Sep", "7bre", "7br"],
        '10': ["Octobre", "octobr.", "Octob", "Oct", "8bre", "8br"],
        '11': ["Novembre", "Novbr", "Nov", "9bre", "9br"],
        '12': ["Décembre", "Decbr", "Dec", "Xbre", "Xbr"],
    },
    'roman': {
        '8': ["VIII"],
        '7': ["VII"],
        '12': ["XII"],
        '3': ["III"],
        '11': ["XI"],
        '9': ["IX"],
        '2': ["II"],
        '4': ["IV"],
        '6': ["VI"],
        '10': ["X", "Xbr"],
        '5': ["V"],
        '1': ["I"],
    }
}

centuryTerms = {
    'de': ["Jahrhundert", "Jht.", "Jh."]
}

midTerms = {
    'de': ["Hälfte"]
}

allMonthTerms = [item for sublist in [month for langMonths in [list(d.values()) for d in [monthTerms[lang] for lang in monthTerms]] for month in langMonths] for item in sublist]
allCenturyTerms = [d for l in [centuryTerms[d] for d in centuryTerms] for d in l]
allMidTerms = [d for l in [midTerms[d] for d in midTerms] for d in l]

def cleanDateString(dateString):
    s = re.sub('\[|\]', '', dateString)
    return s

def guessMonth(monthString):
    testOrder = ['de', 'en', 'fr', 'roman']
    monthString = re.sub(r'\.|\s', '', monthString)
    for lang in testOrder:
        for i in monthTerms[lang].keys():
            for monthVariation in monthTerms[lang][i]:
                if monthVariation.lower() == monthString.lower():
                    return i
    return 0
            
def afterYearWithQualifier(dateString):
    yearSearch = re.search(r'(\d{4}\??)', dateString)
    if not yearSearch:
        return None
    year = yearSearch.group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = year + "/"
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def beforeYearWithQualifier(dateString):
    yearSearch = re.search(r'(\d{4}\??)', dateString)
    if not yearSearch:
        return None
    year = yearSearch.group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def century(dateString):
    centurySearch = re.search(r'(\d{1,2})', dateString)
    if not centurySearch:
        return None
    century = centurySearch.group(1)
    centuryEDTF = str(int(century)-1) ## EDTF uses YY for century. 19th century is 18
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain:
        return centuryEDTF + "?"
    else:
        return centuryEDTF
    
def midCentury(dateString):
    centurySearch = re.search(r'(\d)\.\s?[A-zäöü]*\s?(\d{1,2})', dateString)
    if not centurySearch:
        return None
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    half = centurySearch.group(1)
    century = centurySearch.group(2)
    centuryEDTF = str(int(century)-1)
    qualifier = '?' if uncertain else ''
    if half == "1":
        return century + "00" + qualifier + "/" + century + "50" + qualifier
    else:
        return century + "50" + qualifier + "/" + century + "99" + qualifier

def centuryRange(dateString):
    centurySearch = re.findall(r'(\d{2})', dateString)
    if len(centurySearch) <2:
        return None
    centuryFrom = centurySearch[0]
    centuryTo = centurySearch[1]
    centuryFromEDTF = str(int(centuryFrom)-1) ## EDTF uses YY for century. 19th century is 18
    centuryToEDTF = str(int(centuryTo)-1) 
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain:
        return centuryFromEDTF + "?/" + centuryToEDTF + "?"
    else:
        return centuryFromEDTF + "/" + centuryToEDTF
    
    
def singleYearWithQualifier(dateString):
    yearSearch = re.search(r'(\d{4}\??)', dateString)
    if not yearSearch:
        return None
    year = yearSearch.group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def singleYearRelaxed(dateString):
    return singleYearWithQualifier(dateString)

def yearRangeWithQualifier(dateString):
    years = re.findall(r'(\d{2,4}\??)', dateString)
    uncertain = re.search(r'(ca)', dateString)
    if uncertain:
        for i, year in enumerate(years):
            if not '?' in year:
                years[i] += '?'
    return "/".join(years)

def singleDate(dateString):
    date = re.search(r'\d{1,2}\.\d{1,2}\.\d{2,4}', dateString)
    if date:
        return date.group(0)
    else:
        return None

def fullDateWithMonthInLangOrRoman(dateString):
    allMonthsPattern = '|'.join(allMonthTerms)
    datePattern = r'(\d{1,2})(?:\.|\s)*(?:' + allMonthsPattern + ')(?:\.|\s)*(?:\d{2,4})'
    yearPattern = r'((\d{2,4})\.?$|\d{4})'
    try:
        date = re.search(datePattern, dateString, flags=re.IGNORECASE).group(1)
    except:
        return None
        
    try:
        monthWords = re.search(allMonthsPattern, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        return None

    try:
        year = re.search(yearPattern, dateString).group(1)
    except:
        return None
    
    return '.'.join([date, month, year])

def monthAndYearWithMonthInLangOrRoman(dateString):
    allMonthsPattern = '|'.join(allMonthTerms)
    yearPattern = r'((\d{2,4})\.?$|\d{4})'
        
    try:
        monthWords = re.search(allMonthsPattern, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        return None

    try:
        year = re.search(yearPattern, dateString).group(1)
    except:
        return None
    
    return '.'.join([month, year])

def yearWithPlaceHolderAndQualifier(dateString):
    uncertain = re.search(r'(ca|\?)', dateString)
    quantifier = '?' if uncertain else ''
    m = re.search(r'(\d{2})--', dateString)
    if m:
        century = m.group(1)
        return "%s00%s/%s99%s" % (century, quantifier, century, quantifier)
    m = re.search(r'(\d{3})-', dateString)
    if m:
        century = m.group(1)
        return "%s0%s/%s9%s" % (century, quantifier, century, quantifier)

def interpret(dateString, pattern):
    ds = cleanDateString(dateString)
    testOrder = ['singleDate', 'fullDateWithMonthInLangOrRoman', 'monthAndYearWithMonthInLangOrRoman', 'singleYearWithQualifier', 'beforeYearWithQualifier', 'afterYearWithQualifier', 'yearRangeWithQualifier', 'yearWithPlaceHolderAndQualifier', 'centuryRange', 'midCentury', 'century', 'singleYearRelaxed']
    possibles = globals().copy()
    possibles.update(locals())
    for test in testOrder:
        m = re.search(r[test], pattern)
        if m:    
            f = possibles.get(test)
            if not f:
                raise NotImplementedError("Function %s not implemented" % test)
            return f(ds)
    
    return None

In [None]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        uuid = row['UUID']
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                if d:
                    genericDate = re.sub(artefactsToRemoveBefore, '', d)
                    
                    langOrder = ['de', 'en', 'fr']
                    for lang in langOrder:
                        regexPattern = r'(' + ')|('.join([month for variations in [monthTerms[lang][d] for d in monthTerms[lang]] for month in variations]) + ')'                            
                        genericDate = re.sub(regexPattern, '🌕', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(r'🌕r|🌕re|🌕s|🌕br|🌕st|🌕obr|🌕ob|🌕t', '🌕', genericDate, flags=re.IGNORECASE)
                    
                    genericDate = re.sub(unknowns, '❓', genericDate)
                    
                    monthsRoman = r'(' + ')|('.join([month for variations in [monthTerms['roman'][d] for d in monthTerms['roman']] for month in variations]) + ')'                            
                    genericDate = re.sub(monthsRoman, '🌕', genericDate)
                    
                    centuriesPattern = r'(' + ')|('.join(allCenturyTerms) + ')'
                    genericDate = re.sub(centuriesPattern, '¢', genericDate)
                    
                    midTermPattern = r'(' + ')|('.join(allMidTerms) + ')'
                    genericDate = re.sub(midTermPattern, '½', genericDate)
                    
                    genericDate = re.sub(r'\d','_', genericDate)
                    
                    genericDate = genericDate.strip()
                    entry = { 'uuid': uuid, 'value': d}
                    if genericDate in dateVariants:
                        dateVariants[genericDate]['count'] += 1
                        dateVariants[genericDate]['examples'].append(entry)
                    else:
                        dateVariants[genericDate] = {
                            'count': 1,
                            'examples': [entry]
                        }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

# Interpret Values

In [None]:
field = '260$c'
matches = {
    'matched': [],
    'notMatched': []
}
print(len(dateVariantsPerField[field]))
for pattern in dateVariantsPerField[field].keys():
    example = random.choice(dateVariantsPerField[field][pattern]['examples'])['value']
    if not interpret(example, pattern):    
        m = re.search(r'(__--|___-)', pattern)
        if m:
            matches['matched'].append(pattern)
#             print(example, " --> ", yearWithPlaceHolderAndQualifier(example))
        else:
            matches['notMatched'].append(pattern)

# print("Matches")
# print("\n".join(matches['matched']))
# print("====\nNot matched")
# print("\n".join(matches['notMatched']))

## Output

In [None]:
for field in datefields:
    with open(output + '/dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example', 'Interpretation'))
        for k, d in enumerate(dateVariantsPerField[field]):
            example = dateVariantsPerField[field][d]['examples'][0]['value']
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], example, interpret(example, d)))
            