In [1]:
import sys
!{sys.executable} -m pip install edtf

You should consider upgrading via the '/Users/fkraeutli/anaconda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import csv
import json
import random
import re
from collections import OrderedDict
from edtf import parse_edtf

In [3]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"
output = "./output"

In [4]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [5]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [6]:
artefactsToRemoveBefore = r'\[|\]'
monthsGermanFull = r'(Januar)|(Februar)|(März)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)'
monthsGermanAbr = r'(Jan)|(Feb)|(Mär)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)'
monthsFrenchFull = r'(janvier)|(fégenericDatevrier)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(décembre)'
monthsFrenchAbr = r'(janv)|(févr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(déc)'
monthsEnglishFull = r'(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)'
monthsEnglishAbr = r'(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)'
monthsRoman = r'((?:\s|\.)I(?:\s|\.))|((?:\s|\.)II(?:\s|\.))|((?:\s|\.)III(?:\s|\.))|((?:\s|\.)IV(?:\s|\.))|((?:\s|\.)V(?:\s|\.))|((?:\s|\.)VI(?:\s|\.))|((?:\s|\.)VII(?:\s|\.))|((?:\s|\.)VIII(?:\s|\.))|((?:\s|\.)IX(?:\s|\.))|((?:\s|\.)X(?:\s|\.))|((?:\s|\.)XI(?:\s|\.))|((?:\s|\.)XII(?:\s|\.))'
unknowns = r'XX|xx'

In [7]:
uncertaintyQualifiers = 'ca\.|ca|circa|um|vermutlich um'
r = {
    'afterYearWithQualifier': r'^(' + uncertaintyQualifiers + ')?(?:nach|nicht vor)\s?(_{4})\??$',
    'beforeYearWithQualifier': r'^(' + uncertaintyQualifiers + ')?(?:vor|nicht nach)\s?(_{4})\??$',
    'fullDateWithMonthInLangOrRoman': '_{1,2}(\.|\s)*(🌕)(\.|\s)*(_{2,4})',
    'monthAndYearWithMonthInLangOrRoman': '🌕(\.|\s)*(_{2,4})',
    'singleDate': r'(?:i\.e\.|den|le)?\s?(_{1,2}\._{1,2}\._{2,4})',
    'singleYearWithQualifier': r'^(?:' + uncertaintyQualifiers + '|A°|A°\.|Anno|anno|gezeichnet nach der Natur)?\s?(____)\??$',
    'yearRangeWithQualifier': r'(?:ca\.)?\s?(?:zwischen)?\s?(____\??)\s?(?:-|und|ud)\s?(____\??)',
    'yearWithPlaceHolderAndQualifier': r'(__--|___-)'
}

months = {
    'de': {
        '1': ["Januar", "Jan"],
        '2': ["Februar", "Feb"],
        '3': ["März", "Maerz", "Mrz"],
        '4': ["April", "Apr"],
        '5': ["Mai"],
        '6': ["Juni", "Jun"],
        '7': ["Juli", "Jul"],
        '8': ["August", "Aug", "Augst"],
        '9': ["September", "Sep", "Sept"],
        '10': ["Oktober", "Okt"],
        '11': ["November", "Nov"],
        '12': ["Dezember", "Dez"],
    },
    'en': {
        '1': ["January", "Jan"],
        '2': ["February", "Feb"],
        '3': ["March", "Mar"],
        '4': ["April", "Apr"],
        '5': ["May"],
        '6': ["June", "Jun"],
        '7': ["July", "Jul"],
        '8': ["August", "Aug"],
        '9': ["September", "Sep", "Sept"],
        '10': ["October", "Oct"],
        '11': ["November", "Nov"],
        '12': ["December", "Dec"],
    },
    'fr': {
        '1': ["Janvier", "Jan", "Janv"],
        '2': ["Février", "Févr"],
        '3': ["Mars", "Mar"],
        '4': ["Avril", "Avr"],
        '5': ["Mai"],
        '6': ["Juin"],
        '7': ["Juillet", "Juil"],
        '8': ["Aout", "Aou"],
        '9': ["Septembre", "Sep", "Sept"],
        '10': ["Octobre", "Oct"],
        '11': ["Novembre", "Nov"],
        '12': ["Décembre", "Dec"],
    },
    'roman': {
        '1': ["I"],
        '2': ["II"],
        '3': ["III"],
        '4': ["IV"],
        '5': ["V"],
        '6': ["VI"],
        '7': ["VII"],
        '8': ["VIII"],
        '9': ["IX"],
        '10': ["X"],
        '11': ["XI"],
        '12': ["XII"],
    }
}

def cleanDateString(dateString):
    s = re.sub('\[|\]', '', dateString)
    return s

def guessMonth(monthString):
    testOrder = ['de', 'fr', 'en', 'roman']
    monthString = re.sub(r'\.|\s', '', monthString)
    for lang in testOrder:
        for i in months[lang].keys():
            for monthVariation in months[lang][i]:
                if monthVariation.lower() == monthString.lower():
                    return i
    return 0
            
def afterYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def beforeYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    year = "/" + year
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def singleYearWithQualifier(dateString):
    year = re.search(r'(\d{4}\??)', dateString).group(1)
    uncertain = re.search(r'(' + uncertaintyQualifiers + ')', dateString)
    if uncertain and not '?' in year:
        return year + "?"
    else:
        return year

def yearRangeWithQualifier(dateString):
    years = re.findall(r'(\d{4}\??)', dateString)
    uncertain = re.search(r'(ca)', dateString)
    if uncertain:
        for i, year in enumerate(years):
            if not '?' in year:
                years[i] += '?'
    return "/".join(years)

def singleDate(dateString):
    date = re.search(r'\d{1,2}\.\d{1,2}\.\d{2,4}', dateString)
    return date.group(0)

def fullDateWithMonthInLangOrRoman(dateString):
    allMonths = '|'.join([monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman])
    datePattern = r'(\d{1,2})(?:\.|\s)*(?:' + allMonths + ')(?:\.|\s)*(?:\d{2,4})'
    yearPattern = r'((\d{2,4})\.?$|\d{4})'
    try:
        date = re.search(datePattern, dateString, flags=re.IGNORECASE).group(1)
    except:
        date = "xx"
        
    try:
        monthWords = re.search(allMonths, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        month = "xx"

    try:
        year = re.search(yearPattern, dateString).group(1)
    except:
        year = "xxxx"
    
    return '.'.join([date, month, year])

def monthAndYearWithMonthInLangOrRoman(dateString):
    allMonths = '|'.join([monthsGermanFull, monthsFrenchFull, monthsEnglishFull, monthsGermanAbr, monthsFrenchAbr, monthsEnglishAbr, monthsRoman])
    yearPattern = r'((\d{2,4})\.?$|\d{4})'
        
    try:
        monthWords = re.search(allMonths, dateString, flags=re.IGNORECASE).group(0)
        month = str(guessMonth(monthWords))
    except:
        month = "xx"

    try:
        year = re.search(yearPattern, dateString).group(1)
    except:
        year = "xxxx"
    
    return '.'.join([month, year])

def yearWithPlaceHolderAndQualifier(dateString):
    uncertain = re.search(r'(ca|\?)', dateString)
    quantifier = '?' if uncertain else ''
    m = re.search(r'(\d{2})--', dateString)
    if m:
        century = m.group(1)
        return "%s00%s/%s99%s" % (century, quantifier, century, quantifier)
    m = re.search(r'(\d{3})-', dateString)
    if m:
        century = m.group(1)
        return "%s0%s/%s9%s" % (century, quantifier, century, quantifier)

def interpret(dateString, pattern):
    ds = cleanDateString(dateString)
    testOrder = ['singleDate', 'fullDateWithMonthInLangOrRoman', 'monthAndYearWithMonthInLangOrRoman', 'singleYearWithQualifier', 'beforeYearWithQualifier', 'afterYearWithQualifier', 'yearRangeWithQualifier', 'yearWithPlaceHolderAndQualifier']
    possibles = globals().copy()
    possibles.update(locals())
    for test in testOrder:
        m = re.search(r[test], pattern)
        if m:    
            f = possibles.get(test)
            if not f:
                raise NotImplementedError("Function %s not implemented" % test)
            return f(ds)
    
    return None

In [8]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        uuid = row['UUID']
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                if d:
                    genericDate = re.sub(artefactsToRemoveBefore, '', d)
                    genericDate = re.sub(r'\d','_', genericDate)
                    langOrder = ['de', 'fr', 'en']
                    for lang in langOrder:
                        r = r'(' + ')|('.join([month for variations in [months[lang][d] for d in months[lang]] for month in variations]) + ')'                            
                        genericDate = re.sub(r, '🌕', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(unknowns, '❓', genericDate)
                    monthsRoman = r'(' + ')|('.join([month for variations in [months['roman'][d] for d in months['roman']] for month in variations]) + ')'                            
                    genericDate = re.sub(monthsRoman, ' 🏛 ', genericDate)
                    genericDate = genericDate.strip()
                    entry = { 'uuid': uuid, 'value': d}
                    if genericDate in dateVariants:
                        dateVariants[genericDate]['count'] += 1
                        dateVariants[genericDate]['examples'].append(entry)
                    else:
                        dateVariants[genericDate] = {
                            'count': 1,
                            'examples': [entry]
                        }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

In [9]:
r'(' + ')|('.join([month for variations in [months['de'][d] for d in months['de']] for month in variations]) + ')'

'(Januar)|(Jan)|(Februar)|(Feb)|(März)|(Maerz)|(Mrz)|(April)|(Apr)|(Mai)|(Juni)|(Jun)|(Juli)|(Jul)|(August)|(Aug)|(Augst)|(September)|(Sep)|(Sept)|(Oktober)|(Okt)|(November)|(Nov)|(Dezember)|(Dez)'

# Interpret Values

In [10]:
field = '260$c'
matches = {
    'matched': [],
    'notMatched': []
}
print(len(dateVariantsPerField[field]))
for pattern in dateVariantsPerField[field].keys():
    example = random.choice(dateVariantsPerField[field][pattern]['examples'])['value']
    if not interpret(example, pattern):    
        m = re.search(r'(__--|___-)', pattern)
        if m:
            matches['matched'].append(pattern)
#             print(example, " --> ", yearWithPlaceHolderAndQualifier(example))
        else:
            matches['notMatched'].append(pattern)

# print("Matches")
# print("\n".join(matches['matched']))
# print("====\nNot matched")
# print("\n".join(matches['notMatched']))

484


TypeError: string indices must be integers

## Output

In [None]:
for field in datefields:
    with open(output + '/dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example', 'Interpretation'))
        for k, d in enumerate(dateVariantsPerField[field]):
            example = dateVariantsPerField[field][d]['examples'][0]['value']
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], example, interpret(example, d)))