In [1]:
import csv
import json
import re
from collections import OrderedDict

In [2]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"

In [3]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [4]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [5]:
monthsGermanFull = r'(Januar)|(Februar)|(März)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)'
monthsGermanAbr = r'(Jan)|(Feb)|(Mär)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)'
monthsFrenchFull = r'(janvier)|(février)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(décembre)'
monthsFrenchAbr = r'(janv)|(févr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(déc)'
monthsEnglishFull = r'(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)'
monthsEnglishAbr = r'(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)'
numeralRomans = r'(III)|(II)|(I)|(IV)|(VI)|(VII)|(VIII)|(V)|(IX)|(X)'

In [6]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                genericDate = re.sub(r'\d','x', d)
                genericDate = re.sub(monthsGermanFull, 'MMMM(de)', genericDate, flags=re.IGNORECASE)
                genericDate = re.sub(monthsFrenchFull, 'MMMM(fr)', genericDate, flags=re.IGNORECASE)
                genericDate = re.sub(monthsEnglishFull, 'MMMM(en)', genericDate, flags=re.IGNORECASE)
                genericDate = re.sub(monthsGermanAbr, 'MM(de)', genericDate, flags=re.IGNORECASE)
                genericDate = re.sub(monthsFrenchAbr, 'MM(fr)', genericDate, flags=re.IGNORECASE)
                genericDate = re.sub(monthsEnglishAbr, 'MM(en)', genericDate, flags=re.IGNORECASE)
                genericDate = re.sub(numeralRomans, 'R', genericDate)
                genericDate = genericDate.strip()
                if genericDate in dateVariants:
                    dateVariants[genericDate]['count'] += 1
                    dateVariants[genericDate]['examples'].append(d)
                else:
                    dateVariants[genericDate] = {
                        'count': 1,
                        'examples': [d]
                    }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

In [8]:
for field in datefields:
    with open('dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example'))
        for k, d in enumerate(dateVariantsPerField[field]):
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], dateVariantsPerField[field][d]['examples'][0]))
