In [1]:
import csv
import json
import re
from collections import OrderedDict

In [2]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.txt"
output = "./output"

In [3]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [4]:
datefields = ['100$d', '260$c', '260$g', '264$c', '533$d', '600$d', '611$d', '700$d']

In [5]:
monthsGermanFull = r'(Januar)|(Februar)|(März)|(April)|(Mai)|(Juni)|(Juli)|(August)|(September)|(Oktober)|(November)|(Dezember)'
monthsGermanAbr = r'(Jan)|(Feb)|(Mär)|(Apr)|(Mai)|(Jun)|(Jul)|(Aug)|(Sept)|(Okt)|(Nov)|(Dez)'
monthsFrenchFull = r'(janvier)|(février)|(mars)|(avril)|(mai)|(juin)|(juillet)|(aout)|(septembre)|(octobre)|(novembre)|(décembre)'
monthsFrenchAbr = r'(janv)|(févr)|(mars)|(avril)|(avr)|(mai)|(juin)|(juil)|(aout)|(aou)|(sept)|(oct)|(nov)|(déc)'
monthsEnglishFull = r'(January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December)'
monthsEnglishAbr = r'(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)'
numeralRomans = r'(VIII)|(III)|(IV)|(VI)|(VII)|(V)|(IX)|(X)|II|I'
remaining = r'[A-z]'
unknowns = r'XX|xx'

In [6]:
dateVariantsPerField = {}
for datefield in datefields:
    dateVariants = {}
    for row in rawData['rows']:
        uuid = row['UUID']
        if datefield in row and row[datefield] != None:
            date = row[datefield]
            for d in date.split('|'):
                if d:
                    genericDate = re.sub(r'\d','_', d)
                    genericDate = re.sub(monthsGermanFull, '🌕🇩🇪', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchFull, '🌕🇫🇷', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishFull, '🌕🇬🇧', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsGermanAbr, '🌙🇩🇪', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsFrenchAbr, '🌙🇫🇷', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(monthsEnglishAbr, '🌙🇬🇧', genericDate, flags=re.IGNORECASE)
                    genericDate = re.sub(unknowns, '❓', genericDate)
                    genericDate = re.sub(numeralRomans, '🏛', genericDate)
                    #genericDate = re.sub(remaining, '', genericDate)
                    genericDate = genericDate.strip()
                    entry = { 'uuid': uuid, 'value': d}
                    if genericDate in dateVariants:
                        dateVariants[genericDate]['count'] += 1
                        dateVariants[genericDate]['examples'].append(entry)
                    else:
                        dateVariants[genericDate] = {
                            'count': 1,
                            'examples': [entry]
                        }
    dateVariantsOrdered = OrderedDict(sorted(dateVariants.items(), key=lambda kv: kv[1]['count'], reverse=True))
    dateVariantsPerField[datefield] = dateVariantsOrdered

In [7]:
for field in datefields:
    with open(output + '/dateVariants-' + field + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(('Format', 'Appearances', 'Example'))
        for k, d in enumerate(dateVariantsPerField[field]):
            csvwriter.writerow((d, dateVariantsPerField[field][d]['count'], dateVariantsPerField[field][d]['examples'][0]))


In [8]:
dateVariantsPerField

{'100$d': OrderedDict([('____-____',
               {'count': 5456,
                'examples': ['1828-1870',
                 '1774-1850',
                 '1774-1850',
                 '1796-1851',
                 '1796-1851',
                 '1796-1851',
                 '1798-1868',
                 '1819-1890',
                 '1819-1890',
                 '1774-1850',
                 '1797-1885',
                 '1796-1851',
                 '1781-1834',
                 '1828-1870',
                 '1802-1870',
                 '1813-1886',
                 '1813-1886',
                 '1655-1712',
                 '1655-1712',
                 '1775-1854',
                 '1767-1824',
                 '1746-1812',
                 '1752-1827',
                 '1763-1830',
                 '1677-1736',
                 '1746-1805',
                 '1871-1946',
                 '1871-1946',
                 '1871-1946',
                 '1871-1946',
                 '18