In [1]:
import re
import csv
from lxml import etree

In [2]:
inputFile = '../data/source/nb-allRecords.xml'
outputDir = '../data/xml/nb'
limit = 10
offset = 0

In [3]:
curatedDataFiles = [
    "../data/source/nb-curation-personen.csv",
    "../data/source/nb-curation-koerperschaften.csv",
    "../data/source/nb-curation-geografika.csv"
]
curatedKey = "Raw"
curatedFieldsToAdd = ["GND-Nummer", "GND-Kennung", "WD"]

In [4]:
root = etree.parse(inputFile)

In [5]:
collection = root.getroot()

In [6]:
records = root.findall("Record")

In [7]:
curatedData = []
for curatedDataFile in curatedDataFiles:
    with open(curatedDataFile, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            curatedData.append(row)

In [8]:
descriptors = root.xpath("//Descriptor")

In [9]:
for descriptor in descriptors:
    thesaurus = descriptor.find("Thesaurus").text
    key = descriptor.find("IdName").text
    try:
        dataToAdd = [d for d in curatedData if d['Thesaurus'] == thesaurus and d[curatedKey] == key][0]
    except:
        continue
    for field in curatedFieldsToAdd:
        if field in dataToAdd:
            el = etree.SubElement(descriptor, field)
            el.text = dataToAdd[field]

In [10]:
def getDateForDateElement(date):
    if not date.text:
        return False
        
    patternCeYear = r'\+\d{4}'
    if re.match(patternCeYear, date.text):
        year = date.text[1:].zfill(4)
        if date.tag == 'FromDate':
            return "%s-01-01" % year
        else:
            return "%s-12-31" % year
    return False

In [11]:
dates = root.xpath("//FromDate|//ToDate")
for date in dates:
    fullDate = getDateForDateElement(date)
    if fullDate:
        date.set("fullDate", fullDate)

In [12]:
for record in records[offset:limit+offset]:
    collection.clear()
    recordId = record.get("Id")
        
    collection.append(record)
    outputFile = "%s/nb-record-%s.xml" % (outputDir, recordId)
    with open(outputFile, 'wb') as f:
        f.write(etree.tostring(collection, xml_declaration=True, pretty_print=True, encoding="UTF-8"))