In [1]:
import re
import copy
import csv
from lxml import etree

In [2]:
inputFile = '../data/source/nb-records.xml'
outputDir = '../data/xml/nb'
limit = 10
offset = 0

In [3]:
curatedDataFiles = [
    "../data/source/nb-curation-personen.csv",
    "../data/source/nb-curation-koerperschaften.csv",
    "../data/source/nb-curation-geografika.csv"
]
curatedKey = "Raw"
curatedFieldsToAdd = ["GND-Nummer", "GND-Kennung", "WD"]

In [4]:
curatedNamesFile = "../data/source/nb-curation-names.csv"

In [5]:
root = etree.parse(inputFile)

In [6]:
collection = root.getroot()

In [7]:
records = root.findall("Record")

In [8]:
curatedData = []
for curatedDataFile in curatedDataFiles:
    with open(curatedDataFile, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            curatedData.append(row)

In [9]:
descriptors = root.xpath("//Descriptor")

In [10]:
for descriptor in descriptors:
    thesaurus = descriptor.find("Thesaurus").text
    key = descriptor.find("IdName").text
    try:
        dataToAdd = [d for d in curatedData if d['Thesaurus'] == thesaurus and d[curatedKey] == key][0]
    except:
        continue
    for field in curatedFieldsToAdd:
        if field in dataToAdd:
            el = etree.SubElement(descriptor, field)
            el.text = dataToAdd[field]

In [11]:
curatedNames = []
with open(curatedNamesFile, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        curatedNames.append(row)

In [12]:
elementIdsWithCuratedNames = ['10817', '10927']
dataElementXPath = '|'.join(["DetailData/DataElement[@ElementId='%s']" % d for d in elementIdsWithCuratedNames])

def matchNameWithCuratedNames(name, curatedNames):
    for curatedName in curatedNames:
        if name in curatedName['Raw']:
            return curatedName['normalised name']
    print("Not found ", name)
    return False

def matchRoleWithCuratedNames(name, curatedNames):
    for curatedName in curatedNames:
        if curatedName['normalised role'] and curatedName['normalised role'] in name:
            roles = curatedName['normalised role'].split("/") 
            gndRoles = curatedName['gnd role'].split(";")
            returnRoles = []
            for i in range(min(len(roles), len(gndRoles))):
                returnRoles.append({"label": roles[i], "gnd": gndRoles[i]})
            return returnRoles
    return False

def cleanName(name):
    return re.sub(r'[^A-Za-z]+', '', name)

for record in records:
    
    recordElements = record.xpath(dataElementXPath)
    recordDescriptors = record.xpath("Descriptors/Descriptor[Thesaurus/text()='Personen']")
    
    if len(recordElements):
        for recordElement in recordElements:
            values = recordElement.xpath("ElementValue")
            for value in values:
                name = value.find("TextValue").text
                matchedName = matchNameWithCuratedNames(name, curatedNames)
                matchedRoles = matchRoleWithCuratedNames(name, curatedNames)
                
                for descriptor in recordDescriptors:
                    idName = descriptor.find("IdName").text
                    if cleanName(matchedName) in cleanName(idName):
                        value.append(copy.deepcopy(descriptor))
                
                if matchedRoles:
                    for role in matchedRoles:
                        roleElement = etree.SubElement(value, "Role")
                        roleElement.set("gnd", role['gnd'])
                        roleElement.text = role['label']

In [13]:
# elementIdsToAddDescriptors = ['10817', '10927']
# xpath = '|'.join(["//DataElement[@ElementId='%s']" % d for d in elementIdsToAddDescriptors])
# elementsToAddDescriptors = root.xpath(xpath)

In [14]:
# descriptorIndex = {}
# descriptorKeys = []
# for descriptor in descriptors:
#     idName = descriptor.find("IdName").text
#     if idName not in descriptorIndex:
#         descriptorIndex[idName] = descriptor
#         descriptorKeys.append(idName)

In [15]:
# unmatchedNames = []
# for element in elementsToAddDescriptors:
#     values = element.xpath("ElementValue")
#     for value in values:
#         name = value.find("TextValue").text
        
#         try:
#             matchedCuratedName = [d for d in curatedNames if d['Name'] in name][0]
#         except:
#             print("Could not find %s in curated names" % name)
            
#         try:
#             matchedDescriptorKey = [d for d in descriptorKeys if matchedCuratedName['Name'] in d][0]
#         except:
#             unmatchedNames.append(matchedCuratedName['Name'])

# unmatchedNames = list(set(unmatchedNames))
# unmatchedNames.sort()

In [16]:
def getDateForDateElement(date):
    if not date.text:
        return False
        
    patternCeYear = r'\+\d{4}'
    if re.match(patternCeYear, date.text):
        year = date.text[1:].zfill(4)
        if date.tag == 'FromDate':
            return "%s-01-01" % year
        else:
            return "%s-12-31" % year
    return False

In [17]:
dates = root.xpath("//FromDate|//ToDate")
for date in dates:
    fullDate = getDateForDateElement(date)
    if fullDate:
        date.set("fullDate", fullDate)

In [18]:
additionalIdsToRender = ['476941']
for record in records[offset:limit+offset] + [r for r in records if r.get('Id') in additionalIdsToRender]:
    collection.clear()
    recordId = record.get("Id")
        
    collection.append(record)
    outputFile = "%s/nb-record-%s.xml" % (outputDir, recordId)
    with open(outputFile, 'wb') as f:
        f.write(etree.tostring(collection, xml_declaration=True, pretty_print=True, encoding="UTF-8"))