In [1]:
import re
import copy
import csv
from lxml import etree

In [2]:
inputFiles = ['../data/source/nb-records.xml', '../data/source/nb-parentrecords.xml']
outputDir = '../data/xml/nb'
limit = 10
offset = 10000

In [3]:
curatedDataFiles = [
    "../data/source/nb-curation-personen.csv",
    "../data/source/nb-curation-koerperschaften.csv",
    "../data/source/nb-curation-geografika.csv"
]
curatedKey = "Raw"
curatedFieldsToAdd = ["GND-Nummer", "GND-Kennung", "WD"]

In [4]:
curatedNamesFile = "../data/source/nb-curation-names.csv"

In [5]:
root = etree.XML("<Collection/>")
for inputFile in inputFiles:
    collection = etree.parse(inputFile)
    for record in collection.findall("//Record"):
        root.append(record)

In [6]:
records = root.findall("Record")

In [7]:
# Filter records that either don't have an image or don't show up as a parent of another record
parentIDs = []

for record in records:
    parentIDs.append(record.get('ParentId'))
parentIDs = list(set(parentIDs))

In [8]:
orphans = []

for record in records:
    recordID = record.get('Id')
    image = record.find('.//DataElement[@ElementId="11040"]')
    # If record contains no image and is not a parent of another record, mark as orphan
    if image is None and recordID not in parentIDs:
        orphans.append(recordID)

In [9]:
records = [d for d in records if d.get('Id') not in orphans]

In [10]:
curatedData = []
for curatedDataFile in curatedDataFiles:
    with open(curatedDataFile, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            curatedData.append(row)

In [11]:
descriptors = root.xpath("//Descriptor")

In [12]:
for descriptor in descriptors:
    thesaurus = descriptor.find("Thesaurus").text
    key = descriptor.find("IdName").text
    try:
        dataToAdd = [d for d in curatedData if d['Thesaurus'] == thesaurus and d[curatedKey] == key][0]
    except:
        continue
    for field in curatedFieldsToAdd:
        if field in dataToAdd:
            el = etree.SubElement(descriptor, field)
            el.text = dataToAdd[field]

In [13]:
curatedNames = []
with open(curatedNamesFile, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        curatedNames.append(row)

In [14]:

def matchNameWithCuratedNames(name, curatedNames):
    for curatedName in curatedNames:
        if name in curatedName['Raw']:
            return curatedName['normalised name']
    print("Not found ", name)
    return False

def matchRoleWithCuratedNames(name, curatedNames):
    for curatedName in curatedNames:
        if curatedName['normalised role'] and curatedName['normalised role'] in name:
            roles = curatedName['normalised role'].split("/") 
            gndRoles = curatedName['gnd role'].split(";")
            returnRoles = []
            for i in range(min(len(roles), len(gndRoles))):
                returnRoles.append({"label": roles[i], "gnd": gndRoles[i]})
            return returnRoles
    return False

def cleanName(name):
    return re.sub(r'[^A-Za-z]+', '', name)

elementIdsWithCuratedNames = ['10817', '10927']
dataElementXPath = '|'.join(["DetailData/DataElement[@ElementId='%s']" % d for d in elementIdsWithCuratedNames])

# Process DataElements that have several values in one ElementValue by splitting the TextValue and adding extra ElementValues
#
# For example in ID 476941 the Element 10927 contains a TextValue that refers to two artists:
#
#            <DataElement ElementName="KünstlerIn" ElementId="10927" ElementType="Memo (max. 4000 Z.)" ElementTypeId="7">
#              <ElementValue Sequence="1">
#                <TextValue>Aberli, Johann Ludwig [MalerIn/ZeichnerIn];
#Zingg, Adrian [StecherIn]</TextValue>
#               </ElementValue>
#            </DataElement>
#            
# This should become:
#
#            <DataElement ElementName="KünstlerIn" ElementId="10927" ElementType="Memo (max. 4000 Z.)" ElementTypeId="7">
#              <ElementValue Sequence="1-0">
#                <TextValue>Aberli, Johann Ludwig [MalerIn/ZeichnerIn]</TextValue>
#              </ElementValue>
#              <ElementValue Sequence="1-1">
#                <TextValue>#Zingg, Adrian [StecherIn]</TextValue>
#              </ElementValue>
#            </DataElement>

for record in records:
    dataElementsContainingNames = record.xpath(dataElementXPath)
    if len(dataElementsContainingNames):
        for dataElement in dataElementsContainingNames:
            elementValues = dataElement.findall('./ElementValue')
            for elementValue in elementValues:
                text = elementValue.find("./TextValue").text
                if ";" in text:
                    # Extract data
                    values = text.split(";\n")
                    sequence = elementValue.get("Sequence")
                    # Remove ElementValue
                    dataElement.remove(elementValue)
                    # Create new ElementValue elements for each value
                    for i, value in enumerate(values):
                        newElementValue = etree.SubElement(dataElement, "ElementValue")
                        newElementValue.set("Sequence", "%s-%d" % (sequence, i))
                        newTextValue = etree.SubElement(newElementValue, "TextValue")
                        newTextValue.text = value




In [15]:
for record in records:
    
    # Extract Elements containing names
    recordElements = record.xpath(dataElementXPath)
    recordDescriptors = record.xpath("Descriptors/Descriptor[Thesaurus/text()='Personen']")
    
    if len(recordElements):
        for recordElement in recordElements:
            # Extract ElementValues (there can be several)
            values = recordElement.xpath("ElementValue")
            for value in values:
                name = value.find("TextValue").text

                matchedName = matchNameWithCuratedNames(name, curatedNames)
                if matchedName:
                    # If a match is found, copy the descriptor directly into the Element
                    for descriptor in recordDescriptors:
                        idName = descriptor.find("IdName").text
                        if cleanName(matchedName) in cleanName(idName):
                            value.append(copy.deepcopy(descriptor))

                matchedRoles = matchRoleWithCuratedNames(name, curatedNames)
                if matchedRoles:
                    for role in matchedRoles:
                        roleElement = etree.SubElement(value, "Role")
                        roleElement.set("gnd", role['gnd'])
                        roleElement.text = role['label']

Not found  Mllener, Johann Karl


In [16]:
# elementIdsToAddDescriptors = ['10817', '10927']
# xpath = '|'.join(["//DataElement[@ElementId='%s']" % d for d in elementIdsToAddDescriptors])
# elementsToAddDescriptors = root.xpath(xpath)

In [17]:
# descriptorIndex = {}
# descriptorKeys = []
# for descriptor in descriptors:
#     idName = descriptor.find("IdName").text
#     if idName not in descriptorIndex:
#         descriptorIndex[idName] = descriptor
#         descriptorKeys.append(idName)

In [18]:
# unmatchedNames = []
# for element in elementsToAddDescriptors:
#     values = element.xpath("ElementValue")
#     for value in values:
#         name = value.find("TextValue").text
        
#         try:
#             matchedCuratedName = [d for d in curatedNames if d['Name'] in name][0]
#         except:
#             print("Could not find %s in curated names" % name)
            
#         try:
#             matchedDescriptorKey = [d for d in descriptorKeys if matchedCuratedName['Name'] in d][0]
#         except:
#             unmatchedNames.append(matchedCuratedName['Name'])

# unmatchedNames = list(set(unmatchedNames))
# unmatchedNames.sort()

In [19]:
def getDateForDateElement(date):
    if not date.text:
        return False
        
    patternCeYear = r'\+\d{4}'
    if re.match(patternCeYear, date.text):
        year = date.text[1:].zfill(4)
        if date.tag == 'FromDate':
            return "%s-01-01" % year
        else:
            return "%s-12-31" % year
    return False

In [20]:
dates = root.xpath("//FromDate|//ToDate")
for date in dates:
    fullDate = getDateForDateElement(date)
    if fullDate:
        date.set("fullDate", fullDate)

In [21]:
additionalIdsToRender = ['476941']
collection = root
for record in records[offset:limit+offset] + [r for r in records if r.get('Id') in additionalIdsToRender]:
    collection.clear()
    id = record.get("Id")
    parentId = record.get("ParentId")
    record.set("RecordIdentifier", "nb-" + id)
    record.set("ParentRecordIdentifier", "nb-" + parentId)
    collection.append(record)
    outputFile = "%s/nb-record-%s.xml" % (outputDir, id)
    with open(outputFile, 'wb') as f:
        f.write(etree.tostring(collection, xml_declaration=True, pretty_print=True, encoding="UTF-8"))