In [None]:
from lxml import etree
import json
import os
import requests
import urllib

In [2]:
inputFile = "../input/sari_abzug-utf-8_23_04-tsv.json"
manifestDirectory = "../manifests/"
outputDirectory = "../input/"
outputPrefix = "sari-"

In [3]:
def convertRowToXml(row, keys):
    record = etree.Element("record")
    etree.SubElement(record, "uuid").text = row['UUID']
    datafield = False
    for key in keys:
        if key in row and row[key] is not None:
            if '$' in key:
                code = key[4:]
                etree.SubElement(datafield, "subfield", code=code).text = str(row[key])
                # Remove non-separated field content
                datafield.text = None
            else:
                datafield = etree.SubElement(record, "datafield", tag=key)
                datafield.text = str(row[key])
    return record

In [8]:
def getImagesFromCachedManifest(manifest):
    manifestFilePath = manifestDirectory + urllib.parse.quote(manifest, safe='') + '.json'
    if os.path.isfile(manifestFilePath):
        with open(manifestFilePath, 'r') as f:
            content = json.load(f)
            canvases = [d for d in content['sequences'][0]['canvases']]
            images = [{
                'image': c['images'][0]['resource']['service']['@id'],
                'width': c['width'],
                'height': c['height']
            } for c in canvases]
            return images
    else:
        print("Manifest for %s has not been cached" % row['UUID'])

In [9]:
def imageListToXml(images):
    imagesNode = etree.Element("images")
    for image in images:
        imageNode = etree.SubElement(imagesNode, "image")
        etree.SubElement(imageNode, "height").text = str(image['height'])
        etree.SubElement(imageNode, "width").text = str(image['width'])
        etree.SubElement(imageNode, "url", type="iiif").text = image['image']
    return imagesNode

In [4]:
with open(inputFile, 'r') as f:
    rawData = json.load(f)

In [5]:
keys = list(rawData['rows'][0].keys())
keys.sort()

In [10]:
# Output individual files
for row in rawData['rows'][:10]:
    
    records = etree.Element("records")
    record = convertRowToXml(row, keys)
    
    images = getImagesFromCachedManifest(row['manifest'])
    record.append(imageListToXml(images))
    
    records.append(record)
    
    outputFile = outputDirectory + outputPrefix + row['UUID'] + ".xml"
    with open(outputFile, 'wb') as f:
        f.write(etree.tostring(records, pretty_print=True))