In [1]:
from csv import DictWriter
from lxml import etree
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [2]:
inputFolderMultimedia = '../data/source/multimedia/'
inputFolderObjects = '../data/source/object/'
outputFile = 'output/correspondences.csv'

Read all multimedia item files

In [3]:
multimediaFiles = [f for f in listdir(inputFolderMultimedia) if isfile(join(inputFolderMultimedia, f)) and f.endswith('.xml')]

Read all object item files

In [4]:
objectFiles = [f for f in listdir(inputFolderObjects) if isfile(join(inputFolderObjects, f)) and f.endswith('.xml')]

Initialise dictionary to store data

In [5]:
multimediaItems = {}

Set limit of number of files to process (only used for debugging)

In [6]:
limit = 999999

Read multimedia item files and retrieve uuid, id and filename

In [7]:
for file in tqdm(multimediaFiles[:limit]):
    tree = etree.parse(join(inputFolderMultimedia, file))
    uuid = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('uuid')
    if not uuid in multimediaItems:
        multimediaItems[uuid] = {}
    # Retrieve filename
    filenameElement = tree.find('.//{http://www.zetcom.com/ria/ws/module}dataField[@name="MulOriginalFileTxt"]/{http://www.zetcom.com/ria/ws/module}value')        
    if filenameElement is not None:
        multimediaItems[uuid]['filename'] = filenameElement.text
    # Retrieve module ID
    multimediaItems[uuid]['moduleId'] = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('id')

100%|██████████| 276698/276698 [01:06<00:00, 4157.97it/s]


Read object item files and retreive uuid and id of linked objects as well as multimedia item data that might be missing

In [8]:
for file in tqdm(objectFiles[:limit]):
    tree = etree.parse(join(inputFolderObjects, file))
    # Retrieve object uuid
    objectUuid = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('uuid')
    # Retrieve object id
    objectId = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('id')
    # Retrieve inventory number
    #objectInvNr = tree.find('.//{http://www.zetcom.com/ria/ws/module}dataField[@name="ObjObjectNumberTxt"]/{http://www.zetcom.com/ria/ws/module}value')
    try:
        objectInvNr = tree.find('.//{http://www.zetcom.com/ria/ws/module}repeatableGroup[@name="ObjObjectNumberGrp"]/{http://www.zetcom.com/ria/ws/module}repeatableGroupItem/{http://www.zetcom.com/ria/ws/module}virtualField[@name="NumberVrt"]/{http://www.zetcom.com/ria/ws/module}value').text.strip()
    except:
        objectInvNr = ''
    # Retrieve linked multimedia items
    moduleReferenceItems = tree.findall('.//{http://www.zetcom.com/ria/ws/module}moduleReference[@name="ObjMultimediaRef"]/{http://www.zetcom.com/ria/ws/module}moduleReferenceItem')
    for moduleReferenceItem in moduleReferenceItems:
        # Get uuid of multimedia item
        uuid = moduleReferenceItem.get('uuid')
        # Get id of multimedia item
        multimediaModuleId = moduleReferenceItem.get('moduleItemId')
        # Create entry if it does not exist yet
        if not uuid in multimediaItems:
            multimediaItems[uuid] = {}
        # Add object Ids
        if not 'objectIds' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectIds'] = []
        multimediaItems[uuid]['objectIds'].append(objectId)
        # Add object uuids
        if not 'objectUuids' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectUuids'] = []
        multimediaItems[uuid]['objectUuids'].append(objectUuid)
        # Add object inv. nrs.
        if not 'objectInvNrs' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectInvNrs'] = []
        multimediaItems[uuid]['objectInvNrs'].append(objectInvNr)
        # If filename is not present, retrieve from formatted value
        if not 'filename' in multimediaItems[uuid]:
            # Get filename from object
            filenameElement = moduleReferenceItem.find('./{http://www.zetcom.com/ria/ws/module}formattedValue')
            
            if filenameElement is not None:
                if len(filenameElement.text.split(', ')) > 1:
                    multimediaItems[uuid]['filename'] = filenameElement.text.split(', ')[1]

100%|██████████| 89630/89630 [01:00<00:00, 1474.44it/s]


Sort uuids for output

In [9]:
uuids = sorted(multimediaItems.keys())

Write output to csv file

In [10]:
with open(outputFile, 'w') as f:
    writer = DictWriter(f, fieldnames=('uuid' ,'filename','multimediaId','objectUuids','objectIds','objectInvNrs'))
    writer.writeheader()
    for uuid in uuids:
        item = multimediaItems[uuid]
        row = {
            'uuid': uuid,
            'filename': item['filename'] if 'filename' in item else '',
            'multimediaId': item['moduleId'] if 'moduleId' in item else '',
            'objectUuids': ';'.join(item['objectUuids']) if 'objectUuids' in item else '',
            'objectIds': ';'.join(item['objectIds']) if 'objectIds' in item else '',
            'objectInvNrs': ';'.join(item['objectInvNrs']) if 'objectInvNrs' in item else ''
        }
        writer.writerow(row)