In [1]:
from csv import DictWriter
from lxml import etree
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [2]:
inputFolderMultimedia = '../data/source/multimedia/'
inputFolderObjects = '../data/source/object/'
outputFile = 'output/correspondences.csv'

Read all multimedia item files

In [3]:
multimediaFiles = [f for f in listdir(inputFolderMultimedia) if isfile(join(inputFolderMultimedia, f)) and f.endswith('.xml')]

Read all object item files

In [4]:
objectFiles = [f for f in listdir(inputFolderObjects) if isfile(join(inputFolderObjects, f)) and f.endswith('.xml')]

Initialise dictionary to store data

In [5]:
multimediaItems = {}

Set limit of number of files to process (only used for debugging)

In [6]:
limit = 999999

Read multimedia item files and retrieve uuid, id and filename

In [7]:
for file in tqdm(multimediaFiles[:limit]):
    tree = etree.parse(join(inputFolderMultimedia, file))
    uuid = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('uuid')
    if not uuid in multimediaItems:
        multimediaItems[uuid] = {}
    # Retrieve filename
    filenameElement = tree.find('.//{http://www.zetcom.com/ria/ws/module}dataField[@name="MulOriginalFileTxt"]/{http://www.zetcom.com/ria/ws/module}value')        
    if filenameElement is not None:
        multimediaItems[uuid]['filename'] = filenameElement.text
    # Retrieve module ID
    multimediaItems[uuid]['moduleId'] = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('id')

100%|██████████| 188545/188545 [00:40<00:00, 4615.51it/s]


Read object item files and retreive uuid and id of linked objects

In [8]:
for file in tqdm(objectFiles[:limit]):
    tree = etree.parse(join(inputFolderObjects, file))
    # Retrieve object uuid
    objectUuid = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('uuid')
    # Retrieve object id
    objectId = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('id')
    # Retrieve inventory number
    #objectInvNr = tree.find('.//{http://www.zetcom.com/ria/ws/module}dataField[@name="ObjObjectNumberTxt"]/{http://www.zetcom.com/ria/ws/module}value')
    try:
        objectInvNr = tree.find('.//{http://www.zetcom.com/ria/ws/module}repeatableGroup[@name="ObjObjectNumberGrp"]/{http://www.zetcom.com/ria/ws/module}repeatableGroupItem/{http://www.zetcom.com/ria/ws/module}virtualField[@name="NumberVrt"]/{http://www.zetcom.com/ria/ws/module}value').text.strip()
    except:
        objectInvNr = ''
    # Retrieve linked multimedia items
    moduleReferenceItems = tree.findall('.//{http://www.zetcom.com/ria/ws/module}moduleReference[@name="ObjMultimediaRef"]/{http://www.zetcom.com/ria/ws/module}moduleReferenceItem')
    for moduleReferenceItem in moduleReferenceItems:
        # Get uuid of multimedia item
        uuid = moduleReferenceItem.get('uuid')
        # Get id of multimedia item
        multimediaModuleId = moduleReferenceItem.get('moduleItemId')
        # Update entry only if it exists already
        if not uuid in multimediaItems:
            continue
        # Add object Ids
        if not 'objectIds' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectIds'] = []
        multimediaItems[uuid]['objectIds'].append(objectId)
        # Add object uuids
        if not 'objectUuids' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectUuids'] = []
        multimediaItems[uuid]['objectUuids'].append(objectUuid)
        # Add object inv. nrs.
        if not 'objectInvNrs' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectInvNrs'] = []
        multimediaItems[uuid]['objectInvNrs'].append(objectInvNr)
        # If filename is not present, retrieve from formatted value
        if not 'filename' in multimediaItems[uuid]:
            # Get filename from object
            filenameElement = moduleReferenceItem.find('./{http://www.zetcom.com/ria/ws/module}formattedValue')
            
            if filenameElement is not None:
                if len(filenameElement.text.split(', ')) > 1:
                    multimediaItems[uuid]['filename'] = filenameElement.text.split(', ')[1]

100%|██████████| 89738/89738 [00:58<00:00, 1531.12it/s]


Sort uuids for output

In [9]:
uuids = sorted(multimediaItems.keys())

Write output to csv file

In [10]:
with open(outputFile, 'w') as f:
    writer = DictWriter(f, fieldnames=('uuid' ,'filename','multimediaId','objectUuids','objectIds','objectInvNrs'))
    writer.writeheader()
    for uuid in uuids:
        item = multimediaItems[uuid]
        row = {
            'uuid': uuid,
            'filename': item['filename'] if 'filename' in item else '',
            'multimediaId': item['moduleId'] if 'moduleId' in item else '',
            'objectUuids': ';'.join(item['objectUuids']) if 'objectUuids' in item else '',
            'objectIds': ';'.join(item['objectIds']) if 'objectIds' in item else '',
            'objectInvNrs': ';'.join(item['objectInvNrs']) if 'objectInvNrs' in item else ''
        }
        writer.writerow(row)

# Read and analyse

In [11]:
import pandas as pd



In [12]:
df = pd.read_csv(outputFile)

In [13]:
df

Unnamed: 0,uuid,filename,multimediaId,objectUuids,objectIds,objectInvNrs
0,0000171c-c721-4c4d-ac3a-52f915ce8565,6506.1.jpg,56723,7291,5523,6506
1,00001a4c-4d10-4860-8b95-4e4a4c08afc1,lit_15383.0.2.jpg,187815,,,
2,000069aa-bdfb-4630-b533-59a42894a548,lit_11318.0.jpg,144585,,,
3,0000a8a7-073d-4bde-9fd7-1867d52249c1,31471.0.1.jpg,158894,bd1587e2-b5a0-4974-8ca5-086155fd0e73,69800,31471
4,000125ea-11ba-413d-af3b-1539cf2de123,10883.02.1.jpg,35212,18644ada-4350-44ae-818f-617465b9c1e5,21170,10883.2
...,...,...,...,...,...,...
188540,fffe75ab-62ca-4e9f-ae92-e2b76ace0801,GB-0078.0.jpg,133154,11142,12523,GB-0078
188541,fffebe6f-384b-4e16-a2dd-1ace7c262f3d,18833.0.3.jpg,76647,2db0c2ee-bed6-47a3-87a7-88b4d29201ac,45010,18833
188542,fffecc3f-4844-466e-b47a-77820c1c0b1a,lit_8558.0.jpg,129863,,,
188543,ffff013e-8fda-46d2-b9d8-c663e8b9dfb3,10311.0.jpg,32861,597c8add-75f7-4083-b7f9-674f0a75d6ff,19845,10311


In [14]:
empty_filename_rows = df[df['filename'].isna()]
empty_filename_rows

Unnamed: 0,uuid,filename,multimediaId,objectUuids,objectIds,objectInvNrs


In [15]:
empty_filename_rows.to_csv('output/missingFilenames.csv', index=False)