In [1]:
from csv import DictWriter
from lxml import etree
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [2]:
inputFolderMultimedia = '../data/source/multimedia/'
inputFolderObjects = '../data/source/object/'
outputFile = 'output/correspondences.csv'

In [3]:
multimediaFiles = [f for f in listdir(inputFolderMultimedia) if isfile(join(inputFolderMultimedia, f)) and f.endswith('.xml')]

In [4]:
objectFiles = [f for f in listdir(inputFolderObjects) if isfile(join(inputFolderObjects, f)) and f.endswith('.xml')]

In [5]:
multimediaItems = {}

In [6]:
limit = 99999999

In [7]:
for file in tqdm(multimediaFiles[:limit]):
    tree = etree.parse(join(inputFolderMultimedia, file))
    uuid = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('uuid')
    if not uuid in multimediaItems:
        multimediaItems[uuid] = {}
    # Retrieve filename
    filenameElement = tree.find('.//{http://www.zetcom.com/ria/ws/module}dataField[@name="MulOriginalFileTxt"]/{http://www.zetcom.com/ria/ws/module}value')        
    if filenameElement is not None:
        multimediaItems[uuid]['filename'] = filenameElement.text
    # Retrieve module ID
    multimediaItems[uuid]['moduleId'] = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('id')

100%|██████████| 299984/299984 [01:08<00:00, 4398.16it/s]


In [8]:
for file in tqdm(objectFiles[:limit]):
    tree = etree.parse(join(inputFolderObjects, file))
    # Retrieve object uuid
    objectUuid = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('uuid')
    # Retrieve object id
    objectId = tree.find('.//{http://www.zetcom.com/ria/ws/module}moduleItem').get('id')
    # Retrieve 
    moduleReferenceItems = tree.findall('.//{http://www.zetcom.com/ria/ws/module}moduleReference[@name="ObjMultimediaRef"]/{http://www.zetcom.com/ria/ws/module}moduleReferenceItem')
    for moduleReferenceItem in moduleReferenceItems:
        # Get uuid of multimedia item
        uuid = moduleReferenceItem.get('uuid')
        # Get id of multimedia item
        multimediaModuleId = moduleReferenceItem.get('moduleItemId')
        # Create entry if it does not exist yet
        if not uuid in multimediaItems:
            multimediaItems[uuid] = {}
        # Add object Ids
        if not 'objectIds' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectIds'] = []
        multimediaItems[uuid]['objectIds'].append(objectId)
        # Add object uuids
        if not 'objectUuids' in multimediaItems[uuid]:
            multimediaItems[uuid]['objectUuids'] = []
        multimediaItems[uuid]['objectUuids'].append(objectUuid)
        # If filename is not present, retrieve from formatted value
        if not 'filename' in multimediaItems[uuid]:
            # Get filename from object
            filenameElement = moduleReferenceItem.find('./{http://www.zetcom.com/ria/ws/module}formattedValue')
            
            if filenameElement is not None:
                if len(filenameElement.text.split(', ')) > 1:
                    multimediaItems[uuid]['filename'] = filenameElement.text.split(', ')[1]

100%|██████████| 88487/88487 [00:56<00:00, 1558.78it/s]


In [9]:
uuids = sorted(multimediaItems.keys())

In [11]:
with open(outputFile, 'w') as f:
    writer = DictWriter(f, fieldnames=('uuid' ,'filename','multimediaId','objectUuids','objectIds'))
    writer.writeheader()
    for uuid in uuids:
        item = multimediaItems[uuid]
        row = {
            'uuid': uuid,
            'filename': item['filename'] if 'filename' in item else '',
            'multimediaId': item['moduleId'] if 'moduleId' in item else '',
            'objectUuids': ';'.join(item['objectUuids']) if 'objectUuids' in item else '',
            'objectIds': ';'.join(item['objectIds']) if 'objectIds' in item else ''
        }
        writer.writerow(row)