In [1]:
import sys

In [2]:
!{sys.executable} -m pip install itables

You should consider upgrading via the '/Users/fkraeutli/anaconda/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
from itables import show
from lxml import etree
import pandas as pd

<IPython.core.display.Javascript object>

In [4]:
inputFile = '/Users/fkraeutli/Sites/bso-curation-data/data/nb/source/WMC_Records_20201201.xml'
outputFile = './output/nb-analysis.html'
output = ''

In [5]:
root = etree.parse(inputFile)

In [6]:
records = root.findall("/Record")

Look at number of records

In [7]:
print("Found %d records" % len(records))

Found 8384 records


Parse XML file

In [8]:
def addDataElements(elements, newElements):
    for element in newElements:
        id = element.get("ElementId")
        if not id in elements:
            elements[id] = {
                "id": id,
                "name": element.get("ElementName"),
                "type": element.get("ElementType"),
                "typeId": element.get("ElementTypeId"),
                "recordCount": 1
            }
        else:
            elements[id]["recordCount"] += 1
    return elements

dataElements = {}
descriptors = {}
for record in records:
    recordDataElements = record.findall("DetailData/DataElement")
    dataElements = addDataElements(dataElements, recordDataElements)
            
    recordDescriptors = record.findall("Descriptors/Descriptor")
    for descriptor in recordDescriptors:
        id = descriptor.find("Thesaurus").text
        if not id in descriptors:
            descriptors[id] = {
                "id": id,
                "recordCount": 0,
                "hasDetailData": False,
                "dataElements": {}
            }
        else:
            descriptors[id]["recordCount"] += 1
        
        if descriptor.find("DetailData"): 
            descriptors[id]["hasDetailData"] = True
            descriptors[id]["dataElements"] = addDataElements(descriptors[id]["dataElements"], descriptor.findall("DetailData/DataElement"))
            
            



## Data elements

In [9]:
data = [[d['id'], d['name'], d['type'], d['recordCount'], d['recordCount']/len(records)] for d in dataElements.values()]
dataElementsAnalysis = pd.DataFrame(data, columns=["Id", "Name", "Type", "Count", "Coverage"])
show(dataElementsAnalysis)

Id,Name,Type,Count,Coverage


In [10]:
output += "<h2>Data Elements</h2>"
output += dataElementsAnalysis.to_html()

## Descriptors

In [11]:
data = [[d['id'], d['hasDetailData'], d['recordCount'], d['recordCount']/len(records)] for d in descriptors.values()]
descriptorsAnalysis = pd.DataFrame(data, columns=["Id", "Details", "Count", "Coverage"])
show(descriptorsAnalysis)

Id,Details,Count,Coverage


In [12]:
output += "<h2>Descriptors</h2>"
output += descriptorsAnalysis.to_html()

### Data Elements within Descriptors

In [13]:
output += "<h3>Data Elements within Descriptors</h3>"

In [14]:
for descriptor in [descriptors[d] for d in descriptors.keys() if descriptors[d]['hasDetailData'] is True]:
    print(descriptor['id'])
    data = [[d['id'], d['name'], d['type'], d['recordCount']] for d in descriptor['dataElements'].values()]
    dataElementsAnalysis = pd.DataFrame(data, columns=["Id", "Name", "Type", "Count"])
    show(dataElementsAnalysis)
    
    output += "<h4>%s</h4>" % descriptor['id']
    output += dataElementsAnalysis.to_html()

Personen


Id,Name,Type,Count


Geografika


Id,Name,Type,Count


Körperschaften


Id,Name,Type,Count


## Tags

In [15]:
tags = list(set([t.tag for t in root.findall('//*')]))
tags.sort()

List all tags

In [16]:
print("\n".join(tags))

AdministrativeData
CreatedOn
DataElement
DateRange
Description
Descriptor
Descriptors
DetailData
ElementValue
FileValue
FromApproxIndicator
FromDate
IdName
IntValue
LastEditedOn
Name
Record
Reference
ReferenceName
References
SeeAlso
TextRepresentation
TextValue
Thesaurus
ToApproxIndicator
ToDate


In [17]:
output += "<h2>Tags</h2>"
output += "<ul><li>" + "</li><li>".join(tags) + "</li></ul>"

In [18]:
with open(outputFile, 'w') as f:
    f.write(output)

## Gugelmann

In [19]:
gugelmannRecords = [record for record in records if record.xpath('DetailData/DataElement[@ElementId="2"]/ElementValue/TextValue[contains(text(),"GS-GUGE")]')]

In [20]:
urls = []
for record in gugelmannRecords:
    url = record.xpath('DetailData/DataElement[@ElementId="11040"]/ElementValue/TextValue')[0].text
    urls.append(url)

In [21]:
urls[:100]

['https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-ABERLI-R-1.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-ABERLI-R-2.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-ABERLI-R-3.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-ABERLI-R-4.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-ABERLI-R-5.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-ABERLI-R-6.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-BIEDERMANN-R-1.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-BIEDERMANN-R-2.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-BIEDERMANN-R-3.tif',
 'https://commons.wikimedia.org/wiki/File:CH-NB_-_Collection_Gugelmann_-_GS-GUGE-BIEDERMANN-R-4.tif',
 'ht

## Date Descriptions

In [25]:
productionDates = root.findall('//DetailData/DataElement[@ElementId="7"]/ElementValue')                       

Look at tags used to define production dates

In [43]:
set([date.getchildren()[0].tag for date in productionDates])

{'DateRange'}

Look at date operators

In [63]:
operators = set([date.getchildren()[0].get('DateOperator') for date in productionDates])
print("\n".join(operators))

exact
before
To
after
startingWith
fromTo
between


In [60]:
for operator in operators:
    example = root.find('//DateRange[@DateOperator="%s"]' % operator)
    print(operator)
    print(etree.tostring(example, pretty_print=True, encoding="UTF-8").decode("utf-8") )

exact
<DateRange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" DateOperator="exact">      
      <FromDate>+1901</FromDate>      
      <FromApproxIndicator>false</FromApproxIndicator>      
      <ToDate/>      
      <ToApproxIndicator>false</ToApproxIndicator>      
      <TextRepresentation>1901</TextRepresentation>      
     </DateRange>     
    

before
<DateRange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" DateOperator="before">      
      <FromDate>+1812</FromDate>      
      <FromApproxIndicator>false</FromApproxIndicator>      
      <ToDate/>      
      <ToApproxIndicator>false</ToApproxIndicator>      
      <TextRepresentation>-1812</TextRepresentation>      
     </DateRange>     
    

To
<DateRange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" DateOperator="To">      
      <FromDate>+1774</FromDate>      
      <FromApproxIndicator>true</FromApproxIndicator>      
      <ToDate/>      
      <ToApproxIndicator>false</ToApproxIndicator>   

In [53]:
etree.tostring(example)

b'<DateRange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" DateOperator="between">      \n      <FromDate>+1898</FromDate>      \n      <FromApproxIndicator>false</FromApproxIndicator>      \n      <ToDate>+1907</ToDate>      \n      <ToApproxIndicator>false</ToApproxIndicator>      \n      <TextRepresentation>1898-1907</TextRepresentation>      \n     </DateRange>     \n    '

In [54]:
help(etree.tostring)

Help on cython_function_or_method in module lxml.etree:

tostring(element_or_tree, *, encoding=None, method='xml', xml_declaration=None, pretty_print=False, with_tail=True, standalone=None, doctype=None, exclusive=False, with_comments=True, inclusive_ns_prefixes=None)
    tostring(element_or_tree, encoding=None, method="xml",
                 xml_declaration=None, pretty_print=False, with_tail=True,
                 standalone=None, doctype=None,
                 exclusive=False, with_comments=True, inclusive_ns_prefixes=None)
    
    Serialize an element to an encoded string representation of its XML
    tree.
    
    Defaults to ASCII encoding without XML declaration.  This
    behaviour can be configured with the keyword arguments 'encoding'
    (string) and 'xml_declaration' (bool).  Note that changing the
    encoding to a non UTF-8 compatible encoding will enable a
    declaration by default.
    
    You can also serialise to a Unicode string without declaration by
    passing