In [1]:
# Libraries to extract metadata descriptions from XML data
import xml.etree.ElementTree as ET
import urllib.request

# Libraries for data analysis
import pandas as pd
import numpy as np
import csv

# Custom variables
import config

**Step 1:** Read the CSV file with XML path data for the Newcastle University Special Collections archival catalog.

In [2]:
df_xml = pd.read_csv(config.xml_paths)
row_count = df_xml.shape[0]
col_count = df_xml.shape[1]
print("rows:",row_count, "| columns:",col_count)
df_xml.head(3)

rows: 11 | columns: 3


Unnamed: 0,collection_name,xml_path,xml_format
0,Bloodaxe Books Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
1,Bell (Gertrude) Archive,https://gertrudebell.ncl.ac.uk/export/,custom
2,Hill (Selima) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD


In [3]:
ead_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] == "EAD"]
other_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] != "EAD"] 
assert(ead_xml_paths[0] == df_xml.iloc[0,1]), "The first EAD XML list value should be the first XML path in the DataFrame."
assert(other_xml_paths[0] == df_xml.iloc[1,1]), "There should only be one non-EAD XML path, that of the Bell Archive."
print(ead_xml_paths[0])
print(other_xml_paths[0])

https://specialcollections.ncl.ac.uk/downloads/exports/ead/2000f6325dfc4fc3201fc45ed01c7a5d.ead.xml
https://gertrudebell.ncl.ac.uk/export/


In [4]:
xml_path = ead_xml_paths[0]

Parse the XML data:

In [5]:
content = urllib.request.urlopen(xml_path)
xmlTree = ET.parse(content)

Extract the descriptive text in the metadata fields listed below from the XML data.

For each collection, extract the:
* EADID: `<eadid>`

At all levels of the archival hierarchy, from the collection overall (i.e., `<archdesc>`) down to the item level (e.g., `<c level="...">` for `"subfonds"`, `"subseries"`, `"file"`, `"item"`, etc.), extract the following tags' text:
* Title: `<unittitle>`
* Identifier: `<unitid>`
* Date (of material?): `<unitdate>`
* Language of material: `<langmaterial>`
* Biographical / Historical: `<bioghist>`
* Scope and Contents: `<scopecontent>`
* Processing Information: `<processinfo>`

Then, store the extracted data in a DataFrame.

In [13]:
root = xmlTree.getroot()
print(root[0][0].tag, root[0][0].text, root[0][0].attrib["identifier"])
print(root[1].tag, root[1][0].tag)

eadid BXB gb186-bxb
archdesc did


In [80]:
level_tags = [
    "archdesc", "c"
]
metadata_field_tags = [
    "unittitle", "unitid", "unitdate", "langmaterial", 
    "bioghist", "scopecontent", "processinfo"
]
metadata_desc_tags = ["bioghist", "scopecontent", "processinfo"]

In [85]:
# eadids, levels = [], []
titles, ids, dates, langmaterials = [], [], [], []
bioghists, scopecontents, processinfos = [], [], []
d = {
    # "eadid": eadids, "level": levels, 
    "unittitle": titles, "unitid": ids, 
    "unitdate": dates, "langmaterial": langmaterials, "bioghist": bioghists, 
    "scopecontent": scopecontents, "processinfo": processinfos
}
tracker = {
    "unittitle": False, "unitid": False, 
    "unitdate": False, "langmaterial": False, "bioghist": False, 
    "scopecontent": False, "processinfo": False
}
for child in xmlTree.iter():
    tag = child.tag
    if (tag == "archdesc") or (tag == "c"):
        level = tag.attrib["level"]
        # ... stopped here - trying to make sure lists are all same length!
    if tag in metadata_field_tags:
        if tag in metadata_desc_tags:
            all_text = ""
            for text_block in child.itertext():
                if len(text_block) > 0:
                    all_text = all_text + text_block + "\n"
            d[tag] += [all_text.strip()]
        else:
            d[tag] += [child.text]
        tracker[tag] = True
    else:
        continue

In [86]:
print(len(d['unittitle']))

5771


In [87]:
print(d["unittitle"][0:10])

['Bloodaxe Books Archive', ' Editorial ', 'Published Poetry and Translations by Author', 'Robert Adamson', 'Reading the River: Selected Poems', 'Jacket cover proof', 'Information sheet', "The Kingfisher's Soul", 'Jacket cover proof', 'Fleur Adcock']


In [88]:
print(d["bioghist"][0])

Bloodaxe Books is one of Britain's leading poetry publishers, named after Erik Bloodaxe, the last Viking king of independent Northumbria. Based at Hexham, Northumberland, its finance and administration is handled by sister company Pandon Press from Bala in North Wales. The company is internationally renowned for its quality in literature and excellence in book design. Their authors and books have won virtually every major literary award given to poetry, from the T.S. Eliot Prize and Pulitzer to the Nobel Prize. Bloodaxe Books has also broken new ground by opening up contemporary poetry to many thousands of new readers through publishing books such as the Staying Alive trilogy.
Bloodaxe Books was founded by Neil Astley in Newcastle upon Tyne in 1978. Initially working from his own flat, Neil used the Tyneside Free Press Workshop for typesetting whilst also working as the sole sales rep for Bloodaxe. As a poetry reader, Neil understood the lack of diversity in poetry publishing during th

In [89]:
metadata_df = pd.DataFrame.from_dict(d)
metadata_df.head()

ValueError: All arrays must be of the same length

In [None]:
# print("On tag", tag)
    if tag == "eadid":
        eadid = child.text
    # for grandchild in child.iter():
    elif tag == level_tags[0]:
        level = "fonds"
    elif tag == level_tags[1]:
        level = child.attrib["level"]
        # for greatgrandchild in child.iter():
            # if greatgrandchild.tag in metadata_field_tags:
                # d[greatgrandchild.tag] += [greatgrandchild.text]
                    # d[tag] += [child.text]
                    # d["eadid"] += [eadid]
                    # d["level"] += [level]
                        #     break
                        # else:
                        #     continue
    el