In [110]:
# Libraries to extract metadata descriptions from XML data
import xml.etree.ElementTree as ET
import urllib.request

# Libraries for data analysis
import pandas as pd
import numpy as np
import csv

# Custom variables
import config

# For writing data and creating directories
from pathlib import Path

**Step 1:** Read the CSV file with XML path data for the Newcastle University Special Collections archival catalog.

In [2]:
df_xml = pd.read_csv(config.xml_paths)
row_count = df_xml.shape[0]
col_count = df_xml.shape[1]
print("rows:",row_count, "| columns:",col_count)
df_xml.head(3)

rows: 11 | columns: 3


Unnamed: 0,collection_name,xml_path,xml_format
0,Bloodaxe Books Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
1,Bell (Gertrude) Archive,https://gertrudebell.ncl.ac.uk/export/,custom
2,Hill (Selima) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD


In [3]:
ead_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] == "EAD"]
other_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] != "EAD"] 
assert(ead_xml_paths[0] == df_xml.iloc[0,1]), "The first EAD XML list value should be the first XML path in the DataFrame."
assert(other_xml_paths[0] == df_xml.iloc[1,1]), "There should only be one non-EAD XML path, that of the Bell Archive."
print(ead_xml_paths[0])
print(other_xml_paths[0])

https://specialcollections.ncl.ac.uk/downloads/exports/ead/2000f6325dfc4fc3201fc45ed01c7a5d.ead.xml
https://gertrudebell.ncl.ac.uk/export/


In [4]:
xml_path = ead_xml_paths[0]

Parse the XML data:

In [5]:
content = urllib.request.urlopen(xml_path)
xmlTree = ET.parse(content)

Extract the descriptive text in the metadata fields listed below from the XML data.

For each collection, extract the:
* EADID: `<eadid>`

At all levels of the archival hierarchy, from the collection overall (i.e., `<archdesc>`) down to the item level (e.g., `<c level="...">` for `"subfonds"`, `"subseries"`, `"file"`, `"item"`, etc.), extract the following tags' text:
* Title: `<unittitle>`
* Identifier: `<unitid>`
* Date (of material?): `<unitdate>`
* Language of material: `<langmaterial>`
* Biographical / Historical: `<bioghist>`
* Scope and Contents: `<scopecontent>`
* Processing Information: `<processinfo>`

Then, store the extracted data in a DataFrame.

In [6]:
root = xmlTree.getroot()
print(root[0][0].tag, root[0][0].text, root[0][0].attrib["identifier"])
print(root[1].tag, root[1][0].tag)

eadid BXB gb186-bxb
archdesc did


In [None]:
# d = {
#     # "eadid": eadids, "level": levels, 
#     "unittitle": titles, "unitid": unitids, 
#     # "unitdate": dates, "langmaterial": langmaterials, 
#     "bioghist": bioghists, "scopecontent": scopecontents, "processinfo": processinfos
# }

In [134]:
metadata_field_tags = [
    "unittitle", "unitid", "unitdate", "bioghist", "scopecontent", 
    "processinfo", "langmaterial", "controlaccess"
]
d_descs = {
    'unittitle': "", 'unitid': "", 'unitdate': "", 'bioghist': "", 'scopecontent': "", 
    'processinfo': "", 'langmaterial': "", 'controlaccess': ""
}
list_metadata = []
for child in xmlTree.iter():
    tag = child.tag
    # Each time loop hits <archdesc> or <c> tags, record the metadata descriptions gathered
    # thus far and start a new d_descs dictionary
    if tag == "c":
        list_metadata += [d_descs]
        d_descs = {
            'unittitle': "", 'unitid': "", 'unitdate': "", 'bioghist': "", 'scopecontent': "", 
            'processinfo': "", 'langmaterial': "", 'controlaccess': ""
        }
    elif tag in metadata_field_tags:
        all_text = ""
        # Gather all the text under metadata field tags, including those enclosed in
        # additional tags (for example: <p> or <note> tags)
        for text_block in child.itertext():
            if len(text_block) > 0:
                text_block = text_block.strip()
                # Include a paragraph break because sequences of text strings enclosed
                # in <p> or <note> tags will otherwise have no separation between them.
                all_text = all_text + text_block + "\n"
        if (tag == 'langmaterial') or (tag == 'controlaccess'):
            all_text = all_text.strip()
            if "\xa01953-\xa0" in all_text:
                    all_text = all_text.replace("\xa01953-\xa0","-")
            text_list = all_text.split("\n\n")
            d_descs[tag] = text_list
        else:
            d_descs[tag] = all_text.strip()    # Remove leading and trailing whitespace
    else:
        continue

In [135]:
print(len(list_metadata))
# print(list_metadata[0])
# print(list_metadata[100]["unitid"]+":", list_metadata[100]["scopecontent"])

5770


In [141]:
# Create list of first unitid value (which is the eadid) to set as another column - so 
# list of eadids should have length equal to length of list_metadata

In [138]:
metadata_df = pd.DataFrame.from_records(list_metadata)
metadata_df.head()

Unnamed: 0,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
0,Bloodaxe Books Archive,BXB,1978 - [ongoing],Bloodaxe Books is one of Britain's leading poe...,Consists of letters and proofs relating to pub...,"This catalogue was created by Rebecca Bradley,...","[Bengali, Czech, Danish, English, Finnish, Fre...","[Bloodaxe Books, 1978 -, Astley, Neil,-, poet ..."
1,Editorial,BXB/1,1978 - [ongoing],,Consists of letters and proofs relating to poe...,,,
2,Published Poetry and Translations by Author,BXB/1/1,1978 - [ongoing],,Consists of letters and proofs relating to pub...,,,
3,Robert Adamson,BXB/1/1/ADA,2004 - 2009,Robert Adamson was born in Sydney in 1943 and ...,Consists of letters and proofs relating to the...,,,"[Adamson, Robert, 1943 -, Poet]"
4,Reading the River: Selected Poems,BXB/1/1/ADA/1,2004,Robert Adamson was born in Sydney in 1943 and ...,Consists of letters and manuscripts relating t...,,,


In [139]:
metadata_df.tail()

Unnamed: 0,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
5765,Readings,BXB/5/1/2,c. 1977,,"Consists of reading pamphlets, typescripts of ...",,,
5766,Thoth Anthology,BXB/5/1/3,1968 - 1978,,Consists of correspondence regarding Morden To...,,,
5767,Correspondence,BXB/5/1/4,c. 1978,,Consists of correspondence between Neil Astley...,,,
5768,Readings,BXB/5/2,c. 1980,,Consists of material relating to readings base...,,,
5769,Correspondence,BXB/5/2/1,c. 1983,,Consists of correspondence regarding readings ...,,,


In [140]:
extracted_dir = "data/extracted/"
Path(extracted_dir).mkdir(parents=True, exist_ok=True)
metadata_df.to_csv(extracted_dir + "BloodaxeBooksArchive.csv")