In [1]:
# Libraries to extract metadata descriptions from XML data
import xml.etree.ElementTree as ET
import urllib.request

# Libraries for data analysis
import pandas as pd
import numpy as np
import csv

# Custom variables
import config
import utils

# For writing data and creating directories
from pathlib import Path

**Step 1:** Read the CSV file with XML path data for the Newcastle University Special Collections archival catalog.

In [2]:
df_xml = pd.read_csv(config.xml_paths)
row_count = df_xml.shape[0]
col_count = df_xml.shape[1]
print("rows:",row_count, "| columns:",col_count)
df_xml.head(3)

rows: 11 | columns: 3


Unnamed: 0,collection_name,xml_path,xml_format
0,Bloodaxe Books Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
1,Bell (Gertrude) Archive,https://gertrudebell.ncl.ac.uk/export/,custom
2,Hill (Selima) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD


In [3]:
ead_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] == "EAD"]
other_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] != "EAD"] 
assert(ead_xml_paths[0] == df_xml.iloc[0,1]), "The first EAD XML list value should be the first XML path in the DataFrame."
assert(other_xml_paths[0] == df_xml.iloc[1,1]), "There should only be one non-EAD XML path, that of the Bell Archive."
print(ead_xml_paths[0])
print(other_xml_paths[0])

https://specialcollections.ncl.ac.uk/downloads/exports/ead/2000f6325dfc4fc3201fc45ed01c7a5d.ead.xml
https://gertrudebell.ncl.ac.uk/export/


Parse the XML data as an ElementTree tree. 

Then, at all levels of the archival hierarchy, from the fonds (i.e., `<archdesc>`) down to the item level (e.g., `<c level="...">` for `"subfonds"`, `"subseries"`, `"file"`, `"item"`, etc.), extract the following tags' text:
* Title: `<unittitle>`
* Identifier: `<unitid>`
* Date (of material?): `<unitdate>`
* Language of material: `<langmaterial>`
* Biographical / Historical: `<bioghist>`
* Scope and Contents: `<scopecontent>`
* Processing Information: `<processinfo>`

The Encoded Archival Description Identifier, or EADID (e.g., `<eadid>BXB<\eadid>`), will also be the first Unit ID of a fonds (e.g., `<unitid>BXB</unitid>`).

*Note: "Fonds" is the archival term for collection.  It is the top-most level of a single archive's (or archival collection's) hierarchy.*

In [4]:
all_fonds = []
for xml_path in ead_xml_paths:
    fonds_list = utils.extractMetadata(xml_path)
    all_fonds += [fonds_list]

In [5]:
assert len(all_fonds) == row_count - 1

Store the extracted data in a DataFrame.

In [8]:
metadata_df = pd.DataFrame()
for fonds_list in all_fonds:
    df = pd.DataFrame.from_records(fonds_list)
    metadata_df = pd.concat([metadata_df, df])
print(metadata_df.shape)

(16682, 8)


In [9]:
metadata_df.head()

Unnamed: 0,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
0,Bloodaxe Books Archive,BXB,1978 - [ongoing],Bloodaxe Books is one of Britain's leading poe...,Consists of letters and proofs relating to pub...,"This catalogue was created by Rebecca Bradley,...","[Bengali, Czech, Danish, English, Finnish, Fre...","[Bloodaxe Books, 1978 -, Astley, Neil, - , poe..."
1,Editorial,BXB/1,1978 - [ongoing],,Consists of letters and proofs relating to poe...,,,
2,Published Poetry and Translations by Author,BXB/1/1,1978 - [ongoing],,Consists of letters and proofs relating to pub...,,,
3,Robert Adamson,BXB/1/1/ADA,2004 - 2009,Robert Adamson was born in Sydney in 1943 and ...,Consists of letters and proofs relating to the...,,,"[Adamson, Robert, 1943 -, Poet]"
4,Reading the River: Selected Poems,BXB/1/1/ADA/1,2004,Robert Adamson was born in Sydney in 1943 and ...,Consists of letters and manuscripts relating t...,,,


In [10]:
metadata_df.tail()

Unnamed: 0,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
1067,Portrait Photograph of Richard Owen,SW/11/55,c. 1860s,,Black and white Portrait Photograph of Richard...,,,"[Owen, Sir Richard, 1804-1892, Knight, compara..."
1068,Portrait photograph,SW/11/56,1860s - 1900s,,Black and white portrait photograph,,,
1069,Portrait photograph,SW/11/57,c. 1860s - 1900s,,Black and white portrait photograph,,,
1070,Portrait Photograph of David Lloyd George,SW/11/58,c. 1860s - 1900s,,Black and white portrait photograph,,,"[George, David Lloyd, 1863-1945, 1st Earl Lloy..."
1071,Portrait print,SW/11/59,c. 1860s - 1900s,,Portrait print,,,


In [11]:
extracted_dir = "data/extracted/"
Path(extracted_dir).mkdir(parents=True, exist_ok=True)
metadata_df.to_csv(extracted_dir + "newcastle_archival_metadata_sample.csv")