# Data Extraction

In [1]:
# Libraries to extract metadata descriptions from XML data
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError
from lxml import etree
import urllib.request
import time

# Libraries for data analysis
import pandas as pd
import re

# Custom variables
import config
import ext_utils

# For writing data and creating directories
from pathlib import Path

**Step 1:** Read the CSV file with XML path data for the Newcastle University Special Collections archival catalog.

In [2]:
df_xml = pd.read_csv(config.xml_paths)
row_count = df_xml.shape[0]
col_count = df_xml.shape[1]
print("rows:",row_count, "| columns:",col_count)
# df_xml.head(3)
df_xml

rows: 12 | columns: 3


Unnamed: 0,collection_name,xml_path,xml_format
0,Bloodaxe Books Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
1,Bell (Gertrude) Archive,https://gertrudebell.ncl.ac.uk/export/,custom
2,Bell (Gertrude) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
3,Hill (Selima) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
4,O'Brien (Sean) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
5,Plowden (Lady Bridget) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
6,Sharp (Thomas) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
7,Loebl (Herbert) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
8,Tyneside Campaign for Homosexual Equality Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
9,Trevelyan (Charles Philips) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD


In [3]:
ead_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] == "EAD"]
other_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] != "EAD"] 
assert(ead_xml_paths[0] == df_xml.iloc[0,1]), "The first EAD XML list value should be the first XML path in the DataFrame."
assert(other_xml_paths[0] == df_xml.iloc[1,1]), "There should only be one non-EAD XML path, that of the Bell Archive."

Parse the XML data as an ElementTree tree. 

Then, at all levels of the archival hierarchy, from the fonds (i.e., `<archdesc>`) down to the item level (e.g., `<c level="...">` for `"subfonds"`, `"subseries"`, `"file"`, `"item"`, etc.), extract the following tags' text:
* Title: `<unittitle>`
* Identifier: `<unitid>`
* Date (of material?): `<unitdate>`
* Language of material: `<langmaterial>`
* Biographical / Historical: `<bioghist>`
* Scope and Contents: `<scopecontent>`
* Processing Information: `<processinfo>`

The Encoded Archival Description Identifier, or EADID (e.g., `<eadid>BXB<\eadid>`), will also be the first Unit ID of a fonds (e.g., `<unitid>BXB</unitid>`).

*Note: "Fonds" is the archival term for collection.  It is the top-most level of a single archive's (or archival collection's) hierarchy.*

In [6]:
all_fonds = []
for xml_path in ead_xml_paths:
    print("Extracting archival metadata descriptions from", xml_path)
    fonds_list = ext_utils.extractMetadata(xml_path)
    all_fonds += [fonds_list]
print("Done!")

Done!


In [6]:
assert len(all_fonds) == row_count - 1

Extract metadata descriptions from the custom (non-EAD) XML data:

In [16]:
more_data = True
resumption_token_prefix = "/export?resumptionToken="
errored_tokens, error_messages = [], []
list_metadata = []
for xml_path in other_xml_paths:
    next_xml_path = xml_path
    while more_data:
        print("Current XML path:", next_xml_path)
        # Create an ElementTree tree and get the tree's root
        content = urllib.request.urlopen(next_xml_path)

        try:

            parser = etree.XMLParser(recover=True)#, encoding="utf-8")  # Use recover to handle malformed XML data
            xmlTree = etree.parse(content, parser)
            root = xmlTree.getroot()

            # Extract metadata descriptions from the below-specified fields in the tree
            metadata_field_tags = [
                "node_id", "title", "description", "creator", "creation-date", 
                "recipient", "language", "extent-and-medium", "country-and-region"
            ]
            d_descs = {
                'node_id': "", 'title': "", 'description': "", 'creator': "", 'creation-date': "", 
                'recipient': "", 'language': "", 'extent-and-medium': "", 'country-and-region': ""
            }
            
            for child in xmlTree.iter():
                tag = child.tag
                if tag in metadata_field_tags:
                    all_text = ext_utils.getAllText(child, tag)
                    d_descs[tag] = all_text
                # Upon reaching a new item tag, add the previous item's extracted descriptions
                # to the metadata list and reset the descriptions dictionary
                elif (tag == "item") and (d_descs["node_id"] != ""):
                    list_metadata += [d_descs]
                    d_descs = {
                        'node_id': "", 'title': "", 'description': "", 'creator': "", 'creation-date': "", 
                        'recipient': "", 'language': "", 'extent-and-medium': "", 'country-and-region': ""
                    }
                else:
                    continue

            # Add the final item's descriptions
            list_metadata += [d_descs]

            # If there's a resumption token, use it to continue extracting data, 
            # otherwise there is no more data to extract
            another_token = root.find("resumptionToken")
            if not another_token is None:
                resumption_token = another_token.text
            else:
                more_data = False

        except ParseError as error_message:

            # Record the error
            errored_token = int(re.search("[0-9]+", resumption_token)[0])
            errored_tokens += [errored_token]
            error_messages += [error_message]
            print(error_message)

            # Try the next node ID for a resumption token
            next_token = errored_token + 1
            resumption_token = resumption_token_prefix + str(next_token)

        next_xml_path = xml_path.replace("/export/", resumption_token)

print(len(list_metadata))

Current XML path: https://gertrudebell.ncl.ac.uk/export/
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=69768
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=69963
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70163
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70363
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70563
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70763
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=71358
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=71557
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91133
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91333
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91533
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91733
Current

In [17]:
if len(errored_tokens) > 0:
    error_df = pd.DataFrame.from_dict({"resumption_token": errored_tokens, "error_message": error_messages})
    print(error_df.shape)
else:
    print("No parse errors!")

No parse errors!


In [19]:
print(list_metadata[-1])

{'node_id': '117617', 'title': 'Photograph of an unidentified location, Turkey / Israel, taken 1902', 'description': '', 'creator': 'Bell, Gertrude Margaret Lowthian', 'creation-date': '1902-01-01 - 1902-12-31', 'recipient': '', 'language': '', 'extent-and-medium': '', 'country-and-region': 'Unclear'}


Store the extracted EAD data in a DataFrame.

In [9]:
metadata_df = pd.DataFrame()
for fonds_list in all_fonds:
    df = pd.DataFrame.from_records(fonds_list)
    metadata_df = pd.concat([metadata_df, df])
print(metadata_df.shape)

(16683, 8)


Add a column for the Encoded Archival Description Identifiers (EADIDs).

In [10]:
unitids = list(metadata_df.unitid)
eadids = [re.match("\w+((?=/)?)", unitid)[0].upper() for unitid in unitids]
assert len(unitids) == len(eadids)
# print(set(eadids))
metadata_df.insert(0, "eadid", eadids)
metadata_df.sort_values(["eadid", "unitid"])
metadata_df.head()

  eadids = [re.match("\w+((?=/)?)", unitid)[0].upper() for unitid in unitids]


Unnamed: 0,eadid,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
0,BXB,Bloodaxe Books Archive,BXB,1978 - [ongoing],Bloodaxe Books is one of Britain's leading poe...,Consists of letters and proofs relating to pub...,"This catalogue was created by Rebecca Bradley,...","[Bengali, Czech, Danish, English, Finnish, Fre...","[Bloodaxe Books, 1978 -, Astley, Neil, - , poe..."
1,BXB,Editorial,BXB/1,1978 - [ongoing],,Consists of letters and proofs relating to poe...,,,
2,BXB,Published Poetry and Translations by Author,BXB/1/1,1978 - [ongoing],,Consists of letters and proofs relating to pub...,,,
3,BXB,Robert Adamson,BXB/1/1/ADA,2004 - 2009,Robert Adamson was born in Sydney in 1943 and ...,Consists of letters and proofs relating to the...,,,"[Adamson, Robert, 1943 -, Poet]"
4,BXB,Reading the River: Selected Poems,BXB/1/1/ADA/1,2004,Robert Adamson was born in Sydney in 1943 and ...,Consists of letters and manuscripts relating t...,,,


In [11]:
extracted_dir = "data/extracted/"
Path(extracted_dir).mkdir(parents=True, exist_ok=True)
metadata_df.to_csv(extracted_dir + "newcastle_archival_metadata_sample.csv")

Store the extracted non-EAD data in a DataFrame.

In [20]:
nonead_metadata_df = pd.DataFrame()
nonead_metadata_df = pd.DataFrame.from_records(list_metadata)
# The non-EAD formatted data is all from the Gertrude Bell Archive, which has the EADID "GB"
nonead_metadata_df.insert(0, "eadid", ["GB"]*(nonead_metadata_df.shape[0]))
print(nonead_metadata_df.shape)
nonead_metadata_df.head()

(11990, 10)


Unnamed: 0,eadid,node_id,title,description,creator,creation-date,recipient,language,extent-and-medium,country-and-region
0,GB,69571,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1874-09-25,"Bell, Florence",English,"1 letter, paper",Redcar
1,GB,69572,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-04-12,"Bell, Florence",English,"1 letter plus envelope, paper",
2,GB,69573,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-01-01 - 1877-12-31,"Bell, Florence",English,"1 letter, paper",Redcar
3,GB,69574,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-04-16,"Bell, Florence",English,"1 letter plus envelope, paper",Redcar
4,GB,69575,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-04-26,"Bell, Florence",English,"1 letter plus envelope, paper",Redcar


In [22]:
node_ids = list(nonead_metadata_df.node_id)
assert "69571" in node_ids

In [24]:
# nonead_metadata_df.to_csv(extracted_dir + "bell_archival_metadata_nonead.csv")

extracted_dir = "data/extracted/"
nonead_metadata_df.to_csv(extracted_dir + "complete_bell_archival_metadata_nonead.csv")