# Data Extraction

In [1]:
# Libraries to extract metadata descriptions from XML data
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError
from lxml import etree
import urllib.request
import time

# Libraries for data analysis
import pandas as pd
import numpy as np
import re

# Custom variables
import config
import ext_utils

# For writing data and creating directories
from pathlib import Path

## For Entire Catalog

In [2]:
nusc_url = "https://specialcollections.ncl.ac.uk/;oai?verb=ListRecords&metadataPrefix=oai_dc&set=oai:virtual:top-level-records"

In [3]:
id_tag = "{http://purl.org/dc/elements/1.1/}identifier"  # appears in browser as: <dc:identifier>
df = ext_utils.getTextFromTag(True, nusc_url, id_tag)
print(df.shape)
df.head()

1 resumption tokens
(148, 2)


Unnamed: 0,eadid,url
0,WBC,https://specialcollections.ncl.ac.uk/wbc
1,LAY,https://specialcollections.ncl.ac.uk/lay
2,SWAN,https://specialcollections.ncl.ac.uk/swan
3,G,https://specialcollections.ncl.ac.uk/g
4,RBD,https://specialcollections.ncl.ac.uk/rbd


In [4]:
eadids = df.eadid.unique()
eadids.sort()
print(eadids)

['17th C. Coll' '18th C. Coll' '19th C. Coll' '20th C. Coll' '20th C. Pam'
 '21st C. Coll.' 'AC Poetry Coll.' 'AN' 'AUWH' 'AW' 'Alderson' 'Anderson'
 'B' 'BAI' 'BBHB' 'BM' 'BP' 'BR' 'BXB' 'Bell-White' "Benefactor's Library"
 'Blavatnik Honresfield' 'Bloodaxe' 'Booktrust' 'Bradshaw'
 'Bradshaw-Berwick' 'Broadsides' 'Burman Alnwick' 'Burnett' 'CET' 'CG'
 'CHE' 'CPT' 'CVS' 'Chapbooks' 'Chorley' 'Clarke' 'Clarke Med.'
 'Clarke Misc.' 'Cowen Tracts' 'Crawhall' 'DAG' 'DB' 'ECG' 'EWL'
 'Eagle Press' 'Ent. Coll.' 'FLP' 'FP' 'FW' 'Flambard Press' 'Fletcher'
 'Friends' 'G' 'GB' 'GC' 'GEX' 'GG' 'GOT' 'Grey Tracts' 'HD' 'HFB' 'HH'
 'HL' 'HOL' 'Haverfield' 'Heslop' 'Hev' 'ILL' 'IRO' 'IRONP Coll.' 'Inc.'
 'Indian Tracts' 'JC' 'JCII' 'JG' 'JJ' 'JTB' 'JWD' 'Joan Butler Coll.' 'K'
 'Kipling/Pollard' 'LAP' 'LAY' 'LD' 'LE' 'LV' 'Layard' 'M' 'MA' 'MAK'
 'MAP' 'MAPS' 'MC' 'MISC.MSS' 'MM' 'MS' 'MSA' 'MT' 'MTSC'
 'Maurice Bell Coll.' 'Meade' 'Med. Coll.' 'Moore' 'NCN' 'NHG' 'NI' 'NRI'
 'NS' 'NUA' 'Napoleon' 

In [5]:
urls = list(df["url"])
xml_path_regex = "https?:\/\/\S*downloads\/exports\/ead\S*\.xml"
ead_xml_paths = []
for url in urls:
    response = urllib.request.urlopen(url)
    content = response.read().decode("utf-8")
    response.close()
    ead_xml_paths += re.findall(xml_path_regex, content)
print(len(ead_xml_paths))

  xml_path_regex = "https?:\/\/\S*downloads\/exports\/ead\S*\.xml"


144


## For Select Archival Collections

**Step 1:** Read the CSV file with XML path data for the Newcastle University Special Collections archival catalog.

In [32]:
df_xml = pd.read_csv(config.xml_paths)
row_count = df_xml.shape[0]
col_count = df_xml.shape[1]
print("rows:",row_count, "| columns:",col_count)
# df_xml.head(3)
df_xml

rows: 12 | columns: 3


Unnamed: 0,collection_name,xml_path,xml_format
0,Bloodaxe Books Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
1,Bell (Gertrude) Archive,https://gertrudebell.ncl.ac.uk/export/,custom
2,Bell (Gertrude) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
3,Hill (Selima) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
4,O'Brien (Sean) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
5,Plowden (Lady Bridget) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
6,Sharp (Thomas) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
7,Loebl (Herbert) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
8,Tyneside Campaign for Homosexual Equality Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD
9,Trevelyan (Charles Philips) Archive,https://specialcollections.ncl.ac.uk/downloads...,EAD


In [34]:
print(list(df_xml["xml_path"])[5])

https://specialcollections.ncl.ac.uk/downloads/exports/ead/1e48c4420b7073bc11916c6c1de226bb.ead.xml


In [3]:
ead_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] == "EAD"]
other_xml_paths = [df_xml.iloc[i,1] for i in range(0,row_count) if df_xml.iloc[i,2] != "EAD"] 
assert(ead_xml_paths[0] == df_xml.iloc[0,1]), "The first EAD XML list value should be the first XML path in the DataFrame."
assert(other_xml_paths[0] == df_xml.iloc[1,1]), "There should only be one non-EAD XML path, that of the Bell Archive."

## Extract Descriptive Metadata

### For EAD XML

Then, at all levels of the archival hierarchy, from the fonds (i.e., `<archdesc>`) down to the item level (e.g., `<c level="...">` for `"subfonds"`, `"subseries"`, `"file"`, `"item"`, etc.), extract the text between XML tags with the specified names, which are the descriptions in the metadata fields with those names.  

By default, the function we'll use, `extractMetadata()`, will look for text between the following tags:
* EADID: `<eadid>`
* Title: `<unittitle>`
* Identifier: `<unitid>`
* Date (of material?): `<unitdate>`
* Language of material: `<langmaterial>`
* Biographical / Historical: `<bioghist>`
* Scope and Contents: `<scopecontent>`
* Processing Information: `<processinfo>`
Optionally, you can specifiy your own list of tag (a.k.a. metadata field) names.

Generally, the Encoded Archival Description Identifier, or EADID (e.g., `<eadid>BXB<\eadid>`), will also be the first Unit ID of a fonds (e.g., `<unitid>BXB</unitid>`).

*Note 1: "Fonds" is the archival term for collection.  It is the top-most level of a single archive's (or archival collection's) hierarchy.*

*Note 2: The code for the `extractMetadata()` function can be found in the ext_utils.py file.*

To extract the metadata descriptions in the default list of tags for from all our EAD XML data, run the following code cell:

In [6]:
all_fonds = []
for xml_path in ead_xml_paths:
    print("Extracting archival metadata descriptions from", xml_path)
    fonds_list = ext_utils.extractMetadata(xml_path)
    all_fonds += [fonds_list]
print("Done!")

Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/457c7caf38b56ec93afbf6ad22877478.ead.xml
Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/d5a934d2db319845f83ff215eb10c452.ead.xml
Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/9f094b9ffa0f7dd89f0f2d3eeaf7c139.ead.xml
Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/1c3ba574dd5d0b83446d360ccb8f41b8.ead.xml
Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/50901a6542ade98ecedba98b403134bf.ead.xml
Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/348de297d684d1af942e57d08e3c2a89.ead.xml
Extracting archival metadata descriptions from https://specialcollections.ncl.ac.uk/downloads/exports/ead/02d4ad74e410

In [7]:
assert len(all_fonds) == len(ead_xml_paths), "The total number of fonds from which metadata descriptions were extracted should equal the number of EAD XML paths"

### For non-EAD XML
Extract metadata descriptions from the custom (non-EAD) XML data:

In [None]:
# Define the tags to extract text from
# Note: every item has a unique 'node_id'

metadata_tags = [  # for metadata descriptions
    "node_id", "title", "description", "creator", "creation-date", 
    "recipient", "language", "extent-and-medium", "country-and-region"
    ]

# transcription_tags = ["node_id", "type", "iiif-manifest", "transcription"] # For digitized text


In [7]:
more_data = True
resumption_token_prefix = "/export?resumptionToken="
errored_tokens, error_messages = [], []
text_list = []
for xml_path in other_xml_paths:
    next_xml_path = xml_path
    while more_data:
        print("Current XML path:", next_xml_path)
        # Create an ElementTree tree and get the tree's root
        content = urllib.request.urlopen(next_xml_path)

        try:

            parser = etree.XMLParser(recover=True)#, encoding="utf-8")  # Use recover to handle malformed XML data
            xmlTree = etree.parse(content, parser)
            root = xmlTree.getroot()

            # Extract metadata descriptions from the below-specified fields in the tree
            tags = transcription_tags
            d_descs = dict.fromkeys(tags, "")
            
            for child in xmlTree.iter():
                tag = child.tag
                if tag in tags:
                    all_text = ext_utils.getAllText(child, tag)
                    d_descs[tag] = all_text
                # Upon reaching a new item tag, add the previous item's extracted descriptions
                # to the metadata list and reset the descriptions dictionary
                elif (tag == "item") and (d_descs["node_id"] != ""):
                    text_list += [d_descs]
                    d_descs = dict.fromkeys(tags, "")
                else:
                    continue

            # Add the final item's descriptions
            text_list += [d_descs]

            # If there's a resumption token, use it to continue extracting data, 
            # otherwise there is no more data to extract
            another_token = root.find("resumptionToken")
            if not another_token is None:
                resumption_token = another_token.text
            else:
                more_data = False

        except ParseError as error_message:

            # Record the error
            errored_token = int(re.search("[0-9]+", resumption_token)[0])
            errored_tokens += [errored_token]
            error_messages += [error_message]
            print(error_message)

            # Try the next node ID for a resumption token
            next_token = errored_token + 1
            resumption_token = resumption_token_prefix + str(next_token)

        next_xml_path = xml_path.replace("/export/", resumption_token)

print(len(text_list))

Current XML path: https://gertrudebell.ncl.ac.uk/export/
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=69768
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=69963
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70163
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70363
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70563
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=70763
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=71358
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=71557
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91133
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91333
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91533
Current XML path: https://gertrudebell.ncl.ac.uk/export?resumptionToken=91733
Current

In [8]:
text_list[1]

{'node_id': '69572',
 'type': 'letters',
 'iiif-manifest': 'https://cdm21051.contentdm.oclc.org/iiif/info/p21051coll46/6626/manifest.json',
 'transcription': "Dear dear Mamy The Johnson's are here because of the new baby I was dragging Kootle in the cart all this morning because she had no goloshes on. We picked some flowers and put them in water this afternoon. We get on very well with nurse. I put Walter and the Becoo to bed last night. I mean to say that I folded up their clothes and took them off Yates gave them their bath Kootle did not slepe [sic] here and before they went to slepe I gave them each a piece of sugar and a biscuit. It is so fine today. Maurice and I send love to everybody. From your loving child Gertrude"}

In [9]:
if len(errored_tokens) > 0:
    error_df = pd.DataFrame.from_dict({"resumption_token": errored_tokens, "error_message": error_messages})
    print(error_df.shape)
else:
    print("No parse errors!")

No parse errors!


## Transformation

In [8]:
extracted_dir = "data/extracted/"
Path(extracted_dir).mkdir(parents=True, exist_ok=True)

### Transcriptions

Export the extracted data into a CSV file and the extracted transcriptions only into Plain Text files, where each file's name is the node ID of the transcribed item.

In [None]:
transcription_df = pd.DataFrame.from_dict(text_list)
print(transcription_df.shape)
transcription_df.head()

(11990, 4)


Unnamed: 0,node_id,type,iiif-manifest,transcription
0,69571,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,"<p>[25 September 1974] Sept 25 1874 Red Barns,..."
1,69572,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear dear Mamy The Johnson's are here because ...
2,69573,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear dear Mamy I am so sorry I did not write t...
3,69574,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear dear Mamy I am very very sorry you cannot...
4,69575,letters,https://cdm21051.contentdm.oclc.org/iiif/info/...,Dear Mamy I am so very very very very very ver...


In [12]:
transcription_df.to_csv(extracted_dir + "bell_transcription_data.csv")

In [15]:
bell_transcriptions_dir = "data/extracted/GB_transcriptions/"
Path(bell_transcriptions_dir).mkdir(parents=True, exist_ok=True)

In [16]:

for item in text_list:
    node_id = item["node_id"]
    file_name = "GB_{}.txt".format(node_id)
    with open(bell_transcriptions_dir + file_name, "w", encoding="utf-8") as f:
        f.write(item["transcription"])
    f.close()
print("Wrote", len(text_list), "files to", bell_transcriptions_dir)

Wrote 11990 files to data/extracted/GB_transcriptions/


Check what the contents of one of the files looks like...

In [17]:
file_path = bell_transcriptions_dir + "GB_69571.txt"
with open(file_path) as f:
    full = f.read()
    print(full[:100])  # print the first 100 characters of the file
    f.close()

<p>[25 September 1974] Sept 25 1874 Red Barns, Coatham, Redcar. My dear Florence Mopsa has been very


Looks good!

### Metadata

Store the extracted data in a DataFrame.

#### Metadata in EAD

In [13]:
metadata_df = pd.DataFrame()
for fonds_list in all_fonds:
    df = pd.DataFrame.from_records(fonds_list)
    metadata_df = pd.concat([metadata_df, df])
print(metadata_df.shape)

(37943, 9)


In [14]:
metadata_df.head()

Unnamed: 0,eadid,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
0,WBC,Blenkinsopp Coulson (William) Archive,WBC,1469 - 1975,"William Lisle Blenkinsopp Coulson (1841-1911),...",This collection comprises a small amount of pa...,,[English],"[Blenkinsopp Coulson, William Lisle, 1841-1911..."
1,,Correspondence of William Blenkinsopp Coulson,WBC/1,1873 - 1975,,Comprises two letters concerning William Blenk...,,,
2,,Letter from Lord Dufferin to William B. Coulson,WBC/1/1,8 May 1873,,Letter from Lord Dufferin to William B. Coulso...,,,
3,,Letter from Major General A. Macdonell to Will...,WBC/1/2,Apr 1877,,Letter from Major General A. Macdonell to Will...,,,
4,,Published Material relating to William Blenkin...,WBC/2,1872 - 1914,,This section chiefly comprises a selection of ...,,,


Forward-fill the values in the `eadid` column so all rows have an EADID.

In [15]:
metadata_df["eadid"] = metadata_df["eadid"].replace({"":np.nan})
metadata_df["eadid"] = metadata_df["eadid"].ffill()
metadata_df.head()

Unnamed: 0,eadid,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
0,WBC,Blenkinsopp Coulson (William) Archive,WBC,1469 - 1975,"William Lisle Blenkinsopp Coulson (1841-1911),...",This collection comprises a small amount of pa...,,[English],"[Blenkinsopp Coulson, William Lisle, 1841-1911..."
1,WBC,Correspondence of William Blenkinsopp Coulson,WBC/1,1873 - 1975,,Comprises two letters concerning William Blenk...,,,
2,WBC,Letter from Lord Dufferin to William B. Coulson,WBC/1/1,8 May 1873,,Letter from Lord Dufferin to William B. Coulso...,,,
3,WBC,Letter from Major General A. Macdonell to Will...,WBC/1/2,Apr 1877,,Letter from Major General A. Macdonell to Will...,,,
4,WBC,Published Material relating to William Blenkin...,WBC/2,1872 - 1914,,This section chiefly comprises a selection of ...,,,


In [16]:
metadata_df.tail()

Unnamed: 0,eadid,unittitle,unitid,unitdate,bioghist,scopecontent,processinfo,langmaterial,controlaccess
9273,TF,"Design Review File, February - March 2013.",TF-07579,February - March 2013,,"Design Review File, February - March 2013. A c...",,[English],
9274,TF,"Thematic Workshop File, May - July 2013.",TF-07578,May - July 2013,,,,[English],
9275,TF,"Working Drafts File, December 2013 - February ...",TF-07583,December 2013 - February 2014,,Working drafts of the Farrell Review.,,[English],
9276,TF,"Notes File, 28 December 2013",TF-04852,28-Dec-13,,Notes prepared by Sir Terry Farrell regarding ...,,[English],
9277,TF,"The Daily Telegraph, 31 March 2014",TF-07492,31-Mar-14,,"'Homes fit for people', p19 mentions Farrell r...",,[English],


In [17]:
f = "nusc_ead_all_fonds.csv" #"newcastle_archival_metadata_sample.csv"
metadata_df.to_csv(extracted_dir + f)

#### Metadata not in EAD

In [None]:
nonead_metadata_df = pd.DataFrame()
nonead_metadata_df = pd.DataFrame.from_records(text_list)
# The non-EAD formatted data is all from the Gertrude Bell Archive, which has the EADID "GB"
nonead_metadata_df.insert(0, "eadid", ["GB"]*(nonead_metadata_df.shape[0]))
print(nonead_metadata_df.shape)
nonead_metadata_df.head()

(11990, 10)


Unnamed: 0,eadid,node_id,title,description,creator,creation-date,recipient,language,extent-and-medium,country-and-region
0,GB,69571,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1874-09-25,"Bell, Florence",English,"1 letter, paper",Redcar
1,GB,69572,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-04-12,"Bell, Florence",English,"1 letter plus envelope, paper",
2,GB,69573,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-01-01 - 1877-12-31,"Bell, Florence",English,"1 letter, paper",Redcar
3,GB,69574,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-04-16,"Bell, Florence",English,"1 letter plus envelope, paper",Redcar
4,GB,69575,"Letter from Gertrude Bell to her stepmother, D...",,"Bell, Gertrude Margaret Lowthian",1877-04-26,"Bell, Florence",English,"1 letter plus envelope, paper",Redcar


In [None]:
node_ids = list(nonead_metadata_df.node_id)
assert "69571" in node_ids

In [None]:
# nonead_metadata_df.to_csv(extracted_dir + "bell_archival_metadata_nonead.csv")

nonead_metadata_df.to_csv(extracted_dir + "complete_bell_archival_metadata_nonead.csv")