# Extracting Latest Catalogue Data for Classification
Harvesting, transforming, and exporting metadata descriptions for classifying gendered and gender biased language.

This project is focused on the English language and archival institutions in the United Kingdom.

* Creator: Lucy Havens
* Date: April 2023 (harvesting latest catalogue data for automated annotation with classifiers)
* Project: PhD research at the School of Informatics, University of Edinburgh
* Data Source: Heritage Collections' [online archival catalog](https://archives.collections.ed.ac.uk/)

***
**Table of Contents**

  [I. Harvesting](#harvesting)

  [II. Transforming](#transforming)

  [III. Preparing](#preparing)
  
  ***

<a id="harvesting"></a>
## I. Harvesting
Obtain metadata from the Heritage Collections' online archival catalog using the Open Archives Initiative - Protocol for Metadata Harvesting (OAI-PMH).  Heritage Collections provides its metadata in Encoded Archival Description (EAD) format as XML data.  Harvest metadata descriptions from the following metadata fields in the Archives online catalog:
  * Scope and Contents
  * Biographical Historical
  * Processing Information
  * Title
  * Language of Material
  * Geography Name
  * Unit ID
  * Encoded Archival Description Identifier

In [11]:
# Import libraries for harvesting
import xml.dom.minidom
import urllib.request
import urllib
import xml.etree.ElementTree as ET
from lxml import etree
import config
from pathlib import Path

In [12]:
archiveMetadataUrl = "https://aspaceoai.collections.ed.ac.uk/?verb=ListRecords&metadataPrefix=oai_ead"  #Outdated URL: "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&metadataPrefix=oai_ead"

def getRootFromUrl(url):
    content = urllib.request.urlopen(url)

    #tree = ET.parse(content)
    parser = etree.XMLParser(recover=True)  # Use recover to try to fix broken XML
    tree = etree.parse(content, parser)
    
    root = tree.getroot()
    return root

root = getRootFromUrl(archiveMetadataUrl)
print(root)

<Element {http://www.openarchives.org/OAI/2.0/}OAI-PMH at 0x7f713f545840>


In [13]:
# Input: part of or the entirety of a tag name below which you want to get text 
# Output: a list of text between tags contained within the inputted tagName, 
#         with one list element per tagName instance
def getTextBeneathTag(root, tagName):
    text_list = []
    for child in root.iter():
        tag = child.tag
        if tagName in tag:
            text_elem = ""
            for subchild_text in child.itertext():
                text_elem = text_elem + subchild_text
            text_list.append(text_elem)
    return text_list

In [14]:
# Input: binary value, url for harvesting metadata, starting prefix for the end of the url, and lists of metadata fields to gather
# Output: lists of strings of the gathered metadata fields' descriptions, with one string per fonds, series, subseries, file, and item in the catalog
def getDescriptiveMetadata(more, archiveMetadataUrlShort, startingPrefix, eadid, ut, ui, ud, gn, lm, sc, bh, pi):    
   
    archiveMetadataUrlWithPrefix = archiveMetadataUrlShort + startingPrefix
    root = getRootFromUrl(archiveMetadataUrlWithPrefix)
    eadid.append(getTextBeneathTag(root, "eadid"))
    ut.append(getTextBeneathTag(root, "unittitle"))
    ui.append(getTextBeneathTag(root, "unitid"))
    ud.append(getTextBeneathTag(root, "unitdate"))
    gn.append(getTextBeneathTag(root, "geogname"))
    lm.append(getTextBeneathTag(root, "langmaterial"))
    sc.append(getTextBeneathTag(root, "scopecontent"))
    bh.append(getTextBeneathTag(root, "bioghist"))
    pi.append(getTextBeneathTag(root, "processinfo"))
    resumptionToken = getTextBeneathTag(root, "resumptionToken")
    
    if len(resumptionToken) == 0:
        more = False
    i = 1
    
    while more:
        archiveMetadataUrlWithToken = archiveMetadataUrlShort + "resumptionToken=" + resumptionToken[0]
        root = getRootFromUrl(archiveMetadataUrlWithToken)
        eadid.append(getTextBeneathTag(root, "eadid"))
        ut.append(getTextBeneathTag(root, "unittitle"))
        ui.append(getTextBeneathTag(root, "unitid"))
        ud.append(getTextBeneathTag(root, "unitdate"))
        gn.append(getTextBeneathTag(root, "geogname"))
        lm.append(getTextBeneathTag(root, "langmaterial"))
        sc.append(getTextBeneathTag(root, "scopecontent"))
        bh.append(getTextBeneathTag(root, "bioghist"))
        pi.append(getTextBeneathTag(root, "processinfo"))
        resumptionToken = getTextBeneathTag(root, "resumptionToken")
        if len(resumptionToken) == 0:
            more = False
        i += 1
    
    print(str(i) + " resumption tokens")
    return eadid, ut, ui, ud, gn, lm, sc, bh, pi

In [15]:
url = "https://aspaceoai.collections.ed.ac.uk/?verb=ListRecords&"   #Outdated URL: "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&"
startPrefix = "metadataPrefix=oai_ead"
eadid = [] # List of fonds-level identifiers
ut = [] # List of fonds, series, subseries, file, and item titles
ui = [] # List of fonds, series, subseries, file, and item identifiers
ud = [] # List of fonds, series, subseries, file, and item dates
gn = [] # List of fonds, series, subseries, file, and item associated geographic locations 
lm = [] # List of fonds, series, subseries, file, and item material languages
sc = [] # List of fonds, series, subseries, file, and item "Scope and Contents" descriptions
bh = [] # List of fonds, series, subseries, file, and item "Biographical / Historical" descriptions
pi = []  # List of fonds, series, subseries, file, and item "Processing Information" descriptions

eadid, ut, ui, ud, gn, lm, sc, bh, pi = getDescriptiveMetadata(True, url, startPrefix, eadid, ut, ui, ud, gn, lm, sc, bh, pi)  # initial number of resumption tokens: 1081

663 resumption tokens


In [16]:
assert len(eadid) == len(ut)
assert len(ut) == len(ui)
assert len(ui) == len(ud)
assert len(gn) == len(ui)
assert len(lm) == len(sc)
assert len(eadid) == len(sc)
assert len(bh) == len(pi)
assert len(pi) == len(eadid)

The sublists however have different lengths:

In [17]:
i = 0
print(len(eadid[i]))  # 1
print(len(ut[i]))     # 124
print(len(ui[i]))     # 124
print(len(ud[i]))     # 124
print(len(gn[i]))     # 116
print(len(lm[i]))     # 125
print(len(sc[i]))     # 119
print(len(bh[i]))     # 2
print(len(pi[i]))     # 1

1
124
124
124
116
0
119
2
1


<a id="transforming"></a>
## II. Transforming
Create a table (pandas DataFrame) of the metadata without multi-sentence descriptions and plain text files of the descriptive metadata.

In [18]:
import pandas as pd
import re
import string
import csv

#### Additional Metadata Fields

Create a CSV file of the EADIDs and unit titles, identifiers, dates, geographies, and languages:

In [19]:
print(eadid[:10])

[['Coll-1064'], ['Coll-31'], ['Coll-51'], ['Coll-204'], ['Coll-206'], ['Coll 205'], ['Coll-1443'], ['Coll-1444'], ['Coll-1391'], ['Coll-1371']]


In [20]:
flatten = []
for sublist in eadid:
    for item in sublist:
        flatten += [item]
assert type(flatten[0]) == str

In [21]:
assert len(eadid) == len(flatten)

In [22]:
df_meta = pd.DataFrame.from_dict({"eadid":flatten,"unit_title":ut, "unit_identifier":ui, "unit_date":ud, "geography":gn, "language":lm})
df_meta.head()

Unnamed: 0,eadid,unit_title,unit_identifier,unit_date,geography,language
0,Coll-1064,"[Papers of Professor Walter Ledermann, 1 (37),...","[Coll-1064, Coll-1064/1, Coll-1064/2, Coll-106...","[1937-1954, 2 Feb 1937, 10 Feb 1937, 16 Feb 19...","[Edinburgh -- Scotland, Edinburgh -- Scotland,...",[]
1,Coll-31,[Drawings from the Office of Sir Rowand Anders...,"[Coll-31, Coll-31/1, Coll-31/1/1, Coll-31/1/1/...","[1814-1924, 1874-1905, 1874-1879, 1874-1875, 1...",[],[]
2,Coll-51,[Papers of Sir Roderick Impey Murchison and hi...,"[Coll-51, Coll-51/1, Coll-51/2, Coll-51/2/1, C...","[1771-1935, 1723-1935, 1770-1938, 1770-1938, 1...","[Calcutta (India), Europe, Scotland, Tarradale...",[]
3,Coll-204,"[Lecture Notes of John Robison, Introductions,...","[Coll-204, Coll-204/1, Coll-204/2, Coll-204/3,...","[c1779-c1801, c1779-c1801, c1804, c1802, c1780...","[Edinburgh -- Scotland, Glasgow Lanarkshire Sc...",[]
4,Coll-206,[Records of the Wernerian Natural History Soci...,"[Coll-206, Coll-206/1, Coll-206/1/1, Coll-206/...","[1808-1858, 12 January 1808-16 April 1858, 12 ...","[Edinburgh -- Scotland, Freiburg im Breisgau (...",[]


In [23]:
ids = list(df_meta["eadid"])
ids.sort()
print(ids[:10])  # 6 of these are empty strings!

['', '', '', '', '', '', 'BAI', 'Coll 205', 'Coll-100', 'Coll-1000']


Give EADIDs that are empty strings a name:

In [24]:
new_eadids = []
no_ids = 0
for ui in flatten:
    if ui == "":
        new_eadids += ["no_id"+str(no_ids)]
        no_ids += 1
    else:
        new_eadids += [ui]
print(len(new_eadids))
print(new_eadids[:10])

663
['Coll-1064', 'Coll-31', 'Coll-51', 'Coll-204', 'Coll-206', 'Coll 205', 'Coll-1443', 'Coll-1444', 'Coll-1391', 'Coll-1371']


In [25]:
assert len(new_eadids) == len(flatten)

In [26]:
df_meta["eadid"] = new_eadids

Save the data:

In [27]:
print(df_meta.shape)
df_meta.to_csv(config.latest_cat_path+"CRC_units-grouped-by-fonds_April2023.csv")

(663, 6)


#### Main Descriptive Metadata Fields

Remove the metadata field names from the description strings:

In [28]:
def removeFieldName(descs, field_name):
    new_descs = []
    for desc_list in descs:
        new_list = []
        for d in desc_list:
            # Remove the metadata field name from the start of the description
            new_d = d.replace(field_name, "")
            # Remove any leading and trailing whitespace from the description
            new_d = new_d.strip()
            new_list += [new_d]
        new_descs += [new_list]
    assert len(new_descs) == len(descs)
    return new_descs

new_ut = removeFieldName(ut, "Title")
new_sc = removeFieldName(sc, "Scope and Contents")
new_bh = removeFieldName(bh, "Biographical / Historical")
new_pi = removeFieldName(pi, "Processing Information")

Create a CSV file of the descriptions associated with their EADID:

In [29]:
n_rows = len(new_eadids)
df_ut = pd.DataFrame({"eadid":new_eadids, "description":new_ut, "field":["Title"]*n_rows})
df_sc = pd.DataFrame({"eadid":new_eadids, "description":new_sc, "field":["Scope and Contents"]*n_rows})
df_bh = pd.DataFrame({"eadid":new_eadids, "description":new_bh, "field":["Biographical / Historical"]*n_rows})
df_pi = pd.DataFrame({"eadid":new_eadids, "description":new_pi, "field":["Processing Information"]*n_rows})
df_desc = pd.concat([df_ut, df_sc, df_bh, df_pi], axis=0)
df_desc.head() # df_desc.tail()

Unnamed: 0,eadid,description,field
0,Coll-1064,"[Papers of Professor Walter Ledermann, 1 (37),...",Title
1,Coll-31,[Drawings from the Office of Sir Rowand Anders...,Title
2,Coll-51,[Papers of Sir Roderick Impey Murchison and hi...,Title
3,Coll-204,"[Lecture Notes of John Robison, Introductions,...",Title
4,Coll-206,[Records of the Wernerian Natural History Soci...,Title


In [30]:
df_desc_exploded = df_desc.explode("description")
df_desc_exploded = df_desc_exploded.sort_values(by="eadid")
# df_desc_exploded.head()

In [31]:
df_desc_exploded = df_desc_exploded.loc[~df_desc_exploded.description.isna()]
df_desc_exploded = df_desc_exploded.drop_duplicates()
print(df_desc_exploded.shape)

(199608, 3)


In [32]:
df_desc_exploded = df_desc_exploded.reset_index()
df_desc_exploded = df_desc_exploded.drop(columns=["index"])
df_desc_exploded = df_desc_exploded.reset_index()
df_desc_exploded = df_desc_exploded.rename(columns={"index":"description_id"})
print(df_desc_exploded.shape)
df_desc_exploded.head()  #tail()

(199608, 4)


Unnamed: 0,description_id,eadid,description,field
0,0,BAI,Review by John Baillie of the fifth chapter of...,Scope and Contents
1,1,BAI,Letters of condolence received primarily by Fl...,Scope and Contents
2,2,BAI,Cutting describing the opening of the Baillie ...,Scope and Contents
3,3,BAI,A selection of memorabilia gathered together b...,Scope and Contents
4,4,BAI,Correspondence and related items relating to t...,Scope and Contents


In [34]:
df_desc_exploded.tail(2)

Unnamed: 0,description_id,eadid,description,field
199606,199606,no_id5,Box 8: Various,Biographical / Historical
199607,199607,no_id5,1.\tFolded Chest X-ray Adolf Heller2.\tPartial...,Title


Save the DataFrame as a CSV file:

In [33]:
df_desc_exploded.to_csv(config.latest_cat_path+"descriptions_April2023.csv")

<a id="preparing"></a>
## III. Preparing
Prepare the files for classification, splitting descriptions into sentences and words.

In [1]:
import string
import re
import csv
import config
import pandas as pd

# Libraries for Natural Language Processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [2]:
df_desc_exploded = pd.read_csv(config.latest_cat_path+"descriptions_April2023.csv", index_col=0)
# df_desc_exploded.head()

In [3]:
# assert df_desc_exploded.loc[df_desc_exploded.description.isna()].shape[0] == 0  # 8?
df_desc_exploded = df_desc_exploded.loc[~df_desc_exploded.description.isna()]

In [4]:
descs = list(df_desc_exploded.description)

In [9]:
# # sents = dict()
# words_by_sent = dict()
# for i,desc in enumerate(descs):
#     sentences = sent_tokenize(desc)
#     sents[i] = sentences
#     sentence_words = []
#     for s in sentences:
#         words = word_tokenize(s)
#         sentence_words += [words]
#     words_by_sent[i] = sentence_words
# print(sents[199607])              # Looks good
desc_ids = list(df_desc_exploded.description_id)
print(words_by_sent[(desc_ids[-1])][1])   # Looks good

KeyError: 199607

In [11]:
print(i)

199599


199607


In [13]:
df_sents = pd.DataFrame({"description_id":sents.keys(), "sentence":sents.values()})
df_sents = df_sents.explode("sentence")
# df_sents1 = df_sents1.reset_index().drop(columns=["index"]).reset_index()
# df_sents1 = df_sents1.rename(columns={"index":"sentence_id"})
# df_sents1.head()

MemoryError: Unable to allocate 297. GiB for an array with shape (39840160000,) and data type object