# Creating Data for Final Annotation
Harvesting, transforming, and exporting metadata descriptions for annotation of gendered language in [brat](brat.nlplab.org/).

* Creator: Lucy Havens
* Date: February 18, 2021
* Project: PhD research at the School of Informatics, University of Edinburgh
* Data Source: Centre for Research Collections' (CRC) [online archival catalog](https://archives.collections.ed.ac.uk/)

## I. Harvesting
Obtain metadata from the CRC's online archival catalog using the Open Archives Initiative - Protocol for Metadata Harvesting (OAI-PMH).  The CRC provides its metadata in Encoded Archival Description (EAD) format as XML data.

In [21]:
# Import libraries for harvesting
import xml.dom.minidom
import urllib.request
import urllib
import xml.etree.ElementTree as ET
from lxml import etree

In [22]:
archiveMetadataUrl = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&metadataPrefix=oai_ead"

def getRootFromUrl(url):
    content = urllib.request.urlopen(url)

    #tree = ET.parse(content)
    parser = etree.XMLParser(recover=True)  # Use recover to try to fix broken XML
    tree = etree.parse(content, parser)
    
    root = tree.getroot()
    return root

root = getRootFromUrl(archiveMetadataUrl)
print(root)

<Element {http://www.openarchives.org/OAI/2.0/}OAI-PMH at 0x7fda159e2440>


In [23]:
# Input: part of or the entirety of a tag name below which you want to get text 
# Output: a list of text between tags contained within the inputted tagName, 
#         with one list element per tagName instance
def getTextBeneathTag(root, tagName, header):
    text_list = []
    for child in root.iter():
        tag = child.tag
        if tagName in tag:
            text_elem = ""
            for subchild_text in child.itertext():
                if header:
                    if header not in subchild_text:
                        text_elem = text_elem + subchild_text
                else:
                    text_elem = text_elem + subchild_text
            text_list.append(text_elem)
    return text_list

In [24]:
# Input: binary value, url for harvesting metadata, starting prefix for the end of the url, and lists of metadata fields to gather
# Output: lists of strings of the gathered metadata fields' descriptions, with one string per fonds, series, and item in the catalog
def getDescriptiveMetadata(more, archiveMetadataUrlShort, startingPrefix, ut, ui, ud, gn, lm, sc, bh, pi):    
   
    archiveMetadataUrlWithPrefix = archiveMetadataUrlShort + startingPrefix
    root = getRootFromUrl(archiveMetadataUrlWithPrefix)
    ut.append(getTextBeneathTag(root, "unittitle", "Unit Title"))
    ui.append(getTextBeneathTag(root, "unitid", "Unit Identifier"))
    ud.append(getTextBeneathTag(root, "unitdate", "Unit Date"))
    gn.append(getTextBeneathTag(root, "geogname", "Geography Name"))
    lm.append(getTextBeneathTag(root, "langmaterial", "Language of Materials"))
    sc.append(getTextBeneathTag(root, "scopecontent", "Scope and Contents"))
    bh.append(getTextBeneathTag(root, "bioghist", "Biographical / Historical"))
    pi.append(getTextBeneathTag(root, "processinfo", "Processing Information"))
    resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
    
    if len(resumptionToken) == 0:
        more = False
    i = 1
    
    while more:
        archiveMetadataUrlWithToken = archiveMetadataUrlShort + "resumptionToken=" + resumptionToken[0]
        root = getRootFromUrl(archiveMetadataUrlWithToken)
        ut.append(getTextBeneathTag(root, "unittitle", "Unit Title"))
        ui.append(getTextBeneathTag(root, "unitid", "Unit Identifier"))
        ud.append(getTextBeneathTag(root, "unitdate", "Unit Date"))
        gn.append(getTextBeneathTag(root, "geogname", "Geography Name"))
        lm.append(getTextBeneathTag(root, "langmaterial", "Language of Materials"))
        sc.append(getTextBeneathTag(root, "scopecontent", "Scope and Contents"))
        bh.append(getTextBeneathTag(root, "bioghist", "Biographical / Historical"))
        pi.append(getTextBeneathTag(root, "processinfo", "Processing Information"))
        resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
        if len(resumptionToken) == 0:
            more = False
        i += 1
    
    print(str(i) + " resumption tokens")
    return ut, ui, ud, gn, lm, sc, bh, pi

In [25]:
url = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&"
startPrefix = "metadataPrefix=oai_ead"
ut = [] # List of fonds, series, and item titles
ui = [] # List of fonds, series, and item identifiers
ud = [] # List of fonds, series, and item dates
gn = [] # List of fonds, series, and item associated geographic locations 
lm = [] # List of fonds, series, and item material languages
sc = [] # List of fonds, series, and item "Scope and Contents" descriptions
bh = [] # List of fonds, series, and item "Biographical / Historical" descriptions
pi = []  # List of fonds, series, and item "Processing Information" descriptions

ut, ui, ud, gn, lm, sc, bh, pi = getDescriptiveMetadata(True, url, startPrefix, ut, ui, ud, gn, lm, sc, bh, pi)

1081 resumption tokens


In [26]:
print(len(ut))
print(len(ui))
print(len(ud))
print(len(gn))
print(len(lm))
print(len(sc))
print(len(bh))
print(len(pi))

1081
1081
1081
1081
1081
1081
1081
1081


In [27]:
i = 0
print(len(ut[i]))
print(len(ui[i]))
print(len(ud[i]))
print(len(gn[i]))
print(len(lm[i]))
print(len(sc[i]))
print(len(bh[i]))
print(len(pi[i]))

124
124
124
116
125
119
2
1


# II. Transforming
Create a table (pandas DataFrame) of the metadata without multi-sentence descriptions and plain text files of the descriptive metadata.

In [28]:
import pandas as pd
import re
import string
import csv

In [29]:
df = pd.DataFrame.from_dict({"unit_title":ut, "unit_identifier":ui, "unit_date":ud, "geography":gn, "language":lm})
df.head()

Unnamed: 0,unit_title,unit_identifier,unit_date,geography,language
0,"[Papers of Professor Walter Ledermann, 1 (37),...","[Coll-1064, Coll-1064/1, Coll-1064/2, Coll-106...","[1937-1954, 2 Feb 1937, 10 Feb 1937, 16 Feb 19...","[Edinburgh (Scotland), Edinburgh (Scotland), E...","[\n English\n , English, English, Engl..."
1,[Drawings from the Office of Sir Rowand Anders...,"[Coll-31, Coll-31/1, Coll-31/1/1, Coll-31/1/1/...","[1814-1924, 1874-1905, 1874-1879, 1874-1875, 1...",[],"[\n English\n , English, English, Engl..."
2,[Papers of Sir Roderick Impey Murchison and hi...,"[Coll-51, Coll-51/1, Coll-51/2, Coll-51/2/1, C...","[1771-1935, 1723-1935, 1770-1938, 1770-1938, 1...","[Calcutta (India), Europe, Scotland, Tarradale...","[\n English\n , English, English, Engl..."
3,"[Lecture Notes of John Robison, Introductions,...","[Coll-204, Coll-204/1, Coll-204/2, Coll-204/3,...","[c1779-c1801, c1779-c1801, c1804, c1802, c1780...","[Edinburgh (Scotland), Glasgow Lanarkshire Sco...","[\n English\n , English., English Lati..."
4,[Records of the Wernerian Natural History Soci...,"[Coll-206, Coll-206/1, Coll-206/1/1, Coll-206/...","[1808-1858, 12 January 1808-16 April 1858, 12 ...","[Edinburgh (Scotland), Freiburg im Breisgau (G...","[\n English\n , English, English, Engl..."


In [30]:
df.to_csv("CRC_units-grouped-by-fonds.csv")

In [31]:
indeces = []
for ui_list in ui:
    indeces += [ui_list[0]]
print(len(indeces))
print(indeces[:10])

1081
['Coll-1064', 'Coll-31', 'Coll-51', 'Coll-204', 'Coll-206', 'Coll-205', 'Coll-1443', 'Coll-1444', 'Coll-1391', 'Coll-1371']


In [32]:
# def flattenTwoDimensionalList(two_d_list):
#     flattened = []
#     for listoflists in two_d_list:
#         for unit in listoflists:
#             flattened += [unit]
#     return flattened

In [33]:
# titles = flattenTwoDimensionalList(ut)
# # print(titles[0:30])
# identifiers = flattenTwoDimensionalList(ui)
# dates = flattenTwoDimensionalList(ud)
# geogs = flattenTwoDimensionalList(gn)
# lang = flattenTwoDimensionalList(lm)
# scopecont = flattenTwoDimensionalList(sc)
# bioghist = flattenTwoDimensionalList(bh)
# procinfo = flattenTwoDimensionalList(pi)

In [34]:
# print(len(titles))
# print(len(identifiers))
# print(len(dates))
# print(len(geogs))
# print(len(lang))
# print(len(scopecont))
# print(len(bioghist))
# print(len(procinfo))

In [35]:
def writeListsToFilesPerFonds(indeces, titles, scopeconts, bioghists, procinfo):
    maxI = len(indeces)
    i = 0
    while i < maxI:
        filename = (indeces[i]).strip()
        filename = filename.replace(" ", "_")
        filename = filename.replace("/", "_")
        filepath = "descriptions_by_fonds/"+filename+".txt"
        with open(filepath, 'w') as f:
            f.write("Identifier: ")
            f.write(filename + "\n")
            for t in titles[i]:
                t = t.strip()
                f.write("\nTitle:\n")
                f.write(t + "\n")
            for s in scopeconts[i]:
                s = s.strip()
                f.write("\nScope and Contents:\n")
                f.write(s + "\n")
            for b in bioghists[i]:
                b = b.strip()
                f.write("\nBiographical / Historical:\n")
                f.write(b + "\n")
            for p in procinfo[i]:
                p = p.strip()
                f.write("\nProcessing Information:\n")
                f.write(p + "\n")
        f.close()
        i += 1
    return str(maxI) + " files written"

In [36]:
writeListsToFilesPerFonds(indeces, ut, sc, bh, pi)

'1081 files written'

## III. Preparing
Prepare the files for annotation, ensuring ease in reading and splitting any excessively long files.

In [53]:
import string
import re
import csv

# Libraries for Natural Language Processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.text import Text
# nltk.download('punkt')
from nltk.probability import FreqDist
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader
# nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

In [56]:
directory = 'descriptions_by_fonds/'
files = PlaintextCorpusReader(directory, '.+')
tokens = files.words()

In [57]:
print(tokens[:20])

['Identifier', ':', 'AA4', 'Title', ':', 'Papers', 'of', 'Rev', 'Prof', 'John', 'McIntyre', '(', '1916', '-', '2005', ')', 'Scope', 'and', 'Contents', ':']


In [58]:
token_totals = []
filenames = files.fileids()
for f in filenames:
        token_totals += [len(files.words(f))]
file_lengths = dict(zip(filenames,token_totals))

In [59]:
token_totals.sort
print(set(token_totals))

{8, 9, 14, 2063, 15, 4115, 19, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 10290, 52, 54, 55, 56, 24632, 57, 59, 58, 61, 53, 63, 64, 65, 66, 67, 68, 60, 70, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 85, 86, 89, 91, 95, 8287, 97, 99, 100, 103, 104, 105, 106, 107, 108, 109, 110, 114, 116, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 129, 131, 2180, 133, 134, 136, 137, 138, 139, 140, 20621, 142, 143, 144, 145, 147, 148, 149, 150, 151, 153, 154, 156, 157, 158, 159, 161, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 2223, 184, 186, 187, 185, 190, 192, 196, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 222, 223, 224, 225, 226, 227, 230, 4327, 231, 233, 234, 235, 236, 237, 45289, 239, 240, 241, 242, 243, 244, 245, 247, 248, 249, 250, 251, 252, 253, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 26

In [60]:
file_lengths["Coll-1250.txt"]

826

In [61]:
too_long = []
for key,value in file_lengths.items():
    if value > 1000:
        too_long += [key]

In [62]:
print(too_long)
print(len(too_long))

['BAI.txt', 'Coll-1022.txt', 'Coll-1036.txt', 'Coll-1052.txt', 'Coll-1057.txt', 'Coll-1059.txt', 'Coll-1060.txt', 'Coll-1061.txt', 'Coll-1062.txt', 'Coll-1064.txt', 'Coll-1066.txt', 'Coll-1142.txt', 'Coll-1146.txt', 'Coll-1156.txt', 'Coll-1162.txt', 'Coll-1167.txt', 'Coll-1242.txt', 'Coll-1243.txt', 'Coll-1247.txt', 'Coll-1255.txt', 'Coll-1257.txt', 'Coll-1260.txt', 'Coll-1266.txt', 'Coll-1294.txt', 'Coll-13.txt', 'Coll-1310.txt', 'Coll-1320.txt', 'Coll-1329.txt', 'Coll-1357.txt', 'Coll-1362.txt', 'Coll-1363.txt', 'Coll-1364.txt', 'Coll-1373.txt', 'Coll-1383.txt', 'Coll-1385.txt', 'Coll-14.txt', 'Coll-1434.txt', 'Coll-1443.txt', 'Coll-146.txt', 'Coll-1461.txt', 'Coll-1489.txt', 'Coll-1490.txt', 'Coll-1492.txt', 'Coll-1496.txt', 'Coll-1497.txt', 'Coll-1499.txt', 'Coll-1527.txt', 'Coll-1528.txt', 'Coll-1541.txt', 'Coll-1549.txt', 'Coll-1557.txt', 'Coll-1574.txt', 'Coll-1577.txt', 'Coll-1580.txt', 'Coll-1583.txt', 'Coll-1586.txt', 'Coll-1593.txt', 'Coll-16.txt', 'Coll-1613.txt', 'Coll-162

That's a lot of files to break up manually, so let's use Python to divide these large files into smaller files with a maximum of 100 lines each.

In [73]:
# Code in this cell from:
# https://stackoverflow.com/questions/16289859/splitting-large-text-file-into-smaller-text-files-by-line-numbers-using-python
def splitLargeFile(f, max_lines, old_dir, new_dir):
    short = None
    file_path = old_dir+f
    with open(file_path) as long:
        for line_no, line in enumerate(long):
            if line_no % max_lines == 0:
                if short:
                    short.close()
                f = f.replace(".txt","_")
                short_name = str(f)+'{}.txt'.format(line_no + max_lines)
                new_path = new_dir+short_name
                short = open(new_path, "w")
            short.write(line)
        if short:
            short.close()

In [74]:
for f in filenames:
    splitLargeFile(f, 100, "descriptions_by_fonds/", "for_brat/")