# Parsing Archive Metadata as XML from URL
Reference: https://python101.pythonlibrary.org/chapter23_xml.html

In [1]:
import xml.dom.minidom
import urllib.request
import urllib
import xml.etree.ElementTree as ET
from lxml import etree

Obtain the root of the XML data given a URL:

In [2]:
archiveMetadataUrl = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&metadataPrefix=oai_ead"

def getRootFromUrl(url):
    content = urllib.request.urlopen(url)

    #tree = ET.parse(content)
    parser = etree.XMLParser(recover=True)  # Use recover to try to fix broken XML
    tree = etree.parse(content, parser)
    
    root = tree.getroot()
    return root

root = getRootFromUrl(archiveMetadataUrl)
print(root)

<Element {http://www.openarchives.org/OAI/2.0/}OAI-PMH at 0x10e80f5c8>


Extract descriptive metadata, creating three separate lists for text under the three headings "Scope and Contents," "Biographical / Historical," and "Processing Information."

In [3]:
# Input: part of or the entirety of a tag name below which you want to get text 
# Output: a list of text between tags contained within the inputted tagName, 
#         with one list element per tagName instance
def getTextBeneathTag(root, tagName, header):
    text_list = []
    for child in root.iter():
        tag = child.tag
        if tagName in tag:
            text_elem = ""
            for subchild_text in child.itertext():
                if header:
                    if header not in subchild_text:
                        text_elem = text_elem + subchild_text
                else:
                    text_elem = text_elem + subchild_text
            # replace each newline with a single space
            text_elem = " ".join(text_elem.split())
            text_list.append(text_elem)
    return text_list

# sc = getTextBeneathTag(root, "scopecontent", "Scope and Contents")
# bh = getTextBeneathTag(root, "bioghist", "Biographical / Historical")
# pi = getTextBeneathTag(root, "processinfo", "Processing Information")

In [4]:
# print(len(sc))
# print(sc)  # manual check verified that last text of this list matches last descriptive text of the XML

In [5]:
# print(len(bh))
# print(bh)  # manual check verified that last text of this list matches last descriptive text of the XML

In [6]:
# print(len(pi))
# print(pi)  # manual check verified that last text of this list matches last descriptive text of the XML

In [7]:
# resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
# print(resumptionToken)

To continue accessing data, obtain the resumption token at the end of each page and add it to the end of the url, replacing any prefixes provided in the initial url (in our case, exclude the metadata prefix):

In [9]:
# archiveMetadataUrlShort = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&"
# archiveMetadataUrlWithToken = archiveMetadataUrlShort + "resumptionToken=" + resumptionToken[0]
# print(archiveMetadataUrlWithToken)

In [16]:
def getDescriptiveMetadata(more, archiveMetadataUrlShort, startingPrefix, sc, bh, pi):    
   
    archiveMetadataUrlWithPrefix = archiveMetadataUrlShort + startingPrefix
    root = getRootFromUrl(archiveMetadataUrlWithPrefix)
    sc.append(getTextBeneathTag(root, "scopecontent", "Scope and Contents"))
    bh.append(getTextBeneathTag(root, "bioghist", "Biographical / Historical"))
    pi.append(getTextBeneathTag(root, "processinfo", "Processing Information"))
    resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
    
    if len(resumptionToken) == 0:
        more = False
    i = 1
    
    while more:
        archiveMetadataUrlWithToken = archiveMetadataUrlShort + "resumptionToken=" + resumptionToken[0]
        root = getRootFromUrl(archiveMetadataUrlWithToken)
        sc.append(getTextBeneathTag(root, "scopecontent", "Scope and Contents"))
        bh.append(getTextBeneathTag(root, "bioghist", "Biographical / Historical"))
        pi.append(getTextBeneathTag(root, "processinfo", "Processing Information"))
        resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
        if len(resumptionToken) == 0:
            more = False
        i += 1
    
    print(str(i) + " resumption tokens")
    return sc, bh, pi

In [17]:
url = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&"
startPrefix = "metadataPrefix=oai_ead"
sc = []
bh = []
pi = []

sc, bh, pi = getDescriptiveMetadata(True, url, startPrefix, sc, bh, pi)

1232 resumption tokens


In [18]:
print(len(sc))
print(len(bh))
print(len(pi))

1232
1232
1232


In [19]:
print(pi[0:10])

[['Catalogued by Emma Anthony, Project Archivist, Nov 2013, utilising a preliminary handlist created by Meghan Cote, Project Archivist, 2006.'], ["Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. Entered in catalogue in pen.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007.", "Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. Also given Red No.266", "Archivist's NoteEAD conver

In [21]:
print(sc[1230:1232])

[['See External Documents (below) for details.'], ["Content Description This archive comprises the personal and working papers of Kenneth Murray, including: awards and honours; biographical materials including photographs and films; material relating to Kenneth's education and early career, including school certificates; personal and professional correspondence; papers relating to Kenneth's's work at the University of Edinburgh, including departmental papers, teaching materials and staff and student files; papers relating to Biogen, including a large number of legal files; papers relating to the Darwin Trust of Edinburgh, chiefly financial information; research notes and data, including laboratory notebooks and material created by technicians, collaborators and students; papers relating to grants and funding bodies; files relating to Kenneth's membership of and involvement with various committees, organisations and societies; material relating to events, conferences, workshops and symp

In [24]:
print(bh[190:195])

[['The Rev. Alexander Dewar served in the missionary community of the Eastern Cape, South Africa, and with the Free Church of Scotland at Livingstonia, Malawi.'], ["George Walter Prothero was born on 14 October 1848 in Wiltshire. He was educated at Eton and studied at King's College, Cambridge and at the University of Bonn. He became an Assistant Master at Eton and then Lecturer at the Universities of Nottingham and Leicester. In 1876 he became a Lecturer in History and Tutor at King's College and in 1894 he was appointed Professor of History at Edinburgh University. Between 1901 and 1905 he was President of the Royal Historical Society. Prothero lectured in Cambridge, in Boston, at Johns Hopkins, and in Oxford. In 1916 he was Governor of Holloway College, and between 1918 and 1919 he was the Director of the Historical Section at the Foreign Office. After the end of the First World War, Prothero was a member of the British Peace Delegation, 1919. His publications includeLife and times 