# Analyzing Archival Metadata Descriptions 
From the University of Edinburgh's Archives online catalog, [ArchivesSpace](https://archives.collections.ed.ac.uk/).

#### Archives' Overview
TBD

#### Sources
The University of Edinburgh.  2016.  *Directory of Collections.*  Third Millenium Publishing, pp. 186-7.

### 1. OAI EAD Harvesting
**Download descriptive metadata from the Archives' online catalog using OAI EAD harvesting.**

*Note: OAI stands for Open Archives Initiative.  [Click here](https://www.openarchives.org/pmh/) to read about the OAI's Protocol for Metadata Harvesting (OAI-PMH).*

Download the libraries needed for analysis:

In [2]:
# Libraries for parsing XML from URL
import xml.dom.minidom
import urllib.request
import urllib
import xml.etree.ElementTree as ET
from lxml import etree

# Other useful libraries
import numpy as np
import re

Obtain the root of the XML data given a URL:

In [3]:
archiveMetadataUrl = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&metadataPrefix=oai_ead"

def getRootFromUrl(url):
    content = urllib.request.urlopen(url)

    #tree = ET.parse(content)
    parser = etree.XMLParser(recover=True)  # Use recover to try to fix broken XML
    tree = etree.parse(content, parser)
    
    root = tree.getroot()
    return root

root = getRootFromUrl(archiveMetadataUrl)
print(root)

<Element {http://www.openarchives.org/OAI/2.0/}OAI-PMH at 0x11922c048>


Write functions to extract text from specified XML tags and to write that text to a file:

In [4]:
# Input: an XML root, a tag name, and part of or the entirety of a tag name below 
#        which you want to get text 
# Output: a list of text between tags contained within the inputted tagName, 
#         with one list element per tagName instance
def getTextBeneathTag(root, tagName, header):
    text_list = []
    for child in root.iter():
        tag = child.tag
        if tagName in tag:
            text_elem = ""
            for subchild_text in child.itertext():
                if header:
                    if header not in subchild_text:
                        text_elem = text_elem + subchild_text
                else:
                    text_elem = text_elem + subchild_text
            # replace each newline with a single space
            text_elem = " ".join(text_elem.split())
            if text_elem not in text_list:
                text_list.append(text_elem)
    return text_list

In [5]:
# Input: the name (or file path) of the file you wish to output
#        and the name of the list (an array of strings) to write to that file
# Output: the file written where inputted and the file's wordcount
def writeFileFromList(fileName, listName):
    with open(fileName, 'w') as f:
        total_words = 0
        for text in listName:
            total_words += len(re.findall("(\w+)(-*)(\w*)", text))
            f.write(text)
    f.close()
    return total_words

#### 1.1 Extract Descriptions by Metadata Field
Create three separate lists for text under the three headings "Scope and Contents," "Biographical / Historical," and "Processing Information:"

To continue accessing data, obtain the resumption token at the end of each page and add it to the end of the url, replacing any prefixes provided in the initial url (in our case, exclude the metadata prefix):

In [4]:
# def getDescriptiveMetadata(more, archiveMetadataUrlShort, startingPrefix, sc, bh, pi):    
   
#     archiveMetadataUrlWithPrefix = archiveMetadataUrlShort + startingPrefix
#     root = getRootFromUrl(archiveMetadataUrlWithPrefix)
#     sc = sc + getTextBeneathTag(root, "scopecontent", "Scope and Contents")
#     bh = bh + getTextBeneathTag(root, "bioghist", "Biographical / Historical")
#     pi = pi + getTextBeneathTag(root, "processinfo", "Processing Information")
#     resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
    
#     if len(resumptionToken) == 0:
#         more = False
#     i = 1
    
#     while more:
#         archiveMetadataUrlWithToken = archiveMetadataUrlShort + "resumptionToken=" + resumptionToken[0]
#         root = getRootFromUrl(archiveMetadataUrlWithToken)
#         sc = sc + getTextBeneathTag(root, "scopecontent", "Scope and Contents")
#         bh = bh + getTextBeneathTag(root, "bioghist", "Biographical / Historical")
#         pi = pi + getTextBeneathTag(root, "processinfo", "Processing Information")
#         resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
#         if len(resumptionToken) == 0:
#             more = False
#         i += 1
    
#     print(str(i) + " resumption tokens")
#     return sc, bh, pi

In [5]:
# url = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&"
# startPrefix = "metadataPrefix=oai_ead"
# sc = []
# bh = []
# pi = []
# sc, bh, pi = getDescriptiveMetadata(True, url, startPrefix, sc, bh, pi)

Write unique data to files for analysis with natural language processing:

In [6]:
# # Remove duplicate values from the lists
# sc_unique = np.unique(sc)
# bh_unique = np.unique(bh)
# pi_unique = np.unique(pi)
# # print(bh[1222:1232])
# print(len(sc_unique), len(bh_unique), len(pi_unique))

In [8]:
# Write sets to files (one for each set and one combined)
# sc_words = writeFileFromList("UoE_ArchivesMetadata_ScopeAndContents.txt", sc_unique)
# bh_words = writeFileFromList("UoE_ArchivesMetadata_BiographicalHistorical.txt", bh_unique)
# pi_words = writeFileFromList("UoE_ArchivesMetadata_ProcessInfo.txt", pi_unique)
# print(sc_words, bh_words, pi_words)

#### 1.2 Extract Descriptive Metadata by Fonds
Create a dictionary of fonds-level dictionaries, where the fonds' unitids are keys and the fonds-level dictionaries are values, and the fonds-level dictionaries have three key-value pairs for the metadata fields "Unit ID," "Scope and Contents," "Biographical / Historical," and "Processing Information:"

*Note:* fonds *is the archival term for* collection

In [6]:
# Get the text of the scopecontent, bioghist, and processinfo XML elements
# (the descriptions written for the metadata fields named Scope & Contents,
#  Biographical / Historical, and Processing Information, as well as an ID)

def getDescriptiveMetadataByFonds(more, archiveMetadataUrlShort, startingPrefix, descriptions):
    descs_by_id = {}
    
    archiveMetadataUrlWithPrefix = archiveMetadataUrlShort + startingPrefix
    root = getRootFromUrl(archiveMetadataUrlWithPrefix)
    
    # Find the start of the fonds
    for child in root.iter():
        tag = child.tag
        if "eadid" in tag:
            fonds_id = (getTextBeneathTag(child, "eadid", None))[0]
        if "archdesc" in tag:
            unitid = getTextBeneathTag(child, "unitid", None)
            sc = getTextBeneathTag(child, "scopecontent", "Scope and Contents")
            bh = getTextBeneathTag(child, "bioghist", "Biographical / Historical")
            pi = getTextBeneathTag(child, "processinfo", "Processing Information")
            # Store descriptive metadata in a dictionary as sets
            # (Sets are immutable and do not contain duplicates)
            descs = {"unitids": set(unitid),"scopecontent" : set(sc), "bioghist" : set(bh), "processinfo" : set(pi)}
            # Add the fonds' descriptive metadata to the list of all fonds' descriptive metadata
            descs_by_id[fonds_id] = descs
    
    # Determine if there's more fonds' metadata
    resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
    if len(resumptionToken) == 0:
        more = False
    i = 1
    
    # If there is more fonds' metadata, extract text from the same metadata fields as for the first fonds
    while more:
        archiveMetadataUrlWithToken = archiveMetadataUrlShort + "resumptionToken=" + resumptionToken[0]
        root = getRootFromUrl(archiveMetadataUrlWithToken)

        # Find the next fonds
        for child in root.iter():
            tag = child.tag
            if "eadid" in tag:
                fonds_id = (getTextBeneathTag(child, "eadid", None))[0]
            if "archdesc" in tag:
                unitid = getTextBeneathTag(child, "unitid", None)
                sc = getTextBeneathTag(child, "scopecontent", "Scope and Contents")
                bh = getTextBeneathTag(child, "bioghist", "Biographical / Historical")
                pi = getTextBeneathTag(child, "processinfo", "Processing Information")
                # Store descriptive metadata in a dictionary
                # (Sets are immutable and do not contain duplicates)
                descs = {"unitids": set(unitid),"scopecontent" : set(sc), "bioghist" : set(bh), "processinfo" : set(pi)}
                # Add the fonds' descriptive metadata to the list of all fonds' descriptive metadata,
                # using the first unitid (which should be the fonds-level ID) as a key
                descs_by_id[fonds_id] = descs
        
        # Determine if there's more fonds' metadata
        resumptionToken = getTextBeneathTag(root, "resumptionToken", "")
        if len(resumptionToken) == 0:
            more = False
        i += 1
    
    print(str(i) + " resumption tokens")
    return descs_by_id

In [7]:
url = "http://lac-archives-live.is.ed.ac.uk:8082/?verb=ListRecords&"
startPrefix = "metadataPrefix=oai_ead"
descriptions = []

desc_by_fonds = getDescriptiveMetadataByFonds(True, url, startPrefix, descriptions)
print(len(desc_by_fonds))

1231 resumption tokens
987


In [8]:
fonds_ids = list(desc_by_fonds.keys())
print(fonds_ids[500], ": ", desc_by_fonds[fonds_ids[500]])

Coll-1544 :  {'unitids': {'Coll-1544'}, 'scopecontent': {"The letter is 1.5 pages folio, countersigned by 'J. Holcroft', and with address on the verso to Falkland, 'and to our Chancellor there now being, and to the Deputie chiefe Governor or Governors, Chauncellors or keeper of the greate Seale [...] Given under our signett at the Pallace of Westmr. the fifth day of december in the twentieth yeare of or. Raigne of England, Fraunce and Ireland, and of Scotland the six and fiftieth'."}, 'bioghist': {"The letter signed 'James R' to Henry Cary, 1st Viscount Falkland, as Lord Deputy of Ireland, is countersigned by J. Holcroft. It concerns the succession of Sir Richard Bolton by his son Edward, as Solicitor General in Ireland, 'to hold, exercise and enjoy the same during our pleasure [...]'. It is dated 5 December 1622. Henry Cary, an English landowner and politician, son of Sir Edward Cary and grandson of Sir Henry Knevet, master of the jewel office to Queen Elizabeth and King James, was bo

Looking good!  We have all four metadata fields expected: `unitid`, `scopecontent`, `bioghist`, and `processinfo`.

### 2. Analysis of Extracted Metadata Descriptions

**Determine how many fonds have a description in the "Processing Information" field.**

In [24]:
process_info = []
no_pi_count = 0
for key,value in desc_by_fonds.items():
    pi = list(value['processinfo'])
    if len(pi) == 0:
        no_pi_count += 1
    else:
        for desc in pi:
            process_info += [desc]
print("Fonds WITHOUT Processing Information:", no_pi_count, "("+str((no_pi_count/len(desc_by_fonds))*100)+"%)")
print("Fonds WITH Processing Information:", len(desc_by_fonds)-no_pi_count, "("+str(((len(desc_by_fonds)-no_pi_count)/len(desc_by_fonds))*100)+"%)")
print("Total Processing Information Descriptions:", len(process_info))

Fonds WITHOUT Processing Information: 556 (56.33232016210739%)
Fonds WITH Processing Information: 431 (43.66767983789261%)
Total Processing Information Descriptions: 1562


Almost half of fonds (approximately 43.7%) have a description in the Processing Information field, which is where a date documenting when the description was written may be found.

Extract any dates (as a four-digit year) in the Processing Information descriptions to find the earliest and latest ones:

In [10]:
# pi = process_info[0]
# print(re.findall("\d{4}", pi))
# print(pi)

date_list = []
for pi in process_info:
    dates = re.findall("\d{4}", pi)
    for d in dates:
        date_list += [d]

unique_dates = (list(set(date_list)))
unique_dates.sort()
# print(len(set(date_list)))
print(unique_dates)
# date_list.sort()
# print(date_list)

['0094', '0221', '1009', '1077', '1078', '1159', '1228', '1290', '1292', '1295', '1296', '1300', '1303', '1315', '1317', '1318', '1362', '1370', '1380', '1385', '1386', '1400', '1452', '1466', '1504', '1521', '1522', '1526', '1534', '1549', '1560', '1561', '1574', '1599', '1656', '1659', '1660', '1775', '1776', '1824', '1831', '1851', '1856', '1865', '1868', '1872', '1877', '1878', '1883', '1890', '1892', '1896', '1937', '1938', '1939', '1956', '1968', '1985', '1995', '1998', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2034', '2035', '2046', '2156', '2275', '2379', '3161', '3166', '3175', '3176']


Manually review the dates to determine which are actual dates of documentation (clearly some are not based on the list above):

In [11]:
for pi in process_info:
    dates = re.findall("\d{4}", pi)
    for d in dates:
        if int(d) < 1900:
            print(pi)
        break

before 1890
Archivist's NoteFrom the date and other evidence it is clear that this John Rolland is the translator of the Tales of the Seven Sages, which was executed by ' John Rolland in Dalkeith' about 1560. He also wrote The Court of Venus. Margaret Forrest 10th January 2013
Archivist's NoteNo date, but about 25th May 1228, when it was confirmed by King Alexander the Second. See Liber Ecclesie de Scon. pp. 44, 45.
Archivist's NoteThis writ is much torn. Held in small open top box with Laing Charter GB 237 Coll-1/5/1659 Margaret Forrest 29th November 2011
Archivist's NoteNo date, before 1303, see No. 19. Hugh Gobyon was sheriff from 1292 to 1295. Possibly a chirograph.
Archivist's NoteNo date, c1318.
Archivist's NoteNote on back of charter: "Charter of Hugo de Gourlaye of 12 acres of land in Linton about 1290"
Archivist's NoteNot dated, c1385. Sketch of a man's face inside the first letter of the first word, "Omnibus". Margaret Forrest 22nd April 2010
Archivist's NoteNo date, but c. 1

In [12]:
for pi in process_info:
    dates = re.findall("\d{4}", pi)
    for d in dates:
        if int(d) >= 1900 and int(d) < 2000:
            print(pi)
        break

Archivist's Note Note for reader: Published in 1956 and referencing 19th century popular culture, this annotated proof copy of "Their first ten years; Victorian Childhood" (Gen.2156/1/6) contains outdated language and ideas concerning race, religion, gender and class. LMcL Aug 2020
Original handlist created in May 1998 by Kate McDonald, Researcher at The Centre of African Studies; then keyed into ArchivsSpace by Stephen Willis in August-September 2020.
Catalogue created by Andrew Grout in June 1998; keyed into ArchivesSpace by Jack Green in June 2020.
Archivist's NoteThis charter is contained in an envelope with additional paper notes: 1. Transcription of charter in Anglo-Saxon; 2. Translation of charter; 3. Exhibition marker/note; 4. Photocopy of guides to the charter from unidentified sources. Printed from this original in Birch's Cartularium Saxonicum (or possibly Calendarium Saxonarium), vol ii. p 79 See A.J. Robertson, Anglo-Saxon Charters (C.U.P. 1956) no VIII and notesAlso Journ

In [14]:
for pi in process_info:
    dates = re.findall("\d{4}", pi)
    for d in dates:
        if int(d) >= 2000:
            print(pi)
        break

Catalogued by Emma Anthony, Project Archivist, Nov 2013, utilising a preliminary handlist created by Meghan Cote, Project Archivist, 2006.
Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. 2 Packets
Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. Difficulty reading title! Added to list in pen.
Older handlist keyed into ArchivesSpace by Stephen Willis in July 2019.
Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. 3 Packages. Also given Red No.110.
Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. 2 Packages. Note reads: All the engineering drawings & R.A.'s presentation set apparently in Blyth & Blyth coll., S.R.O. (uncatalogued) Info from reader 2/95
Archivist's NoteEAD converted from Microsoft Excel spreadsheet by Grant Buttars, 30 May 2007. Entered in catalogue in pen.
Archivist's NoteEAD converted from Micr

In [17]:
# Valid years of documention (years a description was written for the Archives' catalogue)
doc_yrs = [1896, 1995, 1998, 2018, 2011, 2010, 2016, 2015, 2014, 2012, 2013, 2006, 2007, 2019, 2017, 2004, 2009, 2002, 2000, 2001, 2020, 2003, 2005]
doc_yrs.sort()
print("Years a description was written for the Archives' catalogue:", doc_yrs)

Years a description was written for the Archives' catalogue: [1896, 1995, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]


So the **earliest** known documentation year is **1896** and the **latest** known documentation year is **2020** (the current year at the time of coding).

In [25]:
desc_with_valid_yr = 0
for pi in process_info:
    dates = re.findall("\d{4}", pi)
    for d in dates:
        if int(d) in doc_yrs:
            desc_with_valid_yr += 1

print("Estimate of total Processing Information fields with a documentation date:", desc_with_valid_yr, "out of", len(process_info), "(", (desc_with_valid_yr/len(process_info))*100, "%)")

Estimate of total Processing Information fields with a documentation date: 1374 out of 1562 ( 87.9641485275288 %)


Among fonds with a Processing Information description, the majority of the descriptions contain a date of documentation!

### 3. Annotation Preparation

**Divide the descriptions into 5 groups to to designate subsets of the metadata for annotation and classification algorithm development: 1 dev set, 3 training sets, and 1 blind test set.**

In [26]:
iMax = int(len(desc_by_fonds)/5)
first = {}
second = {}
third = {}
fourth = {}
fifth = {}
dictionaries = [first, second, third, fourth, fifth]
i = 0
j = 0
for key, value in desc_by_fonds.items():
    d = dictionaries[j]
    if i < iMax:
        d[key] = value
        i += 1
    else:
        i = 0
        j += 1

In [27]:
print(len(first), len(second), len(third), len(fourth), len(fifth))

197 197 197 197 195


Write the extracted descriptions to files for annotation and algorithm training, designating the first as the blind test set, the second as the dev set, and the last three as training sets:

In [28]:
# Input: the name (or file path) of the file you wish to output
#        and the name of the list (an array of strings) to write to that file
# Output: the file written where inputted and the file's wordcount
def writeFileFromDict(fileName, dictName):
    with open(fileName, 'w') as f:
        for key,value in dictName.items():
            heading = "Fonds ID:"+str(key)+"\n"
            f.write(heading)
            for k,v in value.items():
                field = str(k)+"\n"
                f.write(field)
                desc = str(v)+"\n"
                f.write(desc)
    f.close()
    return "File written"

In [29]:
writeFileFromDict('UoEArchivesMetadata_ID-SC-BH-PI_blindtestset.txt', first)
writeFileFromDict('UoEArchivesMetadata_ID-SC-BH-PI_devset.txt', second)
writeFileFromDict('UoEArchivesMetadata_ID-SC-BH-PI_trainingset1.txt', third)
writeFileFromDict('UoEArchivesMetadata_ID-SC-BH-PI_trainingset2.txt', fourth)
writeFileFromDict('UoEArchivesMetadata_ID-SC-BH-PI_trainingset3.txt', fifth)

'File written'

In [30]:
# writeFileFromDict('UoEArchivesMetadataDescs.txt', desc_by_fonds)
# writeFileFromDict('UoEArchivesMetadata_BHPI.txt', desc_by_fonds)



# # Write sets to files (one for each set and one combined)
# i = 0
# word_counts = []
# for fonds in desc_by_fonds:
#     fonds_file = "UoE_ArchivesMetadataDescs_Fonds" + str(i) + ".txt"  # SHOULD EXTRACT UNITID TAG'S TEXT FOR FILE NAMES!
#     fonds_lists = ["Biographical / Historical:\n"] + list(fonds["bioghist"]) + ["\nScope and Contents:\n"] + list(fonds["scopecontent"]) + ["\nProcessing Information:\n"] + list(fonds["processinfo"])
#     words = writeFileFromList(fonds_file, fonds_lists)
#     if words:
#         word_counts.append(int(words))
#     i += 1

In [41]:
# words = []
# for word in word_counts:
#     words += [int(word)]
# # print(words)
# print(min(words))
# # print(max(words))
# print(np.sum(words)/len(words))
# print(np.std(words))

### 2. Natural Language Processing (NLP)
* Tool: NLTK from https://www.nltk.org/
* Reference: The NLTK Book at http://www.nltk.org/book/ 

#### 2.1 Sentence Segmentation
**Create a file of sentences for annotation.**

Download the libraries needed for analysis:

In [4]:
# Libraries for data analysis and visualization
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
from scipy.stats import mode

# Libraries for Natural Language Processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.text import Text
# nltk.download('punkt')
from nltk.probability import FreqDist
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader
from collections import Counter
from wordcloud import WordCloud
from nltk.draw.dispersion import dispersion_plot
from nltk.collocations import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
# nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

# To avoid SSL error when downloading NLTK packages...
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download()

# Other useful libraries
import string
import csv
import re

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Segment the descriptive metadata into sentences:

In [15]:
# sc_sentences = []
# for s in sc_unique:
#     sentences = sent_tokenize(s)
#     sc_sentences += sentences
# assert len(sc_sentences) > len(sc_unique)  # passed
# sc_sentences[0:10]  # passed

# Input: A Python list of strings or Numpy array of strings
# Output: A list of sentences
def segmentSentences(listOfStrings):
    listOfSentences = []
    for s in listOfStrings:
        sentences = sent_tokenize(s)
        listOfSentences += sentences
    return listOfSentences

In [5]:
# sc_sentences = segmentSentences(sc_unique)
# bh_sentences = segmentSentences(bh_unique)
# pi_sentences = segmentSentences(pi_unique)

In [None]:
bh_sentences[100:105]

In [None]:
all_sentences = sc_sentences + bh_sentences + pi_sentences
assert len(all_sentences) == len(sc_sentences) + len(bh_sentences) + len(pi_sentences)

In [None]:
# total_words = writeFileFromList("UoE_ArchivesMetadata_AllSentences.txt", all_sentences)
# print(total_words)

In [None]:
print(len(bh_sentences))
print(len(sc_sentences))
print(len(pi_sentences))

In [None]:
print(len(sc_sentences)+len(bh_sentences))
print(len(sc_sentences)+len(bh_sentences)+len(pi_sentences))

In [None]:
file = open('UoE_ArchivesMetadata_AllSentences.txt', 'r')
sample_text = file.read()
sample_tokens = word_tokenize(sample_text)
print(sample_tokens[:10])
print(len(sample_tokens))

In [None]:
file = open('UoE_ArchivesMetadata_ScopeAndContents.txt', 'r')
sample_text1 = file.read()
sample_tokens1 = word_tokenize(sample_text1)
print(sample_tokens1[:10])
print(len(sample_tokens1))

In [None]:
file = open('UoE_ArchivesMetadata_BiographicalHistorical.txt', 'r')
sample_text2 = file.read()
sample_tokens2 = word_tokenize(sample_text2)
print(sample_tokens2[:10])
print(len(sample_tokens2))

In [None]:
file = open('UoE_ArchivesMetadata_ProcessInfo.txt', 'r')
sample_text3 = file.read()
sample_tokens3 = word_tokenize(sample_text3)
print(sample_tokens3[:10])
print(len(sample_tokens3))

In [None]:
word_count = 0
for t in sample_tokens:
    if t.isalpha():
        word_count += 1
print("Total words:", word_count)

word_count1 = 0
for t in sample_tokens1:
    if t.isalpha():
        word_count1 += 1
print("BH words:", word_count1)

word_count2 = 0
for t in sample_tokens2:
    if t.isalpha():
        word_count2 += 1
print("SC words:", word_count2)

word_count3 = 0
for t in sample_tokens3:
    if t.isalpha():
        word_count3 += 1
print("PI words:", word_count3)

#### 2.2 Sentence Segmentation by Fonds

In [11]:
# corpus_folder = 'UoE_Archives_DescsByFonds/'
# wordlists = PlaintextCorpusReader(corpus_folder, '.*', encoding='utf8')
# corpus_tokens = wordlists.words()
# print(corpus_tokens[:10])

In [12]:
file = open('UoE_Archives_DescsByFonds/UoE_ArchivesMetadataDescs_Fonds0.txt','r')  # replace the path with the one on your computer
file_raw = file.read()
file_tokens = word_tokenize(file_raw)
lower_file_tokens = [word.lower() for word in file_tokens]
lower_file_tokens[0:10]


['biographical',
 '/',
 'historical',
 ':',
 'ledermann',
 'leaving',
 'for',
 'st',
 'andrews',
 'meant']

# NEXT: distinguish between cataloguers' words and quotes from items being catalogued