In [None]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
import nltk
import pandas as pd

In [4]:
#Note that for this to work the NoMoreSilence_ProjectData.tsv file needs to be
#in the same directory (folder) that this notebook file is in, and that you started
#the jupyter notebook from.
df = pd.read_csv('NoMoreSilence_ProjectData.tsv', sep='\t')

In [5]:
#to show us the columns of the data we're working with
df.columns

Index(['Collection Title', 'Title', 'Local Identifier ', 'Type', 'Date ',
       'Date Type', 'Publication/Origination Info', 'Creator 1 Name',
       'Creator 1 NameType', 'Creator 1 Source', 'Creator 2 Name',
       'Creator 2 NameType', 'Creator 2 Source', 'Creator 3 Name',
       'Creator 3 NameType', 'Creator 3 Source', 'Creator 4 Name',
       'Creator 4 NameType', 'Creator 4 Source', 'Format/Physical Description',
       'Language ', 'Language Code', 'Copyright Status', 'Copyright Statement',
       'Source', 'Subject (Name) 1 Name', 'Subject (Name) 1 Name Type',
       'Subject (Name) 1 Source', 'Subject (Name) 2 Name',
       'Subject (Name) 2 Name Type', 'Subject (Name) 2 Source',
       'Subject (Name) 3 Name', 'Subject (Name) 3 Name Type',
       'Subject (Name) 3 Source', 'Subject (Topic) 1 Heading',
       'Subject (Topic) 1 Heading Type', 'Subject (Topic) 1 Source',
       'Subject (Topic) 2 Heading', 'Subject (Topic) 2 Heading Type',
       'Subject (Topic) 2 Source', '

In [6]:
#taking a look at 'Source' column as a way to potentially pull out something to categorize by collection, 
#like the call number. This looks pretty good, let's use it. 
df['Source'][0:10]

0    AIDS Legal Referral Panel Records, 2000-46, Bo...
1    AIDS Legal Referral Panel Records, 2000-46, Bo...
2    AIDS Legal Referral Panel Records, 2000-46, Bo...
3    AIDS Legal Referral Panel Records, 2000-46, Bo...
4    AIDS Legal Referral Panel Records, 2000-46, Bo...
5    AIDS Legal Referral Panel Records, 2000-46, Bo...
6    AIDS Legal Referral Panel Records, 2000-46, Bo...
7    AIDS Legal Referral Panel Records, 2000-46, Bo...
8    AIDS Legal Referral Panel Records, 2000-46, Bo...
9    AIDS Legal Referral Panel Records, 2000-46, Bo...
Name: Source, dtype: object

In [7]:
#this is creating a variable of all the sources, which we'll use to experiment with pulling out
#just the call number for each collection. 
#the for loop will iterate through each source, and use the .split method to create a new list
#with each element (separated by commas, which we specified with .split(', ') -- note 
# comma then space) as a list item.
#this threw an error, because one of the entries was a float not a string. We have to choose to 
#either make it a string or to ignore it. In this code I've made it a string with source = str(source)
#but it may actually be better to ignore it. (with an if else statement)
sources = df['Source']
for source in sources:
    source = str(source)
    s_list = source.split(', ')
    print(s_list)

['AIDS Legal Referral Panel Records', '2000-46', 'Box 3', 'Folder 2']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 4', 'Folder 4']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 7', 'Folder 2']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 1', 'Folder 10']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 3', 'Folder 6']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 5', 'Folder 4']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 4', 'Folder 8']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 1', 'Folder 1']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 3', 'Folder 13']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 6', 'Folder 1']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 3', 'Folder 10']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 1', 'Folder 6']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 7', 'Folder 1']
['AIDS Legal Referral Panel Records', '2000-46', 'Box 2', 'Folder 7']
['AIDS Legal Refe

In [8]:
#note that from the above it is the Mobilization Against AIDS records that contain the spaces as s_list[1]
#that were giving us problems.
#Additionally in People v Owens Bathhouse we find the completely empty entries, which had to be made as blank.
#also Barbara Cameron have totally blank 2nd field in 1 entry. 

In [9]:
#Filling out the above to more completely get the call number. Note that this time we are opting to skip
#the row if it has no data for the source, this is contained in the "if type(source) == str:"
collection_list = []
for source in sources:
    if type(source) == str:
        source_list = source.split(', ')
        try:
            if source_list[1] == '':
                collection_list.append(source_list[2])
            else:
                collection_list.append(source_list[1])
        except IndexError:
            collection_list.append('no data')

#the below makes a set from the list, to pull out all the unique values so we can see what the extent of the values
#we're getting. 
#we can see that there are some duplicates due to trailing spaces, so we'll need to fix that. 
            
collection_set = set(collection_list)
print(sorted(collection_set))

['2000-46', '2003-09', '2005-13', 'GLC 63', 'MSS 2000-31', 'MSS 2000-31 ', 'MSS 2001-04', 'MSS 2009-04', 'MSS 95-03', 'MSS 95-04', 'MSS 98-47', 'MSS 98-48', 'SFH 31', 'SFH 71', 'no data']


In [10]:
#We're almost there, but we want the code to remove trailing spaces and to replace spaces with dashes for cleaner data.
#The below does that. 
collection_list = []
for source in sources:
    call_no = 'blank'
    if type(source) == str:
        source_list = source.split(', ')
        try:
            if source_list[1] == '':
                if source_list[2][-1] == ' ':
                    call_no = source_list[2][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[2].replace(' ', '-')
            else:
                if source_list[1][-1] == ' ':
                    call_no = source_list[1][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[1].replace(' ', '-')
        except IndexError:
            call_no = 'no-data'
    collection_list.append(call_no)
    
collection_set = set(collection_list)
print(sorted(collection_set))
        

['2000-46', '2003-09', '2005-13', 'GLC-63', 'MSS-2000-31', 'MSS-2001-04', 'MSS-2009-04', 'MSS-95-03', 'MSS-95-04', 'MSS-98-47', 'MSS-98-48', 'SFH-31', 'SFH-71', 'blank', 'no-data']


In [11]:
#Now we need to take the code above and turn it into a function that will run on the "Source" field
#for every line in the dataframe. We need to define its inputs a little differently, and do the function
#definition.
def get_call_no(row):
    call_no = 'blank'
    if type(row['Source']) == str:
        source_list = row['Source'].split(', ')
        try:
            if source_list[1] == '':
                if source_list[2][-1] == ' ':
                    call_no = source_list[2][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[2].replace(' ', '-')
            else:
                if source_list[1][-1] == ' ':
                    call_no = source_list[1][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[1].replace(' ', '-')
        except IndexError:
            call_no = 'no-data'
    return call_no

In [12]:
#This used the function we just defined above to go through each row in the dataframe and pull out the call_no
#and put it into a new column called 'call_no', which we've defined simply by naming it in the 'df['call_no'] = ...'
df['call_no'] = df.apply(lambda row: get_call_no(row), axis=1)

In [13]:
#if we simply call the dataframe now we can see that our new column is there:
df

Unnamed: 0,Collection Title,Title,Local Identifier,Type,Date,Date Type,Publication/Origination Info,Creator 1 Name,Creator 1 NameType,Creator 1 Source,...,Subject (Topic) 4 Heading Type,Subject (Topic) 4 Source,Subject (Topic) 5 Heading,Subject (Topic) 5 Heading Type,Subject (Topic) 5 Source,Subject (Topic) 6 Heading,Subject (Topic) 6 Heading Type,Subject (Topic) 6 Source,Ocr text,call_no
0,"AIDS Legal Referral Panel Records, 2000-46, Bo...","""Prop 64: The AIDS Initiative in California""",glbths_200046_003_002,text,1986,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,PROPOSITION 64 The AIDSInitiativein California...,2000-46
1,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Management,glbths_200046_004_004,text,circa 1992,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,MAKING YOUR WILL California State Aids Legal S...,2000-46
2,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Board Meetings,glbths_200046_009_005,text,1995-1996,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,"January 11, 1997 Community Liaison Committee c...",2000-46
3,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Correspondence,glbths_200046_001_0010,text,1985-1987,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,^ GREAT REPUBLIC IIMSURAIMCE COMPANY i 470 SOU...,2000-46
4,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Letters of support,glbths_200046_003_006,text,1993,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,"SANFRANCISCOAIDSFOUNDATION P.O.BOX 426182,SANF...",2000-46
5,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Fundraising,glbths_200046_007_005,text,undated,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,"WOMEN'S 1^ AIDS February 26, 1996 A NETWORK An...",2000-46
6,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Operations Procedures Manuals,glbths_200046_004_008,text,1993-2000,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,r AIDS LEGAL REFERRAL PANEL Draft Operations M...,2000-46
7,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Audit,glbths_200046_001_001,text,1988-1992,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,AIDS LEGAL REFERRAL PANEL OF THE SAN FRANCISCO...,2000-46
8,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Guardianship for Parents with HIV,glbths_200046_003_013,text,1993,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,,...,topic,lcsh,,,,,,,GUARDIANSHIPFORPARENTSWITHHIV ANADVOCATE'SGUID...,2000-46
9,"AIDS Legal Referral Panel Records, 2000-46, Bo...",Historical Documents,glbths_200046_008_006,text,undated,created,Digital resource published by the Regents of t...,AIDS Legal Referral Panel,corpname,naf,...,topic,lcsh,,,,,,,HISTORY AND ACCOMPLISHMENTS OF THE WOMEN'S AID...,2000-46


In [14]:
#using the df.unique method, we can check the same thing we did above using set() -- that there are no repeat values. 
call_nums = df['call_no'].unique()
print(call_nums)

['2000-46' '2003-09' '2005-13' 'MSS-2000-31' 'blank' 'MSS-2001-04'
 'MSS-2009-04' 'MSS-95-03' 'MSS-95-04' 'no-data' 'MSS-98-47' 'MSS-98-48'
 'GLC-63' 'SFH-31' 'SFH-71']


In [15]:
#and now we can sort it by collection simply by creating a variable that defines all the rows that match a certain 
#collection value, and passing this as a selection of the df variable:
act_up = df['call_no'] == 'MSS-98-47'
df[act_up]

Unnamed: 0,Collection Title,Title,Local Identifier,Type,Date,Date Type,Publication/Origination Info,Creator 1 Name,Creator 1 NameType,Creator 1 Source,...,Subject (Topic) 4 Heading Type,Subject (Topic) 4 Source,Subject (Topic) 5 Heading,Subject (Topic) 5 Heading Type,Subject (Topic) 5 Source,Subject (Topic) 6 Heading,Subject (Topic) 6 Heading Type,Subject (Topic) 6 Source,Ocr text,call_no
639,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",6th International AIDS Conference - Various Or...,ucsf_mss98-47_001_0016,text,1988-1993,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,The Quilt AnInternational AIDSMemorial CONTACT...,MSS-98-47
640,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",Newspaper Clippings,ucsf_mss98-47_001_0018,text,"1989-1990, undated",created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,Hdirin^cfnAid Cuts Disrupted in Phil^elphia (^...,MSS-98-47
641,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",ACT-UP Golden Gate - Treatment Issues Committee,ucsf_mss98-47_001_003,text,1991-1992,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,"To: AIDS Activists, Service Organizations, Com...",MSS-98-47
642,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",ATAC² - 2nd AIDS Treatment Activist Conference,ucsf_mss98-47_001_008,text,1990,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,CONFERENCESUMMARY A SummaryoftheSecondAIDSTrea...,MSS-98-47
643,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",6th International AIDS Conference - 1990 ACT-U...,ucsf_mss98-47_001_005,text,1989,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,EMBARGO FOR RELEASE: AIDSCoalition ToUnleash P...,MSS-98-47
644,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...","ACT-UP Washington, D.C.",ucsf_mss98-47_001_0014,text,1988-1989,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,i-» NGUF lesbian GAY AIDSCoalitionToUnleashPow...,MSS-98-47
645,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",AIDS Action News (Toronto),ucsf_mss98-47_001_0015,text,"1988-1992, undated",created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,"Inside: If you'vefiadPCP, youmayneed HelpWante...",MSS-98-47
646,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",ATAC - AIDS Treatment Activist Conference,ucsf_mss98-47_001_0010,text,November 1990,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,"Sunday, 11 November 1990 - Quality Hotel Capit...",MSS-98-47
647,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",6th International AIDS Conference - San Franci...,ucsf_mss98-47_001_006,text,1990,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,Background Th* AIDS Coalition to Unlaash Power...,MSS-98-47
649,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",Treatment Issues ACT-UP Golden Gate and other ...,ucsf_mss98-47_001_0011,text,September 1991,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,CMngSTMJkS cancelled: SantaissickwithHIV-infec...,MSS-98-47


In [16]:
#or, we can simply define the selection right inside the brackets of df:
df[df['call_no'] == 'MSS-98-47']

Unnamed: 0,Collection Title,Title,Local Identifier,Type,Date,Date Type,Publication/Origination Info,Creator 1 Name,Creator 1 NameType,Creator 1 Source,...,Subject (Topic) 4 Heading Type,Subject (Topic) 4 Source,Subject (Topic) 5 Heading,Subject (Topic) 5 Heading Type,Subject (Topic) 5 Source,Subject (Topic) 6 Heading,Subject (Topic) 6 Heading Type,Subject (Topic) 6 Source,Ocr text,call_no
639,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",6th International AIDS Conference - Various Or...,ucsf_mss98-47_001_0016,text,1988-1993,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,The Quilt AnInternational AIDSMemorial CONTACT...,MSS-98-47
640,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",Newspaper Clippings,ucsf_mss98-47_001_0018,text,"1989-1990, undated",created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,Hdirin^cfnAid Cuts Disrupted in Phil^elphia (^...,MSS-98-47
641,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",ACT-UP Golden Gate - Treatment Issues Committee,ucsf_mss98-47_001_003,text,1991-1992,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,"To: AIDS Activists, Service Organizations, Com...",MSS-98-47
642,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",ATAC² - 2nd AIDS Treatment Activist Conference,ucsf_mss98-47_001_008,text,1990,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,CONFERENCESUMMARY A SummaryoftheSecondAIDSTrea...,MSS-98-47
643,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",6th International AIDS Conference - 1990 ACT-U...,ucsf_mss98-47_001_005,text,1989,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,EMBARGO FOR RELEASE: AIDSCoalition ToUnleash P...,MSS-98-47
644,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...","ACT-UP Washington, D.C.",ucsf_mss98-47_001_0014,text,1988-1989,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,i-» NGUF lesbian GAY AIDSCoalitionToUnleashPow...,MSS-98-47
645,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",AIDS Action News (Toronto),ucsf_mss98-47_001_0015,text,"1988-1992, undated",created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,"Inside: If you'vefiadPCP, youmayneed HelpWante...",MSS-98-47
646,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",ATAC - AIDS Treatment Activist Conference,ucsf_mss98-47_001_0010,text,November 1990,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,"Sunday, 11 November 1990 - Quality Hotel Capit...",MSS-98-47
647,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",6th International AIDS Conference - San Franci...,ucsf_mss98-47_001_006,text,1990,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,Background Th* AIDS Coalition to Unlaash Power...,MSS-98-47
649,"ACT-UP Golden Gate Records, MSS 98-47, Box 1 F...",Treatment Issues ACT-UP Golden Gate and other ...,ucsf_mss98-47_001_0011,text,September 1991,created,Digital resource published by the Regents of t...,ACT UP Golden Gate (Organization),corpname,naf,...,,,,,,,,,CMngSTMJkS cancelled: SantaissickwithHIV-infec...,MSS-98-47


In [17]:
#Now, let's choose some collections to compare to each other with NLP, and prepare them to be analyzed. 
#We'll make a new tokenized version of each collection and store it in a variable, 
#And then we'll make a 'model' of each collection using GenSim and store that in another variable. 
#Let's compare these 3 collections, we'll even copy it here as a dictionary in hopes that we might be able 
#to re-use it later.
collections_comparison = {'Aids Legal Referral Network records':'2000-46', 'Womens AIDS Network records':'MSS-95-04',
                          'ACT-UP Golden Gate records':'MSS-98-47'}

In [20]:
#To prepare the text from each collection, we have to preprocess it into a word form that's easier for a computer to use. 
#We will tokenize it, which basically turns it into a list, lowercase it, and remove stopwords (the, a, and, etc.)
#There is probably a better way to do this, but I will do it by storing the tokenized version of each collection
#in a variable, and making a new variable for each collection. Let's do it for AIDS legal referral network first.

#You may get an error from the code below if you have not installed the "punkt" vocabulary from nltk. 
#see the next cell for how to do that if needed. 

#note also that when we are specifying the text to use, we have to also specify the ['Ocr text'] column of the dataframe
#which is important so that it doesn't try to just tokenize all the metadata too. 
tokenized_2000_46 = []
for t in df[df['call_no'] == '2000-46']['Ocr text']:
    if isinstance(t, str):
        t = strip_non_alphanum(t)
        t = t.lower()
        t = remove_stopwords(t)
        tokenized_2000_46.append(nltk.word_tokenize(t))

In [19]:
#run this cell if you need to download the 'punkt' vocabulary.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cmacquarie\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [22]:
#So what is a tokenized version of a text anyway?
#run the code below and take a look.
tokenized_2000_46

[['proposition',
  '64',
  'aidsinitiativein',
  'california',
  'preparedby',
  'senate',
  'officeofresearch',
  'elisabeth',
  'kersten',
  'director',
  'september',
  '1986',
  '207',
  's',
  'pgnbr',
  '1',
  'proposition',
  '64',
  'aids',
  'initiative',
  'incalifornia',
  'prepared',
  'senate',
  'officeof',
  'research',
  'kathryn',
  'duke',
  'j',
  'd',
  'm',
  'p',
  'h',
  'september',
  '1986',
  'pgnbr',
  '2',
  'thtngs',
  'public',
  'thinks',
  'long',
  'commonly',
  'attains',
  'think',
  'right',
  'samuel',
  'johnson',
  'lives',
  'poets',
  '1778',
  'kje',
  'thinking',
  'ofillness',
  'aspolitical',
  'recognize',
  'thepolitical',
  'dimension',
  'health',
  'care',
  'research',
  'example',
  'fact',
  'prevention',
  'lead',
  'poisoning',
  'curing',
  'sickle',
  'cell',
  'anemia',
  'glam',
  'orous',
  'financed',
  'heart',
  'transplants',
  'isstill',
  'difficult',
  'conceive',
  'diseaseitself',
  'political',
  'construct',
  'denn

In [23]:
#You can see that it's pretty much just a list (and a messy one at that). 
#In fact, we can check the type and indeed see that it is a list of words. 
type(tokenized_2000_46)

list

In [24]:
#here we are creating tokenized versions of the next two collections we will compare:

tokenized_MSS_95_04 = []
for t in df[df['call_no'] == 'MSS-95-04']['Ocr text']:
    if isinstance(t, str):
        t = strip_non_alphanum(t)
        t = t.lower()
        t = remove_stopwords(t)
        tokenized_MSS_95_04.append(nltk.word_tokenize(t))
        
tokenized_MSS_98_47 = []
for t in df[df['call_no'] == 'MSS-98-47']['Ocr text']:
    if isinstance(t, str):
        t = strip_non_alphanum(t)
        t = t.lower()
        t = remove_stopwords(t)
        tokenized_MSS_98_47.append(nltk.word_tokenize(t))

In [25]:
#Next we have to create the model, which will layout the math that will allow the computer to attempt to 
#make meaning from the words. 

#This is complex and difficult to explain (to think critically about it is part of the workshop!) but 
#basically the computer will construct a statistical representation of the body of text so that it 
#can use those numbers to decide meaning. This is also called a vector representation.

#To do this we will use the Word2Vec method contained in GenSim. A not too horrible explanation of
#some of the methods used in Word2Vec can be found here: 
#https://towardsdatascience.com/word-to-vectors-natural-language-processing-b253dd0b0817

#Below is the code that creates the model, again we will do this once for each collection, storing 
#the results (the 'model') in a variable:

model_2000_46 = gensim.models.Word2Vec(tokenized_2000_46, size=100, window=5, 
                                      min_count=1, sg=1, alpha=0.025, iter=5, 
                                      batch_words=1000, workers=1, max_vocab_size=10000)

  "C extension not loaded, training will be slow. "


In [None]:
model_MSS_95_04 = gensim.models.Word2Vec(tokenized_MSS_95_04, size=100, window=5, min_count=1, 
                               sg=1, alpha=0.025, iter=5, batch_words=10000, workers=4,
                              max_vocab_size=10000)

  "C extension not loaded, training will be slow. "


In [None]:
model_MSS_98_47 = gensim.models.Word2Vec(tokenized_MSS_98_47, size=100, window=5, min_count=1, 
                               sg=1, alpha=0.025, iter=5, batch_words=10000, workers=4,
                              max_vocab_size=10000)