# Set-up and Data Ingestion

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pylab inline
from string import ascii_letters
import sys
import re
import time


Populating the interactive namespace from numpy and matplotlib


In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk import pos_tag

np.random.seed(2018)

import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/kjl5t/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kjl5t/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/kjl5t/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
raw_df=pd.read_csv('../../data/prd/RND Topic Modelling/abstracts_federal_reporter_combined.csv',engine='python')


# Remove duplicate and null abstracts

In [4]:
##############
#Remove nulls and duplicates
#Currently removes only duplicates based on ABSTRACTS and only in the same YEAR
#The rationale here is that we may do year-by-year modelling and don't want to exclude projects
#But if we do all-in-one modelling (e.g. across all years), we will want to reconsider
#Also will want to do additional duplicate check once abstracts are cleaned
###############

df=raw_df.loc[pd.notnull(raw_df['ABSTRACT'])]
df.drop_duplicates(subset=['ABSTRACT','FY'],inplace=True) #Drop projects with identical abstracts and year. Different year
                                                          # could indicate additional funding sent to this project.
print('Length '+str(len(df)))

####################
#Check for additional duplicates
#Note that the project id isnt necessarily identical for each transaction on same grant--e.g. one number could be added, 
#so this isnt that strict and why checking astract is needed
#####################
print('Project ID duplicates')
vc=df['PROJECT_ID'].value_counts()
print(vc[vc>1])

# no output means no duplicate IDs 

Length 550074
Project ID duplicates
Series([], Name: PROJECT_ID, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


# Create working abstracts to manipulate

In [5]:
#Define a new series which is an abstract that keeps the raw text, but can be continuously manipulated.
wa='working_abstract'
df[wa]=df['ABSTRACT'].apply(str.strip)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#Ensure there are no empty abstracts
print(pd.isnull(df[wa]).value_counts())

False    550074
Name: working_abstract, dtype: int64


# Remove "junk" from the beginning and end of abstracts.  

"Junk" does not contribute any meaning to the abstract for purposes of topic modeling.

In [7]:
################
#Function for removing any text we don't like at start, end, or anywhere within a string
################

def remove_phrase(x, phrase,loc='Start'):
    """
    returns x with phrase removed. location can be "Start" of string, "End" of string, or 
    "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instance
    """
    assert loc in ['Start','End','Anywhere_All','Anywhere_First']
    if loc=='End':
        if x.endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    elif loc=='Anywhere_All':
        return x.replace(phrase,'')
    elif loc=='Anywhere_First':
        return x.replace(phrase,'',1)
    else:
        return 'Error'
    
#Testing phrases
'''
x='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'
y='nothing in common but a phrase to remov nothing in common'
print(remove_phrase(x,'phrase to remove',loc='Start'))
print(remove_phrase(x,'phrase to remove',loc='End'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))
print(remove_phrase(y,'phrase to remove',loc='Start'))
print(remove_phrase(y,'phrase to remove',loc='End'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))
'''

"\nx='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'\ny='nothing in common but a phrase to remov nothing in common'\nprint(remove_phrase(x,'phrase to remove',loc='Start'))\nprint(remove_phrase(x,'phrase to remove',loc='End'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))\nprint(remove_phrase(y,'phrase to remove',loc='Start'))\nprint(remove_phrase(y,'phrase to remove',loc='End'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))\n"

In [8]:
#2.1--phrases noticed through investigation of starting characters, as well as those identified in R Analysis

start_phrases=['****TECHNICAL ABSTRACT****','****Technical Abstract****',
               '****Non Technical Abstract****','*** Non- Technical Abstract ***','**Non-Technical Abstract**',
          '*****NON-TECHNICAL ABSTRACT*****','***** NON-TECHNICAL ABSTRACT *****'
          '****NONTECHNICAL ABSTRACT****','****Non-Technical Abstract****','*Non-technical Abstract*',
               '*****NON-TECHNICAL ABSTRACT*****','****NON-TECHNICAL ABSTRACT****',
               '***NON-TECHNICAL ABSTRACT***','****Nontechnical abstract****'
               'DESCRIPTION (provided by applicant):','DESCRIPTION (provided by applicant)',
               'Project Summary/Abstract','PROJECT SUMMARY/ABSTRACT','ABSTRACT',
               'PROJECT SUMMARY','Project Summary','/ASBTRACT','/ Proposal','/ SUMMARY','/ DESCRIPTION','/PROJECT SUMMARY',
               '/ PROJECT SUMMARY','/Abstract:','/ABSTRACT:','/ABSTRACT','/ ABSTRACT:',
               '/ ABSTRACT','/Abstract','/ Abstract','/Description','/SUMMARY','/PROJECT SUMMARY',
              '/ RESEARCH SUMMARY','/PROJECT SUMMARY','/abstract','/Proposal Abstract',
               '/DESCRIPTION','/PROJECT DESCRIPTION','/PROJECT SUMMARY','/NARRATIVE','/RESEARCH ABSTRACT','/ PROJECT DESCRIPTION',
              'EXCEED THE SPACE PROVIDED',
              'one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.']
df[wa]=df[wa].apply(str.lstrip,args=['?-_^. :,!;¿|]#%>&'])
df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)
#Remove found phrases
for phrase in start_phrases:
    print(phrase)
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :'])
    
#df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)    # this seems to only drop the first null...ask sam.  This is a bug.  Fix below

df.drop(df[df[wa].apply(len)==0].index,axis=0,inplace=True)
df['Start Char']=df[wa].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


****TECHNICAL ABSTRACT****


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


****Technical Abstract****
****Non Technical Abstract****
*** Non- Technical Abstract ***
**Non-Technical Abstract**
*****NON-TECHNICAL ABSTRACT*****
***** NON-TECHNICAL ABSTRACT *********NONTECHNICAL ABSTRACT****
****Non-Technical Abstract****
*Non-technical Abstract*
*****NON-TECHNICAL ABSTRACT*****
****NON-TECHNICAL ABSTRACT****
***NON-TECHNICAL ABSTRACT***
****Nontechnical abstract****DESCRIPTION (provided by applicant):
DESCRIPTION (provided by applicant)
Project Summary/Abstract
PROJECT SUMMARY/ABSTRACT
ABSTRACT
PROJECT SUMMARY
Project Summary
/ASBTRACT
/ Proposal
/ SUMMARY
/ DESCRIPTION
/PROJECT SUMMARY
/ PROJECT SUMMARY
/Abstract:
/ABSTRACT:
/ABSTRACT
/ ABSTRACT:
/ ABSTRACT
/Abstract
/ Abstract
/Description
/SUMMARY
/PROJECT SUMMARY
/ RESEARCH SUMMARY
/PROJECT SUMMARY
/abstract
/Proposal Abstract
/DESCRIPTION
/PROJECT DESCRIPTION
/PROJECT SUMMARY
/NARRATIVE
/RESEARCH ABSTRACT
/ PROJECT DESCRIPTION
EXCEED THE SPACE PROVIDED
one page and must contain a summary of the proposed act

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
#1.2
#Ending phrases noticed through manual investigation of last character
print('End phrases to remove: ')
for phrase in ['(End of Abstract)',"End of Abstract", '(Abstract end)' "(END OF ABSTRACT)", '(End of abstract.)','(Abstract End)','(End 0f Abstract)','(End of Abstract.)','(End of Absract)',
               '(Abstract below)','(End of Reviewers\' Comment)','(End Abstract)','(End of abstract)','(End of abstract)',
               'PERFORMANCE SITE ========================================Section End===========================================',
                'KEY PERSONNEL ========================================Section End===========================================',
               '[summary truncated at 7800 characters]', 
               'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.',
               'Project Description Page 6', 'Page 1 of 1', 'Project Summary/Abstract Page 6',
               'Project Description Page 7', 'Project Summary/Abstract Page 7', 'Pag 1 o 1', 
               'Page 2 Number pages consecutively at the bottom throughout Form Page 2',
               'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.']:      
    print(phrase)
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'End'])

df.drop(df[df[wa].apply(len)==0].index,axis=0,inplace=True)
df['LAST_CHAR']=df[wa].apply(lambda x: x[-1])

End phrases to remove: 
(End of Abstract)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


End of Abstract
(Abstract end)(END OF ABSTRACT)
(End of abstract.)
(Abstract End)
(End 0f Abstract)
(End of Abstract.)
(End of Absract)
(Abstract below)
(End of Reviewers' Comment)
(End Abstract)
(End of abstract)
(End of abstract)
[summary truncated at 7800 characters]
This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.
Project Description Page 6
Page 1 of 1
Project Summary/Abstract Page 6
Project Description Page 7
Project Summary/Abstract Page 7
Pag 1 o 1
Page 2 Number pages consecutively at the bottom throughout Form Page 2
This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Tokenize & simple_preprocess
- removes numbers, punctuation, and accents
- lowercases
- only includes words with 3 <= length <= 25

In [10]:
import random 

#idx = random.sample(range(len(text)), 25000)
#test_text = text.iloc[idx]

test_text = df[wa][df['DEPARTMENT']=='NSF']
len(test_text)

116507

In [11]:
############
#Tokenize
#Converts document into a list of lowercase tokens
#ignores those too long or short, can remove accents, uses "tokenize"
#############

#Tokenize, lowercase, remove accents (deacc=True), remove too short of words (min_length)
t1 = time.time()
tokened_text=[gensim.utils.simple_preprocess(x, deacc=True,min_len=3, max_len=25) for x in test_text]
t2 = time.time()
print("time ", t2-t1)

#Our example text throughout cleaning process
print(tokened_text[0])

time  86.76537704467773
['this', 'project', 'explore', 'game', 'based', 'metaphor', 'enhanced', 'game', 'design', 'game', 'method', 'for', 'applying', 'cognitive', 'science', 'metaphor', 'theory', 'toward', 'the', 'design', 'computer', 'mediated', 'learning', 'environments', 'the', 'process', 'uses', 'structure', 'mapping', 'theory', 'design', 'videogame', 'worlds', 'aligned', 'with', 'science', 'concepts', 'this', 'rigorous', 'specification', 'procedure', 'for', 'mapping', 'relational', 'structure', 'from', 'targeted', 'concept', 'game', 'world', 'game', 'design', 'translates', 'targeted', 'concept', 'into', 'game', 'system', 'game', 'play', 'and', 'game', 'goal', 'because', 'the', 'relational', 'structure', 'the', 'game', 'world', 'designed', 'analog', 'the', 'targeted', 'conceptual', 'domain', 'players', 'begin', 'construct', 'mental', 'models', 'the', 'targeted', 'concept', 'during', 'interactive', 'gameplay', 'this', 'makes', 'learning', 'concrete', 'and', 'embodied', 'gameplay', 

# Removal of Stop Words
- general English list
- words relevant to the corpus that do not aid in Topic Modeling 

In [12]:
stopWords = set(nltk.corpus.stopwords.words('english'))

In [13]:
# format stop words the same way we formatted our corpus, ie. without apostrophes.  Testing print statements are commented out.

#print(stopWords)

stop_wds = stopWords.copy()
for word in stopWords:
    #print(word)
    if "\'" in word:
        stop_wds.discard(word)
        stop_wds.add(word.replace("\'",""))
    
#print(stop_wds) # stop_wds is new list of stop words to use

In [14]:
# more stop words that do not add meaning to topics

additional_stopwords=['another','well','funding','addition','require','grant', 'thus','measure', 'protocol','project',
                      'specifically', 'federal','institution', 'similar','found','find','investigator','including',
                      'funded', 'via','within', 'thus', 'particular', 'furthermore','study','studie','include','also',
                      'includes','however','whether','due', 'investigators','may','studies','overall', 'subproject','whether','could',
                      'many','finally', 'award', 'several', 'specific', 'aim', 'additional', 'therefore', 'either', 'various','address', 
                      'description', 'applicant', 'aims', 'proposal', 'within', 'among', 'would', 'form'] 
                      #'abstract', 'page']?
    
    
#    'grant_funded', 'in_addition', 'proposed_research'  What to do with bigrams in stop words.  Talk to Sam

sw = stop_wds.union(additional_stopwords)

In [15]:
# remove stopwords

def remove_stopwords(texts,sw):
    return [[word for word in doc if word not in sw] for doc in texts]
    

In [16]:
nostop_test_data = remove_stopwords(tokened_text,sw)

In [17]:
print(nostop_test_data[0])

['explore', 'game', 'based', 'metaphor', 'enhanced', 'game', 'design', 'game', 'method', 'applying', 'cognitive', 'science', 'metaphor', 'theory', 'toward', 'design', 'computer', 'mediated', 'learning', 'environments', 'process', 'uses', 'structure', 'mapping', 'theory', 'design', 'videogame', 'worlds', 'aligned', 'science', 'concepts', 'rigorous', 'specification', 'procedure', 'mapping', 'relational', 'structure', 'targeted', 'concept', 'game', 'world', 'game', 'design', 'translates', 'targeted', 'concept', 'game', 'system', 'game', 'play', 'game', 'goal', 'relational', 'structure', 'game', 'world', 'designed', 'analog', 'targeted', 'conceptual', 'domain', 'players', 'begin', 'construct', 'mental', 'models', 'targeted', 'concept', 'interactive', 'gameplay', 'makes', 'learning', 'concrete', 'embodied', 'gameplay', 'experiences', 'designed', 'guide', 'learner', 'discover', 'relational', 'structure', 'targeted', 'concept', 'gameplay', 'readiness', 'activity', 'preparing', 'learner', 'sub

# Find bigrams and trigrams

In [18]:
#Calculate bi and tri grams
from gensim.models import Phrases

# Build the bigram and trigram models
bigram = gensim.models.Phrases(nostop_test_data, min_count=5, threshold=100) # higher threshold fewer phrases.
test_data_b = bigram[nostop_test_data]
trigram = gensim.models.Phrases(test_data_b, threshold=100)  

nostop_bt_text = trigram[test_data_b]

# See trigram example
print(nostop_bt_text[0])

['explore', 'game', 'based', 'metaphor', 'enhanced', 'game', 'design', 'game', 'method', 'applying', 'cognitive', 'science', 'metaphor', 'theory', 'toward', 'design', 'computer', 'mediated', 'learning', 'environments', 'process', 'uses', 'structure', 'mapping', 'theory', 'design', 'videogame', 'worlds', 'aligned', 'science', 'concepts', 'rigorous', 'specification', 'procedure', 'mapping', 'relational', 'structure', 'targeted', 'concept', 'game', 'world', 'game', 'design', 'translates', 'targeted', 'concept', 'game', 'system', 'game', 'play', 'game', 'goal', 'relational', 'structure', 'game', 'world', 'designed', 'analog', 'targeted', 'conceptual', 'domain', 'players', 'begin', 'construct', 'mental', 'models', 'targeted', 'concept', 'interactive', 'gameplay', 'makes', 'learning', 'concrete', 'embodied', 'gameplay', 'experiences', 'designed', 'guide', 'learner', 'discover', 'relational', 'structure', 'targeted', 'concept', 'gameplay', 'readiness', 'activity', 'preparing', 'learner', 'sub

In [19]:
bigram['grant', 'funded']

['grant', 'funded']

# Lemmatize

In [20]:

##########
#Lemmatize
#Create function to lemmatize by part of speech
#Retain only adjectives, nouns, verbs, and adverbs
##########
from nltk.stem.wordnet import WordNetLemmatizer

#Create lemmatizer
lemmatizer=WordNetLemmatizer()

#The POS tags created using pos_tag method are abbreviations like 'FW' or 'JJ'.
"""['FW',#Foreign word
 'JJ','JJR','JJS', #Adjectives
 'NN','NNS','NNP','NNPS','POS',#Nouns, POS is a possessive noun
 'RB','RBR','RBS','RP', #Adverbs
 'VB','VBD','VBG','VBN','VBP','VBZ','MD']]#Verbs, MD is modal, 'could','will'"""
#We are ignoring all other categories: numbers, prepositions,pronouns etc.


def apply_lemmatizer(x):
    """apply lemmatizer with correct pos parameter for each word. Takes a tuple of length 2 e.g. ("avocado",'NN')"""
    y=x[1] #x is a tuple of length 2
    if y in ['JJ','JJR','JJS']:
        return lemmatizer.lemmatize(x[0],pos='a')
    elif y in ['NN','NNS','NNP','NNPS','POS']:
        return lemmatizer.lemmatize(x[0],pos='n')
    elif y in ['RB','RBR','RBS','RP']: #This doesn't actually work
        return lemmatizer.lemmatize(x[0],pos='r')
    elif y in ['VB','VBD','VBG','VBN','VBP','VBZ','MD']:
        return lemmatizer.lemmatize(x[0],pos='v')
    elif y=='FW':
        return x[0]
    elif '_' in x[0]: #Bi and trigrams
        return x[0]
        
#Traverse each document, tag its part of speech
#and return tokens that are lemmatized and either a noun, verb, adverb, adj, or foreign work
#This works REALLY slowly
docs=[] #Final lemmatized list of documents, i.e. list of lists of tokens
t1 = time.time()
for x in range(len(nostop_bt_text)):
    doc=nostop_bt_text[x]
    tagged_sentence=pos_tag(doc) #output is a list of tuples: [('game', 'NN'), ('explore','VB')]
    tokens_kept=[] #Tokens we are retaining
    for token_tuple in tagged_sentence:
        tokens_kept.append(apply_lemmatizer(token_tuple))
    #Removes tokens that aren't nouns, verbs, adverbs, adjectives or foreign words
    docs.append([x for x in tokens_kept if x is not None]) # could we just do this in apply_lemmatizer function?  Ask SAm.
    
t2 = time.time()    
    
#Our example text throughout cleaning process
print(docs[0])
print("time ", t2-t1)

['explore', 'game', 'base', 'metaphor', 'enhanced', 'game', 'design', 'game', 'method', 'apply', 'cognitive', 'science', 'metaphor', 'theory', 'design', 'computer', 'mediate', 'learning', 'environment', 'process', 'us', 'structure', 'map', 'theory', 'design', 'videogame', 'world', 'align', 'science', 'concept', 'rigorous', 'specification', 'procedure', 'map', 'relational', 'structure', 'target', 'concept', 'game', 'world', 'game', 'design', 'translate', 'target', 'concept', 'game', 'system', 'game', 'play', 'game', 'goal', 'relational', 'structure', 'game', 'world', 'design', 'analog', 'target', 'conceptual', 'domain', 'player', 'begin', 'construct', 'mental', 'model', 'target', 'concept', 'interactive', 'gameplay', 'make', 'learn', 'concrete', 'embody', 'gameplay', 'experience', 'design', 'guide', 'learner', 'relational', 'structure', 'target', 'concept', 'gameplay', 'readiness', 'activity', 'prepare', 'learner', 'subsequent', 'instruction', 'primary', 'objective', 'cyber_enabled', 't

# Create dictionary and corpus to feed to LDA model

In [21]:
#######################
#Dictionary creation
###################

# Create Dictionary
id2word = gensim.corpora.Dictionary(docs)
print(id2word)

keep_only_most_common=int(len(docs)/2) #LDA works best with less features than documents
#Filter words to only those found in at least a set number of documents (min_appearances)
id2word.filter_extremes(no_below=3, no_above=0.4, keep_n=keep_only_most_common)
print(id2word)

#Our dataset:
print(len(docs))
#Bigrams or trigrams
print(len([x for x in id2word.token2id.keys() if '_' in x]))

####################
# Term Document Frequency--Our corpus
####################
#Creates a count for each unique word appearing in the document, where the word_id is substituted for the word
corpus = [id2word.doc2bow(doc) for doc in docs]

#To see what the word is: 
#Index 0 is the document, Index 1 is the (word, count) tuple, Index 2 is the location of word ID in tuple
#Final 0 is never changed.
id2word[corpus[0][0][0]] 


Dictionary(210133 unique tokens: ['achieve', 'achievement', 'activity', 'align', 'analog']...)
Dictionary(58253 unique tokens: ['achieve', 'achievement', 'activity', 'align', 'analog']...)
116507
23659


'achieve'

# Train LDA model

In [22]:
########################
#Run model
########################

#This takes a long time - make sure to use workers = number of cores - 1
t1 = time.time()
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=id2word, passes=10, workers=7)
t2 = time.time()
print("time ", t2-t1)

time  139.75012636184692


In [23]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"cell" + 0.012*"protein" + 0.007*"biology" + 0.007*"gene" + 0.007*"plant" + 0.007*"function" + 0.007*"biological" + 0.006*"molecular" + 0.005*"brain" + 0.005*"development"
Topic: 1 
Words: 0.010*"instrument" + 0.010*"data" + 0.010*"physic" + 0.009*"university" + 0.008*"support" + 0.007*"program" + 0.007*"image" + 0.006*"facility" + 0.005*"particle" + 0.005*"scientific"
Topic: 2 
Words: 0.019*"theory" + 0.016*"problem" + 0.016*"model" + 0.011*"method" + 0.009*"mathematical" + 0.007*"structure" + 0.007*"application" + 0.007*"field" + 0.007*"analysis" + 0.006*"quantum"
Topic: 3 
Words: 0.021*"data" + 0.012*"network" + 0.011*"design" + 0.009*"model" + 0.007*"application" + 0.007*"software" + 0.006*"base" + 0.006*"information" + 0.006*"tool" + 0.005*"user"
Topic: 4 
Words: 0.013*"model" + 0.008*"climate" + 0.007*"ocean" + 0.007*"data" + 0.007*"change" + 0.007*"process" + 0.006*"earth" + 0.005*"surface" + 0.005*"flow" + 0.005*"scale"
Topic: 5 
Words: 0.017*"material" +

# Create a visualization of the model results using pyLDAvis

In [24]:
import pyLDAvis
import pyLDAvis.gensim as ldavis

In [25]:
pyLDAvis.enable_notebook()

vis = ldavis.prepare(lda_model, corpus, id2word)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
pyLDAvis.display(vis)

# Extra

In [22]:
df[wa][df.index[85]]

"The Binding Principles, perhaps the most well studied principles in human language, state where in a sentence nominals (e.g., pronouns, common and proper nouns, etc.) may occur, and what they may/may not refer to. Many prominent researchers argue that these principles are part of our innate linguistic endowment because (i) they occur in the vast majority of languages for which we have data, and (ii) children in every language studied to date exhibit them at the very earliest testable ages. This argument for innateness is threatened somewhat by the existence of a handful of languages, e.g., Thai, in which the Binding Principles appear not to hold. If these principles are innate, exceptional languages like Thai should not exist.So why don't Thai speakers obey the Binding Principles? Thai children hold the answer to this puzzle: if Thai children, unlike Thai adults, exhibit knowledge of the Binding Principles at young ages, this will show that the principles are indeed part of our innate

In [47]:
docs = df[wa][df['LAST_CHAR'] == '7']
print((df[wa][df['LAST_CHAR'] == '7']).index)

for d in docs:
    print(d[-100:])

Int64Index([  2092,   3426,  19034,  21230,  59567,  65718,  68023,  71888,
             72340,  74398,  78863,  79951,  81310,  82095,  86570,  86697,
             86711, 119532, 120315, 127869, 129676, 129802, 131347, 132781,
            133844, 135499, 135998, 136003, 139060, 139160, 143584, 144645,
            157973, 165423, 169447, 180384, 184253, 185084, 185092, 185098,
            185691, 186274, 191274, 199161, 203579, 224518, 227375, 228513,
            234759, 248680, 255542, 276938, 282493, 296911, 301320, 322930,
            382249, 396939, 419241, 437579, 440221, 452087, 458490, 461743,
            462559, 480348, 488423, 491838, 492152, 498100, 501368, 510759,
            510777, 512436, 512786, 513131, 515244, 516575, 522046, 529835,
            537868, 538062, 538863],
           dtype='int64')
observe a strong surface brightness discontinuity, most likely a cold front, in the outskirts of M87
ntation goes beyond the original projcct and its first supplement in two way

In [51]:
df[wa][86697]

"one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.3145-AbstractPBC.pdfTracking Number: GRANT00426567AttachmentsProjectAbstractAddAttachmentFile Name Mime Type3145-AbstractPBC.pdf application/pdfTracking Number: GRANT00426567 Pacific Basin Consortium ConferenceTopic NCEH 500.5 - International Environmental Health Abstract th The Pacific Basin Consortium for Environment and Health will hold its 13 InternationalConference, `Environmental Exposures in the Era of Climate Change', October 12-16, 2009 inPerth, Australia. Obje

In [52]:
print(tokened_text[86696])

['one', 'page', 'and', 'must', 'contain', 'summary', 'the', 'proposed', 'activity', 'suitable', 'for', 'dissemination', 'thepublic', 'should', 'self', 'contained', 'description', 'the', 'project', 'and', 'should', 'contain', 'statement', 'objectives', 'and', 'methods', 'employed', 'should', 'informative', 'other', 'persons', 'working', 'the', 'same', 'related', 'fields', 'and', 'insofar', 'possible', 'understandable', 'technically', 'liter', 'ate', 'lay', 'reader', 'this', 'abstract', 'must', 'not', 'include', 'any', 'proprietary', 'confidential', 'information', 'please', 'click', 'the', 'add', 'attachment', 'button', 'complete', 'this', 'entry', 'abstractpbc', 'pdftracking', 'number', 'grant', 'name', 'mime', 'type', 'abstractpbc', 'pdf', 'application', 'pdftracking', 'number', 'grant', 'pacific', 'basin', 'consortium', 'conferencetopic', 'nceh', 'international', 'environmental', 'health', 'abstract', 'the', 'pacific', 'basin', 'consortium', 'for', 'environment', 'and', 'health', 'wil

In [51]:
st_str = 'one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.'

count = 0
for abstract in df[wa]:
    if abstract.startswith(st_str):
        count = count + 1
        
print(count)



25


In [38]:
gensim.utils.simple_preprocess("The dog was 3 named Bond007.", deacc=True,min_len=3, max_len=25)

['the', 'dog', 'was', 'named', 'bond']

### Trying to find more stop words

In [44]:
all_tokens = list()

for tokens in nostop_test_data:
    all_tokens.extend(tokens)


In [45]:
# potential stop words - check list by hand 

from collections import Counter

c = Counter(all_tokens)

In [46]:
c.most_common(100)

[('research', 306901),
 ('project', 240559),
 ('students', 162546),
 ('new', 125131),
 ('data', 116395),
 ('science', 91710),
 ('high', 85110),
 ('also', 81521),
 ('systems', 76009),
 ('program', 72214),
 ('development', 72168),
 ('understanding', 71678),
 ('provide', 69660),
 ('based', 67747),
 ('study', 67266),
 ('work', 63135),
 ('university', 62758),
 ('use', 61694),
 ('materials', 61667),
 ('engineering', 57672),
 ('well', 56294),
 ('develop', 56023),
 ('using', 55433),
 ('used', 55174),
 ('graduate', 54321),
 ('system', 52730),
 ('design', 51427),
 ('undergraduate', 50620),
 ('education', 50025),
 ('methods', 49685),
 ('two', 49232),
 ('models', 48913),
 ('time', 47571),
 ('energy', 47545),
 ('one', 47170),
 ('including', 47078),
 ('impact', 46749),
 ('information', 46580),
 ('applications', 46284),
 ('model', 46279),
 ('important', 46232),
 ('technology', 45772),
 ('proposed', 45600),
 ('community', 44642),
 ('results', 44480),
 ('field', 44466),
 ('analysis', 44438),
 ('support

In [47]:
c['technical']

20965

In [48]:
# n least common elements

c.most_common()[:-100-1:-1] 

[('nianet', 1),
 ('righted', 1),
 ('ausreptheory', 1),
 ('midreptheory', 1),
 ('dataresearch', 1),
 ('subsequentworkshops', 1),
 ('bdv', 1),
 ('pegylating', 1),
 ('qingzhu', 1),
 ('calaboose', 1),
 ('bather', 1),
 ('utsap', 1),
 ('txhsic', 1),
 ('gameworld', 1),
 ('biostimulants', 1),
 ('uguin', 1),
 ('azschool', 1),
 ('mathphys', 1),
 ('anoklase', 1),
 ('lnmo', 1),
 ('wholesaling', 1),
 ('fowa', 1),
 ('rrdv', 1),
 ('sulfinate', 1),
 ('reductionthe', 1),
 ('nonnutritive', 1),
 ('djs', 1),
 ('rainbelts', 1),
 ('sgoc', 1),
 ('esss', 1),
 ('algortihm', 1),
 ('convolves', 1),
 ('ectosymbiont', 1),
 ('hypermastigote', 1),
 ('hypermastigotes', 1),
 ('nwconference', 1),
 ('pausader', 1),
 ('achievementsthis', 1),
 ('akaigawa', 1),
 ('antiguo', 1),
 ('caballeros', 1),
 ('nata', 1),
 ('glazes', 1),
 ('tica', 1),
 ('cnico', 1),
 ('polit', 1),
 ('unmtemps', 1),
 ('bartrami', 1),
 ('strymon', 1),
 ('hairstreak', 1),
 ('bartram', 1),
 ('skarn', 1),
 ('feud', 1),
 ('gameshow', 1),
 ('triploblastic',

In [66]:
## Parallel Version -- DOESN'T WORK

##########
#Lemmatize
#Create function to lemmatize by part of speech
#Retain only adjectives, nouns, verbs, and adverbs
##########
from nltk.stem.wordnet import WordNetLemmatizer

#Create lemmatizer
lemmatizer=WordNetLemmatizer()

#The POS tags created using pos_tag method are abbreviations like 'FW' or 'JJ'.
"""['FW',#Foreign word
 'JJ','JJR','JJS', #Adjectives
 'NN','NNS','NNP','NNPS','POS',#Nouns, POS is a possessive noun
 'RB','RBR','RBS','RP', #Adverbs
 'VB','VBD','VBG','VBN','VBP','VBZ','MD']]#Verbs, MD is modal, 'could','will'"""
#We are ignoring all other categories: numbers, prepositions,pronouns etc.


def apply_lemmatizer(x):
    """apply lemmatizer with correct pos parameter for each word. Takes a tuple of length 2 e.g. ("avocado",'NN')"""
    y=x[1] #x is a tuple of length 2
    if y in ['JJ','JJR','JJS']:
        return lemmatizer.lemmatize(x[0],pos='a')
    elif y in ['NN','NNS','NNP','NNPS','POS']:
        return lemmatizer.lemmatize(x[0],pos='n')
    elif y in ['RB','RBR','RBS','RP']: #This doesn't actually work
        return lemmatizer.lemmatize(x[0],pos='r')
    elif y in ['VB','VBD','VBG','VBN','VBP','VBZ','MD']:
        return lemmatizer.lemmatize(x[0],pos='v')
    elif y=='FW':
        return x[0]
    elif '_' in x[0]: #Bi and trigrams
        return x[0]
        
#Traverse each document, tag its part of speech
#and return tokens that are lemmatized and either a noun, verb, adverb, adj, or foreign work
#This works REALLY slowly
docs=[] #Final lemmatized list of documents, i.e. list of lists of tokens


# parallel version

import multiprocessing 
import os 
import ctypes 
  
def doc_lemma(text, docs, idx):
    
    doc=text[x]
    tagged_sentence=pos_tag(doc) #output is a list of tuples: [('game', 'NN'), ('explore','VB')]
    tokens_kept=[] #Tokens we are retaining
    for token_tuple in tagged_sentence:
        tokens_kept.append(apply_lemmatizer(token_tuple))
    #Removes tokens that aren't nouns, verbs, adverbs, adjectives or foreign words
    docs.append([x for x in tokens_kept if x is not None]) #Removes tokens that aren't nouns, verbs, adverbs, adjectives or foreign words
  
    return

# creating Array of string data type with space for 4 integers 
docs = multiprocessing.Array('POINTER(c_char)',2) ## Shared memory allocation will need to be done separately. Maybe using Manager class?
    
# Instead of for-loop - creating new processes 
p1 = multiprocessing.Process(target=char_count, args=(lim_data, result, 0))
p2 = multiprocessing.Process(target=char_count, args=(lim_data, result, 1))
  
# starting processes 
p1.start() 
p2.start()
  
# wait until process is finished 
p1.join() 
p2.join() 

print(docs)
    
    
#Our example text throughout cleaning process
print(docs[0])
#print("time ", t2-t1)

TypeError: this type has no size