# Set-up and Data Ingestion

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pylab inline
from string import ascii_letters
import sys
import re
import time


Populating the interactive namespace from numpy and matplotlib


In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk import pos_tag

np.random.seed(2018)

import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/kjl5t/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kjl5t/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/kjl5t/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
raw_df=pd.read_csv('../../data/prd/RND Topic Modelling/abstracts_federal_reporter_combined.csv',engine='python')


# Remove duplicate and null abstracts

In [4]:
##############
#Remove nulls and duplicates
#Currently removes only duplicates based on ABSTRACTS and only in the same YEAR
#The rationale here is that we may do year-by-year modelling and don't want to exclude projects
#But if we do all-in-one modelling (e.g. across all years), we will want to reconsider
#Also will want to do additional duplicate check once abstracts are cleaned
###############

df=raw_df.loc[pd.notnull(raw_df['ABSTRACT'])]
df.drop_duplicates(subset=['ABSTRACT','FY'],inplace=True) #Drop projects with identical abstracts and year. Different year
                                                          # could indicate additional funding sent to this project.
print('Length '+str(len(df)))

####################
#Check for additional duplicates
#Note that the project id isnt necessarily identical for each transaction on same grant--e.g. one number could be added, 
#so this isnt that strict and why checking astract is needed
#####################
print('Project ID duplicates')
vc=df['PROJECT_ID'].value_counts()
print(vc[vc>1])

# no output means no duplicate IDs 

Length 550074
Project ID duplicates
Series([], Name: PROJECT_ID, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


# Create working abstracts to manipulate

In [5]:
#Define a new series which is an abstract that keeps the raw text, but can be continuously manipulated.
wa='working_abstract'
df[wa]=df['ABSTRACT'].apply(str.strip)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#Ensure there are no empty abstracts
print(pd.isnull(df[wa]).value_counts())

False    550074
Name: working_abstract, dtype: int64


# Remove "junk" from the beginning and end of abstracts.  

"Junk" does not contribute any meaning to the abstract for purposes of topic modeling.

In [7]:
################
#Function for removing any text we don't like at start, end, or anywhere within a string
################

def remove_phrase(x, phrase,loc='Start'):
    """
    returns x with phrase removed. location can be "Start" of string, "End" of string, or 
    "Anywhere_All"--anywhere will remove all instances and Anywhere_First will remove the first instance
    """
    assert loc in ['Start','End','Anywhere_All','Anywhere_First']
    if loc=='End':
        if x.endswith(phrase):
            return x[:-1*len(phrase)].strip()
        else:
            return x
    elif loc=='Start':
        if x.startswith(phrase):
            return x[len(phrase):].strip()
        else:
            return x
    elif loc=='Anywhere_All':
        return x.replace(phrase,'')
    elif loc=='Anywhere_First':
        return x.replace(phrase,'',1)
    else:
        return 'Error'
    
#Testing phrases
'''
x='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'
y='nothing in common but a phrase to remov nothing in common'
print(remove_phrase(x,'phrase to remove',loc='Start'))
print(remove_phrase(x,'phrase to remove',loc='End'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))
print(remove_phrase(y,'phrase to remove',loc='Start'))
print(remove_phrase(y,'phrase to remove',loc='End'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))
print(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))
'''

"\nx='phrase to remove is at the start and phrase to remove is also in middle and phrase to remove is in the middle again and ends with phrase to remove'\ny='nothing in common but a phrase to remov nothing in common'\nprint(remove_phrase(x,'phrase to remove',loc='Start'))\nprint(remove_phrase(x,'phrase to remove',loc='End'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(x,'phrase to remove',loc='Anywhere_First'))\nprint(remove_phrase(y,'phrase to remove',loc='Start'))\nprint(remove_phrase(y,'phrase to remove',loc='End'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_All'))\nprint(remove_phrase(y,'phrase to remove',loc='Anywhere_First'))\n"

In [8]:
#2.1--phrases noticed through investigation of starting characters, as well as those identified in R Analysis

start_phrases=['****TECHNICAL ABSTRACT****','****Technical Abstract****',
               '****Non Technical Abstract****','*** Non- Technical Abstract ***','**Non-Technical Abstract**',
          '*****NON-TECHNICAL ABSTRACT*****','***** NON-TECHNICAL ABSTRACT *****'
          '****NONTECHNICAL ABSTRACT****','****Non-Technical Abstract****','*Non-technical Abstract*',
               '*****NON-TECHNICAL ABSTRACT*****','****NON-TECHNICAL ABSTRACT****',
               '***NON-TECHNICAL ABSTRACT***','****Nontechnical abstract****'
               'DESCRIPTION (provided by applicant):','DESCRIPTION (provided by applicant)',
               'Project Summary/Abstract','PROJECT SUMMARY/ABSTRACT','ABSTRACT',
               'PROJECT SUMMARY','Project Summary','/ASBTRACT','/ Proposal','/ SUMMARY','/ DESCRIPTION','/PROJECT SUMMARY',
               '/ PROJECT SUMMARY','/Abstract:','/ABSTRACT:','/ABSTRACT','/ ABSTRACT:',
               '/ ABSTRACT','/Abstract','/ Abstract','/Description','/SUMMARY','/PROJECT SUMMARY',
              '/ RESEARCH SUMMARY','/PROJECT SUMMARY','/abstract','/Proposal Abstract',
               '/DESCRIPTION','/PROJECT DESCRIPTION','/PROJECT SUMMARY','/NARRATIVE','/RESEARCH ABSTRACT','/ PROJECT DESCRIPTION',
              'EXCEED THE SPACE PROVIDED',
              'one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.']
df[wa]=df[wa].apply(str.lstrip,args=['?-_^. :,!;¿|]#%>&'])
df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)
#Remove found phrases
for phrase in start_phrases:
    print(phrase)
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'Start']).apply(str.lstrip,args=[' :'])
    
#df.drop(df[df[wa].apply(len)==0].index[0],axis=0,inplace=True)    # this seems to only drop the first null...ask sam.  This is a bug.  Fix below

df.drop(df[df[wa].apply(len)==0].index,axis=0,inplace=True)
df['Start Char']=df[wa].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


****TECHNICAL ABSTRACT****


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


****Technical Abstract****
****Non Technical Abstract****
*** Non- Technical Abstract ***
**Non-Technical Abstract**
*****NON-TECHNICAL ABSTRACT*****
***** NON-TECHNICAL ABSTRACT *********NONTECHNICAL ABSTRACT****
****Non-Technical Abstract****
*Non-technical Abstract*
*****NON-TECHNICAL ABSTRACT*****
****NON-TECHNICAL ABSTRACT****
***NON-TECHNICAL ABSTRACT***
****Nontechnical abstract****DESCRIPTION (provided by applicant):
DESCRIPTION (provided by applicant)
Project Summary/Abstract
PROJECT SUMMARY/ABSTRACT
ABSTRACT
PROJECT SUMMARY
Project Summary
/ASBTRACT
/ Proposal
/ SUMMARY
/ DESCRIPTION
/PROJECT SUMMARY
/ PROJECT SUMMARY
/Abstract:
/ABSTRACT:
/ABSTRACT
/ ABSTRACT:
/ ABSTRACT
/Abstract
/ Abstract
/Description
/SUMMARY
/PROJECT SUMMARY
/ RESEARCH SUMMARY
/PROJECT SUMMARY
/abstract
/Proposal Abstract
/DESCRIPTION
/PROJECT DESCRIPTION
/PROJECT SUMMARY
/NARRATIVE
/RESEARCH ABSTRACT
/ PROJECT DESCRIPTION
EXCEED THE SPACE PROVIDED
one page and must contain a summary of the proposed act

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
#1.2
#Ending phrases noticed through manual investigation of last character
print('End phrases to remove: ')
for phrase in ['(End of Abstract)',"End of Abstract", '(Abstract end)' "(END OF ABSTRACT)", '(End of abstract.)','(Abstract End)','(End 0f Abstract)','(End of Abstract.)','(End of Absract)',
               '(Abstract below)','(End of Reviewers\' Comment)','(End Abstract)','(End of abstract)','(End of abstract)',
               'PERFORMANCE SITE ========================================Section End===========================================',
                'KEY PERSONNEL ========================================Section End===========================================',
               '[summary truncated at 7800 characters]', 
               'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.',
               'Project Description Page 6', 'Page 1 of 1', 'Project Summary/Abstract Page 6',
               'Project Description Page 7', 'Project Summary/Abstract Page 7', 'Pag 1 o 1', 
               'Page 2 Number pages consecutively at the bottom throughout Form Page 2',
               'This award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria.']:      
    print(phrase)
    df[wa]=df[wa].apply(remove_phrase,args=[phrase,'End'])

df.drop(df[df[wa].apply(len)==0].index,axis=0,inplace=True)
df['LAST_CHAR']=df[wa].apply(lambda x: x[-1])

End phrases to remove: 
(End of Abstract)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


End of Abstract
(Abstract end)(END OF ABSTRACT)
(End of abstract.)
(Abstract End)
(End 0f Abstract)
(End of Abstract.)
(End of Absract)
(Abstract below)
(End of Reviewers' Comment)
(End Abstract)
(End of abstract)
(End of abstract)
[summary truncated at 7800 characters]
This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.
Project Description Page 6
Page 1 of 1
Project Summary/Abstract Page 6
Project Description Page 7
Project Summary/Abstract Page 7
Pag 1 o 1
Page 2 Number pages consecutively at the bottom throughout Form Page 2
This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Tokenize & simple_preprocess
- removes numbers, punctuation, and accents
- lowercases
- only includes words with 3 <= length <= 25

In [10]:
import random 

#idx = random.sample(range(len(df[wa])), 50000)
#test_text = df[wa].iloc[idx]

test_text = df[wa][df['DEPARTMENT']=='NASA']
#test_text = df[wa]

len(test_text)


16006

In [11]:
############
#Tokenize
#Converts document into a list of lowercase tokens
#ignores those too long or short, can remove accents, uses "tokenize"
#############

#Tokenize, lowercase, remove accents (deacc=True), remove too short of words (min_length)
t1 = time.time()
tokened_text=[gensim.utils.simple_preprocess(x, deacc=True,min_len=3, max_len=25) for x in test_text]
t2 = time.time()
print("time ", t2-t1)

#Our example text throughout cleaning process
print(tokened_text[0])

time  1.5348949432373047
['mapping', 'characterization', 'and', 'analysis', 'channel', 'valley', 'features', 'titan', 'elongate', 'sinuous', 'features', 'interpreted', 'fluvial', 'origin', 'have', 'been', 'identified', 'titan', 'data', 'from', 'all', 'three', 'cassini', 'imaging', 'instruments', 'the', 'cassini', 'titan', 'radar', 'mapp']


# Removal of Stop Words
- general English list
- words relevant to the corpus that do not aid in Topic Modeling 

In [12]:
stopWords = set(nltk.corpus.stopwords.words('english'))

In [13]:
# format stop words the same way we formatted our corpus, ie. without apostrophes.  Testing print statements are commented out.

#print(stopWords)

stop_wds = stopWords.copy()
for word in stopWords:
    #print(word)
    if "\'" in word:
        stop_wds.discard(word)
        stop_wds.add(word.replace("\'",""))
    
#print(stop_wds) # stop_wds is new list of stop words to use

In [14]:
# more stop words that do not add meaning to topics

additional_stopwords=['another','well','funding','addition','require','grant', 'thus','measure', 'protocol','project',
                      'specifically', 'federal','institution', 'similar','found','find','investigator','including',
                      'funded', 'via','within', 'thus', 'particular', 'furthermore','study','studie','include','also',
                      'includes','however','whether','due', 'investigators','may','studies','overall', 'subproject','whether','could',
                      'many','finally', 'award', 'several', 'specific', 'aim', 'additional', 'therefore', 'either', 'various','address', 
                      'description', 'applicant', 'aims', 'proposal', 'within', 'among', 'would', 'form'] 
                      #'abstract', 'page']?
    
    
#    'grant_funded', 'in_addition', 'proposed_research'  What to do with bigrams in stop words.  Talk to Sam

sw = stop_wds.union(additional_stopwords)

In [15]:
# remove stopwords

def remove_stopwords(texts,sw):
    return [[word for word in doc if word not in sw] for doc in texts]
    

In [16]:
nostop_test_data = remove_stopwords(tokened_text,sw)

In [17]:
print(nostop_test_data[0])

['mapping', 'characterization', 'analysis', 'channel', 'valley', 'features', 'titan', 'elongate', 'sinuous', 'features', 'interpreted', 'fluvial', 'origin', 'identified', 'titan', 'data', 'three', 'cassini', 'imaging', 'instruments', 'cassini', 'titan', 'radar', 'mapp']


# Find bigrams and trigrams

In [18]:
#Calculate bi and tri grams
from gensim.models import Phrases

# Build the bigram and trigram models
bigram = gensim.models.Phrases(nostop_test_data, min_count=5, threshold=100) # higher threshold fewer phrases.
test_data_b = bigram[nostop_test_data]
trigram = gensim.models.Phrases(test_data_b, threshold=100)  

nostop_bt_text = trigram[test_data_b]

# See trigram example
print(nostop_bt_text[0])

['mapping', 'characterization', 'analysis', 'channel', 'valley', 'features', 'titan', 'elongate', 'sinuous', 'features', 'interpreted', 'fluvial', 'origin', 'identified', 'titan', 'data', 'three', 'cassini', 'imaging', 'instruments', 'cassini', 'titan', 'radar', 'mapp']


In [None]:
bigram['grant', 'funded']

# Lemmatize

In [19]:

##########
#Lemmatize
#Create function to lemmatize by part of speech
#Retain only adjectives, nouns, verbs, and adverbs
##########
from nltk.stem.wordnet import WordNetLemmatizer

#Create lemmatizer
lemmatizer=WordNetLemmatizer()

#The POS tags created using pos_tag method are abbreviations like 'FW' or 'JJ'.
"""['FW',#Foreign word
 'JJ','JJR','JJS', #Adjectives
 'NN','NNS','NNP','NNPS','POS',#Nouns, POS is a possessive noun
 'RB','RBR','RBS','RP', #Adverbs
 'VB','VBD','VBG','VBN','VBP','VBZ','MD']]#Verbs, MD is modal, 'could','will'"""
#We are ignoring all other categories: numbers, prepositions,pronouns etc.


def apply_lemmatizer(x):
    """apply lemmatizer with correct pos parameter for each word. Takes a tuple of length 2 e.g. ("avocado",'NN')"""
    y=x[1] #x is a tuple of length 2
    if y in ['JJ','JJR','JJS']:
        return lemmatizer.lemmatize(x[0],pos='a')
    elif y in ['NN','NNS','NNP','NNPS','POS']:
        return lemmatizer.lemmatize(x[0],pos='n')
    elif y in ['RB','RBR','RBS','RP']: #This doesn't actually work
        return lemmatizer.lemmatize(x[0],pos='r')
    elif y in ['VB','VBD','VBG','VBN','VBP','VBZ','MD']:
        return lemmatizer.lemmatize(x[0],pos='v')
    elif y=='FW':
        return x[0]
    elif '_' in x[0]: #Bi and trigrams
        return x[0]
        
#Traverse each document, tag its part of speech
#and return tokens that are lemmatized and either a noun, verb, adverb, adj, or foreign work
#This works REALLY slowly
docs=[] #Final lemmatized list of documents, i.e. list of lists of tokens
t1 = time.time()
for x in range(len(nostop_bt_text)):
    doc=nostop_bt_text[x]
    tagged_sentence=pos_tag(doc) #output is a list of tuples: [('game', 'NN'), ('explore','VB')]
    tokens_kept=[] #Tokens we are retaining
    for token_tuple in tagged_sentence:
        tokens_kept.append(apply_lemmatizer(token_tuple))
    #Removes tokens that aren't nouns, verbs, adverbs, adjectives or foreign words
    docs.append([x for x in tokens_kept if x is not None]) # could we just do this in apply_lemmatizer function?  Ask SAm.
    
t2 = time.time()    
    
#Our example text throughout cleaning process
print(docs[0])
print("time ", t2-t1)

['map', 'characterization', 'analysis', 'channel', 'valley', 'feature', 'titan', 'elongate', 'sinuous', 'feature', 'interpret', 'fluvial', 'origin', 'identify', 'titan', 'data', 'cassini', 'image', 'instrument', 'cassini', 'titan', 'radar', 'mapp']
time  26.346226453781128


# Create dictionary and corpus to feed to LDA model

In [32]:
#######################
#Dictionary creation
###################

# Create Dictionary
id2word = gensim.corpora.Dictionary(docs)
print(id2word)

#keep_only_most_common=int(len(docs)/2) #LDA works best with less features than documents
keep_only_most_common=1000
print(keep_only_most_common)

#Filter words to only those found in at least a set number of documents (min_appearances)
id2word.filter_extremes(no_below=3, no_above=0.4, keep_n=keep_only_most_common)
print(id2word)

#Our dataset:
print(len(docs))
#Bigrams or trigrams
print(len([x for x in id2word.token2id.keys() if '_' in x]))

####################
# Term Document Frequency--Our corpus
####################
#Creates a count for each unique word appearing in the document, where the word_id is substituted for the word
corpus = [id2word.doc2bow(doc) for doc in docs]

#To see what the word is: 
#Index 0 is the document, Index 1 is the (word, count) tuple, Index 2 is the location of word ID in tuple
#Final 0 is never changed.
id2word[corpus[0][0][0]] 


Dictionary(26743 unique tokens: ['analysis', 'cassini', 'channel', 'characterization', 'data']...)
1000
Dictionary(1000 unique tokens: ['analysis', 'cassini', 'channel', 'characterization', 'data']...)
16006
30


'analysis'

In [42]:
# save data for later

import pickle

#pickle.dump([corpus, id2word, docs], open('rand_data.sav','wb'))
#pickle.dump([corpus, id2word, docs], open('nasa_data.sav','wb'))
pickle.dump([corpus, id2word, docs], open('all_data.sav','wb'))

# Train LDA model

In [33]:
########################
#Run model
########################

#This takes a long time - make sure to use workers = number of cores - 1
t1 = time.time()
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=id2word, passes=5, workers=15)
t2 = time.time()
print("time ", t2-t1)

time  7.434653997421265


In [44]:
# save model for later

import pickle

#pickle.dump([lda_model, corpus, id2word, docs], open('rand_model.sav','wb'))
pickle.dump([lda_model, corpus, id2word, docs], open('nasa_model.sav','wb'))

In [3]:
# load previous model

import pickle

model = pickle.load(open('rand_model.sav', 'rb'))
#model = pickle.load(open('nasa_model.sav', 'rb'))
lda_model = model[0]
corpus = model[1]
id2word = model[2]
docs = model[3]

In [5]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.032*"cancer" + 0.013*"gene" + 0.012*"cell" + 0.009*"resistance" + 0.009*"identify" + 0.009*"tumor" + 0.007*"target" + 0.006*"mechanism" + 0.006*"lung" + 0.005*"mutation"
Topic: 1 
Words: 0.011*"behavior" + 0.010*"learn" + 0.010*"social" + 0.009*"behavioral" + 0.008*"language" + 0.008*"cognitive" + 0.007*"individual" + 0.007*"treatment" + 0.005*"alcohol" + 0.005*"brain"
Topic: 2 
Words: 0.015*"neuron" + 0.012*"brain" + 0.008*"model" + 0.008*"neural" + 0.007*"function" + 0.007*"activity" + 0.006*"cell" + 0.006*"mechanism" + 0.006*"human" + 0.006*"system"
Topic: 3 
Words: 0.011*"model" + 0.006*"new" + 0.006*"data" + 0.005*"method" + 0.005*"system" + 0.005*"high" + 0.005*"change" + 0.005*"result" + 0.005*"process" + 0.004*"work"
Topic: 4 
Words: 0.011*"clinical" + 0.009*"patient" + 0.009*"treatment" + 0.007*"radiation" + 0.006*"therapy" + 0.006*"disease" + 0.006*"image" + 0.005*"model" + 0.005*"improve" + 0.005*"propose"
Topic: 5 
Words: 0.032*"program" + 0.025*"student"

# Create a visualization of the model results using pyLDAvis

In [27]:
import pyLDAvis
import pyLDAvis.gensim as ldavis

In [1]:
pyLDAvis.enable_notebook()

vis = ldavis.prepare(lda_model, corpus, id2word)


NameError: name 'pyLDAvis' is not defined

In [35]:
pyLDAvis.display(vis)

In [36]:
# from Sam's code:
# corpus = corpus, dictionary = id2word, texts = docs

lda_tc = gensim.models.coherencemodel.CoherenceModel(model=lda_model, texts=docs, dictionary=id2word, coherence="u_mass")
topic_coherence = lda_tc.get_coherence()

print('\nCoherence Score: ', topic_coherence)


Coherence Score:  -2.515527299149995


In [37]:
lda_tc = gensim.models.coherencemodel.CoherenceModel(model=lda_model, texts=docs, dictionary=id2word, coherence="c_v")
topic_coherence = lda_tc.get_coherence()

print('\nCoherence Score: ', topic_coherence)


Coherence Score:  0.40479054963242034


# Extra

In [None]:
df[wa][df.index[85]]

In [None]:
docs = df[wa][df['LAST_CHAR'] == '7']
print((df[wa][df['LAST_CHAR'] == '7']).index)

for d in docs:
    print(d[-100:])

In [None]:
df[wa][86697]

In [None]:
print(tokened_text[86696])

In [None]:
st_str = 'one page and must contain a summary of the proposed activity suitable for dissemination to thepublic. It should be a self-contained description of the project and should contain a statement of objectives and methods to be employed.It should be informative to other persons working in the same or related fields and insofar as possible understandable to a technically liter-ate lay reader. This Abstract must not include any proprietary/confidential information.* Please click the add attachment button to complete this entry.'

count = 0
for abstract in df[wa]:
    if abstract.startswith(st_str):
        count = count + 1
        
print(count)



In [17]:
tokens = gensim.utils.simple_preprocess("The 3rd dog was named Bond007.", deacc=True,min_len=3, max_len=25)
print(tokens)

for token in tokens:
    if token in stopWords:
        tokens.remove(token)
     
print(tokens)

['the', 'dog', 'was', 'named', 'bond']
['dog', 'named', 'bond']


### Trying to find more stop words

In [12]:
all_tokens = list()

for tokens in tokened_text:#nostop_test_data:
    all_tokens.extend(tokens)


In [13]:
# potential stop words - check list by hand 

from collections import Counter

c = Counter(all_tokens)

In [14]:
c.most_common(100)

[('the', 180631),
 ('and', 152670),
 ('will', 55892),
 ('that', 45904),
 ('for', 43490),
 ('with', 39142),
 ('cancer', 37984),
 ('this', 28088),
 ('are', 21681),
 ('cells', 21152),
 ('breast', 16019),
 ('have', 15595),
 ('from', 14872),
 ('tumor', 14750),
 ('these', 14324),
 ('cell', 14050),
 ('our', 13644),
 ('prostate', 13229),
 ('specific', 12833),
 ('study', 12296),
 ('research', 12255),
 ('treatment', 11980),
 ('patients', 11843),
 ('aim', 11584),
 ('can', 10526),
 ('clinical', 10416),
 ('which', 10027),
 ('has', 9642),
 ('studies', 8531),
 ('using', 8461),
 ('determine', 8241),
 ('development', 7791),
 ('disease', 7754),
 ('novel', 7721),
 ('also', 7385),
 ('hypothesis', 7016),
 ('new', 6904),
 ('expression', 6824),
 ('impact', 6764),
 ('may', 6741),
 ('human', 6620),
 ('been', 6546),
 ('their', 6501),
 ('therapy', 6476),
 ('use', 6347),
 ('therapeutic', 6327),
 ('model', 6259),
 ('not', 6203),
 ('aims', 6154),
 ('identify', 6080),
 ('based', 5889),
 ('design', 5525),
 ('data', 5

In [15]:
c['non']

3540

In [16]:
# n least common elements

c.most_common()[:-100-1:-1] 

[('subspecialists', 1),
 ('biointegration', 1),
 ('benefitresearch', 1),
 ('scheimpflug', 1),
 ('haze', 1),
 ('designphase', 1),
 ('hypothesismsc', 1),
 ('backgroundnon', 1),
 ('uvea', 1),
 ('orbit', 1),
 ('cpsi', 1),
 ('gravest', 1),
 ('carboxamide', 1),
 ('oxopiperidine', 1),
 ('figuratively', 1),
 ('corneoscleral', 1),
 ('namsa', 1),
 ('bricksimple', 1),
 ('bressler', 1),
 ('suitcase', 1),
 ('apprise', 1),
 ('village', 1),
 ('oled', 1),
 ('diode', 1),
 ('stereoscopic', 1),
 ('stereo', 1),
 ('wheeled', 1),
 ('vitrectomies', 1),
 ('ageing', 1),
 ('iofbs', 1),
 ('globes', 1),
 ('gelling', 1),
 ('reattach', 1),
 ('disulfides', 1),
 ('copoly', 1),
 ('tamponades', 1),
 ('loyola', 1),
 ('moreso', 1),
 ('jdarhmga', 1),
 ('jdar', 1),
 ('mpzcre', 1),
 ('neurocristogenesis', 1),
 ('neurocristopathy', 1),
 ('pyscharray', 1),
 ('lupo', 1),
 ('northrup', 1),
 ('proteostatic', 1),
 ('cryptochrome', 1),
 ('ndds', 1),
 ('cripsr', 1),
 ('cystinosin', 1),
 ('ess', 1),
 ('gewin', 1),
 ('majka', 1),
 ('

In [21]:
for token_abstract in tokened_text:

    for i in range(len(token_abstract)):
        if token_abstract[i] == 'non':
            print(token_abstract[i+1])
                        
    

penetrating
invasive
tbi
ptsd
hematopoietic
preferred
penetrating
military
headache
military
tbi
invasive
blast
invasive
commercial
tbi
hormonal
hormonal
hormonal
hormonal
cell
pws
small
specific
coding
metastatic
metastatic
metastatic
diseased
pregnant
malignant
silencing
silencing
genotoxic
genotoxic
clonal
rtk
transformed
adherent
specific
tumorigenic
mutant
invasive
invasive
invasive
penetrating
cancerous
luciferase
ideal
ibc
ibc
invasive
ionizing
ionizing
obligate
metastatic
coding
protein
coding
coding
genomic
nuclear
genomic
nuclear
genomic
genomic
nuclear
genomic
mutant
metastatic
metastatic
metastatic
cancerous
tumor
tumor
homologous
tumor
competitive
cell
motile
transformed
tumorigenic
transformed
invasive
invasive
invasive
increased
specific
responding
target
invasive
invasive
malignant
toxic
emt
specific
transformed
transformed
immunosuppressive
specific
invasively
neuronal
toxic
invasive
metastatic
breast
coding
tumorigenic
metastatic
invasively
target
target
canonical
inv

In [None]:
## Parallel Version -- DOESN'T WORK

##########
#Lemmatize
#Create function to lemmatize by part of speech
#Retain only adjectives, nouns, verbs, and adverbs
##########
from nltk.stem.wordnet import WordNetLemmatizer

#Create lemmatizer
lemmatizer=WordNetLemmatizer()

#The POS tags created using pos_tag method are abbreviations like 'FW' or 'JJ'.
"""['FW',#Foreign word
 'JJ','JJR','JJS', #Adjectives
 'NN','NNS','NNP','NNPS','POS',#Nouns, POS is a possessive noun
 'RB','RBR','RBS','RP', #Adverbs
 'VB','VBD','VBG','VBN','VBP','VBZ','MD']]#Verbs, MD is modal, 'could','will'"""
#We are ignoring all other categories: numbers, prepositions,pronouns etc.


def apply_lemmatizer(x):
    """apply lemmatizer with correct pos parameter for each word. Takes a tuple of length 2 e.g. ("avocado",'NN')"""
    y=x[1] #x is a tuple of length 2
    if y in ['JJ','JJR','JJS']:
        return lemmatizer.lemmatize(x[0],pos='a')
    elif y in ['NN','NNS','NNP','NNPS','POS']:
        return lemmatizer.lemmatize(x[0],pos='n')
    elif y in ['RB','RBR','RBS','RP']: #This doesn't actually work
        return lemmatizer.lemmatize(x[0],pos='r')
    elif y in ['VB','VBD','VBG','VBN','VBP','VBZ','MD']:
        return lemmatizer.lemmatize(x[0],pos='v')
    elif y=='FW':
        return x[0]
    elif '_' in x[0]: #Bi and trigrams
        return x[0]
        
#Traverse each document, tag its part of speech
#and return tokens that are lemmatized and either a noun, verb, adverb, adj, or foreign work
#This works REALLY slowly
docs=[] #Final lemmatized list of documents, i.e. list of lists of tokens


# parallel version

import multiprocessing 
import os 
import ctypes 
  
def doc_lemma(text, docs, idx):
    
    doc=text[x]
    tagged_sentence=pos_tag(doc) #output is a list of tuples: [('game', 'NN'), ('explore','VB')]
    tokens_kept=[] #Tokens we are retaining
    for token_tuple in tagged_sentence:
        tokens_kept.append(apply_lemmatizer(token_tuple))
    #Removes tokens that aren't nouns, verbs, adverbs, adjectives or foreign words
    docs.append([x for x in tokens_kept if x is not None]) #Removes tokens that aren't nouns, verbs, adverbs, adjectives or foreign words
  
    return

# creating Array of string data type with space for 4 integers 
docs = multiprocessing.Array('POINTER(c_char)',2) ## Shared memory allocation will need to be done separately. Maybe using Manager class?
    
# Instead of for-loop - creating new processes 
p1 = multiprocessing.Process(target=char_count, args=(lim_data, result, 0))
p2 = multiprocessing.Process(target=char_count, args=(lim_data, result, 1))
  
# starting processes 
p1.start() 
p2.start()
  
# wait until process is finished 
p1.join() 
p2.join() 

print(docs)
    
    
#Our example text throughout cleaning process
print(docs[0])
#print("time ", t2-t1)