# Text Preprocessing for use with Topic Models

The text has already been cleaned.  This script will preprocess it - tokenize, remove stop words, add bigrams and trigrams, lemmatize.

In [4]:
import pandas as pd
import pickle
import os
import nltk

import TextCleaning
import LDAvariables

In [3]:
import spacy

In [5]:
os.chdir('/home/sc2pg/src/prnd/publicrd/data/prd/RND Topic Modelling')

In [6]:
# load saved df.  df['working_abstract'] contains clean text.
df = pd.read_pickle("./clean_dataset.pkl")

In [7]:
wa='working_abstract'

In [8]:
df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

In [9]:
#So this should go in the first line of the preprocessing script. These are two functions that remove junk words but should be run on the lowercase, tokenized text. Since we're not necessarily removing numbers, I included an extra parameter in remove_first_x_tokens called max_tokens_to_skip. That says search for this series in tokens within max_tokens_to_skip from the first token.
##############
#one could do this by running custom regular exprsesions on original abstract, to account for variations in capitalization
#Or you can run it on the abstracts following the lowercase function but before any other type of preproccesing, including splitting tokens and removing too short BECAUSE
#1. If you run this after removing too short words (e.g. of) you would never match items like  'national institutes of health'
#2. If you tokenize, you have to either check for sequenes of tokens (which is very bad, given the number of unique sequqneces for names)
#Or you remove any subtokens, which is bad for a phrase like "National institutes of health", since "health" is a token we want to retain IN GENERAL
#But, this needs to be done on tokens, not entire string, because imagine replacing all instances of Dr. "Brown", which would turn 'brownian motion' or 'brown fat' into 'ian motion' or 'fat'
#So the best way to run this is to apply your own lowercase, remove custom stop words, and then continue reguolar preprocessing with tokenizing, lemmatizing, etc.
#############################################
def remove_institution(record):
    """removes all instances of exact institution name from lowercase abstract string"""
    org=record['ORGANIZATION_NAME']
    if pd.notnull(org):
        return record['working_abstract'].replace(org.lower(),'')
    else:
        return record['working_abstract']
    
def remove_custom_words(record):
    """Designates stopwords for a particular abstract that contain embedded info e.g. author names and removes them from a lowercase abstract"""
    fields_to_replace=[]
    #Main PI
    #Adds all words in the pis names, excluding initials (hence why the commas and periods must be replaced)
    if pd.notnull(record['CONTACT_PI_PROJECT_LEADER']):
        fields_to_replace.extend([x.lower() for x in record['CONTACT_PI_PROJECT_LEADER'].replace(',','').replace('.','').replace('-',' ').split() if len(x)>1])
    #Additional PIs
    #For each pi, which are split by semicolons, and format is last,first;  #Sometimes a middle initial
    if pd.notnull(record['OTHER_PIS']):
        for i in record['OTHER_PIS'].split(';'):
            i=i.strip() #Remove whitespace
            i=i.replace('.','')#Periods for initials
            i=i.replace(',','')#Commas between last, first
            i=i.replace('-',' ')#Remove hyphen in hypenated names to make separate words once tokens.
            fields_to_replace.extend([x.lower().strip() for x in i.split() if len(x)>1])
    return [x for x in record['tokened_abstracts'] if not x in fields_to_replace]

def remove_first_x_tokens(tokened_abstract,bad_start_phrases,max_tokens_to_skip=3):
    assert [type(phrase)==list for phrase in bad_start_phrases] #Make sure not just a string
    assert [type(tokened_abstract)==list]
    """removes each bad_start_phrase occuring within max_tokens_to_skip of the front--phrases must be lowered.
    be careful calling this, as order matters! It always starts looking at the first token, which will change between runs.
    both tokened_abstract and each phrase in bad_start_phrases must be a list, not just a string
    eg the phrase 'overall project summary' and 'technical abstract' should be input as a list of lists: [ ['overall','project','summary'],['technical','abstract']] """
    for token_sequence in bad_start_phrases:
        #Look for a match within up to 3 tokens from the start. The reasoning here is some abstract start with numbers indicating sections
        #EG 8., 8.a, 8.1.1.--from EDA of first tokens
        for idx in range(0,max_tokens_to_skip):
            if tokened_abstract[idx:len(token_sequence)+idx]==token_sequence:
                tokened_abstract=tokened_abstract[len(token_sequence)+idx:]
                break
    return tokened_abstract

start_phrases_to_remove=[['section'],['abstract'],['contact','pd','pi'],['technical'],['nontechnical'],['non','technical'],
                         ['project','summary','abstract'], ['overall','project','summary'],['project','abstract'],
                        ['project','narrative'],['abstract'],['summary'],['description','provided','by','the','applicant'],
                         ['description','provided','by','applicant'],['description','provided','by','candidate'],
                         ['provided','by','investigator'], ['provided','by','the','investigator'],['description']]

In [10]:
df[wa]=df[wa].apply(str.lower)
df[wa]=df.apply(remove_institution,axis=1) #case sensitive

In [11]:
tokened_abstracts = TextCleaning.tokenize(df['working_abstract'])
df['tokened_abstracts'] = tokened_abstracts


Time to tokenize abstracts 445.0988566875458 seconds


In [12]:
no_pis=df.apply(remove_custom_words,axis=1).apply(remove_first_x_tokens,args=[start_phrases_to_remove])

In [13]:
def create_stopwords():
    
    # stop words include the general English list and those specific to the corpus that do not aid in meaning
    
    stopWords = set(nltk.corpus.stopwords.words('english'))
    
    # format stop words the same way we formatted our corpus, ie. without apostrophes.  
    stop_wds = stopWords.copy()
    for word in stopWords:
        if "\'" in word:
            stop_wds.discard(word)
            stop_wds.add(word.replace("\'",""))
    
    # more stop words that do not add meaning to topics
    additional_stopwords=['another','well','addition', 'thus',
                      'specifically', 'similar','including',
                       'via','within', 'thus', 'particular', 'furthermore','include','also',
                      'includes','however','whether','due', 'may','overall', 'whether','could',
                      'many','finally', 'several', 'specific', 'additional', 'therefore', 'either', 'various',
                       'within', 'among', 'would'] 
        
    sw = stop_wds.union(additional_stopwords)
    
    return sw
    

stopWords = create_stopwords()
tokened_docs_nostop = TextCleaning.remove_stopwords(no_pis, stopWords)

df['tokened_docs_nostop'] = tokened_docs_nostop

In [14]:
import gensim
bigram = gensim.models.Phrases(df['tokened_docs_nostop'], min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_docs=df['tokened_docs_nostop'].apply(lambda x: bigram[x])
trigram = gensim.models.Phrases(bigram_docs, threshold=100)  
tri_docs =bigram_docs.apply(lambda x: trigram[x])
df['tns_bi_tri_docs'] = tri_docs



In [15]:
df['tns_bi_tri_docs'][1:10]#[1:10]

1    [institution, science, museum, steve, project,...
2    [programs, small, group, conversations, citize...
3    [partnership, american, chemical, society, acs...
4    [amphibian, populations, around_world, experie...
5    [center, molecular, interfacing, cmi, enable, ...
6    [dru, integrated, optimization, evacuation_she...
7    [foc, international, collaborative, project, w...
8    [goal, project, reconstruct, low, frequency, b...
9    [mapping, characterization, analysis, channel,...
Name: tns_bi_tri_docs, dtype: object

In [16]:
lemma_docs = TextCleaning.lemmatize(df['tns_bi_tri_docs'].iloc[:2000])


Time to lemmatize: 19.618050813674927 seconds


In [30]:
lemma_docs = TextCleaning.lemmatize(df['tns_bi_tri_docs'])
df['lemma_abstracts'] = lemma_docs

KeyboardInterrupt: 

In [None]:
lemma_docs

In [9]:
# save processed text

df.to_pickle("./processed_dataset.pkl")

In [29]:
mini=df.loc[df['ABSTRACT'].apply(lambda x: 'accrete' in x),[wa,'lemma_abstracts']]
for i in range(10):
    print(mini.iloc[i][wa])
    print(mini.iloc[i]['lemma_abstracts'])

the earth is unique among known planets in having liquid water on its surface, a crucial reason for the existence of life.  explaining how this came to be, and predicting conditions friendly to life elsewhere requires understanding the protoplanetary disk from which earth and the other planets formed, a complex, interacting system of rocks, dust, gas, plasma, and magnetic fields in orbit around the young sun.  many models by different scientific communities have addressed separate aspects of this problem, but none has integrated all the different physical, chemical, and mineralogical processes into a single, general, continuously improvable, three-dimensional, computational model.our imminent entry into the petascale computing era makes such a model practical for the first time.  this project will be the first serious attempt to develop a complete, multi-physics model based on a global simulation of a protoplanetary disk.  we will include three major physical processes.  first is the t

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [72]:
many_sets=[set(doc) for doc in lemma_docs]

In [76]:
big_set=set([])
for i in many_sets:
    big_set=big_set |i

KeyboardInterrupt: 

In [2]:
# load processed text

df = pd.read_pickle("./processed_dataset.pkl")

In [3]:
df.to_csv('FRAbstractsProcessed.csv')

In [4]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,...,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST,working_abstract,nchar,Start Char,tokened_abstracts,tokened_docs_nostop,tns_bi_tri_docs,lemma_abstracts
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,...,WHEELING JESUIT UNIVERSITY,47.076,1999467.0,"This is a project to explore Game-based, Metap...",2057,T,"[this, project, explore, game, based, metaphor...","[explore, game, based, metaphor, enhanced, gam...","[explore, game, based, metaphor, enhanced, gam...","[explore, game, base, metaphor, enhanced, game..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,...,FRANKLIN INSTITUTE,47.076,1799699.0,"Institution: Science Museum PI: Snyder, Steve...",2053,I,"[institution, science, museum, snyder, steve, ...","[science, museum, snyder, steve, drl, summary,...","[science, museum, snyder, steve, drl, summary,...","[science, museum, snyder, steve, drl, summary,..."
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,...,SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0,Through programs (including small group conver...,1154,T,"[through, programs, including, small, group, c...","[programs, small, group, conversations, citize...","[programs, small, group, conversations, citize...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,...,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0,In partnership with the American Chemical Soci...,875,I,"[partnership, with, the, american, chemical, s...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, ac,..."
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,...,CORNELL UNIVERSITY ITHACA,47.074,370996.0,Amphibian populations around the world are exp...,1322,A,"[amphibian, populations, around, the, world, a...","[amphibian, populations, around, world, experi...","[amphibian, populations, around_world, experie...","[amphibian, population, around_world, experien..."


In [6]:
# Save only what is needed for LDA - docs, corpus, and dictionary. When loading the entire dataframe, I have run 
# out of memory to run the model

# from Sam's code:
#    corpus = corpus, dictionary = id2word, texts = docs

docs = df['lemma_abstracts']
id2word, corpus = LDAvariables.createLDAvars(docs)
pickle.dump([corpus, id2word, docs], open('lda_data.sav','wb'))

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [56]:
os.chdir('/home/sc2pg/.local/lib/python3.7/site-packages/spacy/data/')
os.listdir()

['en', '__pycache__', '__init__.py']

In [58]:
!pip install spacy && python -m spacy download en



Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/sc2pg/.local/lib/python3.7/site-packages/en_core_web_sm -->
/home/sc2pg/.local/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
spacy