# NLP Preprocessing of our dataset

The text has already been cleaned, tokenized, and lemmatized.  This script will do a little bit more cleaning and preprocess the text - remove stop words, add bigrams and trigrams, and remove frequent words.

In [1]:
import pandas as pd
import pickle
import numpy as np
import time
import matplotlib.pyplot as plt

import stanza
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
#import gensim
from gensim.models.phrases import Phrases, Phraser
import re

pd.set_option('display.max_columns', 50)



In [2]:
# read in dataset

df = pd.read_pickle("../../../data/prd/Tech-Report/FR_lemmatized_21SEPT14.pkl")

# drop start_char, nchar, last_char
df = df.drop(columns = ['Start_Char', 'nchar', 'LAST_CHAR'])

In [3]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,lemma_abstract
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"This is a project to explore Game-based, Metap...","[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,Institution: Franklin Institute Science Museum...,"[institution, Franklin, Institute, science, Mu..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,Through programs (including small group conver...,"[program, include, small, group, conversation,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,In partnership with the American Chemical Soci...,"[partnership, American, Chemical, Society, ACS..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,The Center for Molecular Interfacing (CMI) wil...,"[Center, Molecular, Interfacing, CMI, enable, ..."


In [4]:
len(df)

696128

#### lower-case lemmas

In [5]:
def lower_case(abstract):
    # abstract is a list of string tokens    
    return [x.lower() for x in abstract]
               
df = df.assign(lower_lemmas = df["lemma_abstract"].apply(lower_case))

In [6]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,lemma_abstract,lower_lemmas
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"This is a project to explore Game-based, Metap...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,Institution: Franklin Institute Science Museum...,"[institution, Franklin, Institute, science, Mu...","[institution, franklin, institute, science, mu..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,Through programs (including small group conver...,"[program, include, small, group, conversation,...","[program, include, small, group, conversation,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,In partnership with the American Chemical Soci...,"[partnership, American, Chemical, Society, ACS...","[partnership, american, chemical, society, acs..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,The Center for Molecular Interfacing (CMI) wil...,"[Center, Molecular, Interfacing, CMI, enable, ...","[center, molecular, interfacing, cmi, enable, ..."


#### Clean-up lemmatized tokens at the start of abstracts

##### Helper function for remaining code

In [7]:
def drop_empties(df, col):
    
    # This function drops empty abstracts from the dataframe
    
    l1 = len(df)
    ix = df[df[col].apply(len)==0].index
    print(ix)
    df.drop(ix,axis=0,inplace=True)
    l2 = len(df)
    
    print(f"dropped {l1-l2}")
    
    return df

In [8]:
def remove_first_x_tokens(tokened_abstract,bad_start_phrases,max_tokens_to_skip=3):
    
    """removes each bad_start_phrase occuring within max_tokens_to_skip of the front--phrases must be lowered.
    be careful calling this, as order matters! It always starts looking at the first token, which will change 
    between runs.
    both tokened_abstract and each phrase in bad_start_phrases must be a list, not just a string
    eg the phrase 'overall project summary' and 'technical abstract' should be input as a list of lists: 
    [ ['overall','project','summary'],['technical','abstract']] """
    
    assert [type(phrase)==list for phrase in bad_start_phrases] #Make sure not just a string
    
    assert [type(tokened_abstract)==list]

    for token_sequence in bad_start_phrases:
        
        #Look for a match within up to 3 tokens from the start. The reasoning here is some abstract start with
        # numbers indicating sections.  EG 8., 8.a, 8.1.1.--from EDA of first tokens
        
        for idx in range(0,max_tokens_to_skip):
            if tokened_abstract[idx:len(token_sequence)+idx]==token_sequence:
                tokened_abstract=tokened_abstract[len(token_sequence)+idx:]
                break
                
    return tokened_abstract

In [9]:
start_phrases_to_remove=[['section'],['abstract'],['contact','pd','pi'],['nontechnical'],['non','technical'], 
                         ['non-technical'],['technical'],
                         ['project','summary','abstract'], ['overall','project','summary'],['project','abstract'],
                        ['project','narrative'], ['summary'], ['description','provide','applicant'],
                         ['description','provide','candidate'], ['provide','investigator'],['description']]

In [10]:
#Remove starting phrases (and any tokens proceeding them up to "x") like 'description', 'provided', 'by', 
# 'applicant'

df = df.assign(clean_lemmas = df["lower_lemmas"].apply(remove_first_x_tokens,args=[start_phrases_to_remove]))
df = drop_empties(df, "clean_lemmas")

Int64Index([], dtype='int64')
dropped 0


#### Remove PI names - skip for now

In [8]:
'''
def remove_pis(record,col_to_clean):
    
    # Removes the PI name(s) from an abstract
    
    pi_names = []
            
    #Adds all words in the main pis names, excluding initials (hence why the commas and periods must be replaced)
    if pd.notnull(record['CONTACT_PI_PROJECT_LEADER']):
        pi_names.extend([x.lower().strip() for x in record['CONTACT_PI_PROJECT_LEADER'].
                         replace(',','').replace('.','').replace('-',' ').replace('(','').replace(')','').
                         split() if len(x)>1])
        
    #For each additional pi, which are split by semicolons, and format is last,first; Sometimes a middle initial
    if pd.notnull(record['OTHER_PIS']):
        for i in record['OTHER_PIS'].split(';'):
            i=i.strip() #Remove whitespace
            i=i.replace('.','')#Periods for initials
            i=i.replace(',','')#Commas between last, first
            i=i.replace('-',' ')#Remove hyphen in hypenated names to make separate words once tokens.
            i=i.replace('(','')
            i=i.replace(')','')
            pi_names.extend([x.lower().strip() for x in i.split() if len(x)>1])
            
    pi_names = set(pi_names)  # sets are faster than lists for the line below
                
    return [x.lower() for x in record[col_to_clean] if x.lower() not in pi_names]
    
'''    

In [9]:
'''
temp = df.apply(lambda x: remove_pis(x, "clean_lemmas"),axis=1)    
df = df.assign(clean_lemmas = temp) 

df = drop_empties(df, "clean_lemmas")
'''

Int64Index([115144], dtype='int64')
dropped 1


### Remove generic stop words

In [11]:
# Note - we are now using the spaCy stopwords list instead of nltk.  It is more comprehensive.

def create_stopwords():
      
    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking 
    through.  """
    
    spacy_stop_words = STOP_WORDS

    # more stop words that do not add meaning to topics
    additional_stopwords = {'addition', 'specifically', 'similar','including', 'particular', 
                            'furthermore','include', 'includes','overall', 'finally', 'specific', 
                            'additional'} 
           
    sw = spacy_stop_words.union(additional_stopwords)
    
    return sw

In [12]:
def remove_stopwords(doc, stop_words):
    
    """remove stopwords"""
    
    return [word for word in doc if word not in stop_words] 

In [13]:
#Remove stopwords

stop_words = create_stopwords()

df = df.assign(stopwds_removed = df["clean_lemmas"].apply(remove_stopwords,args=[stop_words])) 
df = drop_empties(df, "stopwds_removed")


Int64Index([247492], dtype='int64')
dropped 1


### Find bigrams and trigrams

In [14]:
def add_n_grams(docs):

    #Calculate bi and tri grams on tokenized, lemmatized, and stop words removed abstracts -- gensim

    # Build the bigram and trigram models
    bi_phrases = Phrases(docs, min_count=5, threshold=100) # higher threshold --> fewer phrases.
    bigram = Phraser(bi_phrases)  # should provide a speed-up
    bi_docs = bigram[docs]
    
    print("bigrams complete")
    
    tri_phrases = Phrases(bi_docs, threshold=100)  
    trigram = Phraser(tri_phrases)
    tri_docs = trigram[bi_docs]

    return tri_docs

In [15]:
# add in bigrams and trigrams -- time ~15 minutes

df = df.assign(n_grams_added = add_n_grams(df["stopwds_removed"]))

bigrams complete


  return array(a, dtype, copy=False, order=order)


In [16]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,lemma_abstract,lower_lemmas,clean_lemmas,stopwds_removed,n_grams_added
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"This is a project to explore Game-based, Metap...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,Institution: Franklin Institute Science Museum...,"[institution, Franklin, Institute, science, Mu...","[institution, franklin, institute, science, mu...","[institution, franklin, institute, science, mu...","[institution, franklin, institute, science, mu...","[institution, franklin_institute, science, mus..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,Through programs (including small group conver...,"[program, include, small, group, conversation,...","[program, include, small, group, conversation,...","[program, include, small, group, conversation,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,In partnership with the American Chemical Soci...,"[partnership, American, Chemical, Society, ACS...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,The Center for Molecular Interfacing (CMI) wil...,"[Center, Molecular, Interfacing, CMI, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ..."


In [17]:
len(df)

696127

In [18]:
# drop lower_lemma column
df = df.drop(columns = ['lower_lemmas'])

In [19]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"This is a project to explore Game-based, Metap...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,Institution: Franklin Institute Science Museum...,"[institution, Franklin, Institute, science, Mu...","[institution, franklin, institute, science, mu...","[institution, franklin, institute, science, mu...","[institution, franklin_institute, science, mus..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,Through programs (including small group conver...,"[program, include, small, group, conversation,...","[program, include, small, group, conversation,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,In partnership with the American Chemical Soci...,"[partnership, American, Chemical, Society, ACS...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,The Center for Molecular Interfacing (CMI) wil...,"[Center, Molecular, Interfacing, CMI, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ..."


### Final Token Clean-Up  

1. strips hyphens, replaces internal hyphens with _, remvoes non-alphanumeric tokens, strips leading _ if produced by alphanumeric; then removes those updated tokens that: are numeric but not length 4, or are less than length 2.

In [42]:
# NOTE -- we previously removed a list of colleges/universities; however we no longer do this step because 
#  1. we remove the top n most frequent words instead
#  2. the colleges/universities list is ~1500 tokens which need to be gone through by hand each time we add
#     data to our datset.  This is too long to be feasible.
#  3. it shouldn't affect the topic models

'''
# create list of college tokens

c_t = []

for abstract in df['n_grams_added']:
    for token in abstract:
        if 'university' in token or 'college' in token or 'universities' in token:
            c_t.append(token)
            
c_t = set(c_t)
'''

In [45]:
'''

#Any specific university word is removed--schools within college/university, college names, etc. that cannot apply to multiple schools
#This list was generated from all tokens that contained the string 'college','university',or 'universities'. The commented out parts of the list are terms
#That could be considered "generic" ie apply to more than one school
college_tokens = { 
#'college',
'aga_khan_university',
'ahmadu_bello_university',
'alabama_aamp_university',
'albert_einstein_college',
'alcorn_state_university',
'american_college_obstetricians',
'american_college_surgeons',
'americancollege',
#'amongcollege',
#'anduniversity',
'anne_molloy_trinity_college',
#'atuniversity',
'auburn_university',
'auburn_university_alabama',
'auburn_university_au',
'auburn_university_auburn',
'auburn_university_montgomery',
'auburn_university_tuskegee_university',
'augustana_college',
#'auniversity',
'babes_bolyai_university',
'barnard_college',
'baruch_college',
'bates_college',
'baylor_college',
'baylor_college_dentistry',
'baylor_college_medicine',
'baylor_college_ofmedicine',
'baylor_collegeof',
'baylorcollege_medicine',
'baylorcollege_medicine_bcm',
'ben_gurion_university',
'benedict_college',
'benedict_college_historically_black',
'berea_college',
'binghamton_university',
#'black_colleges',
'board_trinity_college',
'bostonuniversity',
'bowdoin_college',
'brownuniversity',
'bryn_mawr_college',
'bucknell_university',
#'cape_universities',
'cardiff_university',
'carleton_college',
'carnegie_mellon_university',
'carver_college',
'carver_college_medicine',
'case_western_reserveuniversity',
'case_westernreserve_university',
'catholic_university',
'cerritos_college',
'charles_drew_university',
'chulalongkorn_university',
'chulalongkorn_university_bangkok_thailand',
'claflin_university',
'claremont_colleges',
'clark_atlanta_university',
'colby_college',
'colby_sawyer_college',
#'college',
#'college',
#'college_american_pathologists',
#'college_arts',
#'college_arts_sciences',
'college_brockport',
'college_dentistry_nyucd',
'college_dentistry_ufcd',
#'college_goer',
#'college_graduates',
#'college_letters',
#'college_letters_arts_sciences',
#'college_letters_sciences',
'college_lewiston',
#'college_liberal_arts',
'college_london',
'college_medicine_aecom',
'college_medicine_uccom',
'college_menominee_nation',
#'college_optometry',
#'college_osteopathic_medicine',
'college_park_umcp',
'college_park_umd',
#'college_physicians_surgeons',
#'college_rheumatology_acr',
'college_south_hadley',
#'college_sports_medicine',
'college_st_scholastica',
'college_staten_island',
#'college_students_basics',
#'college_veterinary_medicine',
#'college_veterinary_pathologists',
#'college_veterinarymedicine',
'college_wcmc',
'college_wcmc_rockefeller_university',
'college_william_mary',
'college_wisconsin_mcw',
'college_wooster',
#'collegeand',
#'collegeof',
#'collegeof_medicine',
#'colleges',
#'colleges_arts_sciences',
#'colleges_chicago',
#'colleges_dentistry',
#'colleges_dentistry_medicine',
#'colleges_optometry',
'colleges_rcc_umb', #Iffy--not sure what this is
#'colleges_schools',
#'colleges_universities',
#'colleges_universities_hacu',
#'collegesand',
#'collegestudent',
'columbia_university',
'columbiauniversity',
'comanche_nation_college',
#'communitycollege',
'creighton_university',
'cross_university',
'cross__university',
'cuny_hunter_college',
'del_mar_college',
'depaul_university',
'dine_college',
'din_college',
'diné_college',
'diplomate_american_college',
'doane_college',
'doron_levy_university_maryland',
'dukeuniversity',
'eckerd_college',
'emoryuniversity',
'famu_fsu_college',
'fort_lewis_college',
'franklin_marshall_college',
'fudan_university',
'fudan_university_shanghai',
'fudan_university_shanghai_china',
'gallaudet_university',
'george_mason_university',
'george_washington_university',
'georgetown_howard_universities',
'georgetown_university',
'georgia_regents_university',
'gettysburg_college',
#'grant_universities_aplu',
'gu_howard_university',
'hackensack_university',
'hampton_university',
'hanyang_university',
'hartnell_college',
'harvarduniversity',
'harvey_mudd_college',
#'historically_black_college',
#'historically_black_colleges',
#'historically_black_colleges_universities',
'hokkaido_university',
'hold_bates_college',
'hold_colby_sawyer_college',
'hold_stonehill_college_easton',
'honors_college',
'houston_baylor_college',
'hunter_college',
'imperial_college',
'imperial_college_london',
'imperial_college_london_uk',
#'incollege',
'indiana_university',
'indianauniversity',
#'inspect_certified_college',
#'inter_college',
#'inter_university',
#'inter_university_consortium_political',
#'interuniversity',
#'interuniversity_consortium_political',
#'intra_university',
'james_cook_university',
'james_madison_university',
'jeffersonuniversity',
'john_jay_college',
'johns_hopkinsuniversity',
'kennesaw_state_university',
'king_college_london',
'kwame_nkrumah_university',
'kyoto_university',
'kyushu_university',
'langston_university',
'lehman_college',
'lehman_college_city',
'lehman_college_cuny',
'lemoyne_owen_college',
'lewis_clark_college',
#'liberal_art_college',
'louisiana_universities_marine',
'loyola_marymount_university',
'loyola_university',
'loyola_university_chicago',
'macalester_college',
'makerere_university',
'makerere_university_kampala_uganda',
'makerere_university_uganda',
'makerereuniversity',
'marquette_university',
'marquette_university_milwaukee',
'mbarara_university',
'mcgill_university',
'mcmaster_university',
'medgar_evers_college',
'medical_colleges_aamc',
'medicalcollege',
'medicaluniversity_south_carolina',
'medicine_yeshiva_university',
'meharrymedical_college',
'mellon_university',
'mellonuniversity',
'mexico_highlands_university',
'miami_dade_college',
'middlebury_college',
'millsaps_college',
'monash_university',
'monash_university_australia',
#'montana_tribal_college',
#'montana_tribal_colleges',
'montclair_state_university',
'morehouse_college',
'morehouse_college_spelman_college',
'mount_holyoke_college',
'msm_tuskegee_university',
'mt_marty_college',
'muhimbili_university',
#'multi_university',
#'muniversity',
'nakoda_college',
'nanyang_technological_university',
'nazarene_university',
'nazareth_college',
#'non_college',
#'non_university',
'northern_arizona_university',
'northern_kentucky_university',
'northshore_university',
'northshore_university_healthsystem',
'northwest_nazarene_university',
'northwestern_university',
'norwich_university',
#'ofuniversity',
'oglala_lakota_college',
'ohio_stateuniversity',
'old_dominion_university',
'olin_college',
#'otheruniversity',
#'participatinguniversity',
'pasadena_city_college',
'peking_university',
'peking_university_beijing_china',
'pennsylvania_college_optometry',
#'phduniversity',
#'polytechnic_university',
#'post__college',
'prairie_view_university',
#'pre_college',
#'pre_university',
#'pre__college',
#'precollege',
'queens_college',
'regents_university',
'researchuniversity',
'rockefeller_university',
'rockefeller_university_memorial_sloan',
'rockefeller_university_ru',
'rockefeller_university_weill_cornell',
'rockefelleruniversity',
'royal_college_surgeons',
'rutgers_university',
'rutgersuniversity',
'saddleback_college',
'saginaw_chippewa_tribal_college',
'saint_michael_college',
'salish_kootenai_college',
'salve_regina_university',
'sawyer_college',
#'scienceuniversity', #This is likely ohsu, as bellow, but for parsimony, this is kept
'scienceuniversity_ohsu',
'serc_carleton_college',
'shams_university',
'shams_university_cairo_egypt',
'shanghai_jiaotong_university',
'simon_fraser_university',
'sinte_gleska_university',
'sisseton_wahpeton_college',
'sitting_bull_college',
'skc_tribal_college',
'sokoine_university',
'south_africa_university_witwatersrand',
'southern_illinois_university_carbondale',
'southern_illinois_university_edwardsville',
'southern_methodist_university',
'spelman_college',
'st_edward_university',
'st_mary_college',
'st_olaf_college',
'st_philip_college',
'stanforduniversity',
'state_university_dominguez', #Specific university
#'stateuniversity', #This could be any state
'stellenbosch_university',
'stellenbosch_university_south_africa',
'stonehill_college',
'stonehill_college_easton_massachusetts',
'stony_brook_university',
'swarthmore_college',
'tarrant_county_college',
'tel_aviv_university',
'templeuniversity',
'texas_a_university',
'texas_southmost_college',
'texas_university_kingsville',
#'thecollege',
#'theuniversity',
'theuniversity_california_san',
'theuniversity_colorado',
'theuniversity_maryland',
'theuniversity_michigan',
'theuniversity_minnesota',
'theuniversity_north_carolina',
'theuniversity_pennsylvania',
'theuniversity_pittsburgh',
'tougaloo_college',
#'touniversity',
#'triangle_universities_nuclear', #this is a government research center
#'tribal_college',
'tribal_college_haskell_indian', #specific university
#'tribal_colleges',
#'tribal_colleges_universities',
#'tribal_colleges_universities_tcus',
'trinity_college',
'trinity_college_arts_sciences',
'trinity_college_dublin',
'tsinghua_university',
'tsinghua_university_beijing',
'tsinghua_university_beijing_china',
'tsinghua_university_china',
#'tsinghua_university_prof_roberto',
'tulaneuniversity',
'tuskegee_universities',
'tuskegee_university',
'tuskegee_university_hbcu',
'uams_colleges',
'ucsf_makerere_university',
'umbc_university_maryland',
'uniformed_services_university',
'united_negro_college',
#'universities',
#'universities_aau', #this is an association of universities, not a university
#'universities_hbcu',
'universities_kansas_ku',
#'universitiesand',
#'universitiesin',
#'university',
#'university',
'university_alabama_birmingham',
'university_alabama_huntsville',
'university_alabama_tuscaloosa',
'university_alabama_ua',
'university_alaska_anchorage',
'university_alaska_fairbanks',
'university_albany_suny',
'university_arizona_ua',
'university_arkansas_fayetteville',
'university_arkansas_pine',
'university_arkansas_ua',
'university_buffalo_suny',
'university_buffalo_ub',
'university_california_berkeley',
'university_california_davis',
'university_california_irvine',
'university_california_los',
'university_california_merced',
'university_california_riverside',
'university_california_san',
'university_california_sanfrancisco',
'university_california_santa',
'university_cincinnati_cincinnati',
'university_college_dublin',
'university_college_london',
'university_colorado_anschutz',
'university_colorado_boulder',
'university_colorado_denver',
'university_connecticut_uconn',
'university_feinberg_school',
'university_florida_gainesville',
'university_florida_uf',
'university_fullerton_csuf',
'university_georgia_athens',
'university_georgia_uga',
'university_hawaii_hilo',
'university_hawaii_manoa',
'university_hawaii_uh',
'university_hospitals_cleveland',
'university_houston_downtown',
'university_houston_uh',
'university_illinois_chicago',
'university_illinois_urbana',
'university_indianapolis_iupui',
'university_kansas_ku',
'university_kansas_lawrence',
'university_kingsville',
'university_langone_medical',
'university_louisiana_lafayette',
'university_louisiana_monroe',
'university_maryland',
'university_maryland_baltimore',
'university_maryland_baltimore_county',
'university_maryland_baltimore_umb',
'university_maryland_eastern_shore',
'university_maryland_greenebaum',
'university_maryland_marlene_stewart', 
'university_maryland_umd',
'university_massachusetts_amherst',
'university_massachusetts_dartmouth',
'university_massachusetts_lowell',
'university_massachusetts_umass',
'university_miami_miller',
'university_miami_um',
'university_michigan_ann',
'university_michigan_dearborn',
'university_michigan_um',
'university_minnesota',
'university_minnesota_duluth',
'university_minnesota_masonic',
'university_minnesota_minneapolis',
'university_minnesota_twin',
'university_minnesota_umn',
'university_missouri__columbia',
'university_missouri_columbia',
'university_missouri_kansas',
'university_missouri_mu',
'university_missouri_rolla',
'university_missouri_st',
'university_nebraska_lincoln',
'university_nebraska_omaha',
'university_nevada_las',
'university_nevada_reno',
'university_northcarolina_chapel',
'university_northridge_csun',
'university_ofalabama',
'university_ofcalifornia',
'university_ofcolorado',
'university_ofmichigan',
'university_ofminnesota',
'university_ofpennsylvania',
'university_ofrochester',
'university_oftexas',
'university_ofwashington',
'university_ofwashington_uw',
'university_ofwisconsin',
'university_ofwisconsin_madison',
'university_oklahoma_norman',
'university_oklahoma_ou',
'university_pennsylvania_upenn',
'university_pittsburgh_pitt',
'university_singapore_nus',
'university_singapore_singapore',
'university_tennessee_chattanooga',
'university_tennessee_knoxville',
'university_tennessee_memphis',
'university_texas_arlington',
'university_texas_austin',
'university_texas_brownsville',
'university_texas_dallas',
'university_texas_el',
'university_texas_pan',
'university_texas_rio',
'university_texas_southwestern',
'university_texas_tyler',
'university_toronto_toronto',
'university_venda',
'university_vermont_burlington',
'university_vermont_uvm',
'university_virginia_charlottesville',
'university_virginia_uva',
'university_washington_seattle',
'university_washington_uw',
'university_waterloo',
'university_west_indies',
'university_wisconsin_carbone',
'university_wisconsin_eau',
'university_wisconsin_madison',
'university_wisconsin_milwaukee',
'university_wisconsin_oshkosh',
'university_wisconsin_platteville',
'university_wisconsin_stout',
'university_witwatersrand',
'university_witwatersrand_south_africa',
'university_witwatersrand_wits',
#'universityabstract',
#'universityand',
#'universitycareer',
#'universityco',
#'universityhospitals',
#'universityin',
#'universityintellectual',
#'universitymedical',
#'universityof',
'universityof_california_san',
'universityof_chicago',
'universityof_colorado',
'universityof_kentucky',
'universityof_michigan',
'universityof_minnesota',
'universityof_pennsylvania',
'universityof_pittsburgh',
'universityof_washington',
#'universityproposal',
#'universityresources',
#'universitys',
#'universityschool_medicine',
#'universitytitle',
'urmc_college_arts',
'vanderbiltuniversity',
'virginia_commonwealth_university',
'wake_forest_university',
'washingtonuniversity',
'wayne_stateuniversity',
'weinberg_college_arts',
'wellesley_college',
'wesley_college',
'western_ontario_mcmaster_universities',
'western_ontario_mcmasters_universities',
'westminster_college',
#'withuniversity',
'xiamen_university',
'xiamen_university_china',
'yaleuniversity',
'yeshiva_university',
'yonsei_university',
'yonsei_university_seoul_south',
'yorkuniversity'
}
'''

### 

In [20]:
def clean_up_tokens(doc):
    
    """determines for each doc which tokens to clean up formatting further in keep_token, and decides which of 
    these cleaned up tokens will be kept"""
    
    kept_tokens=[]

    for token in doc:
        keep,altered_token=keep_token(token)
        if keep:
            kept_tokens.append(altered_token)
    
    return kept_tokens

In [21]:
def keep_token(token):
    
    """strips hyphens, replaces internal hyphens with _, removes non-alphanumeric characters in tokens, strips 
    leading _ if produced by alphanumeric, then removes those updated tokens that: are numeric but not length 4, 
    are tokens related to college names (see below) or are less than length 2."""
    
    token=token.strip('- ') #Removes leading and trailing hyphens
    
    token=token.replace('-','_')
    
    if not str.isalnum(token):
        token=re.sub(r'\W+', '', token)
    
    token=token.strip('_')
    
    #Names of universities
    #if 'university' in token or 'college' in token or 'universities' in token:
    #    return (token not in college_tokens, token)

    if str.isnumeric(token):
        #keep years
        return (len(token)==4,token)
    else:
        #Keep anything that is alphanumeric if it has at least length 2--allows mixed types e.g. h1n1
        return (len(token)>=2,token)

In [22]:
df = df.assign(final_tokens = df['n_grams_added'].apply(clean_up_tokens))

In [23]:
df = drop_empties(df, "final_tokens")

Int64Index([ 47160,  47164,  49167, 172378, 172383, 176341, 176342, 176343,
            176344, 176345, 176346, 177076, 178071, 178082, 249227, 249228,
            249229, 249234, 249235, 249236, 249237, 249238, 249239, 249240,
            249241, 249242, 249245, 249246, 249247, 249248, 249251, 250023,
            250408, 250409],
           dtype='int64')
dropped 34


In [24]:
print(len(df))
df.head()

696093


Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"This is a project to explore Game-based, Metap...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,Institution: Franklin Institute Science Museum...,"[institution, Franklin, Institute, science, Mu...","[institution, franklin, institute, science, mu...","[institution, franklin, institute, science, mu...","[institution, franklin_institute, science, mus...","[institution, franklin_institute, science, mus..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,Through programs (including small group conver...,"[program, include, small, group, conversation,...","[program, include, small, group, conversation,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,In partnership with the American Chemical Soci...,"[partnership, American, Chemical, Society, ACS...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,The Center for Molecular Interfacing (CMI) wil...,"[Center, Molecular, Interfacing, CMI, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ..."


In [25]:
# save processed text

#df.to_pickle("../../../data/prd/Tech-Report/FR_processed_21SEPT14.pkl")

**not doing this part here - frequent words can be given as a list to sklearn tfidf vectorizer in stop_words parameter to be filtered out**

### START HERE to explore removing the top n most frequent words from the corpus

In [2]:
df = pd.read_pickle("../../../data/prd/Tech-Report/FR_processed_21SEPT14.pkl")

In [3]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"This is a project to explore Game-based, Metap...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,Institution: Franklin Institute Science Museum...,"[institution, Franklin, Institute, science, Mu...","[institution, franklin, institute, science, mu...","[institution, franklin, institute, science, mu...","[institution, franklin_institute, science, mus...","[institution, franklin_institute, science, mus..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,Through programs (including small group conver...,"[program, include, small, group, conversation,...","[program, include, small, group, conversation,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,In partnership with the American Chemical Soci...,"[partnership, American, Chemical, Society, ACS...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,The Center for Molecular Interfacing (CMI) wil...,"[Center, Molecular, Interfacing, CMI, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ...","[center, molecular, interfacing, cmi, enable, ..."


In [6]:
# remove these words - these are the most frequent (remaining) words that do not contribute to topic meaning

freq_words = {"research", "use", "study", "project", "aim", "provide", "develop", "data", 
              "new", "model", "disease", "development", "program", "system", "mechanism", 
              "propose", "identify", "function", "base", "determine", "result", "increase", 
              "support", "role", "effect", "high", "target", "treatment","activity", "core", 
              "response", "analysis", "goal", "approach", "lead", "novel","signal", "understanding", 
              "work", "control", "change", "design", "associate", "year", "factor", "expression", 
              "method", "improve", "process", "risk", "training","important","center", "pathway", 
              "focus", "impact", "application", "group", "community", "time", "potential",
              "investigator", "structure", "follow", "induce", "most", "proposal", 
              "show", "information", "type", "cause", "hypothesis", "intervention", "establish", 
              "understand", "reduce", "state", "examine", "term", "outcome", "address", "investigate", 
              "regulate",  "measure", "complex", "long", "strategy", "critical", "enhance", 
              "evaluate", "resource", "require", "different", "area", "characterize", "allow", "trial", 
              "major", "field", "large", "current", "involve", "tool", "conduct", "number", "contribute", 
              "assess", "phase", "behavior", "early", "objective", "form", "technique", "affect", "know", 
              "significant", "define", "key", "find", "component", "demonstrate", "knowledge", "experience", 
              "effective", "problem", "sample", "suggest", "quality", "experiment", "primary", "compare", 
              "service", "promote", "produce", "effort", "ability", "condition", "help", "researcher",  
              "relevance", "material", "perform", "continue", "multiple" , "plan", "rate"}

In [7]:
df = df.assign(final_frqwds_removed = df["final_tokens"].apply(remove_stopwords,args=[freq_words])) 
df = drop_empties(df, "final_frqwds_removed")

Int64Index([93742, 114178, 465376, 465816, 466832], dtype='int64')
dropped 5


In [9]:
print(len(df))
df.head()

690814


Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,...,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,working_abstract,Start_Char,nchar,LAST_CHAR,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens,final_frqwds_removed
0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,5F32AG027647-03,12/1/2005,1/1/2008,"LIEBERMAN, RAQUEL L",,7.0,...,2008,3483.0,,1,1,The multiprotein complex y-secretase proteolyt...,T,1402,g,"[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y_secretase, proteolyt...","[multiprotein, y_secretase, proteolytically_cl..."
1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,5K99HD056157-02,9/1/2007,1/1/2009,"KAUFFMAN, ALEXANDER S",,7.0,...,2008,39175.0,,1,1,The Kissl gene encodes peptides called kisspep...,T,2553,y,"[Kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin..."
2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,5F32GM076971-02,1/2/2007,1/1/2009,"MACK, ERIC T",,5.0,...,2008,49646.0,,1,1,The objective of this research is to understan...,T,1414,e,"[objective, research, be, understand, biophysi...","[objective, research, be, understand, biophysi...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[biophysical, basis, thermodynamics_kinetic, m..."
3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,5F31NR009611-03,4/1/2006,1/1/2009,"HELMREICH, REBECCA J",,9.0,...,2008,20406.0,,1,1,Obesity is the cause of many adverse pregnancy...,O,1545,d,"[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, adverse, pregnancyoutcome, re...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, adverse_pregnancyoutcome, great, hea..."
4,371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,0219605,10/1/2009,1/1/2010,"CHARLTON, B.",,,...,2010,,,1,1,Local potato advisory groups have expressed in...,L,271,s,"[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, express, interest, m..."


In [10]:
# save processed text

#df.to_pickle("../../data/final/final_dataset_7-20.pkl")