# Text Preprocessing for use with Topic Models

The text has already been cleaned.  This script will preprocess it - tokenize, remove stop words, add bigrams and trigrams, lemmatize.

In [1]:
import pandas as pd
import pickle

import TextCleaning
import LDAvariables

In [2]:
# load saved df.  df['working_abstract'] contains clean text.

df = pd.read_pickle("./clean_dataset.pkl")

In [3]:
df.head()


Unnamed: 0,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_TITLE,PROJECT_TERMS,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST,working_abstract,nchar,Start Char
0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,Achievement; analog; base; Cognitive Science; ...,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",WHEELING JESUIT UNIVERSITY,47.076,1999467.0,"This is a project to explore Game-based, Metap...",2057,T
1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,Active Learning; Child; Computer software; des...,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",FRANKLIN INSTITUTE,47.076,1799699.0,"Institution: Science Museum PI: Snyder, Steve...",2053,I
2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,Address; Age; Birth; Brain; Caregivers; Child;...,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0,Through programs (including small group conver...,1154,T
3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,Advanced Development; American; Chemicals; Che...,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0,In partnership with the American Chemical Soci...,875,I
4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,Amphibia; Central America; Communicable Diseas...,"ZAMUDIO, KELLY R",,CORNELL UNIVERSITY ITHACA,47.074,370996.0,Amphibian populations around the world are exp...,1322,A


In [4]:
df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

In [5]:
tokened_abstracts = TextCleaning.tokenize(df['working_abstract'])
df['tokened_abstracts'] = tokened_abstracts

Time to tokenize abstracts 422.47400093078613 seconds


In [6]:
stopWords = TextCleaning.create_stopwords()
tokened_docs_nostop = TextCleaning.remove_stopwords(df['tokened_abstracts'], stopWords)

df['tokened_docs_nostop'] = tokened_docs_nostop

In [7]:
tns_bi_tri_docs = TextCleaning.add_bi_tri_grams(df['tokened_docs_nostop'])
df['tns_bi_tri_docs'] = tns_bi_tri_docs

In [8]:
lemma_docs = TextCleaning.lemmatize(df['tns_bi_tri_docs'])
df['lemma_abstracts'] = lemma_docs

Time to lemmatize: 4900.83739900589 seconds


In [9]:
# save processed text

df.to_pickle("./processed_dataset.pkl")

In [2]:
# load processed text

df = pd.read_pickle("./processed_dataset.pkl")

In [3]:
df.to_csv('FRAbstractsProcessed.csv')

In [4]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,...,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST,working_abstract,nchar,Start Char,tokened_abstracts,tokened_docs_nostop,tns_bi_tri_docs,lemma_abstracts
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,...,WHEELING JESUIT UNIVERSITY,47.076,1999467.0,"This is a project to explore Game-based, Metap...",2057,T,"[this, project, explore, game, based, metaphor...","[explore, game, based, metaphor, enhanced, gam...","[explore, game, based, metaphor, enhanced, gam...","[explore, game, base, metaphor, enhanced, game..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,...,FRANKLIN INSTITUTE,47.076,1799699.0,"Institution: Science Museum PI: Snyder, Steve...",2053,I,"[institution, science, museum, snyder, steve, ...","[science, museum, snyder, steve, drl, summary,...","[science, museum, snyder, steve, drl, summary,...","[science, museum, snyder, steve, drl, summary,..."
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,...,SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0,Through programs (including small group conver...,1154,T,"[through, programs, including, small, group, c...","[programs, small, group, conversations, citize...","[programs, small, group, conversations, citize...","[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,...,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0,In partnership with the American Chemical Soci...,875,I,"[partnership, with, the, american, chemical, s...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, ac,..."
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,...,CORNELL UNIVERSITY ITHACA,47.074,370996.0,Amphibian populations around the world are exp...,1322,A,"[amphibian, populations, around, the, world, a...","[amphibian, populations, around, world, experi...","[amphibian, populations, around_world, experie...","[amphibian, population, around_world, experien..."


In [6]:
# Save only what is needed for LDA - docs, corpus, and dictionary. When loading the entire dataframe, I have run 
# out of memory to run the model

# from Sam's code:
#    corpus = corpus, dictionary = id2word, texts = docs

docs = df['lemma_abstracts']
id2word, corpus = LDAvariables.createLDAvars(docs)
pickle.dump([corpus, id2word, docs], open('lda_data.sav','wb'))