# Topic Modeling with new dataset

In [7]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Data Ingestion

In [2]:
# full corpus
#df = pd.read_pickle("~/git/dspg21RnD/data/dspg21RnD/smaller-final-dataset.pkl")

df = pd.read_pickle("../../data/prd/Tech-Report/FR_meta_and_final_tokens_21SEPT14.pkl")


In [3]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,final_tokens
0,0,89996,"This is a project to explore Game-based, Metap...",2008,Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,"[project, explore, game, base, metaphor, enhan..."
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,"[institution, franklin_institute, science, mus..."
2,2,89998,Through programs (including small group conver...,2008,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,"[program, small, group, conversation, citizen,..."
3,3,89999,In partnership with the American Chemical Soci...,2008,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,"[partnership, american, chemical, society, acs..."
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,2008,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,"[center, molecular, interfacing, cmi, enable, ..."


In [4]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []
docs = df["final_tokens"]

for abstract in docs:
    text.append(" ".join(abstract))

### Functions needed for all models

In [5]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(topic_term_mat, vectorizer, top_n=10):
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

## NMF
- Optimal Model: full dataset, 75 topics, random_state = 14

In [7]:
# use for full dataset

stop_wds = ['research', 'study', 'project']  # use will be eliminated by max_df

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
tf_idf = tfidf_vectorizer.fit_transform(text)

In [8]:
tf_idf.shape

(696093, 95376)

In [11]:
# create model

num_topics = 75 

t1 = time.time()
nmf_model = NMF(n_components=num_topics, random_state = 1)
doc_topic = nmf_model.fit_transform(tf_idf)
t2 = time.time()
print(f"  Model time: {t2-t1}")

topic_term = nmf_model.components_

  Model time: 1986.0394315719604


In [12]:
print_topics(topic_term, tfidf_vectorizer, 10)


Topic 0:
('cell', 59.789452507648384)
('differentiation', 3.276195037801478)
('type', 3.0234170469321255)
('culture', 2.6742028442772967)
('antigen', 2.515638106101276)
('cd4', 2.3437747810911387)
('line', 2.278705519887532)
('cellular', 2.1817083926761676)
('progenitor', 2.137520354388688)
('express', 2.076683414386944)

Topic 1:
('program', 13.766680805909196)
('member', 1.2354296534063316)
('funding', 1.1408704408468995)
('support', 1.0879213175784226)
('grant', 1.0088794960055625)
('year', 0.9317326640574126)
('fund', 0.9010319905726726)
('department', 0.7398151558253725)
('faculty', 0.6222015229027393)
('continue', 0.5779051712638001)

Topic 2:
('core', 9.84903057231326)
('investigator', 0.9155480190980654)
('provide', 0.71556547911199)
('expertise', 0.34564854916707777)
('analysis', 0.3423627241772108)
('cfar', 0.31798734081671365)
('biostatistic', 0.3027810612352848)
('ppg', 0.2956782908938277)
('facilitate', 0.2815960272310699)
('personnel', 0.2815041262521616)

Topic 3:
('int

### LDA

In [8]:
# create document-term matrix

stop_wds = ['research', 'study', 'project']  # use will be eliminated by max_df

vectorizer = CountVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
doc_term_matrix = vectorizer.fit_transform(text)

In [10]:
num_topics = 10

t1 = time.time()
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, \
                                      topic_word_prior=0.1, n_jobs=9, random_state = 1)
doc_topic = lda_model.fit_transform(doc_term_matrix)
t2 = time.time()

topic_term = lda_model.components_

In [13]:
(t2-t1)/60

22.924428717295328

In [11]:
print_topics(topic_term, vectorizer, 10)


Topic 0:
('material', 83093.94421464401)
('high', 64258.458237446815)
('system', 62804.756402463674)
('new', 62621.337054310345)
('student', 61423.4326681996)
('energy', 55196.469240937884)
('develop', 54034.720406293265)
('property', 51680.72835018138)
('model', 49497.28922061439)
('process', 46809.050248819774)

Topic 1:
('protein', 171609.6349069895)
('core', 136300.79334732133)
('provide', 100464.06531974183)
('analysis', 89023.36569173423)
('dna', 84674.48015208326)
('cell', 80360.22791124396)
('structure', 77528.01199526126)
('new', 57102.98948176858)
('molecular', 57085.08125222632)
('sample', 52268.7181619973)

Topic 2:
('brain', 172227.36184743303)
('effect', 119078.96638705643)
('aim', 94880.35720995454)
('function', 80876.99349202229)
('increase', 79757.32308512276)
('disorder', 77648.3015975909)
('change', 74370.74005888059)
('mechanism', 71406.52054168496)
('disease', 69410.49243926504)
('exposure', 68070.05960537527)

Topic 3:
('cancer', 377854.46969019255)
('tumor', 205