In [1]:
import pandas as pd
import numpy
import pickle
import time
import joblib
import gensim
import matplotlib.pyplot as plt

from itertools import islice
from scipy.linalg import block_diag
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger

# Remove warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# function to create a new dictionary and corpus
def createLDAvars(docs):

    # Create the variables needed for LDA from df[final_frqwds_removed]: dictionary (id2word), corpus
    
    # Create Dictionary
    id2word = gensim.corpora.Dictionary(docs)

    #Filter words to only those found in at least a set number of documents (min_appearances)
    id2word.filter_extremes(no_below=20, no_above=0.6)
    
    # filter out stop words - "use" already filtered out by previous line
    id2word.filter_tokens(bad_ids=[id2word.token2id['research'], id2word.token2id['aim'], id2word.token2id['project']])

    # Create Corpus (Term Document Frequency)

    #Creates a count for each unique word appearing in the document, where the word_id is substituted for the word
    # corpus not need for c_v coherence
    #corpus = [id2word.doc2bow(doc) for doc in docs]

    return id2word #, corpus


# function to pre-process the data: compute tfidf
def preprocess(df, stopwords):
    # Append all the final tokens
    text = []
    docs = df['list_final_tokens']
    
    for abstract in docs:
        text.append(' '.join(abstract))
        
    # Create the term-document matrix
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=0, lowercase=False, stop_words=stop_wds)
    tf_idf = tfidf_vectorizer.fit_transform(text)
            
    return (tf_idf, tfidf_vectorizer)


In [3]:
# Load the dataset.
df = pd.read_pickle("/project/biocomplexity/sdad/projects_data/ncses/prd/Paper/FR_meta_and_final_tokens_23DEC21.pkl")
df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS,final_tokens
0,89996,"This is a project to explore Game-based, Metap...",Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,...,,47.076,2008,1999467.0,,1,1,1999467.0,1,project explore game base metaphor enhanced ga...
1,89997,Institution: Franklin Institute Science Museum...,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,...,,47.076,2008,1799699.0,,1,1,1799699.0,1,institution franklin institute science museum ...
2,89998,Through programs (including small group conver...,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,...,,47.076,2008,1505858.0,,1,1,1505858.0,1,program include small group conversation citiz...
3,89999,In partnership with the American Chemical Soci...,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,...,,47.049,2008,51000.0,,1,1,51000.0,1,partnership american chemical society acs nati...
4,90001,The Center for Molecular Interfacing (CMI) wil...,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,...,,47.049,2008,1519821.0,,1,1,1519821.0,1,center molecular interfacing cmi enable integr...


In [5]:
# Create a list of tokens 
df["list_final_tokens"] = df["final_tokens"].str.split(' ').tolist()
year = df['FY'].unique()
year

array(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020'], dtype=object)

In [5]:
# Split the dataset by fiscal year
for fy in year:
    df_subset = df[df['FY']==fy]
    
    # save the pickle file
    pickle.dump(df_subset, open('/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/Data/FR_'+str(fy)+'.pkl','wb'))

In [8]:
# print a subset of the data (at a given fiscal year): checking
fy = 2009
fw = open('/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/Data/FR_'+str(fy)+'.pkl', 'rb')
dfw = pickle.load(fw)
fw.close()
dfw

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS,final_tokens,list_final_tokens
95725,103915,EDUCATION IN ACTION NASA Exchange City Learnin...,base; Cities; Learning; Mission; next generati...,EDUCATION IN ACTION NASA EXCHANGE CITY LEARNIN...,NASA,NASA,,NNX09AR64G,10/1/2009,9/30/2010,...,43.001,2009,180776.0,,1,1,180776.0,1,education action nasa exchange city learning l...,"[education, action, nasa, exchange, city, lear..."
95726,103916,Educational Advancement Alliance Inc Math Scie...,Development; Future; programs; Science; Techno...,EDUCATIONAL ADVANCEMENT ALLIANCE INC MATH SCIE...,NASA,NASA,,NNX09AQ21G,8/1/2009,7/31/2010,...,43.001,2009,2750000.0,,1,1,2750000.0,1,educational advancement alliance inc math scie...,"[educational, advancement, alliance, inc, math..."
95727,103917,"CUBRC, Inc FY09 Earmark Entitled, to continue...",Development; Educational process of instructin...,"CUBRC, INC FY09 EARMARK ENTITLED, ''TO CONTINU...",NASA,NASA,,NNX09AT31G,10/1/2010,9/30/2011,...,43.AAA,2009,250000.0,,1,1,250000.0,1,cubrc inc fy09_earmark_entitle continue develo...,"[cubrc, inc, fy09_earmark_entitle, continue, d..."
95728,103918,University Corporation for Atmospheric Researc...,Joints; Life; programs; Request for Proposals;...,UNIVERSITY CORPORATION FOR ATMOSPHERIC RESEARC...,NASA,NASA,,NNX09AW48A,10/1/2009,9/30/2011,...,43.001,2009,1605942.0,,1,1,1605942.0,1,university corporation atmospheric research co...,"[university, corporation, atmospheric, researc..."
95729,103919,Proposal Number: 0850898PI: John Doyle ...,Area; base; Computer Architectures; design; De...,PLANNING FUTURE RESEARCH IN NETWORK SCIENCE AN...,NSF,NSF,,0962520,6/1/2009,2/28/2011,...,47.070,2009,50400.0,,1,1,50400.0,1,number 0850898pi john doyle institution univer...,"[number, 0850898pi, john, doyle, institution, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199249,1181809,"A fully Cancer Center-managed facility, the Fl...",7-aminoactinomycin D; Accounting; annexin A5; ...,FLOW CYTOMETRY RESOURCE,HHS,NIH,NCI,3P30CA013696-36S3 (9014),,,...,,2009,,2049.0,1,1,0.0,2,fully cancer center manage facility flow_cytom...,"[fully, cancer, center, manage, facility, flow..."
199250,1181734,The Cancer Center has an extensive approach to...,Leadership; oncology; programs; Research,PROGRAM LEADERSHIP,HHS,NIH,NCI,3P30CA006973-47S5 (0002),,9/29/2012,...,,2009,,233821.0,1,1,0.0,2,cancer center extensive approach program plann...,"[cancer, center, extensive, approach, program,..."
199251,1181593,DESCRIPTION OF SHARED RESOURCEThe purpose of t...,Algorithms; base; Biology; Cancer Center; Comp...,CANCER FUNCTIONAL IMAGING,HHS,NIH,NCI,3P30CA006973-47S5 (9036),,9/29/2012,...,,2009,,93766.0,1,1,0.0,2,shared_resourcethe purpose cancer functional i...,"[shared_resourcethe, purpose, cancer, function..."
199252,1181484,DESCRIPTION OF SHARED RESOURCESince the incept...,Animals; anticancer research; Area; Argon; Art...,CELL IMAGING,HHS,NIH,NCI,3P30CA006973-47S5 (9006),,9/29/2012,...,,2009,,60852.0,1,1,0.0,2,shared_resourcesince inception cell imaging co...,"[shared_resourcesince, inception, cell, imagin..."


In [10]:
# Create the term-document matrix tfidf for each pkl file
stop_wds = ['research', 'aim', 'project']  # use will be eliminated by max_df

for fy in year:
    # Load the sample for a given year
    fw = open('/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/Data/FR_'+str(fy)+'.pkl', 'rb')
    dfw = pickle.load(fw)
    fw.close()
            
    # Pre-processing the pkl file
    (tf_idf, tfidf_vectorizer) = preprocess(dfw, stop_wds)
    docs = dfw['list_final_tokens']
    
    # Create the dictionary
    dictionary = createLDAvars(docs)
    
    # Save the term-document matrix
    joblib.dump((tf_idf,tfidf_vectorizer,docs,dictionary), '/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/Term_docs_'+str(fy)+'.pkl' )
    

In [6]:
# Build the full dictionary and docs (use during the 2nd stage)
docs = df['list_final_tokens']
dictionary = createLDAvars(docs)
path = '/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/'
joblib.dump((docs,dictionary), path+'dico_docs.pkl' )
    

NameError: name 'path' is not defined

In [7]:
path = '/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/'
joblib.dump((docs,dictionary), path+'dico_docs.pkl' )

['/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/dico_docs.pkl']