# Model Runs

Sam - let me know if you have any questions!  

This script loads data, prepares it for LDA in Scikit-Learn, and then runs LDA multiple times. It also provides examples of how to pull the topic-term distribution and perplexity from each model. 

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [2]:
# import NSF data
f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/agency_data.sav', 'rb')

# import entire dataset
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/lda_data.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts


In [3]:
# input needed here is one string per document (not a list of strings)

text = []
i=0
for doc in docs:
    text.append(" ".join(doc))

In [4]:
# form document-term matrix -- input for LDA in Scikit-Learn

vectorizer = CountVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)

In [5]:
# Function to compute multiple models 

#function copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

def model_runs(doc_term_matrix, num_runs, num_topics):
    """
    Create and save n models

    Parameters:
    ----------
    doc_term_matrix 
    num_runs : number of models to create
    num_topics: number of topics to use in each model

    Returns:
    -------
    model_list : List of LDA topic models
    """

    model_list = []
    
    for i in range(num_runs):
        
        lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                              topic_word_prior=0.1, n_jobs=39, random_state=i)
        lda_model.fit_transform(doc_term_matrix)
        model_list.append(lda_model)
      
        print('Iteration', i, "complete.")

    return model_list 

In [6]:
# example of function use

models = model_runs(doc_term_matrix, num_runs=10, num_topics=60)

Iteration 0 complete.
Iteration 1 complete.
Iteration 2 complete.
Iteration 3 complete.
Iteration 4 complete.
Iteration 5 complete.
Iteration 6 complete.
Iteration 7 complete.
Iteration 8 complete.
Iteration 9 complete.


In [None]:
# Example of getting perplexity from model 0

perplexity = models[0].bound_
print(perplexity)

In [None]:
# Example of getting term-topic distribution from model 0

TT = models[0].components_
print(TT)

In [None]:
# Function to print out topics with terms - no built in for Scikit-Learn!

# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [None]:
# Example of function use to print out the top_n words for each topic

print_topics(models[0], vectorizer, top_n=20)

In [7]:
# save list of models 

pickle.dump([models], open('all_data_models.sav','wb'))

In [8]:
n_words=30
n_topics=60

In [9]:
def topics_to_dataframe(model, vectorizer, top_n=10):
    all_topics=[]
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
        print_list = [(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]
        all_topics.append(print_list)
    return all_topics

In [10]:
#Compare your outputs for 10 runs

model_df_list=[]
for idx in range(10):
    output=topics_to_dataframe(models[idx],vectorizer,top_n=n_words)
    model_df=pd.DataFrame(output,index=[['Topic'+str(x) for x in range(n_topics)]])
    model_df=model_df.T.set_index(pd.MultiIndex.from_tuples([('Model '+str(idx),'Word '+str(x)) for x in range(n_words)]))
    model_df_list.append(model_df)
master_df=pd.concat(model_df_list)

# CHANGE FILE NAME in this line
#master_df.to_csv('Term-Topic Ouputs for Entire Dataset 10 Model 60 Topic Per Model Run.csv')

In [None]:
#Chosen Model Output
output=topics_to_dataframe(chosen_model,vectorizer,top_n=n_words)
model_df=pd.DataFrame(output,index=[['Topic'+str(x) for x in range(n_topics)]])
model_df=model_df.T.set_index(pd.MultiIndex.from_tuples([('Model '+str(2),'Word '+str(x)) for x in range(n_words)]))

# CHANGE FILE NAME in this line
#model_df.to_csv('OutputOfChosenAllData60Model.csv')

In [None]:
output=chosen_model.transform(doc_term_matrix)
doc_topic_matrix=pd.DataFrame(output,columns=['Topic_'+str(x) for x in range(60)])
admin=pd.read_csv('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/FRAbstractsProcessed.csv')
#nsf_admin=admin.loc[admin['AGENCY']=='NSF']

#Make sure your rows are identical
(admin['lemma_abstracts'].reset_index()['lemma_abstracts']==docs).value_counts()

# CHANGE FILE NAME in this line
pd.concat([admin[['PROJECT_ID', 'ABSTRACT', 'FY',
        'DEPARTMENT', 'AGENCY', 'IC_CENTER',
       'PROJECT_NUMBER', 'PROJECT_TITLE', 'PROJECT_TERMS',
       'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 'ORGANIZATION_NAME',
       'CFDA_CODE', 'FY_TOTAL_COST', 'working_abstract',
       'tokened_abstracts', 'tokened_docs_nostop', 'tns_bi_tri_docs',
       'lemma_abstracts']].reset_index(),doc_topic_matrix],axis=1).to_csv(#'FinalAllDataTopicDataFrame.csv')