## Package up topic model results to make figures and tables

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [2]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(topic_term_mat, vectorizer, top_n=10):
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [3]:
def list_topics(topic_term_mat, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [4]:
f = open('../model_selection/coherence_vars.sav', 'rb')
[id2word, docs] = pickle.load(f)
f.close()

In [None]:
# only run these cells to save the topics, except always run middle cell

In [5]:
text = []

for abstract in docs:
    text.append(" ".join(abstract))

In [6]:
# create document-term matrix

stop_wds = ['research', 'study', 'project']  # use will be eliminated by max_df

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
doc_term_matrix = tfidf_vectorizer.fit_transform(text)

In [9]:
num_topics = 200

with open("/project/biocomplexity/sdad/projects_data/ncses/prd/Tech-Report/nmf_full_200.pkl", "rb") as f:
    res = pickle.load(f)
    
doc_topic = res[0]
topic_term = res[1]

In [8]:
# create list of topics
topics = list_topics(topic_term, tfidf_vectorizer, top_n=10)

In [9]:
# save topics

with open("nmf_full_200_topics.pkl","wb") as f:
    pickle.dump(topics, f)

In [5]:
f = open('nmf_full_50_topics.pkl', 'rb')
topics = pickle.load(f)
f.close()

In [6]:
# calculate topic coherence

t1 = time.time()
cm = CoherenceModel(topics=topics, dictionary=id2word, texts=docs, coherence='c_v', processes=30)  
print(cm.get_coherence())
t2 = time.time()
print(f"  Coherence time: {t2-t1}")

# 50:   0.7164024793963208, time 146
# 100:  0.7140256923571832, time 258
# 150:  0.7089132569879217, time 457
# 200:  0.7014362320832395, time 579

0.7164024793963208
  Coherence time: 145.60491561889648


In [7]:
nmf_output = pd.DataFrame(cm.get_coherence_per_topic(with_std=True))
nmf_output.insert(0, 'topic_words', topics)
nmf_output.columns = ['topic_words', 'coherence_mean', 'coherence_stdev']

In [8]:
nmf_output.head()

Unnamed: 0,topic_words,coherence_mean,coherence_stdev
0,"[new, development, develop, discovery, approac...",0.461889,0.065752
1,"[program, member, department, theme, evaluatio...",0.645525,0.122956
2,"[core, provide, personnel, ppg, ensure, assist...",0.676987,0.105412
3,"[treatment, therapy, effective, efficacy, outc...",0.587592,0.132792
4,"[cancer, nci, pancreatic, ovarian, member, col...",0.678605,0.14211


In [10]:
doc_topic_df = pd.DataFrame(data=doc_topic.copy())
nmf_output["avg_weight_in_corpus"] = doc_topic_df.mean(axis=0)
nmf_output["med_weight_in_corpus"] = doc_topic_df.median(axis=0)

In [11]:
# create a column for the number of documents that contain a topic
doc_topic_bool = pd.DataFrame(data=doc_topic.copy())
doc_topic_bool[doc_topic_bool > 0] = 1 

In [12]:
nmf_output["num_docs_containing_topic"] = doc_topic_bool.sum(axis=0)
nmf_output["percent_docs_containing_topic"] = 100*(nmf_output["num_docs_containing_topic"]/doc_topic.shape[0])

In [13]:
# find the dominant topic per document
max_topic = doc_topic_df.idxmax(axis=1)

In [14]:
doc_topic_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.003327,0.0,0.001449,0.0,0.0,0.0,0.001549,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000538,0.016298,0.0,0.0
1,0.012865,0.0,3.7e-05,0.0,0.0,0.0,0.0,0.0,8e-05,0.0,...,0.0,0.0,0.0,0.000462,0.004259,0.0,0.0,0.0,0.0,0.007231
2,0.003901,0.001176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00867,0.0,0.0,0.0,0.0,0.000164
3,0.001322,0.001154,0.0,0.000191,0.0,0.000145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000455,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001637,0.001618,3.7e-05,0.0,0.0,0.0,0.0,0.0,0.000112,5e-06,...,0.0,0.0,0.0,0.001055,0.000692,0.002164,0.0,0.000279,0.001866,0.001445


In [15]:
max_topic

0         197
1           0
2          15
3         156
4         145
         ... 
696088    169
696089    176
696090    176
696091    184
696092     11
Length: 696093, dtype: int64

In [16]:
nmf_output["num_times_max_topic"] = max_topic.value_counts()
nmf_output["percent_times_max_topic"] = 100*(nmf_output["num_times_max_topic"]/doc_topic.shape[0])

In [17]:
nmf_output.head()

Unnamed: 0,topic_words,coherence_mean,coherence_stdev,avg_weight_in_corpus,med_weight_in_corpus,num_docs_containing_topic,percent_docs_containing_topic,num_times_max_topic,percent_times_max_topic
0,"[new, development, develop, discovery, approac...",0.461889,0.065752,0.003973,0.000998,447505.0,64.288105,76880.0,11.044501
1,"[program, member, department, theme, evaluatio...",0.645525,0.122956,0.000505,0.0,312199.0,44.850185,2734.0,0.392764
2,"[core, provide, personnel, ppg, ensure, assist...",0.676987,0.105412,0.001229,0.0,266806.0,38.329074,28288.0,4.063825
3,"[treatment, therapy, effective, efficacy, outc...",0.587592,0.132792,0.000321,0.0,325644.0,46.78168,824.0,0.118375
4,"[cancer, nci, pancreatic, ovarian, member, col...",0.678605,0.14211,0.000854,0.0,223328.0,32.083069,14491.0,2.081762


In [18]:
# save to file

nmf_output.to_csv('../CSVs/nmf_200_results.csv', index=False)

### Scratch work

In [20]:
# check if old results equal these results

old_results = pd.read_csv("nmf_df.csv")   
# this file is in emerging_topics/emerging_topics/topic_model_tuning/results

In [21]:
nmf_output.equals(old_results)

False

In [27]:
nmf_output[nmf_output['coherence_mean'] != old_results['coherence_mean']]

Unnamed: 0,topic_words,coherence_mean,coherence_stdev,avg_weight_in_corpus,med_weight_in_corpus,num_docs_containing_topic,percent_docs_containing_topic,num_times_max_topic,percent_times_max_topic
6,"[administrative, scientific, meeting, coordina...",0.812359,0.063807,0.001109,0.0,282238.0,40.85586,14409,2.0858
9,"[protein, membrane, bind, interaction, structu...",0.634499,0.08634,0.001781,0.0,313880.0,45.436253,30280,4.383235
30,"[breast, cancer, er, metastasis, estrogen, her...",0.837503,0.06218,0.000592,0.0,183161.0,26.513794,7475,1.082057
38,"[dr, director, career, mentor, award, independ...",0.781606,0.092007,0.000797,0.0,221631.0,32.082587,8628,1.248961
45,"[stem, hsc, hematopoietic, progenitor, college...",0.456995,0.204592,0.000794,0.0,222035.0,32.141068,10000,1.447568
57,"[animal, human, testing, contract, product, sm...",0.400554,0.080745,0.001066,0.0,295836.0,42.824262,9467,1.370412


In [29]:
nmf_output.iloc[6,:]

topic_words                      [administrative, scientific, meeting, coordina...
coherence_mean                                                            0.812359
coherence_stdev                                                          0.0638069
avg_weight_in_corpus                                                    0.00110878
med_weight_in_corpus                                                             0
num_docs_containing_topic                                                   282238
percent_docs_containing_topic                                              40.8559
num_times_max_topic                                                          14409
percent_times_max_topic                                                     2.0858
Name: 6, dtype: object

In [30]:
old_results.iloc[6,:]

topic_words                      ['administrative', 'scientific', 'meeting', 'c...
coherence_mean                                                            0.812359
coherence_stdev                                                          0.0638069
avg_weight_in_corpus                                                    0.00110878
med_weight_in_corpus                                                             0
num_docs_containing_topic                                                   282238
percent_docs_containing_topic                                              40.8559
num_times_max_topic                                                          14409
percent_times_max_topic                                                     2.0858
Name: 6, dtype: object

In [31]:
# getting false bc I was comparing to just calculated dataframe, not saved csv file
# must have been something about rounding

new_results = pd.read_csv("full_nmf_results.csv")

In [32]:
new_results.equals(old_results)

True