In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel



In [2]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(topic_term_mat, vectorizer, top_n=10):
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [3]:
def list_topics(topic_term_mat, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [4]:
f = open('full_coherence_vars.sav', 'rb')
[corpus, id2word, docs] = pickle.load(f)
f.close()

In [5]:
text = []

for abstract in docs:
    text.append(" ".join(abstract))

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, max_features=int(len(docs)/2))
tf_idf = tfidf_vectorizer.fit_transform(text)

In [7]:
num_topics = 75

doc_topic, topic_term = pickle.load(open('../../../data/prd/Paper-ET/final_model_DT_TT.sav','rb'))

In [8]:
# calculate topic coherence

# create list of topics
topics = list_topics(topic_term, tfidf_vectorizer, top_n=10)

t1 = time.time()
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                    coherence='c_v', processes=8) #window_size=500 ) 
print(cm.get_coherence())
t2 = time.time()
print(f"  Coherence time: {t2-t1}")

0.745157970957045
  Coherence time: 317.56836891174316


In [None]:
#topics_5 = list_topics(topic_term, tfidf_vectorizer, top_n=5)

In [9]:
nmf_output = pd.DataFrame(cm.get_coherence_per_topic(with_std=True))
nmf_output.insert(0, 'topic_words', topics)
nmf_output.columns = ['topic_words', 'coherence_mean', 'coherence_stdev']

In [10]:
nmf_output.head()

Unnamed: 0,topic_words,coherence_mean,coherence_stdev
0,"[cell, differentiation, cellular, antigen, cul...",0.615038,0.108656
1,"[student, undergraduate, graduate, college, fa...",0.881729,0.051303
2,"[cancer, nci, pancreatic, member, ovarian, pre...",0.662256,0.151528
3,"[hiv, aids, infect, prevention, cfar, antiretr...",0.770968,0.148324
4,"[infection, infect, immune, viral, hpv, hcv, p...",0.713201,0.154076


In [11]:
doc_topic_df = pd.DataFrame(data=doc_topic.copy())
nmf_output["avg_weight_in_corpus"] = doc_topic_df.mean(axis=0)
nmf_output["med_weight_in_corpus"] = doc_topic_df.median(axis=0)

In [12]:
# create a column for the number of documents that contain a topic
doc_topic_bool = pd.DataFrame(data=doc_topic.copy())
doc_topic_bool[doc_topic_bool > 0] = 1 

In [13]:
nmf_output["num_docs_containing_topic"] = doc_topic_bool.sum(axis=0)
nmf_output["percent_docs_containing_topic"] = 100*(nmf_output["num_docs_containing_topic"]/doc_topic.shape[0])

In [14]:
# find the dominant topic per document
max_topic = doc_topic_df.idxmax(axis=1)

In [16]:
doc_topic_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,0.0,0.0,0.000171,0.0,0.0,0.001167,0.0,0.0,0.0,0.001698,...,0.0,0.000771,0.0,0.0,3.7e-05,0.0,0.000669,0.0,0.000767,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000646,0.001446,0.0,0.014818,0.000425,...,0.0,0.0,0.001405,0.0,0.0,0.0,0.007767,0.0,0.0,0.000727
2,0.0,0.0,9.5e-05,0.0,0.0,0.0,0.000391,0.002431,0.0,0.003938,...,0.0,0.0,3.9e-05,0.0,0.000689,0.000346,0.01871,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000172,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02959,0.043594,0.0,0.000389,0.0,0.0,0.0,0.0
4,2.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0016,3.4e-05,...,0.0,0.00042,0.0,0.0,0.00033,0.0,0.000169,0.0,0.0,0.000423


In [15]:
max_topic

0         16
1          8
2         71
3         68
4         19
          ..
690809    71
690810    16
690811    61
690812    27
690813    46
Length: 690814, dtype: int64

In [17]:
nmf_output["num_times_max_topic"] = max_topic.value_counts()
nmf_output["percent_times_max_topic"] = 100*(nmf_output["num_times_max_topic"]/doc_topic.shape[0])

In [18]:
nmf_output.head()

Unnamed: 0,topic_words,coherence_mean,coherence_stdev,avg_weight_in_corpus,med_weight_in_corpus,num_docs_containing_topic,percent_docs_containing_topic,num_times_max_topic,percent_times_max_topic
0,"[cell, differentiation, cellular, antigen, cul...",0.615038,0.108656,0.000502,1.968816e-07,346034.0,50.090762,767,0.111028
1,"[student, undergraduate, graduate, college, fa...",0.881729,0.051303,0.001211,0.0,217644.0,31.505441,17352,2.511819
2,"[cancer, nci, pancreatic, member, ovarian, pre...",0.662256,0.151528,0.0018,0.0,251421.0,36.394891,24706,3.576361
3,"[hiv, aids, infect, prevention, cfar, antiretr...",0.770968,0.148324,0.000582,0.0,216121.0,31.284977,9563,1.384309
4,"[infection, infect, immune, viral, hpv, hcv, p...",0.713201,0.154076,0.001173,0.0,236715.0,34.266098,15936,2.306844


In [19]:
# save to file
#pickle.dump([doc_topic, topic_term, nmf_output], open('nmf_tuning/full/nmf_100.sav','wb'))

nmf_output.to_csv('full_nmf_results.csv', index=False)

In [20]:
# check if old results equal these results

old_results = pd.read_csv("nmf_df.csv")   
# this file is in emerging_topics/emerging_topics/topic_model_tuning/results

In [21]:
nmf_output.equals(old_results)

False

In [27]:
nmf_output[nmf_output['coherence_mean'] != old_results['coherence_mean']]

Unnamed: 0,topic_words,coherence_mean,coherence_stdev,avg_weight_in_corpus,med_weight_in_corpus,num_docs_containing_topic,percent_docs_containing_topic,num_times_max_topic,percent_times_max_topic
6,"[administrative, scientific, meeting, coordina...",0.812359,0.063807,0.001109,0.0,282238.0,40.85586,14409,2.0858
9,"[protein, membrane, bind, interaction, structu...",0.634499,0.08634,0.001781,0.0,313880.0,45.436253,30280,4.383235
30,"[breast, cancer, er, metastasis, estrogen, her...",0.837503,0.06218,0.000592,0.0,183161.0,26.513794,7475,1.082057
38,"[dr, director, career, mentor, award, independ...",0.781606,0.092007,0.000797,0.0,221631.0,32.082587,8628,1.248961
45,"[stem, hsc, hematopoietic, progenitor, college...",0.456995,0.204592,0.000794,0.0,222035.0,32.141068,10000,1.447568
57,"[animal, human, testing, contract, product, sm...",0.400554,0.080745,0.001066,0.0,295836.0,42.824262,9467,1.370412


In [29]:
nmf_output.iloc[6,:]

topic_words                      [administrative, scientific, meeting, coordina...
coherence_mean                                                            0.812359
coherence_stdev                                                          0.0638069
avg_weight_in_corpus                                                    0.00110878
med_weight_in_corpus                                                             0
num_docs_containing_topic                                                   282238
percent_docs_containing_topic                                              40.8559
num_times_max_topic                                                          14409
percent_times_max_topic                                                     2.0858
Name: 6, dtype: object

In [30]:
old_results.iloc[6,:]

topic_words                      ['administrative', 'scientific', 'meeting', 'c...
coherence_mean                                                            0.812359
coherence_stdev                                                          0.0638069
avg_weight_in_corpus                                                    0.00110878
med_weight_in_corpus                                                             0
num_docs_containing_topic                                                   282238
percent_docs_containing_topic                                              40.8559
num_times_max_topic                                                          14409
percent_times_max_topic                                                     2.0858
Name: 6, dtype: object

In [31]:
# getting false bc I was comparing to just calculated dataframe, not saved csv file
# must have been something about rounding

new_results = pd.read_csv("full_nmf_results.csv")

In [32]:
new_results.equals(old_results)

True