# Evaluation of Topic Models

- Topic Coherence
- Perplexity
- Others?

**Goal of this notebook**:  explore measures to compare runs from multiple algorithms.  Currently perplexity is only available for LDA in Scikit-Learn.  

First models are for NSF data.

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [2]:
# import NSF data
f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/agency_data.sav', 'rb')

# import entire dataset
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/lda_data.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts


In [3]:
len(docs)

116475

In [3]:
# input needed here is one string per document (not a list of strings)

text = []
i=0
for doc in docs:
    text.append(" ".join(doc))

## Topic Printing and Listing Functions - needed for all three models

In [4]:
# Function to print out topics with terms - no built in for Scikit-Learn!

# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [5]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function slightly modified from https://nlpforhackers.io/topic-modeling/

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

## LDA

In [6]:
# form document-term matrix -- input for LDA in Scikit-Learn

vectorizer = CountVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)

In [9]:
num_topics = 50
t1 = time.time()
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state=2) #learning_method='online')
#DT = lda_model.fit_transform(doc_term_matrix)
lda_model.fit_transform(doc_term_matrix)
t2 = time.time()
print('Time = ', t2-t1, 'seconds' )

#TT = lda_model.components_

Time =  96.52559804916382 seconds


In [10]:
print_topics(lda_model, vectorizer, 10)


Topic 0:
('wave', 8430.75655937394)
('host', 4022.9838203421195)
('medium', 1225.4724975815918)
('parasite', 1160.7810339103241)
('acoustic', 732.8117014885498)
('frequency', 638.5521107146782)
('propagation', 630.2416987345889)
('polar', 552.4681662472997)
('mode', 490.5551026465545)
('interaction', 488.9103840558102)

Topic 1:
('technology', 9409.670045098295)
('product', 6248.1461874526985)
('market', 6001.5339181669515)
('patient', 5175.268519689446)
('commercial', 4874.121965670093)
('cost', 4411.816779502856)
('innovation', 4350.580745646132)
('potential', 4225.295212364635)
('industry', 3924.8584842104765)
('health', 3658.4018390088063)

Topic 2:
('earthquake', 4406.556025566838)
('fault', 3795.3624557365915)
('seismic', 2986.8754880038773)
('model', 2604.9741107216623)
('data', 2512.7426955603455)
('deformation', 2386.103607915076)
('process', 1952.9000622191859)
('rock', 1924.6525161103375)
('structure', 1886.9445225289883)
('earth', 1754.6417240622773)

Topic 3:
('specie', 1

In [11]:
topics = list_topics(lda_model, vectorizer, top_n=10)

In [14]:
f = open('topics.sav','wb')
pickle.dump(topics, f)
f.close()

In [12]:
topics[0]

['wave',
 'host',
 'medium',
 'parasite',
 'acoustic',
 'frequency',
 'propagation',
 'polar',
 'mode',
 'interaction']

In [87]:
(TT>0).sum(axis=1)

array([271716, 271716, 271716, 271716, 271716, 271716, 271716, 271716,
       271716, 271716])

In [77]:
TT.shape

(10, 271716)

In [13]:
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherence = cm.get_coherence() 
print(coherence)

-1.748989243938206


In [None]:
import gc

gc.collect()

In [14]:
# for entire dataset: processes must be a small number!

cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, 
                    texts=docs, coherence='c_v', processes=10) #window_size=500 )

In [15]:
coherence = cm.get_coherence() 
print(coherence)

0.5984957295711106


In [20]:
lda_tc = cm.get_coherence_per_topic(with_std=True)

In [22]:
lda_tc

[(0.4210330519119633, 0.22525414910034453),
 (0.6518845877835282, 0.11608243299443773),
 (0.615046411548617, 0.25821236545561294),
 (0.7009327407028131, 0.15244618089086018),
 (0.5556662072639944, 0.13300863776508642),
 (0.5548356155440098, 0.15557539273899895),
 (0.7389899688134622, 0.11571763893348984),
 (0.78185255646231, 0.10963627650031547),
 (0.46518196778544885, 0.14488363388407768),
 (0.6567268044778274, 0.13276915084437688),
 (0.44464320960933207, 0.09368781924681799),
 (0.5802284867512265, 0.13089178977068083),
 (0.4973552412798278, 0.10122116330181088),
 (0.5133676335693017, 0.13448984259990615),
 (0.705336727326605, 0.19027041265632386),
 (0.5527482636949104, 0.1892332277081186),
 (0.6604118943434462, 0.17350454974606186),
 (0.544991238571743, 0.15984803183772858),
 (0.6002753292151164, 0.19709621769879598),
 (0.7562641352493371, 0.09334286513241277),
 (0.5927843250994732, 0.11312353578636161),
 (0.6123940679982219, 0.18359431350514208),
 (0.5704319536778433, 0.115019885267

In [24]:
lda_df = pd.DataFrame(data=lda_tc, columns=['mean', 'std'])

In [26]:
lda_df.head(50)

Unnamed: 0,mean,std
0,0.421033,0.225254
1,0.651885,0.116082
2,0.615046,0.258212
3,0.700933,0.152446
4,0.555666,0.133009
5,0.554836,0.155575
6,0.73899,0.115718
7,0.781853,0.109636
8,0.465182,0.144884
9,0.656727,0.132769


In [12]:
topics[3]

['system',
 'model',
 'data',
 'high',
 'method',
 'technology',
 'material',
 'process',
 'base',
 'design']

In [23]:
type(lda_tc)

list

In [66]:
topics[7]

['gene',
 'genetic',
 'evolution',
 'genome',
 'specie',
 'population',
 'sequence',
 'evolutionary',
 'animal',
 'biology']

## NMF

In [27]:
# Form TF-IDF matrix.  Will use with NMF and LSA

vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
tf_idf = vectorizer.fit_transform(text)

In [28]:
nmf_model = NMF(n_components=50, random_state=1)
#W = nmf_model.fit_transform(tf_idf)
nmf_model.fit_transform(tf_idf)
#H = nmf_model.components_

array([[2.31148618e-04, 0.00000000e+00, 3.10981702e-03, ...,
        0.00000000e+00, 0.00000000e+00, 3.58627479e-02],
       [0.00000000e+00, 2.83410807e-03, 1.10264315e-04, ...,
        0.00000000e+00, 0.00000000e+00, 9.70277624e-03],
       [0.00000000e+00, 8.00771596e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.12180734e-04, 2.17622514e-05, 0.00000000e+00, ...,
        9.20378500e-03, 0.00000000e+00, 0.00000000e+00],
       [5.12180734e-04, 2.17622514e-05, 0.00000000e+00, ...,
        9.20378500e-03, 0.00000000e+00, 0.00000000e+00],
       [4.28283464e-04, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 2.12999754e-03, 1.44793436e-02]])

In [29]:
print_topics(nmf_model, vectorizer, 10)


Topic 0:
('model', 16.05446368893237)
('method', 2.7201995131636907)
('simulation', 2.6793624568239665)
('computational', 2.241460761178497)
('statistical', 1.9764766067627637)
('prediction', 1.6331623018904071)
('uncertainty', 1.5296241710535188)
('dynamic', 1.5277426879227634)
('framework', 1.457642420611006)
('stochastic', 1.417125038993424)

Topic 1:
('program', 3.4693045296520206)
('mentor', 1.3673756134047952)
('reu', 1.3238883710060116)
('faculty', 1.2685912130822892)
('summer', 1.1665853959067034)
('experience', 1.1530347234791287)
('undergraduate', 1.093823597746579)
('graduate', 1.0137791604653785)
('career', 0.990856020890776)
('participant', 0.8791164884224822)

Topic 2:
('structure', 2.5607007524981777)
('property', 2.4935321666421486)
('temperature', 2.080221253155489)
('phase', 1.4993035423499799)
('thermal', 1.1759348452755456)
('mechanical', 1.091639738502639)
('crystal', 0.9910397597867902)
('metal', 0.9605923834038561)
('structural', 0.9394360001128006)
('experiment

In [30]:
topics = list_topics(nmf_model, vectorizer, top_n=10)

In [11]:
topics[1]

['program',
 'mentor',
 'reu',
 'faculty',
 'summer',
 'experience',
 'undergraduate',
 'graduate',
 'career',
 'participant']

In [24]:
sum(H[10,:] > 0)

11177

In [17]:
H.shape

(50, 58237)

In [19]:
H[0,:]

array([0., 0., 0., ..., 0., 0., 0.])

In [31]:
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherence = cm.get_coherence() 
print(coherence)

-2.0738713033333287


In [32]:
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, coherence='c_v', processes=10)
coherence = cm.get_coherence() 
nmf_tc = cm.get_coherence_per_topic(with_std=True)
print(coherence)

0.6840579935749517


In [33]:
nmf_tc

[(0.607004267224564, 0.07738405301527852),
 (0.7700023359720949, 0.062456232399410276),
 (0.6401823154637711, 0.07140525222302344),
 (0.6009820399337429, 0.10560702278480959),
 (0.6317835320933538, 0.1863920353145234),
 (0.722321186783914, 0.12927114293423372),
 (0.6106368266304704, 0.2425044020182997),
 (0.5547658247736265, 0.11206163552060996),
 (0.6562971310057006, 0.17317638355309162),
 (0.5751121197686737, 0.18752716002932146),
 (0.7282892096098073, 0.1479758822328572),
 (0.8322658698565062, 0.042173961823926354),
 (0.6149540253382421, 0.1229365095272426),
 (0.7230415831625151, 0.132239011080671),
 (0.660530421813334, 0.10699703573381461),
 (0.6418431848467365, 0.12055317826741992),
 (0.6784809880022932, 0.2003810071500224),
 (0.630482967552058, 0.175371500764266),
 (0.6642997978146651, 0.0941135866230718),
 (0.7103078142240419, 0.1612024798209975),
 (0.8795580804979114, 0.09475609293150544),
 (0.5547021781154505, 0.12011015572820084),
 (0.6276678915551994, 0.13918163106609732),
 

In [29]:
topics[7]

['data',
 'analysis',
 'big',
 'set',
 'statistical',
 'collection',
 'information',
 'collect',
 'large',
 'analytics']

In [34]:
nmf_df = pd.DataFrame(data=nmf_tc, columns=['mean', 'std'])
nmf_df.head(50)

Unnamed: 0,mean,std
0,0.607004,0.077384
1,0.770002,0.062456
2,0.640182,0.071405
3,0.600982,0.105607
4,0.631784,0.186392
5,0.722321,0.129271
6,0.610637,0.242504
7,0.554766,0.112062
8,0.656297,0.173176
9,0.575112,0.187527


## LSA

In [35]:
lsa_model = TruncatedSVD(n_components=50, random_state=1)
#USigma = lsa_model.fit_transform(tf_idf)
lsa_model.fit_transform(tf_idf)
#Vtrans = lsa_model.components_

array([[ 0.13862687,  0.02058711,  0.0526887 , ...,  0.02702883,
         0.07541054, -0.10325192],
       [ 0.15860988,  0.1029298 ,  0.00306609, ...,  0.00204563,
        -0.04625063, -0.00603495],
       [ 0.14190456,  0.08151303,  0.02565089, ..., -0.05267847,
        -0.07953083,  0.04538653],
       ...,
       [ 0.11031198, -0.02789218,  0.02773367, ...,  0.00169037,
         0.03053921,  0.02707969],
       [ 0.11031198, -0.02789218,  0.02773367, ...,  0.00169037,
         0.03053921,  0.02707969],
       [ 0.15194123, -0.08520254,  0.02189632, ..., -0.01272856,
        -0.0488692 , -0.00941058]])

In [36]:
print_topics(lsa_model, vectorizer, 10)


Topic 0:
('data', 0.145652629487576)
('program', 0.13499599121685202)
('model', 0.13265989158282399)
('material', 0.12252444509732707)
('design', 0.10289680546741051)
('engineering', 0.10207318111029892)
('network', 0.09571692644078954)
('stem', 0.09471066747288234)
('development', 0.09271040764895379)
('support', 0.09114858339312853)

Topic 1:
('stem', 0.36007632761073555)
('program', 0.20673670924338566)
('faculty', 0.15494476628843465)
('teacher', 0.15454036648528788)
('learn', 0.13706952134295672)
('conference', 0.13524328738270316)
('college', 0.13103633642321802)
('engineering', 0.1259113453483039)
('education', 0.12580533116903828)
('mathematics', 0.11234645368344676)

Topic 2:
('data', 0.3503536785878687)
('network', 0.22507763334146202)
('social', 0.12369534833342784)
('model', 0.11338905264877172)
('problem', 0.09904563505062365)
('analysis', 0.08719365134312564)
('information', 0.07950772945045155)
('software', 0.07282762328339822)
('user', 0.07164126323703136)
('human', 0.

In [37]:
topics = list_topics(lsa_model, vectorizer, top_n=10)

In [37]:
topics[20]

['protein',
 'language',
 'material',
 'water',
 'software',
 'membrane',
 'plant',
 'climate',
 'compute',
 'stem']

In [46]:
sum(Vtrans[1,:] > 0)

15995

In [39]:
Vtrans.shape

(50, 58237)

In [45]:
Vtrans[1,:]

array([ 1.25013311e-03,  8.47914240e-04,  4.63844143e-04, ...,
       -6.54274889e-05, -7.83274930e-05,  2.60961059e-05])

In [38]:
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherence = cm.get_coherence() 
print(coherence)

-2.558805018287795


In [41]:
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, coherence='c_v', processes=10)
coherence = cm.get_coherence() 
lsa_tc = cm.get_coherence_per_topic(with_std=True)
print(coherence)

0.34528831567226953


In [42]:
lsa_tc

[(0.3931229227560275, 0.15388175889292238),
 (0.6890757792132984, 0.08742046198525513),
 (0.4496437658525756, 0.08183901899242811),
 (0.5826556686378178, 0.22588898101864252),
 (0.48686328361607867, 0.0857188443953836),
 (0.4917682960846722, 0.2603754379804126),
 (0.6979759111622111, 0.20837837722417502),
 (0.5936312272088102, 0.3269263165077529),
 (0.3017989154804355, 0.10372033235835307),
 (0.32559006122189454, 0.23628902575640648),
 (0.2691032433136168, 0.09933707383990029),
 (0.2521931748113457, 0.20506949360048965),
 (0.3496271930414351, 0.1554230026881287),
 (0.5620607831801789, 0.2732838506520157),
 (0.45497437342450614, 0.1857942229834313),
 (0.3079824578885796, 0.176463714455978),
 (0.3176325118553537, 0.20028984162413563),
 (0.2905622032240114, 0.13171071642436105),
 (0.28786068727180797, 0.21563485151307088),
 (0.33193576638574573, 0.3730329137432797),
 (0.2140451826598396, 0.1993238720048897),
 (0.23750328011663804, 0.12468991207926079),
 (0.2574932882164469, 0.129685780180

In [51]:
topics[6]

['cell',
 'protein',
 'network',
 'gene',
 'biology',
 'plant',
 'cellular',
 'signal',
 'molecular',
 'dna']

In [43]:
lsa_df = pd.DataFrame(data=lsa_tc, columns=['mean', 'std'])
lsa_df.head(50)

Unnamed: 0,mean,std
0,0.393123,0.153882
1,0.689076,0.08742
2,0.449644,0.081839
3,0.582656,0.225889
4,0.486863,0.085719
5,0.491768,0.260375
6,0.697976,0.208378
7,0.593631,0.326926
8,0.301799,0.10372
9,0.32559,0.236289


# Results to CSV

In [45]:
# code from https://xlsxwriter.readthedocs.io/example_pandas_multiple.html

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('coherence.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
lda_df.to_excel(writer, sheet_name='LDA')
nmf_df.to_excel(writer, sheet_name='NMF')
lsa_df.to_excel(writer, sheet_name='LSA')

# Close the Pandas Excel writer and output the Excel file.
writer.save()