# Tune Number of Topics for each model - LDA, NMF, LSA

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

### Data Ingestion

In [2]:
# import NSF data
#f = open('../../data/prd/RND Topic Modelling/nsf_stanford_lemma.sav', 'rb')

# import entire dataset
f = open('../../data/prd/RND Topic Modelling/lda_data_stanford_lemma.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts

In [3]:
# input needed for LDA, NMF and LSA (all from Scikit-Learn) is one string per document (not a list of strings)

text = []
for doc in docs:
    text.append(" ".join(doc))

### Functions needed for all models

In [4]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [5]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

### LDA

In [6]:
# create document-term matrix

vectorizer = CountVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)

In [7]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def lda_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute perplexity and c_v topic coherence for various number of topics

    Parameters:
    ----------
    doc_term_matrix
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the LDA model with respective number of topics
    """
    
    perplexity_values = []
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:
        
        # create model
        t1 = time.time()
        lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                              topic_word_prior=0.1, n_jobs=39, random_state = i)
        lda_model.fit_transform(doc_term_matrix)
        t2 = time.time()
        print(f"  Model time: {t2-t1}")
        
        # compute perplexity
        perplexity_values.append(lda_model.bound_)
        
        # create list of topics
        topics = list_topics(lda_model, vectorizer, top_n=10)
        
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        print(f"  Coherence time: {t2-t1}")
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return perplexity_values, coherence_values

In [14]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = [16] #range(4,101,4) 
num_runs = 1

batch= 9

col_names = [f"iteration {i+batch}" for i in range(num_runs)]
lda_p = pd.DataFrame(index = n_topics, columns = col_names)
lda_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    [p, c] = lda_metrics(doc_term_matrix=doc_term_matrix, n_topics=n_topics, vectorizer=vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = 228) #(i+batch)*len(n_topics))
    
    # save results
    lda_p[f"iteration {i+batch}"] = p
    lda_c[f"iteration {i+batch}"] = c
       

Iteration 0
  Model time: 333.9065086841583
  Coherence time: 144.03825736045837
Number of topics = 16 complete.


In [15]:
lda_p

Unnamed: 0,iteration 9
16,2752.512653


In [16]:
lda_c

Unnamed: 0,iteration 9
16,0.574763


In [None]:
# save results 

#lda_p.to_pickle("./nsf_lda_p.pkl")
#lda_c.to_pickle("./nsf_lda_c.pkl")

lda_p.to_pickle("./lda_p.pkl")
lda_c.to_pickle("./lda_c.pkl")

### NMF

In [None]:
# used for NMF and LSA

tfidf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
tf_idf = tfidf_vectorizer.fit_transform(text)

In [None]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def nmf_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics

    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the NMF model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        t1 = time.time()
        nmf_model = NMF(n_components=num_topics, random_state = i)
        nmf_model.fit_transform(doc_term_matrix)
        t2 = time.time()
        print(f"  Model time: {t2-t1}")
        
        # create list of topics
        topics = list_topics(nmf_model, vectorizer, top_n=10)
        
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        print(f"  Coherence time: {t2-t1}")
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values

In [17]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = [16] #range(4,101,4) 
num_runs = 1

batch = 2

col_names = [f"iteration {i+batch}" for i in range(num_runs)]
nmf_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = nmf_metrics(doc_term_matrix=tf_idf, n_topics=n_topics, vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = (i+batch)*len(n_topics))
    
    # save results
    nmf_c[f"iteration {i+batch}"] = c
       

Iteration 0


NameError: name 'nmf_metrics' is not defined

In [None]:
# save results 

#nmf_c.to_pickle("./nsf_nmf_c67.pkl")

nmf_c.to_pickle("./nmf_c.pkl")

### LSA

We use the same tf_idf created for NMF

In [None]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def lsa_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics

    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the LSA model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        t1 = time.time()
        lsa_model = TruncatedSVD(n_components=num_topics, random_state = i)
        lsa_model.fit_transform(doc_term_matrix)
        t2 = time.time()
        print(f"  Model time: {t2-t1}")
        
        # create list of topics
        topics = list_topics(lsa_model, vectorizer, top_n=10)
        
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        print(f"  Coherence time: {t2-t1}")
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(4,101,4) 
num_runs = 3

batch = 4

col_names = [f"iteration {i+batch}" for i in range(num_runs)]
lsa_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = lsa_metrics(doc_term_matrix=tf_idf, n_topics=n_topics, vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = (i+batch)*len(n_topics))
    
    # save results
    lsa_c[f"iteration {i+batch}"] = c
       

In [None]:
# save results 

#lsa_c.to_pickle("./nsf_lsa_c.pkl")

lsa_c.to_pickle("./lsa_c456.pkl")

### Plot Results

In [None]:
'''

# plot results

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(n_topics, lda_c)
plt.title("NSF data")
plt.xlabel("Num Topics")
plt.ylabel("c_v Topic Coherence")
#plt.legend(("coherence"), loc='best')

plt.subplot(1,2,2)
plt.plot(n_topics, lda_p)
plt.title("NSF data")
plt.xlabel("Num Topics")
plt.ylabel("Perplexity")
#plt.legend(("perplexity"), loc='best')

plt.subplots_adjust(wspace=0.3)
#plt.tight_layout()
plt.show()
'''

### Combine results

In [8]:
c0 = pd.read_pickle("./lda_data_ALL/all_lda_c0.pkl")
c1 = pd.read_pickle("./lda_data_ALL/all_lda_c1-3.pkl")
c2 = pd.read_pickle("./lda_data_ALL/all_lda_c4-6.pkl")
c3 = pd.read_pickle("./lda_data_ALL/all_lda_c7-9.pkl")

p0 = pd.read_pickle("./lda_data_ALL/all_lda_p0.pkl")
p1 = pd.read_pickle("./lda_data_ALL/all_lda_p1-3.pkl")
p2 = pd.read_pickle("./lda_data_ALL/all_lda_p4-6.pkl")
p3 = pd.read_pickle("./lda_data_ALL/all_lda_p7-9.pkl")


In [None]:
# rename columns if necessary

df1 = df1.rename(columns={"iteration 0": "iteration 1"})
df23 = df23.rename(columns={"iteration 0": "iteration 2", "iteration 1": "iteration 3"})
df45 = df45.rename(columns={"iteration 0": "iteration 4", "iteration 1": "iteration 5"})
df67 = df67.rename(columns={"iteration 0": "iteration 6", "iteration 1": "iteration 7"})
df89 = df89.rename(columns={"iteration 0": "iteration 8", "iteration 1": "iteration 9"})

In [9]:
lda_coherence = pd.concat([c0, c1, c2, c3], axis=1)

In [10]:
lda_coherence

Unnamed: 0,iteration 0,iteration 1,iteration 2,iteration 3,iteration 4,iteration 5,iteration 6,iteration 7,iteration 8,iteration 9
4,0.525502,0.516735,0.484837,0.521476,0.471487,0.490747,0.515226,0.516282,0.525464,0.517342
8,0.572111,0.526925,0.558833,0.522835,0.550526,0.559538,0.572227,0.5601,0.555319,0.576492
12,0.584261,0.578889,0.574052,0.573472,0.565584,0.525675,0.564939,0.585593,0.55457,0.567584
16,0.589525,0.580833,0.589836,0.586698,0.567065,0.581224,0.562805,0.592358,0.564412,0.574763
20,0.567955,0.576958,0.565704,0.576695,0.583724,0.584184,0.590186,0.580329,0.592885,0.561338
24,0.605736,0.590307,0.585401,0.56952,0.599309,0.589748,0.577234,0.592363,0.583009,0.571619
28,0.583631,0.619945,0.601776,0.58461,0.581604,0.589834,0.610876,0.58578,0.583893,0.591035
32,0.597807,0.597357,0.607031,0.572299,0.597684,0.609442,0.596639,0.591546,0.591964,0.600725
36,0.588713,0.599887,0.619905,0.612457,0.58515,0.607189,0.593386,0.58321,0.591024,0.603566
40,0.583883,0.595965,0.588807,0.611395,0.604895,0.587521,0.609423,0.60001,0.595055,0.596664


In [13]:
lda_coherence.to_pickle("./lda_all_coherence.pkl")

In [11]:
lda_perplexity = pd.concat([p0, p1, p2, p3], axis=1)

In [12]:
lda_perplexity

Unnamed: 0,iteration 0,iteration 1,iteration 2,iteration 3,iteration 4,iteration 5,iteration 6,iteration 7,iteration 8,iteration 9
4,3411.850617,3470.474742,3417.27604,3498.457741,3458.96747,3440.116389,3456.834542,3456.981969,3411.192599,3452.274672
8,3086.98589,3095.768809,3066.296019,3125.895783,3115.637292,3039.088288,3074.477114,3105.269439,3030.043877,3021.558313
12,2863.172934,2822.892377,2844.60364,2884.786139,2894.029123,2881.27611,2884.880764,2868.998633,2885.188989,2888.700197
16,2761.281565,2789.988357,2700.742524,2776.55334,2743.972965,2761.983738,2764.94626,2751.663137,2761.217982,2752.512653
20,2698.868888,2686.914208,2696.47579,2636.333342,2641.749492,2682.264809,2634.079236,2684.279605,2655.40277,2724.368298
24,2615.713814,2598.526983,2619.954561,2604.806371,2577.051923,2590.714647,2589.597093,2604.317709,2598.437246,2614.512329
28,2573.387007,2556.035496,2534.41152,2560.242781,2565.8251,2574.442458,2548.842372,2572.175109,2574.05528,2577.835906
32,2508.584065,2546.29923,2504.199384,2539.052117,2541.041433,2493.438208,2543.361929,2508.029562,2511.296978,2530.016826
36,2501.122269,2476.239002,2463.482486,2494.876162,2499.015256,2485.582394,2480.352125,2500.854008,2485.841674,2484.224644
40,2465.073743,2467.997133,2476.197199,2484.433925,2460.838535,2458.959215,2485.054891,2470.834881,2458.305425,2468.237953


In [15]:
lda_perplexity.to_pickle("./lda_all_perplexity.pkl")