# Tune Number of Topics for each model - LDA, NMF, LSA

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

unable to import 'smart_open.gcs', disabling that module


### Data Ingestion

In [2]:
# import NSF data
import os
os.chdir('/home/sc2pg/src/prnd/publicrd/data/prd/RND Topic Modelling') 
f = open('nsf_stanford_lemma.sav', 'rb')

# import entire dataset
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/lda_data_stanford_lemma.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts

In [3]:
# input needed for LDA, NMF and LSA (all from Scikit-Learn) is one string per document (not a list of strings)

text = []
i=0
for doc in docs:
    text.append(" ".join(doc))

### Functions needed for all models

In [4]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [5]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

### LDA

In [8]:
# create document-term matrix

vectorizer = CountVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)

In [None]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def lda_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute perplexity and c_v topic coherence for various number of topics

    Parameters:
    ----------
    doc_term_matrix
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the LDA model with respective number of topics
    """
    
    perplexity_values = []
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:
        
        # create model
        lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                              topic_word_prior=0.1, n_jobs=9, random_state = i*num_topics)
        lda_model.fit_transform(doc_term_matrix)
        
        # compute perplexity
        perplexity_values.append(lda_model.bound_)
        
        # create list of topics
        topics = list_topics(lda_model, vectorizer, top_n=10)
        
        # calculate coherence
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return perplexity_values, coherence_values

In [3]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
topic_VOIs=[52]
scaling_factor=len(range(4,101,4))
import os
os.chdir('/home/sc2pg/src/prnd/publicrd/data/prd/RND Topic Modelling') 

# function slightly modified from https://nlpforhackers.io/topic-modeling/

def generate_run_dataframe(lda_model, vectorizer, top_n=10, model_number=-1):
    norm=lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    new_series=[['Word {}'.format(i) for i in range(top_n)]]
    labels=['Word']
    labels.extend(['Topic {}'.format(y) for y in range(len(norm))])
    for idx, topic in enumerate(norm):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], '{0:.4f}'.format(topic[i]))  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        new_series.append(print_list)
    if model_number!=-1:
        new_series.append([model_number]*top_n)
        labels.append('Model Run')
    return pd.DataFrame(new_series,index=labels).T 

def model_run_generation_comparison(doc_term_matrix,n_runs, topic_numbers,top_n_words, vectorizer, corpus, id2word, docs):
    for num_topics in topic_numbers:
        model_outputs=[]
        print('Num Topic: {}'.format(num_topics))
        for i in range(n_runs):
            print(i)
            # create model
            lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics,
                                                  topic_word_prior=0.1, n_jobs=39, random_state = i*scaling_factor)
            lda_model.fit_transform(doc_term_matrix)
            model_outputs.append(generate_run_dataframe(lda_model,vectorizer, top_n=top_n_words,model_number=i))

        pd.concat(model_outputs,ignore_index=True).to_csv('10RunsModelNSF{}topics.csv'.format(num_topics))
        
topic_VOIs=[52]
scaling_fator=len(range(4,101,4))
model_run_generation_comparison(doc_term_matrix,n_runs=10,topic_numbers=topic_VOIs,top_n_words=30,vectorizer=vectorizer,
                               corpus=corpus, id2word=id2word,docs=docs)

NameError: name 'doc_term_matrix' is not defined

In [None]:
topic_VOIs=[40,50,64]
scaling_fator=len(range(4,101,4))
model_run_generation_comparison(doc_term_matrix,n_runs=10,topic_numbers=topic_VOIs,top_n_words=30,vectorizer=vectorizer,
                               corpus=corpus, id2word=id2word,docs=docs)

Num Topic: 40
0
1
2
3
4
5
6
7
8
9
Num Topic: 50
0
1
2


In [31]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics_normal(model, vectorizer, top_n=10):
    norm=lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [32]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def generate_run_dataframe(lda_model, vectorizer, top_n=10, model_number=-1):
    norm=lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    new_series=[['Word {}'.format(i) for i in range(top_n)]]
    labels=['Word']
    labels.extend(['Topic {}'.format(y) for y in range(len(norm))])
    for idx, topic in enumerate(norm):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], '{0:.4f}'.format(topic[i]))  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        new_series.append(print_list)
    if model_number!=-1:
        new_series.append([model_number]*top_n)
        labels.append('Model Run')
    return pd.DataFrame(new_series,index=labels).T

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = [64]
num_runs = 1

col_names = [f"iteration {i}" for i in range(num_runs)]
lda_p = pd.DataFrame(index = n_topics, columns = col_names)
lda_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    [p, c] = lda_metrics(doc_term_matrix=doc_term_matrix, n_topics=n_topics, vectorizer=vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = i*len(n_topics))
    
    # save results
    lda_p[f"iteration {i}"] = p
    lda_c[f"iteration {i}"] = c
       

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(4,101,4) 
num_runs = 10

col_names = [f"iteration {i}" for i in range(num_runs)]
lda_p = pd.DataFrame(index = n_topics, columns = col_names)
lda_c = pd.DataFrame(index = n_topics, columns = col_names)
lda_p=pd.read_pickle('nsf_lda_p_intermittent.pkl')
lda_c=pd.read_pickle('nsf_lda_c_intermittent.pkl')
for i in range(1,num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    [p, c] = lda_metrics(doc_term_matrix=doc_term_matrix, n_topics=n_topics, vectorizer=vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = i*len(n_topics))
    
    # save results
    lda_p[f"iteration {i}"] = p
    lda_c[f"iteration {i}"] = c
       # save results 

    lda_p.to_pickle("./nsf_lda_p_intermittent.pkl")
    lda_c.to_pickle("./nsf_lda_c_intermittent.pkl")

Iteration 1
Number of topics = 4 complete.
Number of topics = 8 complete.
Number of topics = 12 complete.
Number of topics = 16 complete.
Number of topics = 20 complete.
Number of topics = 24 complete.
Number of topics = 28 complete.
Number of topics = 32 complete.
Number of topics = 36 complete.
Number of topics = 40 complete.
Number of topics = 44 complete.


In [None]:
# save results 

lda_p.to_pickle("./nsf_lda_p.pkl")
lda_c.to_pickle("./nsf_lda_c.pkl")

In [110]:
len(range(4,101,4))

25

### NMF

In [6]:
# used for NMF and LSA

tfidf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
tf_idf = tfidf_vectorizer.fit_transform(text)

In [7]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def nmf_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics

    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the NMF model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        nmf_model = NMF(n_components=num_topics, random_state = i)
        nmf_model.fit_transform(doc_term_matrix)
        
        # create list of topics
        topics = list_topics(nmf_model, vectorizer, top_n=10)
        
        # calculate coherence
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=20) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(4,101,4) 
num_runs = 10

col_names = [f"iteration {i}" for i in range(num_runs)]
nmf_c = pd.DataFrame(index = n_topics, columns = col_names)
nmf_c=pd.read_pickle('nsf_nmf_c_intermittent.pkl')
for i in range(3,num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = nmf_metrics(doc_term_matrix=tf_idf, n_topics=n_topics, vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = i*len(n_topics))
    
    # save results
    nmf_c[f"iteration {i}"] = c
    nmf_c.to_pickle("./nsf_nmf_c_intermittent.pkl")   

In [None]:
# save results 

nmf_c.to_pickle("./nsf_nmf_c.pkl")

### LSA

We use the same tf_idf created for NMF

In [None]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def lsa_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics

    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the LSA model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        lsa_model = TruncatedSVD(n_components=num_topics, random_state = i)
        lsa_model.fit_transform(doc_term_matrix)
        
        # create list of topics
        topics = list_topics(lsa_model, vectorizer, top_n=10)
        
        # calculate coherence
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=20) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(4,101,4) 
num_runs = 10

col_names = [f"iteration {i}" for i in range(num_runs)]
lsa_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = lsa_metrics(doc_term_matrix=tf_idf, n_topics=n_topics, vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = i*len(n_topics))
    
    # save results
    lsa_c[f"iteration {i}"] = c
       

In [None]:
# save results 

lsa_c.to_pickle("./nsf_lsa_c.pkl")

### Plot Results

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(4,101,4)
num_runs = 1

col_names = [f"iteration {i}" for i in range(num_runs)]
lda_p = pd.DataFrame(index = n_topics, columns = col_names)
lda_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    [p, c] = lda_metrics(doc_term_matrix=doc_term_matrix, n_topics=52, vectorizer=vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = i*len(n_topics))
    
    # save results
    lda_p[f"iteration {i}"] = p
    lda_c[f"iteration {i}"] = c
       

In [19]:
filehandler = open('LDA52model.pkl',"wb")
pickle.dump(lda_model,filehandler)
filehandler.close()

In [20]:
pd.read_pickle('LDA52model.pkl')

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.019230769230769232,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=52, n_jobs=39,
                          perp_tol=0.1, random_state=75, topic_word_prior=0.1,
                          total_samples=1000000.0, verbose=0)

In [8]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def nmf_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics

    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the NMF model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        nmf_model = NMF(n_components=num_topics, random_state = i)
        return nmf_model.fit_transform(doc_term_matrix)

        

In [10]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(4,101,4) 
num_runs = 10

for i in range(4,5):
    
    print(f"Iteration {i}")
    
    # run models
    c = nmf_metrics(doc_term_matrix=tf_idf, n_topics=[44], vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = i*len(n_topics))
    
    filehandle=('NMFChosenModelpkl.pkl','wb')
    pickle.dump(c,filehandle)
    filehandle.close()

Iteration 4


TypeError: file must have a 'write' attribute