# Tune Number of Topics for each model - LDA, NMF

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

import feather as ft

### Data Ingestion

In [7]:
#df = pd.read_pickle("../../data/final/final_dataset_7-20.pkl")

In [2]:
# data needed for coherence calculation

# import entire dataset
f = open('coherence_vars20.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - df["final_frqwds_removed"]

In [3]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for abstract in docs:
    text.append(" ".join(abstract))

### Functions needed for all models

In [4]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [5]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

## LDA

In [6]:
# create document-term matrix

vectorizer = CountVectorizer(max_df=0.6, min_df=20, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)

In [7]:
# create model

num_topics = 150

t1 = time.time()
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 257)
lda_model.fit_transform(doc_term_matrix)
t2 = time.time()
print(f"  Model time: {t2-t1}")

  Model time: 918.9209473133087


In [8]:
# calculate topic coherence

# create list of topics
topics = list_topics(lda_model, vectorizer, top_n=10)

t1 = time.time()
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                    coherence='c_v', processes=10) #window_size=500 ) 
print(cm.get_coherence())
t2 = time.time()
print(f"  Coherence time: {t2-t1}")

0.6503025804114317
  Coherence time: 512.1495792865753


In [9]:
print_topics(lda_model, vectorizer, 10)


Topic 0:
('association', 28044.269447833365)
('cohort', 27846.03772960392)
('case', 25072.68178782223)
('population', 19162.900330439512)
('genetic', 14299.45668886145)
('controls', 9961.990512228133)
('collect', 7128.875819180357)
('wide', 4986.461345094191)
('african_americans', 4609.112945288659)
('individual', 4491.615494874667)

Topic 1:
('specimen', 13531.38033171047)
('laboratory', 13336.74000031109)
('assay', 9773.27385525988)
('expertise', 7298.816155390257)
('analytical', 5612.8398207326845)
('collection', 5526.333051213283)
('facility', 4703.83265918545)
('biological', 4345.487753480468)
('storage', 4179.950933663459)
('molecular', 3967.329083470286)

Topic 2:
('oxidative_stress', 16746.74702310819)
('ros', 14271.12068739558)
('macrophage', 8381.20098136607)
('redox', 7180.926566937739)
('antioxidant', 6883.768788444947)
('cell', 6275.212660444819)
('oxidative', 5299.845911489784)
('damage', 5195.565610288813)
('reactive_oxygen_species', 5057.931931752499)
('production', 49

**The code below is for when we want to tune our model by number of topics.  We look for the highest topic coherence value and choose that model as the best.**  

There is another metric available for LDA - perplexity.  We track this as well and look for the lowest perplexity.

In [7]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def lda_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute perplexity and c_v topic coherence for various number of topics

    Parameters:
    ----------
    doc_term_matrix
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the LDA model with respective number of topics
    """
    
    perplexity_values = []
    coherence_values = []
    
    i = rand_start 
    for num_topics in n_topics:
        
        # create model
        t1 = time.time()
        lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                              topic_word_prior=0.1, n_jobs=39, random_state = i)
        lda_model.fit_transform(doc_term_matrix)
        t2 = time.time()
        print(f"  Model time: {t2-t1}")
        
        # compute perplexity
        perplexity_values.append(lda_model.bound_)
        
        # create list of topics
        topics = list_topics(lda_model, vectorizer, top_n=10)
        
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        print(f"  Coherence time: {t2-t1}")
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return perplexity_values, coherence_values

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(105,151,5) 
num_runs = 1

batch= 0

col_names = [f"iteration {i+batch}" for i in range(num_runs)]
lda_p = pd.DataFrame(index = n_topics, columns = col_names)
lda_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    [p, c] = lda_metrics(doc_term_matrix=doc_term_matrix, n_topics=n_topics, vectorizer=vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = 248) #(i+batch)*len(n_topics)) -- 228
    
    # save results
    lda_p[f"iteration {i+batch}"] = p
    lda_c[f"iteration {i+batch}"] = c
       

Iteration 0
  Model time: 669.8582649230957
  Coherence time: 369.8085501194
Number of topics = 105 complete.
  Model time: 702.052579164505
  Coherence time: 406.5375988483429
Number of topics = 110 complete.
  Model time: 715.221480846405
  Coherence time: 391.1433913707733
Number of topics = 115 complete.
  Model time: 729.5471692085266
  Coherence time: 434.751793384552
Number of topics = 120 complete.
  Model time: 811.961040019989
  Coherence time: 481.04502058029175
Number of topics = 125 complete.
  Model time: 798.8298094272614
  Coherence time: 454.10427379608154
Number of topics = 130 complete.
  Model time: 867.9877436161041
  Coherence time: 484.00293493270874
Number of topics = 135 complete.
  Model time: 857.8530185222626
  Coherence time: 483.355672121048
Number of topics = 140 complete.


In [None]:
# save results 

#lda_p.to_pickle("./lda_p2.pkl")
#lda_c.to_pickle("./lda_c2.pkl")

In [None]:
lda_p

In [None]:
lda_c

## NMF

In [6]:
# used for NMF and LSA

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, max_features=int(len(docs)/2))
tf_idf = tfidf_vectorizer.fit_transform(text)

In [13]:
tfidf_vectorizer2 = TfidfVectorizer(max_df=0.6, min_df=10, lowercase=False, max_features=int(len(docs)/2))
tf_idf2 = tfidf_vectorizer2.fit_transform(text)

In [None]:
# using min_df = 10

# length(stop_words_) - max_df
# 1099020 - 70%, 60%, 50%, 40%, 30%; 
# 1099032 - 20%; 
# 1099098 - 10%

# num_words - max_df
# 178802 - 70%, 60%, 50%, 40%, 30%; 
# 178790 - 20%; 
# 178724 - 10%

In [None]:
# using max_df = 60%

# length(stop_words_) - min_df
# 932415 - 3, 5
# 1022336 - 7
# 1079503 - 9
# 1099020 - 10
# 1158143 - 15
# 1184195 - 20
# 1277822 - 50
# 1277744 - 10%
# 1276033 - 1%

# num_words - min_df
# 345,407 - 3, 5
# 255486 - 7
# 198319 - 9
# 178802 - 10
# 119679 - 15
# 93627 - 20
# 45363 - 50
# 78 - 10%
# 1789 - 1%

In [63]:
1079503+198319

1277822

In [14]:
s = tfidf_vectorizer.stop_words_
t = tfidf_vectorizer2.stop_words_

In [15]:
print(len(s))
print(len(t))

1184195
1099020


In [17]:
print(tf_idf.shape)
print(tf_idf2.shape)

(690814, 93627)
(690814, 178802)


In [18]:
diff = s-t

In [19]:
diff

{'exhale_air',
 '2128',
 'plate_subduct',
 'poorquality',
 'frail_nursing_home',
 'renaltubule',
 'abct',
 'researchtechnology',
 'malawi_sub__saharan',
 'ipr_deliberatelyfostered',
 'elk1',
 'ecofriendly',
 'ider',
 'mkp3',
 'crf_immunoreactivity',
 'oneflorida',
 'plunger',
 'console_upgrade',
 'kopr',
 'withlung',
 'radiobiologist',
 'alkaloidal',
 'vod',
 'ization',
 'b_snip2_biotype',
 'anit',
 'benzyl_isothiocyanate_bitc',
 'addictivedisorders',
 'arcus',
 'cori',
 'stojilkovic',
 'non_duplicative',
 'rovers',
 'yeat',
 'rockman_et_al',
 'crustal_assimilation',
 'nr3',
 'pharmacokinetic_pk_andpharmacodynamic',
 'baseddata',
 'wethen',
 'cpe_delta_n',
 'synapticdegeneration',
 'hdxms',
 'image_characteristic_core_edge',
 'polysubstance_abuse',
 'worker_hwwt',
 'hmg_coa_synthase',
 'non_hospital',
 'apertureless',
 'african_american_msmw',
 'contraband',
 'ipp_dmapp',
 'isimmediately',
 'andmorphology',
 'bankedsample',
 'poly_glycerol',
 'massachussetts',
 'positivepatient',
 'met

In [88]:
l = list(s)
l[-100:]

['zikvoutbreak',
 'cstcsynchrony',
 'environmentmaximy',
 'error_processing',
 'outrightdeception',
 'seleniumand',
 'capsuleproduction',
 'syn_gcase',
 'uw2020',
 'itgav',
 'narrativeseveral',
 'dpysl2_crmp2',
 'ranbindingprotein',
 'bigwig',
 'assemblyfactor',
 'theaims',
 'smallab',
 'formalizingmethodology',
 'cone_mediate',
 'howtoxoplasma',
 'creutzfelt',
 'provenmore',
 'whichcombination',
 'aflj',
 'eet_hete',
 'psvchoactive',
 'extrahypothalamic_crf',
 'o146',
 'targetedit',
 'pcytokine',
 'shimon',
 'symposium_ecoas',
 'humanmilk_oligosaccharide',
 'antigenpresentingcells',
 'national_registers',
 'beh_fpp',
 'hopesupport',
 'significantlyamong',
 'examineptsd',
 'uridylyltransferase',
 'porcie',
 'humira',
 'agnb2',
 'ibmt',
 'anti_influenzamedications',
 'impairedbystanders',
 'facilitatephenotyping',
 'huberman',
 'extremeasthma',
 'understoodthrough',
 'non_lesional_skin',
 'crenarchaeota',
 'overallcobre',
 'tubercle_bacillus',
 '1_associate',
 'bechtoldt',
 'httpwwwnibi

In [7]:
# create model

num_topics = 50

t1 = time.time()
nmf_model = NMF(n_components=num_topics, random_state = 0)
nmf_model.fit_transform(tf_idf)
t2 = time.time()
print(f"  Model time: {t2-t1}")

  Model time: 1278.035801410675


In [8]:
# calculate topic coherence

# create list of topics
topics = list_topics(nmf_model, tfidf_vectorizer, top_n=10)

t1 = time.time()
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                    coherence='c_v', processes=10) #window_size=500 ) 
print(cm.get_coherence())
t2 = time.time()
print(f"  Coherence time: {t2-t1}")

0.7337015803513163
  Coherence time: 209.27277660369873


In [9]:
print_topics(nmf_model, tfidf_vectorizer, 10)


Topic 0:
('cell', 36.8096416686502)
('differentiation', 2.489230952414879)
('cellular', 1.860174124632391)
('antigen', 1.6957668131124564)
('culture', 1.6345134047686989)
('proliferation', 1.5252667022119657)
('line', 1.4483677463517175)
('cd4', 1.3897971268922436)
('human', 1.3698697061760938)
('epithelial', 1.3682470266589721)

Topic 1:
('student', 8.92491407986324)
('undergraduate', 2.0822161395106877)
('graduate', 1.9843159512875346)
('college', 1.2469174917587136)
('school', 1.062336839348691)
('faculty', 1.0505360328275617)
('course', 0.9927196349871822)
('summer', 0.9791053181548326)
('reu', 0.8681640290496767)
('engineering', 0.8455125717752031)

Topic 2:
('cancer', 8.64863194112041)
('nci', 0.5738280864074098)
('member', 0.5499753036025888)
('ovarian', 0.5048247274735793)
('pancreatic', 0.46913690423009885)
('prevention', 0.45161510259218873)
('colorectal', 0.3757554841934516)
('colon', 0.30813521226097457)
('oncology', 0.2786176951852047)
('funding', 0.25957956661884435)

To

**The code below is for when we want to tune our model by number of topics.  We look for the highest topic coherence value and choose that model as the best.**

In [None]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def nmf_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    """
    Compute c_v topic coherence for various number of topics

    Parameters:
    ----------
    tf_idf
    n_topics : list of number of topics

    Returns:
    -------
    coherence_values : c_v topic coherence values corresponding to the NMF model with respective number of topics
    """
    
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        t1 = time.time()
        nmf_model = NMF(n_components=num_topics, random_state = i)
        nmf_model.fit_transform(doc_term_matrix)
        t2 = time.time()
        print(f"  Model time: {t2-t1}")
        
        # create list of topics
        topics = list_topics(nmf_model, vectorizer, top_n=10)
        
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, texts=docs, 
                            coherence='c_v', processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        print(f"  Coherence time: {t2-t1}")
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values

In [None]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = [16] #range(4,101,4) 
num_runs = 1

batch = 0

col_names = [f"iteration {i+batch}" for i in range(num_runs)]
nmf_c = pd.DataFrame(index = n_topics, columns = col_names)

for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = nmf_metrics(doc_term_matrix=tf_idf, n_topics=n_topics, vectorizer=tfidf_vectorizer, 
                         corpus=corpus, id2word=id2word, docs=docs, rand_start = (i+batch)*len(n_topics))
    
    # save results
    nmf_c[f"iteration {i+batch}"] = c
       

In [None]:
# save results 

#nmf_c.to_pickle("./nmf_c.pkl")