# Guided LDA on NSF Data

Code references:  
1. https://github.com/ShahrzadH/Insight_Project_SHV/blob/master/notebook/Guided%20LDA_6topics-4-grams.ipynb
2. https://github.com/vi3k6i5/GuidedLDA

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

import guidedlda

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis
import pyLDAvis.sklearn

In [2]:
# import NSF data
f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/agency_data.sav', 'rb')

# import entire dataset
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/lda_data.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts


In [3]:
# create document-term matrix using Scikit-Learn 

# input needed is one string per document (not a list of strings)

text = []
i=0
for doc in docs:
    text.append(" ".join(doc))


vectorizer = CountVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)    

In [4]:
print(doc_term_matrix.shape)
print(doc_term_matrix.sum())  # number of tokens in corpus

(116475, 58237)
19922704


In [6]:
# Normal LDA without seeding

lda_model = guidedlda.GuidedLDA(n_topics=5, n_iter=100, random_state=1, refresh=20)
lda_model.fit(doc_term_matrix)

INFO:guidedlda:n_documents: 116475
INFO:guidedlda:vocab_size: 58237
INFO:guidedlda:n_words: 19922704
INFO:guidedlda:n_topics: 5
INFO:guidedlda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:guidedlda:<0> log likelihood: -198476508
INFO:guidedlda:<20> log likelihood: -169768732
INFO:guidedlda:<40> log likelihood: -165474868
INFO:guidedlda:<60> log likelihood: -164004994
INFO:guidedlda:<80> log likelihood: -163424244
INFO:guidedlda:<99> log likelihood: -163131701


<guidedlda.guidedlda.GuidedLDA at 0x7f859c21fd90>

In [4]:
# Function to print out topics with terms - Guided LDA follow Scikit-Learn conventions

# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of term-topic matrix.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)

In [9]:
print_topics(lda_model, vectorizer, 10)


Topic 0:
('data', 0.008129984385145448)
('model', 0.006880674480795533)
('change', 0.00548626495942648)
('community', 0.00466498791620738)
('process', 0.004465943828521035)
('social', 0.00421823962647774)
('understand', 0.004215597448322611)
('water', 0.0039971773874986575)
('impact', 0.0037087396055637987)
('work', 0.0036653638475171064)

Topic 1:
('cell', 0.009015437916728356)
('plant', 0.007089304998311383)
('specie', 0.006946759578522088)
('protein', 0.005474326528960433)
('gene', 0.005427669450248361)
('human', 0.004859740181787627)
('understand', 0.004676651369462325)
('model', 0.004440791792248887)
('biology', 0.004262207801316474)
('change', 0.004140255850751818)

Topic 2:
('data', 0.011176347167708163)
('model', 0.010922299507924794)
('problem', 0.007652968934436753)
('theory', 0.006612906575547772)
('application', 0.006513477577667039)
('network', 0.006396747058128646)
('method', 0.006071303245682021)
('design', 0.005549848523316331)
('analysis', 0.004604397017036329)
('larg

In [5]:
# Create anchors for topics 

seed_topic_list = [['pandemic', 'disease', 'virus', 'infection', 'viral', 'immune', 'pathogen']
                   ]


In [6]:
num_topics = 50

Glda_model = guidedlda.GuidedLDA(n_topics=num_topics, n_iter=100, alpha=1/num_topics, eta=0.1, 
                                 random_state=1, refresh=20)

In [7]:
tf_feature_names = vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(tf_feature_names))

In [13]:
word2id

{'aa': 0,
 'aaa': 1,
 'aaai': 2,
 'aaai_conference': 3,
 'aaai_doctoral_consortium': 4,
 'aaas': 5,
 'aabw': 6,
 'aac': 7,
 'aacc': 8,
 'aachen_germany': 9,
 'aacr': 10,
 'aadl': 11,
 'aae': 12,
 'aag': 13,
 'aaiw': 14,
 'aalto': 15,
 'aam': 16,
 'aamas': 17,
 'aamu': 18,
 'aanderaa': 19,
 'aao': 20,
 'aap': 21,
 'aapf': 22,
 'aapf_fellows': 23,
 'aapt': 24,
 'aar': 25,
 'aarhus_denmark': 26,
 'aarhus_university': 27,
 'aaron': 28,
 'aaronson': 29,
 'aas': 30,
 'aat': 31,
 'aau': 32,
 'aauw': 33,
 'aaws': 34,
 'ab': 35,
 'aba': 36,
 'abac': 37,
 'abaco': 38,
 'abacus': 39,
 'abalone': 40,
 'abandon': 41,
 'abandoned': 42,
 'abandoned_mine': 43,
 'abandonment': 44,
 'abate': 45,
 'abatement': 46,
 'abb': 47,
 'abbott': 48,
 'abbreviate': 49,
 'abc': 50,
 'abc_transporter': 51,
 'abc_transporters': 52,
 'abc_triblock': 53,
 'abcd': 54,
 'abd': 55,
 'abdel': 56,
 'abdomen': 57,
 'abdominal': 58,
 'abduction': 59,
 'abductive': 60,
 'abductive_reasoning': 61,
 'abe': 62,
 'abel': 63,
 'abe

In [8]:
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

In [9]:
Glda_model.fit(doc_term_matrix, seed_topics=seed_topics, seed_confidence=0.8)

INFO:guidedlda:n_documents: 116475
INFO:guidedlda:vocab_size: 58237
INFO:guidedlda:n_words: 19922704
INFO:guidedlda:n_topics: 50
INFO:guidedlda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:guidedlda:<0> log likelihood: -263546437
INFO:guidedlda:<20> log likelihood: -172302552
INFO:guidedlda:<40> log likelihood: -167367818
INFO:guidedlda:<60> log likelihood: -165983089
INFO:guidedlda:<80> log likelihood: -165300118
INFO:guidedlda:<99> log likelihood: -164915507


<guidedlda.guidedlda.GuidedLDA at 0x7f7d54117290>

In [15]:
print_topics(Glda_model, vectorizer, 100)


Topic 0:
('cell', 0.029794916663703205)
('gene', 0.014297343872793463)
('plant', 0.013328745573361604)
('protein', 0.011656391728918063)
('biology', 0.008384302214630063)
('molecular', 0.0076421017536541635)
('function', 0.007209596449056696)
('dna', 0.006650009338910837)
('mechanism', 0.005795677873039296)
('cellular', 0.005670731896155584)
('genetic', 0.00560452120755054)
('biological', 0.0052926902225074275)
('development', 0.005126095586662477)
('undergraduate', 0.005031051211084268)
('expression', 0.0049712480084732604)
('understand', 0.004968044265476242)
('signal', 0.004894358176544822)
('genome', 0.004852709517583584)
('process', 0.004820672087613401)
('sequence', 0.004792906314972576)
('identify', 0.004521656074558362)
('role', 0.00450670527390561)
('control', 0.004087014941296215)
('approach', 0.003998378051712043)
('response', 0.003982359336726951)
('interaction', 0.003862752931504936)
('important', 0.003825375929873056)
('model', 0.003747418183612278)
('bacteria', 0.003712

In [None]:
# doc-topic distribution -- need to adjust

doc_topic = model.transform(doc_term_matrix)
for i in range(9):
    print("top topic: {} Document: {}".format(doc_topic[i].argmax(),
                                                  ', '.join(np.array(vocab)[list(reversed(X[i,:].argsort()))[0:5]])))

In [11]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function slightly modified from https://nlpforhackers.io/topic-modeling/

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [12]:
topics = list_topics(Glda_model, vectorizer, top_n=10)

In [13]:
cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, 
                    texts=docs, coherence='c_v', processes=10) #window_size=500 )

In [14]:
coherence = cm.get_coherence() 
print(coherence)

INFO:gensim.topic_coherence.probability_estimation:using ParallelWordOccurrenceAccumulator(processes=10, batch_size=64) to estimate probabilities from sliding windows
INFO:gensim.topic_coherence.text_analysis:1 batches submitted to accumulate stats from 64 documents (3755 virtual)
INFO:gensim.topic_coherence.text_analysis:2 batches submitted to accumulate stats from 128 documents (9645 virtual)
INFO:gensim.topic_coherence.text_analysis:3 batches submitted to accumulate stats from 192 documents (13800 virtual)
INFO:gensim.topic_coherence.text_analysis:4 batches submitted to accumulate stats from 256 documents (18870 virtual)
INFO:gensim.topic_coherence.text_analysis:5 batches submitted to accumulate stats from 320 documents (23366 virtual)
INFO:gensim.topic_coherence.text_analysis:6 batches submitted to accumulate stats from 384 documents (29174 virtual)
INFO:gensim.topic_coherence.text_analysis:7 batches submitted to accumulate stats from 448 documents (34162 virtual)
INFO:gensim.topic

0.5645480538470938


In [16]:
Glda_tc = cm.get_coherence_per_topic(with_std=True)

In [17]:
Glda_tc

[(0.7549215045123805, 0.10398905206062324),
 (0.45665844527149596, 0.09649487657780381),
 (0.5219731112740217, 0.1290153169186546),
 (0.5269226726706037, 0.21633106146605002),
 (0.7118582123410425, 0.09143509252805795),
 (0.5291958258278945, 0.2327568838267143),
 (0.4398362663597736, 0.08188502245901781),
 (0.5935920241302689, 0.18129853997045398),
 (0.5206423834177739, 0.1289478139481448),
 (0.5460067123668926, 0.14731895347687196),
 (0.6186104039562084, 0.18650536824445887),
 (0.5279692364385312, 0.17606494823983257),
 (0.45684240845793056, 0.05821801654327013),
 (0.5050122782742061, 0.14832062777042856),
 (0.49239667287986116, 0.1502489272583559),
 (0.6543605392527478, 0.1629278384212371),
 (0.6519106248441643, 0.13074402548556677),
 (0.6236047917783848, 0.24400563439338963),
 (0.5472984510553086, 0.09885931722675327),
 (0.6195292204918832, 0.23657976439381984),
 (0.6014147271570413, 0.14876048626620836),
 (0.6353195740488109, 0.1728444496144985),
 (0.6282759852068465, 0.14115456826

In [42]:
pyLDAvis.enable_notebook()

vis = pyLDAvis.sklearn.prepare(Glda_model, doc_term_matrix, vectorizer, n_jobs=10)

  if sparse and not np.issubdtype(doc_word.dtype, int):


KeyboardInterrupt: 

In [None]:
pyLDAvis.display(vis)