# Information Retrieval - Pandemic Investigation

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time
import os


from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from gensim.models.coherencemodel import CoherenceModel



unable to import 'smart_open.gcs', disabling that module


In [None]:
"""Here's the error I'm having with coherence. I'm doing the pandemic analysis (AdventunesinPandemicCohrence.ipynb),
and attempting to replicate our fitting process (NumTopicsTune) on the subset of documents,
but when I attempt to calculate coherence, I get errors based on certain tokens. 
I tried four methods to solve the problem: using the original corpus and id2word, 
calculating corpus and id2word based solely on the subset of docs, 
calculating corpus and id2word based solely on a subset of docs AND not filtering extremes in id2word,
and feeding in docs that only contain tokens found in id2word. 
Only one of those works: not filtering extremes, and I have no idea why and what the implications are here."""

In [2]:
import pickle 
os.chdir('/home/sc2pg/src/prnd/publicrd/data/prd/RND Topic Modelling') 

In [3]:
top_docs_per_method=600
file_path=('lda_data_stanford_lemma.sav')

In [4]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words
        

In [5]:

f = open(file_path, 'rb')

[og_corpus, og_id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts


In [6]:
docs = docs.loc[docs.apply(lambda x: len(x)>0)] #No duplicates removed here
docs=docs.reset_index(drop=True)
docs

0         [project, explore, game, base, metaphor, enhan...
1         [institution, science, museum, pi, steve, proj...
2         [program, small, group, conversation, citizen,...
3         [partnership, american, chemical, society, acs...
4         [amphibian, population, world, experience, dec...
                                ...                        
543409    [establish, administration_children_families, ...
543410    [mix, method, study, seek, deepen, understandi...
543411    [purpose, project, examine, long_term, effect,...
543412    [child, care, development, block, grant, ccdbg...
543413    [goal, study, understand, use, positive, behav...
Name: final_tokens, Length: 543414, dtype: object

In [7]:
# input needed for doc-term matrix creation is one string per document (not a list of strings). 

docs=docs[0:100] #A subset of the docs
text = []
i=0
for doc in docs:
    text.append(" ".join(doc))


In [14]:
#Sample model
num_topics=4
keep_only_most_common=int(len(docs)/2)
vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=keep_only_most_common)
doc_term_matrix = vectorizer.fit_transform(text)

nmf_model = NMF(n_components=num_topics, random_state = i)
nmf_model.fit_transform(doc_term_matrix)

# create list of topics
topics = list_topics(nmf_model, vectorizer, top_n=10)

In [15]:
#Using pickled corpus/id2word
# calculate coherence
cm = CoherenceModel(topics=topics, 
                    corpus=og_corpus,
                    dictionary=og_id2word,
                    texts=docs, 
                    coherence='c_v', 
                    processes=20) #window_size=500 ) 
print(cm)

KeyError: 'learning'

In [16]:
#Using new dictionary generated just from the subset of docs, but filtering extremes
id2word = gensim.corpora.Dictionary(docs)
keep_only_most_common=int(len(docs)/2) #LDA works best with less features than documents
id2word.filter_extremes(no_below=3, no_above=0.4, keep_n=keep_only_most_common)
corpus = [id2word.doc2bow(doc) for doc in docs]

cm = CoherenceModel(topics=topics, 
                    corpus=corpus,
                    dictionary=id2word,
                    texts=docs, 
                    coherence='c_v', 
                    processes=20) #window_size=500 ) 
print(cm)

KeyError: 'teacher'

In [17]:
#Using new dictionary generated just from the subset of docs, but filtering extremes
id2word = gensim.corpora.Dictionary(docs)
keep_only_most_common=int(len(docs)/2) #LDA works best with less features than documents
#id2word.filter_extremes(no_below=3, no_above=0.4, keep_n=keep_only_most_common)
corpus = [id2word.doc2bow(doc) for doc in docs]

cm = CoherenceModel(topics=topics, 
                    corpus=corpus,
                    dictionary=id2word,
                    texts=docs, 
                    coherence='c_v', 
                    processes=20) #window_size=500 ) 
print(cm)

Coherence_Measure(seg=<function s_one_set at 0x7f3d3e873c20>, prob=<function p_boolean_sliding_window at 0x7f3d3e87b950>, conf=<function cosine_similarity at 0x7f3d3e8863b0>, aggr=<function arithmetic_mean at 0x7f3d3e5a1b00>)


In [18]:
#Filtering documents so that they only contain tokens found in id2word

id2word = gensim.corpora.Dictionary(docs)
keep_only_most_common=int(len(docs)/2) #LDA works best with less features than documents
id2word.filter_extremes(no_below=3, no_above=0.4, keep_n=keep_only_most_common)
corpus = [id2word.doc2bow(doc) for doc in docs]

sanitized_docs=[]
for doc in docs:
    sanitized_docs.append([token for token in doc if token in list(id2word.token2id.keys())])


cm = CoherenceModel(topics=topics, 
                    corpus=corpus,
                    dictionary=id2word,
                    texts=sanitized_docs, 
                    coherence='c_v', 
                    processes=20) #window_size=500 ) 
print(cm)

KeyError: 'teacher'