# Information Retrieval - Pandemic Investigation

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from gensim.models.coherencemodel import CoherenceModel


unable to import 'smart_open.gcs', disabling that module


In [2]:
#In this script, I attempted to take the subset of documents produced by IR.ipynb (copied into here), and tune the number of topics and best fit model using coherence
#However, for some reason, coherence couldn't be calculated--this script is still broken as a result and needs to be updated to fix coherence calculations.

#Please also see the query terms, which are more expanded than "pandemic", and could improve this approach moving forward

In [3]:
import os
import pickle 
os.chdir('/home/sc2pg/src/prnd/publicrd/data/prd/RND Topic Modelling') 

In [4]:
top_docs_per_method=600
file_path=('lda_data_stanford_lemma.sav') #Entire dataset

In [5]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [6]:

# CLEANED AND PROCESSED DATA PULL

# import NSF data
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/agency_data.sav', 'rb')
#f = open('nsf_stanford_lemma.sav', 'rb')

# import entire dataset
f = open(file_path, 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts


In [7]:
docs=docs.reset_index(drop=True)
docs = docs.loc[docs.apply(lambda x: len(x)>0)] #No duplicates removed here
corpus=[x for i,x in enumerate(corpus) if i in docs.index]
docs=docs.reset_index(drop=True)


In [8]:
len(docs)

543414

In [9]:
len(corpus)

543414

In [10]:
# input needed for doc-term matrix creation is one string per document (not a list of strings).  This is 
# already the format of df["ABSTRACT"] so nothing to do here


text = []
i=0
for doc in docs:
    text.append(" ".join(doc))
   

## Functions needed for all approaches

In [11]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [12]:

def list_topics(model, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        #print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [13]:
def return_top_abstracts(df, scores, top_n):
    
    '''
    df: dataframe that contains ABSTRACT column
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    ix = scores_sorted_idx[:top_n]
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [14]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Exact word matches - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  A query is just a list of words to search for.

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [15]:
# Create document-term matrix based on count frequencies

# create document-term matrix

vectorizer = CountVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
doc_term_matrix = vectorizer.fit_transform(text)

In [16]:
terms = vectorizer.get_feature_names()

In [17]:
# CHANGE QUERY WORDS HERE

query_words=[x for x in terms if 'pandemic' in x or 'epidemic' in x]
query_words.extend(['contagion','contagious','sars','h1n1', 'outbreak','zika'])#, 'epidemic','contagion','contagious']#'h1n1','epidemic','sars']
q = create_query(query_words, terms)

In [18]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [19]:
sum(f_scores >0)

12065

In [20]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([38., 28., 27., 26., 25., 24., 23., 21., 21., 20.])

In [21]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, top_docs_per_method)

[233719 143617  91351 233180 289274 232152 190307 233418 300559 342174]


In [22]:
f_df = create_result_df(text, f_scores)

## TF-IDF approach

In [23]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(text)

In [24]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()

In [25]:
# CHANGE QUERY WORDS HERE

#query_words = ['pandemic']

q = create_query(query_words, tf_idf_terms)

In [26]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [27]:
sum(tf_idf_scores >0)

12065

In [28]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([1.73887383, 1.56009364, 1.54074833, 1.45882028, 1.32375162,
       1.15300596, 1.12886319, 1.12811907, 1.06449067, 1.05236444])

In [29]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, top_docs_per_method)

[146156 289274 233719 146682 342174 431512  84846 233180 233418 294876]


In [30]:
tfidf_top_abstracts

146156    [influenza, virus, iav, significant, human, pa...
289274    [influenza_pandemics, foremost, international,...
233719    [influenza, virus, significant, human, pathoge...
146682    [influenza, virus, iav, significant, human, pa...
342174    [influenza_pandemics, foremost, international,...
                                ...                        
344618    [dengue, world, important, arbovirus, cause, e...
72983     [severe, acute, respiratory, syndrome_sars, re...
215604    [rapid, accurate, identification, highly, infe...
71238     [core, part, program, project, sars_coronaviru...
399885    [recent, event, pandemic_influenza, h1n1_pdm, ...
Name: final_tokens, Length: 600, dtype: object

In [31]:
tf_idf_df = create_result_df(text, tf_idf_scores)

## Latent Semantic Indexing (LSI) Approach

Uses the TF-IDF matrix.

In [32]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [33]:
# CHANGE QUERY WORDS HERE

#query_words = ['pandemic']

q = create_query(query_words, tf_idf_terms)

# transform query to be in same space as documents
q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [34]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(543414, 500)
(500, 1098073)


In [35]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=19)

In [36]:
lsa_scores.shape

(1, 543414)

In [37]:
lsa_scores

array([[1.03471822, 1.01045798, 0.99789794, ..., 1.01919306, 1.01570976,
        1.00180207]])

In [38]:
type(lsa_scores)

numpy.ndarray

In [39]:
sum(lsa_scores[0] > 0)

543414

In [40]:
lsa_scores[0]

array([1.03471822, 1.01045798, 0.99789794, ..., 1.01919306, 1.01570976,
       1.00180207])

In [41]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.12595817, 1.12595817, 1.12595817, 1.12281549, 1.12281065,
       1.11019888, 1.11003927, 1.10904275, 1.10541619, 1.1043847 ])

In [42]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], top_docs_per_method)

[182549 220675  23881  74450 136650 182663 221223  18399  74273 136151]


In [43]:
lsa_top_abstracts

182549    [efficacy, antiretroviral, therapy, art, hiv, ...
220675    [efficacy, antiretroviral, therapy, art, hiv, ...
23881     [efficacy, antiretroviral, therapy, art, hiv, ...
74450     [objective, efficacy, antiretroviral, therapy,...
136650    [efficacy, antiretroviral, therapy, arv, hiv, ...
                                ...                        
272961    [recent, study, demonstrate, nuclear, factor, ...
482804    [gut, brain, connect, gut, brain, axis, gba, n...
485033    [long_term, goal, research, understand, molecu...
138427    [background, blockade, regulatory, pathway, me...
413130    [nef, protein, hiv, essential, infectivity, vi...
Name: final_tokens, Length: 600, dtype: object

In [44]:
lsa_top_abstracts.iloc[0]

['efficacy',
 'antiretroviral',
 'therapy',
 'art',
 'hiv',
 'infect',
 'individual',
 'determine',
 'restoration',
 'peripheral',
 'blood',
 'cd4',
 'cell',
 'number',
 'viral',
 'suppression',
 'peripheral',
 'blood',
 'represent',
 'total',
 'lymphocyte',
 'body',
 'contrast',
 'gut',
 'associate',
 'lymphoid',
 'tissue',
 'galt',
 'harbor',
 'lymphocyte',
 'body',
 'previous',
 'study',
 'show',
 'severe',
 'cd4',
 'cell',
 'depletion',
 'occur',
 'galt',
 'primary',
 'hiv',
 'infection',
 'cd4',
 'cell',
 'restoration',
 'galt',
 'modest',
 'slow',
 'compare',
 'peripheral',
 'blood',
 'art',
 'change',
 'galt',
 'adequately',
 'reflect',
 'peripheral',
 'blood',
 'analysis',
 'kinetic',
 'mechanism',
 'cd4',
 'cell',
 'restoration',
 'function',
 'galt',
 'follow',
 'art',
 'fully',
 'determine',
 'simian_immunodeficiency_virus',
 'siv',
 'infect',
 'rhesus_macaque',
 'provide',
 'excellent',
 'animal',
 'model',
 'study',
 'gut',
 'mucosal',
 'immune',
 'system',
 'comparison',


In [45]:
lsa_df = create_result_df(docs, lsa_scores[0])

In [46]:
lsa_df

Unnamed: 0,abstracts,scores
0,"[project, explore, game, base, metaphor, enhan...",1.034718
1,"[institution, science, museum, pi, steve, proj...",1.010458
2,"[program, small, group, conversation, citizen,...",0.997898
3,"[partnership, american, chemical, society, acs...",0.990249
4,"[amphibian, population, world, experience, dec...",0.834095
...,...,...
543409,"[establish, administration_children_families, ...",1.001070
543410,"[mix, method, study, seek, deepen, understandi...",1.005692
543411,"[purpose, project, examine, long_term, effect,...",1.019193
543412,"[child, care, development, block, grant, ccdbg...",1.015710


## Topic Modeling with relevant pandemic abstracts

In [48]:
docs_idx = np.concatenate([f_idx, tfidf_idx, lsa_idx])
docs_idx = np.unique(docs_idx)
docs_idx.shape

(1401,)

In [49]:
docs_topic=docs[docs_idx].reset_index(drop=True)

In [32]:
#From here, who knows why it doesn't work
id2wordx = gensim.corpora.Dictionary(docs_topic)

keep_only_most_common=int(len(docs_topic)/2) #LDA works best with less features than documents
#Filter words to only those found in at least a set number of documents (min_appearances)
id2wordx.filter_extremes(no_below=3, no_above=0.4, keep_n=keep_only_most_common)
corpusx = [id2word.doc2bow(doc) for doc in docs_topic]


In [34]:
mini_text=[]


for doc in docs_topic:
    mini_text.append(" ".join(doc))


In [35]:
mini_corpus=[doc for i,doc in enumerate(corpus) if i in docs_idx]

In [36]:
len(docs_topic), len(mini_corpus)

(801, 801)

In [37]:
nmf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=True, max_features=int(len(docs_topic)/2))
nmf_tf_idf = nmf_vectorizer.fit_transform(mini_text)

In [None]:
#Only tokens found in the dictionary? Maybe that will help
sanitized_docs=[]
for doc in docs_topic:
    san_doc=[]
    sanitized_docs.append(" ".join([token for token in doc if token in list(id2wordx.token2id.keys())]))


In [38]:
len(mini_text)

801

In [None]:
# topic modeling with NMF

#nmf_model = NMF(n_components=20, random_state=1)
#W = nmf_model.fit_transform(nmf_tf_idf)
#H = nmf_model.components_

In [39]:
# function adapted from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def nmf_metrics(doc_term_matrix, n_topics, vectorizer, corpus, id2word, docs, rand_start):
    coherence_values = []
    
    i = rand_start
    for num_topics in n_topics:

        # create model
        nmf_model = NMF(n_components=num_topics, random_state = i)
        nmf_model.fit_transform(doc_term_matrix)
        
        # create list of topics
        topics = list_topics(nmf_model, vectorizer, top_n=10)
        
        # calculate coherence
        cm = CoherenceModel(topics=topics, 
                            corpus=corpus,
                            dictionary=id2word,
                            texts=docs, 
                            coherence='c_v', 
                            processes=10) #window_size=500 ) 
        coherence_values.append(cm.get_coherence())
        
        # output completion message
        i = i+1
        print('Number of topics =', num_topics, "complete.")

    return coherence_values

# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/



In [47]:
len(mini_text), len(mini_corpus), len(docs_topic)

(801, 801, 801)

269109

In [44]:
# create model
coherence_values=[]

for num_topics in range(5,56,5):
    doc_term_matrix_x=nmf_vectorizer.fit_transform(mini_text)
    nmf_model_x = NMF(n_components=num_topics, random_state = 0)
    nmf_model_x.fit_transform(doc_term_matrix_x)
    # create list of topics
    topics_x = list_topics(nmf_model_x, nmf_vectorizer, top_n=10)
    print(num_topics)
    cm = CoherenceModel(topics=topics_x, 
                                corpus=mini_corpus,
                                dictionary=id2word,
                                texts=docs_topic, 
                                coherence='c_v', 
                                processes=10) #window_size=500 ) 
    coherence_values.append(cm.get_coherence())

5
10
15
20


KeyError: 'zika'

In [46]:
# code copied from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# minor alterations made

n_topics = range(5,56,5)
num_runs = 10

col_names = [f"iteration {i}" for i in range(num_runs)]
nmf_c = pd.DataFrame(index = n_topics, columns = col_names)
for i in range(num_runs):
    
    print(f"Iteration {i}")
    
    # run models
    c = nmf_metrics(doc_term_matrix=doc_term_matrix_x, n_topics=n_topics, vectorizer=nmf_vectorizer, 
                         corpus=mini_corpus, id2word=id2word, docs=docs_topic, rand_start = i*len(n_topics))
    
    # save results
    nmf_c[f"iteration {i}"] = c
    nmf_c.to_pickle("./pandemic_nmf_c_intermittent.pkl")   

Iteration 0
Number of topics = 5 complete.
Number of topics = 10 complete.


KeyError: 'zika'

In [None]:
print_topics(nmf_model, nmf_vectorizer, 10)

In [None]:
id2word.token2id['zika'] #That token is definitely in the dictionary!