# Information Retrieval - Pandemic Investigation

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

from gensim.models.coherencemodel import CoherenceModel

import TextCleaning

In [None]:
'''
# CLEANED AND PROCESSED DATA PULL - processed abstracts are in docs column.  Will need to create new
# corpus and id2word

# import NSF data
#f = open('../../data/prd/RND Topic Modelling/nsf_stanford_lemma.sav', 'rb')

# import entire dataset
f = open('../../data/prd/RND Topic Modelling/lda_data_stanford_lemma.sav', 'rb')

[temp1, temp2, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts
'''

In [None]:
'''
# from Sam's code

#docs=docs.reset_index(drop=True)  
docs = docs.loc[docs.apply(lambda x: len(x)>0)] #No duplicates removed here
docs = docs.reset_index(drop=True)
'''

In [2]:
# ORIGINAL DATA PULL

# pull in original abstracts

raw_df=pd.read_csv('../../data/prd/RND Topic Modelling/abstracts_federal_reporter_combined.csv',engine='python')

# remove null abstracts and duplicates

df = TextCleaning.remove_nulls(raw_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

df.head()


3 nulls in  ABSTRACT . These rows removed.
11 duplicate abstracts removed
0 project ID duplicates - not removed


Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_TITLE,PROJECT_TERMS,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,Achievement; analog; base; Cognitive Science; ...,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",WHEELING JESUIT UNIVERSITY,47.076,1999467.0
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,Active Learning; Child; Computer software; des...,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",FRANKLIN INSTITUTE,47.076,1799699.0
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,Address; Age; Birth; Brain; Caregivers; Child;...,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,Advanced Development; American; Chemicals; Che...,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,Amphibia; Central America; Communicable Diseas...,"ZAMUDIO, KELLY R",,CORNELL UNIVERSITY ITHACA,47.074,370996.0


In [3]:
docs = df["ABSTRACT"]

# input needed for doc-term matrix creation is one string per document (not a list of strings).  Original data is
# already in this form!

text = df["ABSTRACT"]

#for doc in docs:
#    text.append(" ".join(doc))    

## Functions needed for all approaches

In [4]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [5]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    ix = scores_sorted_idx[:top_n]
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [6]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Exact word matches - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  A query is just a list of words to search for.

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [7]:
# Create document-term matrix based on count frequencies

vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(text)

In [8]:
terms = vectorizer.get_feature_names()

In [38]:
# CHANGE QUERY WORDS HERE

query_words = ["1918"]#['coronavirus']#'pandemic', influenza', 'mers', 'sars', 'zikv', 'denv', 'hiv', 'aids']

q = create_query(query_words, terms)

In [39]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [40]:
sum(f_scores >0)

156

In [41]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([38., 38., 36., 25., 16., 15., 14., 13., 12., 11.])

In [42]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 200)

[ 92006 145010 192544 124462 162734 236762 521109 300486  71142  48540]


In [43]:
f_top_abstracts

92006     Influenza A viruses are significant human path...
145010    Influenza A viruses are significant human path...
192544    Influenza A viruses are significant human path...
124462    DESCRIPTION (provided by applicant): Our long-...
162734    DESCRIPTION (provided by applicant): Current v...
                                ...                        
183328    This subproject is one of many research subpro...
183329    This subproject is one of many research subpro...
183330    This subproject is one of many research subpro...
183331    This subproject is one of many research subpro...
183650    This subproject is one of many research subpro...
Name: ABSTRACT, Length: 200, dtype: object

In [None]:
f_top_abstracts.iloc[9]

In [None]:
f_df = create_result_df(docs, f_scores)

In [18]:
f_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.0
550070,This mixed-methods study seeks to deepen our u...,0.0
550071,The purpose of this project is to examine the ...,0.0
550072,The 2014 Child Care and Development Block Gran...,0.0


## TF-IDF approach

In [15]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(text)

In [16]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()

In [17]:
# CHANGE QUERY WORDS HERE

#query_words = ['pandemic']

q = create_query(query_words, tf_idf_terms)

In [None]:
# I don't need this as it created the exact same query as the create_query function

'''
# transform query to be in same space as documents
# q = q.reshape(1,-1)
qhat = tf_idf_vectorizer.transform(query_words)

temp = qhat.toarray()
qhat = np.reshape(temp, qhat.shape[1])

tf_idf_scores = pairwise_distances(qhat, tf_idf, metric='cosine', n_jobs=19)
'''

In [17]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [18]:
sum(tf_idf_scores >0)

347

In [19]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.71236677, 0.53414312, 0.52442002, 0.52442002, 0.51347095,
       0.50789252, 0.49868827, 0.49668118, 0.49172994, 0.4420692 ])

In [20]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 200)

[394040  60618 269731 163626  22552 227002 297973 122769  56566 180857]


In [21]:
tfidf_top_abstracts

394040    This project focuses on the development of a c...
60618     DESCRIPTION (provided by applicant): Coronavir...
269731    DESCRIPTION (provided by applicant): Coronavir...
163626    DESCRIPTION (provided by applicant):  Coronavi...
22552     DESCRIPTION (provided by applicant): The long-...
                                ...                        
313192    DESCRIPTION (provided by applicant): Since the...
26379     DESCRIPTION (provided by applicant): Despite m...
24669     DESCRIPTION (provided by applicant): A Severe ...
208905    Severe Acute Respiratory Syndrome (SARS) is a ...
111411    Despite many advances in biomedical research, ...
Name: ABSTRACT, Length: 200, dtype: object

In [None]:
tfidf_top_abstracts.iloc[0]

In [None]:
tf_idf_df = create_result_df(docs, tf_idf_scores)

In [None]:
tf_idf_df

## Latent Semantic Indexing (LSI) Approach

Uses the TF-IDF matrix.

In [22]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [23]:
# CHANGE QUERY WORDS HERE

#query_words = ['pandemic']

q = create_query(query_words, tf_idf_terms)

# transform query to be in same space as documents
q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [24]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(550074, 500)
(500, 1058314)


In [25]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=19)

In [None]:
lsa_scores.shape

In [None]:
lsa_scores

In [None]:
type(lsa_scores)

In [26]:
sum(lsa_scores[0] > 0)

550074

In [None]:
lsa_scores[0]

In [27]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.17388496, 1.16942798, 1.16393621, 1.16382112, 1.16360107,
       1.16358756, 1.16242954, 1.16170126, 1.1615778 , 1.16146794])

In [28]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 200)

[ 74935 171536 140160 138426  63439 376551 115251  63937 171573 361659]


In [29]:
lsa_top_abstracts

74935     This subproject is one of many research subpro...
171536    DESCRIPTION (provided by applicant): The Kapos...
140160    Kaposi's sarcoma-associated herpesvirus (KSHV)...
138426    This subproject is one of many research subpro...
63439     DESCRIPTION (provided by applicant): Kaposi's ...
                                ...                        
30914     DESCRIPTION (provided by applicant): Each year...
377648    DESCRIPTION (provided by applicant): Infection...
484532    This project is focused on the clinical study ...
331457    DESCRIPTION (provided by applicant): Recombina...
494806    Project Summary/AbstractThe objective of this ...
Name: ABSTRACT, Length: 200, dtype: object

In [None]:
lsa_top_abstracts.iloc[4]

In [None]:
lsa_df = create_result_df(docs, lsa_scores[0])

In [None]:
lsa_df

## Topic Modeling with relevant pandemic abstracts

In [30]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [31]:
docs_idx = np.unique(docs_ix)

In [32]:
docs_idx.shape

(454,)

In [68]:
type(docs_idx)

numpy.ndarray

In [33]:
lim_docs = [text[i] for i in docs_idx]

In [71]:
lim_docs

['south_africa highest rate hiv aids world still suffer legacy apartheid pandemic put stress exist rural labor constraint cause male urban migration concurrently environmentalist raise concern deepen reliance harvest wild natural resource result hiv aids political ecology disease fitting explore rural woman girl cope labor constraint context wild resource harvesting problem silence group user dependent open access natural resource little national interest suffer epidemic community prefer speak research investigate rural woman child environmental spatial behavior cope household impact hiv aids former homeland transkei objective model varying spatial distribution wild harvesting woman girl compare individual hiv aids afflict household non_afflicted household understand contextual political ecology disease condition individual household spatiotemporal allocation wild harvesting context rural poverty poor health project hypothesize labor constraint cause pandemic influence individual spati

In [34]:
nmf_vectorizer = TfidfVectorizer(stop_words='english', #ngram_range=(1,2) , 
                                 max_df=0.4, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))
#nmf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=True, max_features=int(len(lim_docs)/2)) #original results
nmf_tf_idf = nmf_vectorizer.fit_transform(lim_docs)

In [35]:
# topic modeling with NMF

nmf_model = NMF(n_components=10, random_state=1)  # original results - 10 topic
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [36]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [37]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('kshv', 2.0703525986184825)
('ks', 0.5274126933420551)
('endothelial', 0.39358706159536794)
('induced', 0.33024341823424397)
('kaposi', 0.3175125852749127)
('sarcoma', 0.29278704589143967)
('angiogenesis', 0.2689785637555238)
('pel', 0.26806237193550364)
('oral', 0.2651045895538214)
('cellular', 0.24019894863714092)

Topic 1:
('sars', 1.1270853060819372)
('cov', 0.8816728127565958)
('respiratory', 0.301497220050687)
('lung', 0.27723084771058504)
('acute', 0.23607193569014495)
('mice', 0.2200542987992932)
('severe', 0.21693876902259446)
('aim', 0.20014145866035704)
('immune', 0.17987529470914523)
('model', 0.15940768916873937)

Topic 2:
('hsv', 1.502118010898142)
('hiv', 0.4426401179625362)
('genital', 0.4286884309345566)
('reactivation', 0.23697118547708526)
('mucosal', 0.18454513175211307)
('shedding', 0.16091751292920795)
('women', 0.13691416504807213)
('transmission', 0.13206933103326074)
('herpes', 0.12980619722948883)
('immune', 0.11306829684089044)

Topic 3:
('hypoxia'