# Information Retrieval - Pandemic Investigation  
This notebook retrieves abstracts relevant to pandemics and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

#from gensim.models.coherencemodel import CoherenceModel

import TextCleaning

In [2]:
# pull in raw abstracts

raw_df=pd.read_csv('../../data/original/raw_abstracts.csv',engine='python')

# remove null abstracts and duplicates

df = TextCleaning.remove_nulls(raw_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

3 nulls in  ABSTRACT . These rows removed.
11 duplicate abstracts removed
0 project ID duplicates - not removed


In [3]:
len(df)

550074

In [4]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_TITLE,PROJECT_TERMS,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,Achievement; analog; base; Cognitive Science; ...,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",WHEELING JESUIT UNIVERSITY,47.076,1999467.0
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,Active Learning; Child; Computer software; des...,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",FRANKLIN INSTITUTE,47.076,1799699.0
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,Address; Age; Birth; Brain; Caregivers; Child;...,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,Advanced Development; American; Chemicals; Che...,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,Amphibia; Central America; Communicable Diseas...,"ZAMUDIO, KELLY R",,CORNELL UNIVERSITY ITHACA,47.074,370996.0


In [5]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form!

docs = df["ABSTRACT"] 

## Functions needed for all info retrieval approaches

In [6]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [7]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    ix = scores_sorted_idx[:top_n]
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [8]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [9]:
# Create document-term matrix based on count frequencies

vectorizer = CountVectorizer(ngram_range=(1,2)) #  added parameter to search for bi-grams in addition to single words
doc_term_matrix = vectorizer.fit_transform(docs)

In [10]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [11]:
# CHANGE QUERY WORDS HERE

query_words = ["pandemic", "contagion", "infection", "quarantine", "test", "virus", "death", "vaccine", "treatment", "public health", "food supply"]
# other example: ['pandemic', influenza', 'mers', 'sars', 'zikv', 'denv', 'hiv', 'aids']
#spread, illness,  "water supply", vaccination, "social distancing"

q = create_query(query_words, terms)

In [12]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [13]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

274409

In [14]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:11]

array([68., 62., 59., 56., 56., 55., 53., 52., 51., 51., 51., 50., 49.,
       48., 48., 47.])

In [15]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[235170  92724 436402 193850 297868 147665 235592 300486 298874 394489]


Changing # of top docs to 500 didn't change the returned list of 10 numbers

In [16]:
f_top_abstracts

235170    The hemagglutinin of the 2009 pandemic H1N1 in...
92724     Dengue serotype 1 vaccine development: The liv...
436402    (A) Study pathogenesis and pathophysiology of ...
193850    (I) Disease models and pathogenesis Syrian gol...
297868    The epidemiological success of pandemic and ep...
                                ...                        
166068    DESCRIPTION (provided by applicant): A CTL-bas...
530033    Our main vaccine platform is based on recombin...
538078    The immune responses to Pneumocystis are poorl...
397260    Background: Dengue infections rank second amon...
441954    The major obstacle of basic and antiviral rese...
Name: ABSTRACT, Length: 500, dtype: object

In [17]:
f_top_abstracts.iloc[0]

'The hemagglutinin of the 2009 pandemic H1N1 influenza virus is a derivative of and is antigenically related to classical swine but not to seasonal human H1N1 viruses. We compared the A/California/7/2009 (CA/7/09) virus recommended by the WHO as the reference virus for vaccine development, with two classical swine influenza viruses A/swine/Iowa/31 (sw/IA/31) and A/New Jersey/8/1976 (NJ/76) to establish the extent of immunologic cross-reactivity and cross-protection in animal models. Primary infection with 2009 pandemic or NJ/76 viruses elicited antibodies against the CA/7/09 virus and provided complete protection from challenge with this virus in ferrets; the response in mice was variable and conferred partial protection. Although ferrets infected with sw/IA/31 virus developed low titers of cross-neutralizing antibody, they were protected from pulmonary replication of the CA/7/09 virus. The data suggest that prior exposure to antigenically related H1N1 viruses of swine-origin, by prior

In [18]:
f_df = create_result_df(docs, f_scores)

In [19]:
f_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",2.0
1,Institution: Franklin Institute Science Museum...,1.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,2.0
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,1.0
550070,This mixed-methods study seeks to deepen our u...,0.0
550071,The purpose of this project is to examine the ...,0.0
550072,The 2014 Child Care and Development Block Gran...,0.0


## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [20]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [21]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [22]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [23]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

274409

In [24]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:11]

array([1.        , 1.        , 0.63820489, 0.580568  , 0.54382679,
       0.54323994, 0.53541625, 0.53382216, 0.52973697, 0.51530088,
       0.50228369, 0.50164983, 0.50015078, 0.49486197, 0.49486197,
       0.49453011])

In [25]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[ 93772 351559 235170  62499 194186 436402 300486 145793 128720 112158]


This numeric output did not change when # of top docs was changed to 500

In [26]:
tfidf_top_abstracts

93772                                                  Test
351559                                                 test
235170    The hemagglutinin of the 2009 pandemic H1N1 in...
62499     DESCRIPTION (provided by applicant): Influenza...
194186    Influenza A viruses (IAV) are significant huma...
                                ...                        
441379    ﻿   DESCRIPTION (provided by applicant):  Pers...
285384    DESCRIPTION (provided by applicant): The earli...
292362    DESCRIPTION (provided by applicant): Influenza...
18578     DESCRIPTION (provided by applicant): We have i...
152580    Simple contagion processes underlie various ph...
Name: ABSTRACT, Length: 500, dtype: object

What are these first two with "Test" as the abstract?

In [27]:
tf_idf_df = create_result_df(docs, tf_idf_scores)

In [28]:
tf_idf_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.021511
1,Institution: Franklin Institute Science Museum...,0.012649
2,Through programs (including small group conver...,0.000000
3,In partnership with the American Chemical Soci...,0.000000
4,Amphibian populations around the world are exp...,0.047417
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.012009
550070,This mixed-methods study seeks to deepen our u...,0.000000
550071,The purpose of this project is to examine the ...,0.000000
550072,The 2014 Child Care and Development Block Gran...,0.000000


## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [29]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=10, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [30]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [31]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 10)
(550074, 10)
(10, 17893410)


In [53]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=39)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [54]:
lsa_scores.shape

(1, 550074)

In [55]:
lsa_scores

array([[1.004815  , 1.0282897 , 0.95058173, ..., 0.87933965, 0.80260872,
        0.84413363]])

In [56]:
type(lsa_scores)

numpy.ndarray

In [57]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

550074

In [58]:
lsa_scores[0]

array([1.004815  , 1.0282897 , 0.95058173, ..., 0.87933965, 0.80260872,
       0.84413363])

In [69]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:11]

array([1.47230414, 1.40495041, 1.36192448, 1.33727175, 1.33727175,
       1.33727175, 1.33727175, 1.33727175, 1.33727175, 1.33727175,
       1.3304726 ])

In [60]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 00)  # CHANGE NUMBER OF TOP DOCS RETURNED

[147470   1691 454688 232414 104652 395190 464121 354319 300234 536228]


In [61]:
lsa_top_abstracts

147470                               KKHKHKHKHKFFFFFLLLLLLL
1691                               PHOTOCATALYTIC OXIDATION
454688                        DOE Digital Object Identifier
232414                                               #NAME?
104652                                               #NAME?
                                ...                        
419229    Planning and Evaluation Core Project Summary/A...
540078    This award provides travel support for student...
306678    The Targeted Infusion Project at Howard Univer...
148602    This project will bring NASA climate change an...
119670    Principal Investigator/Program Director (Last,...
Name: ABSTRACT, Length: 500, dtype: object

In [62]:
lsa_df = create_result_df(docs, lsa_scores[0])

In [63]:
lsa_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",1.004815
1,Institution: Franklin Institute Science Museum...,1.028290
2,Through programs (including small group conver...,0.950582
3,In partnership with the American Chemical Soci...,0.980614
4,Amphibian populations around the world are exp...,0.754310
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.903760
550070,This mixed-methods study seeks to deepen our u...,0.806248
550071,The purpose of this project is to examine the ...,0.879340
550072,The 2014 Child Care and Development Block Gran...,0.802609


## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [43]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [44]:
docs_idx = np.unique(docs_ix)

In [45]:
docs_idx.shape

(989,)

In [46]:
lim_docs = [docs[i] for i in docs_idx]

In [47]:
# print out the abstracts of the pandemics corpus -- long text output!

#lim_docs

## Topic Modeling with relevant pandemic abstracts

In [64]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2) , 
                                 max_df=0.4, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(lim_docs)

In [65]:
# topic modeling with NMF

nmf_model = NMF(n_components=10, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [66]:
# TRY TOPIC MODELING WITH LDA



In [67]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [68]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('hiv', 1.4334181418838918)
('siv', 0.6209304993983029)
('responses', 0.47182203253599087)
('cd4', 0.4075446229351998)
('hiv infection', 0.36431171546324054)
('cd8', 0.36052566233285377)
('mucosal', 0.3588899950642283)
('macaques', 0.2957820124362018)
('hiv vaccine', 0.2805870738590329)
('cell responses', 0.2582350372170693)

Topic 1:
('influenza', 1.214241589713104)
('pandemic', 0.5318965271076596)
('influenza virus', 0.4481852068700742)
('1918', 0.3159713623030764)
('viruses', 0.2841014218641761)
('h1n1', 0.2723991425406249)
('influenza viruses', 0.23314547568058977)
('pandemics', 0.19383802447373089)
('2009', 0.1927449673321233)
('h5n1', 0.183262326095296)

Topic 2:
('requisition', 0.8513409657852624)
('purchase', 0.8513409657852624)
('purchase requisition', 0.8513409657852624)
('sap purchase', 0.8513409657852624)
('sap', 0.8405780848570145)
('nipah', 0.0017119218525569453)
('nipah virus', 0.0017064637495498346)
('ebola', 0.0012441419883176953)
('hamsters', 0.0011947202788