# Information Retrieval - Pandemic Investigation  
This notebook retrieves abstracts relevant to pandemics and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [9]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances


#from gensim.models.coherencemodel import CoherenceModel

import TextCleaning

In [10]:
# pull in raw abstracts

raw_df=pd.read_csv('../../data/original/raw_abstracts.csv',engine='python')

# remove null abstracts and duplicates

df = TextCleaning.remove_nulls(raw_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

3 nulls in  ABSTRACT . These rows removed.
11 duplicate abstracts removed
0 project ID duplicates - not removed


In [11]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_TITLE,PROJECT_TERMS,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,Achievement; analog; base; Cognitive Science; ...,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",WHEELING JESUIT UNIVERSITY,47.076,1999467.0
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,Active Learning; Child; Computer software; des...,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",FRANKLIN INSTITUTE,47.076,1799699.0
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,Address; Age; Birth; Brain; Caregivers; Child;...,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,Advanced Development; American; Chemicals; Che...,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,Amphibia; Central America; Communicable Diseas...,"ZAMUDIO, KELLY R",,CORNELL UNIVERSITY ITHACA,47.074,370996.0


In [12]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form!

docs = df["ABSTRACT"] 

## Functions needed for all info retrieval approaches

In [13]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [14]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    ix = scores_sorted_idx[:top_n]
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [15]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [16]:
# Create document-term matrix based on count frequencies

#Countvectroizer -> document term matrix, all terms from corpus, add ngrams in this function, contact space tracing?
vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(docs)

In [17]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [18]:
# CHANGE QUERY WORDS HERE

query_words = ["quarantine"] # other example: ['pandemic', influenza', 'mers', 'sars', 'zikv', 'denv', 'hiv', 'aids']

q = create_query(query_words, terms)

In [19]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [20]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

342

In [21]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([9., 9., 8., 8., 6., 6., 5., 5., 5., 5.])

In [22]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 200)  # CHANGE NUMBER OF TOP DOCS RETURNED

[301209 435776 141020 234110  88600 219473 354509 141156 363604 246770]


In [23]:
f_top_abstracts

301209    Objective(s): The objectives of this project a...
435776    Objective(s): The long-term goals of our resea...
141020    Cattle fever ticks carry two different pathoge...
234110    Objective(s): The long term goals of our resea...
88600     Objective(s): The long-term objective of this ...
                                ...                        
87097     Papaya is a major fruit commodity in Hawaii wi...
232155    This research problem area will develop method...
88624     Objective(s): The primary goals for this proje...
433739    Non-techncal SummaryThe plant pathogenic bacte...
47082     DESCRIPTION (provided by applicant): The Prima...
Name: ABSTRACT, Length: 200, dtype: object

In [24]:
f_top_abstracts.iloc[0]

'Objective(s): The objectives of this project are to 1) Characterize unknown and poorly described pathogens (primarily viruses and viroids) and diseases highly significant to the USDA plant germplasm quarantine program; 2) Develop and transfer sensitive, reliable, and cost-effective methods to clientele for the rapid detection of virus and virus-like pathogens of quarantine significance; and 3) Develop and transfer methods to clientele for the therapeutic elimination of virus and virus-like pathogens from infected plant genetic resources. The objectives focus on characterizing quarantine pathogens and determining the causal agents responsible for diseases that could threaten U.S. agriculture and ecosystems, and developing tools to effectively detect and eliminate them. The research areas are developed in consultation with USDA-APHIS and the data, protocols, and tools developed by this project are shared with them. Because problems that require immediate investigation can arise rapidly 

In [25]:
f_df = create_result_df(docs, f_scores)

In [26]:
f_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.0
550070,This mixed-methods study seeks to deepen our u...,0.0
550071,The purpose of this project is to examine the ...,0.0
550072,The 2014 Child Care and Development Block Gran...,0.0


## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [27]:
# Find doc-term matrix using TF-IDF weighting
#Added this line
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [28]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [29]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [30]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

342

In [31]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.62632618, 0.42638331, 0.41796638, 0.37407251, 0.29760238,
       0.28751123, 0.28657967, 0.27264223, 0.26045013, 0.26017868])

In [32]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 200)  # CHANGE NUMBER OF TOP DOCS RETURNED

[301209 219473 234110 435776 188709  88600 348024 246770 365303 219445]


In [33]:
tfidf_top_abstracts

301209    Objective(s): The objectives of this project a...
219473    Precise metabolic phenotyping depends signific...
234110    Objective(s): The long term goals of our resea...
435776    Objective(s): The long-term goals of our resea...
188709    Objective(s): To develop and protect export ma...
                                ...                        
89749     The Pacific Northwest (PNW) represents a growi...
188305    The nursery and greenhouse industries are impo...
366911    By combining forces and unique institutional r...
87097     Papaya is a major fruit commodity in Hawaii wi...
69180     The Aerosol Biology/Small Animal Models Core, ...
Name: ABSTRACT, Length: 200, dtype: object

In [34]:
tf_idf_df = create_result_df(docs, tf_idf_scores)

In [35]:
tf_idf_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.0
550070,This mixed-methods study seeks to deepen our u...,0.0
550071,The purpose of this project is to examine the ...,0.0
550072,The 2014 Child Care and Development Block Gran...,0.0


## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [36]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [37]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [38]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(550074, 500)
(500, 1058314)


In [39]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=7)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [40]:
lsa_scores.shape

(1, 550074)

In [41]:
lsa_scores

array([[0.95924589, 1.01019012, 1.0199324 , ..., 1.00819649, 0.93320879,
        0.97026889]])

In [42]:
type(lsa_scores)

numpy.ndarray

In [43]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

550074

In [44]:
lsa_scores[0]

array([0.95924589, 1.01019012, 1.0199324 , ..., 1.00819649, 0.93320879,
       0.97026889])

In [45]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.11921575, 1.1156047 , 1.11275628, 1.10926416, 1.10890845,
       1.10874732, 1.10840747, 1.1083724 , 1.10788953, 1.1072968 ])

In [46]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 200)  # CHANGE NUMBER OF TOP DOCS RETURNED

[ 10072 101433 259968 397977 434949  89237 204838 107078 190625 531455]


In [47]:
lsa_top_abstracts

10072     Denitrification is a critical, natural environ...
101433    Marine sponges are hosts to complex microbial ...
259968    Microbes regulate many important ecological pr...
397977    Weathering of exposed coal strata releases pot...
434949    Kentucky farmers want information on how they ...
                                ...                        
541344    Bioreactors can remove contaminants from waste...
423464    SATHE RESEARCH PROJECT IS AN EFFORT TOWARDS AC...
501399    Project SummaryRecent research in evolutionary...
153686    Nocturnal nitrogen oxide chemistry plays a cri...
220508    Carbon monoxide (CO) has been regarded as an e...
Name: ABSTRACT, Length: 200, dtype: object

In [48]:
lsa_df = create_result_df(docs, lsa_scores[0])

In [49]:
lsa_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.959246
1,Institution: Franklin Institute Science Museum...,1.010190
2,Through programs (including small group conver...,1.019932
3,In partnership with the American Chemical Soci...,0.967452
4,Amphibian populations around the world are exp...,0.959087
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.968893
550070,This mixed-methods study seeks to deepen our u...,0.982074
550071,The purpose of this project is to examine the ...,1.008196
550072,The 2014 Child Care and Development Block Gran...,0.933209


## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [50]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [51]:
docs_idx = np.unique(docs_ix)

In [52]:
docs_idx.shape

(467,)

In [53]:
lim_docs = [docs[i] for i in docs_idx]

In [54]:
# print out the abstracts of the pandemics corpus -- long text output!

lim_docs

['Using InSAR Measurements to Study Salt Tectonics in Colorado and Crustal Deformation near Jakobshavn Isbrae, Greenland  (Graduate Student: Lin Liu) In a preliminary InSAR study using ERS-1I2 SAR data, I have found several crustal deformation signals rela',
 'While studying the nitrogen-fixing endophytes within poplar trees, the profound discovery was made that some of the isolates were yeast, a eukaryotic organism. Since it is believed that only prokaryotic organisms can utilize nitrogen from the air, this was an exciting discovery. In this small grant for exploratory research, the PIs will identify the eukaryotic genes from these nitrogen-fixing yeasts that allow growth on nitrogen-free medium, and attempt to isolate mutants that no longer grow in nitrogen-free medium.  The nif gene cluster will be analyzed and compared with known diazotrophic bacteria and they will determine whether or not the genes have eukaryotic transcription signals. Identification of the known genes required f

## Topic Modeling with relevant pandemic abstracts

In [55]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2) , #tfdi vector see for contact, run 1-2, bigrams - 2 word phrases occured right after the other as one token, if we had n_gram range from1-3, unigrams - 1 word token, bigrams and trigrams
                                 max_df=0.4, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(lim_docs)

In [56]:
# topic modeling with NMF

nmf_model = NMF(n_components=10, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [57]:
# TRY TOPIC MODELING WITH LDA



In [58]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [59]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('animal', 0.9705403215016399)
('facility', 0.7245211721994228)
('animals', 0.4345173916589957)
('housing', 0.42097741459505383)
('space', 0.4164522827839455)
('care', 0.3994752121676666)
('vivarium', 0.3040426161610711)
('rodent', 0.30317111917855216)
('services', 0.2937801822322827)
('investigators', 0.27473471771648683)

Topic 1:
('nitrogen', 1.137365367247699)
('denitrification', 0.19398821187846707)
('microbial', 0.17817105512508513)
('organic', 0.16968670314691)
('nutrient', 0.16816458569131018)
('fixation', 0.16289861650835255)
('nitrogen fixation', 0.14350017268679907)
('coastal', 0.12232635873103818)
('soil', 0.12102008494376523)
('cycling', 0.11836279975952392)

Topic 2:
('species', 0.480581776465118)
('biological control', 0.3487085559907562)
('invasive', 0.348537278510549)
('control', 0.32894108297914787)
('biological', 0.3161304351575737)
('pests', 0.25258190161233496)
('plant', 0.2111816389492534)
('native', 0.20972836399396602)
('natural', 0.20334670522077522)
