# Information Retrieval - Pandemic Investigation  
This notebook retrieves abstracts relevant to pandemics and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [4]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances


#from gensim.models.coherencemodel import CoherenceModel

import TextCleaning

In [5]:
# pull in clean abstracts

with open('../../data/final/processed_dataset_stanford_lemma.pkl', 'rb') as f:
    clean_df = pickle.load(f)

# remove null abstracts and duplicates

df = TextCleaning.remove_nulls(clean_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

0 nulls in  ABSTRACT . These rows removed.
0 duplicate abstracts removed
0 project ID duplicates - not removed


In [6]:
df.head()

Unnamed: 0,original index,original index.1,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,...,CFDA_CODE,FY_TOTAL_COST,working_abstract,nchar,Start Char,Field Count,lemma_docs_with_stop,tokened_docs_nostop,tns_bi_tri_docs,final_tokens
0,0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,...,47.076,1999467.0,"This is a project to explore Game-based, Metap...",2057,T,0,"[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan...","[project, explore, game, base, metaphor, enhan..."
1,1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,...,47.076,1799699.0,"Institution: Science Museum PI: Snyder, Steve...",1913,I,1,"[institution, science, Museum, pi, Snyder, Ste...","[institution, science, museum, pi, steve, proj...","[institution, science, museum, pi, steve, proj...","[institution, science, museum, pi, steve, proj..."
2,2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,...,47.076,1505858.0,Through programs (including small group conver...,1154,T,0,"[program, include, small, group, conversation,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,...","[program, small, group, conversation, citizen,..."
3,3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,...,47.049,51000.0,In partnership with the American Chemical Soci...,826,I,0,"[partnership, American, Chemical, Society, ACS...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs...","[partnership, american, chemical, society, acs..."
4,4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,...,47.074,370996.0,Amphibian populations around the world are exp...,1322,A,1,"[amphibian, population, world, experience, dec...","[amphibian, population, world, experience, dec...","[amphibian, population, world, experience, dec...","[amphibian, population, world, experience, dec..."


In [7]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form!

docs = df["ABSTRACT"] 

## Functions needed for all info retrieval approaches

In [8]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [9]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    ix = scores_sorted_idx[:top_n]
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [10]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [11]:
# Create document-term matrix based on count frequencies

#Countvectroizer -> document term matrix, all terms from corpus, add ngrams in this function, contact space tracing?
vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(docs)

In [12]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [55]:
# CHANGE QUERY WORDS HERE

query_words = ["asymptomatic"] # other example: ['pandemic', influenza', 'mers', 'sars', 'zikv', 'denv', 'hiv', 'aids']

q = create_query(query_words, terms)

In [56]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [57]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

2131

In [58]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([13., 13., 12., 11.,  9.,  9.,  9.,  8.,  8.,  7.])

In [59]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 200)  # CHANGE NUMBER OF TOP DOCS RETURNED

[227737 282923 319090 343012  92495 237216 532234 320130 249791  36143]


In [60]:
f_top_abstracts

227737    DESCRIPTION (provided by applicant):     Carot...
282923    DESCRIPTION (provided by applicant): Herpes si...
319090    DESCRIPTION (provided by applicant): Herpes si...
343012    DESCRIPTION (provided by applicant):     Carot...
92495     We examined the association of the onset of de...
                                ...                        
366600    DESCRIPTION (provided by applicant): Atrial Fi...
526959    Carcinoid tumors are rare and cause either no ...
431629    Summary of work: As part of our program of res...
52203     DESCRIPTION (provided by applicant): Chronic l...
302371    Despite increasingly stark evidence that infec...
Name: ABSTRACT, Length: 200, dtype: object

In [61]:
f_top_abstracts.iloc[0]

'DESCRIPTION (provided by applicant):     Carotid artery stenosis is a well-known cause of atheroembolic stroke. Stroke prevention in these patients has been the focus of intense investigation. Cognitive impairment is a more insidious but poorly understood outcome in patients with  asymptomatic  carotid stenosis who have not suffered a stroke. Cognitive function describes how a person produces and controls mental processes such as thinking, learning, and problem solving. It is an important outcome measure that affects patient well-being and their ability to live independent productive lies. It is well-known that cognitive impairment coexists in patients with stroke from carotid stenosis. However, isolated cognitive deficits in patients with asymptomatic carotid stenosis have not been looked for, and have therefore not been reported in any detail.  Asymptomatic carotid stenosis affects 2-12% of people. With 23.4 million veterans in the country, at least 1 million (4.3%) have asymptomati

In [62]:
f_df = create_result_df(docs, f_scores)

In [63]:
f_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
543428,The Title IV-E Prevention Services Clearinghou...,0.0
543429,This mixed-methods study seeks to deepen our u...,0.0
543430,The purpose of this project is to examine the ...,0.0
543431,The 2014 Child Care and Development Block Gran...,0.0


## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [64]:
# Find doc-term matrix using TF-IDF weighting
#Added this line
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [65]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [66]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [67]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

2131

In [68]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.49452739, 0.37273747, 0.36501776, 0.32871419, 0.31243357,
       0.31172143, 0.30619101, 0.29966506, 0.27971157, 0.27062352])

In [69]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 200)  # CHANGE NUMBER OF TOP DOCS RETURNED

[425162 319090 282923 494859  93736 190385 532234 161523 474347 428271]


In [70]:
tfidf_top_abstracts

425162    This research will develop a mathematical mode...
319090    DESCRIPTION (provided by applicant): Herpes si...
282923    DESCRIPTION (provided by applicant): Herpes si...
494859    PROJECT SUMMARYBoth the innate and acquired im...
93736     In 2002 we reported the occurence of avascular...
                                ...                        
412839    ﻿   DESCRIPTION (provided by applicant): Comme...
335116    TB diagnosis in India relies on clinical sympt...
36089     DESCRIPTION (provided by applicant): N. gonorr...
243491    DESCRIPTION (provided by applicant): TB diagno...
494738    ABSTRACTFrontotemporal dementia represents a g...
Name: ABSTRACT, Length: 200, dtype: object

In [71]:
tf_idf_df = create_result_df(docs, tf_idf_scores)

In [72]:
tf_idf_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
543428,The Title IV-E Prevention Services Clearinghou...,0.0
543429,This mixed-methods study seeks to deepen our u...,0.0
543430,The purpose of this project is to examine the ...,0.0
543431,The 2014 Child Care and Development Block Gran...,0.0


## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [73]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [74]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [75]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(543433, 500)
(500, 1057023)


In [76]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=7)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [77]:
lsa_scores.shape

(1, 543433)

In [78]:
lsa_scores

array([[1.01167941, 1.02018858, 0.9802436 , ..., 1.00325031, 1.00938467,
        1.00192321]])

In [79]:
type(lsa_scores)

numpy.ndarray

In [80]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

543433

In [81]:
lsa_scores[0]

array([1.01167941, 1.02018858, 0.9802436 , ..., 1.00325031, 1.00938467,
       1.00192321])

In [82]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.09769953, 1.09258481, 1.09204986, 1.09068945, 1.09067699,
       1.08892375, 1.08882782, 1.08863311, 1.08851272, 1.08798125])

In [83]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 200)  # CHANGE NUMBER OF TOP DOCS RETURNED

[140198 513049  18200 291811  54608 431774 488246 451014 114228 426135]


In [84]:
lsa_top_abstracts

140198    This project is designed to facilitate the reg...
513049    The services that the Biophysics Core Facility...
18200     DESCRIPTION (provided by applicant): Neighborh...
291811    This project will improve soybean adapted to M...
54608     DESCRIPTION (provided by applicant): There is ...
                                ...                        
438242    ﻿   DESCRIPTION (provided by applicant): Princ...
361242    ABSTRACT The prevalence of food allergy is ris...
29661     DESCRIPTION (provided by applicant): Apolipopr...
160882    DESCRIPTION (provided by applicant): We hypoth...
296769    BACKGROUND & SCOPE The NIH Blueprint Neurother...
Name: ABSTRACT, Length: 200, dtype: object

In [85]:
lsa_df = create_result_df(docs, lsa_scores[0])

In [86]:
lsa_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",1.011679
1,Institution: Franklin Institute Science Museum...,1.020189
2,Through programs (including small group conver...,0.980244
3,In partnership with the American Chemical Soci...,1.016725
4,Amphibian populations around the world are exp...,0.915968
...,...,...
543428,The Title IV-E Prevention Services Clearinghou...,0.969645
543429,This mixed-methods study seeks to deepen our u...,1.039167
543430,The purpose of this project is to examine the ...,1.003250
543431,The 2014 Child Care and Development Block Gran...,1.009385


## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [87]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [88]:
docs_idx = np.unique(docs_ix)

In [89]:
docs_idx.shape

(477,)

In [90]:
lim_docs = [docs[i] for i in docs_idx]

In [91]:
# print out the abstracts of the pandemics corpus -- long text output!

lim_docs

['Rejuvenating Injured Tissue for Enhanced Wound Healing: Assisting Trauma Victims Such as Astronauts Lacking Access to Emergency Medical Care  Summary: Autonomous medicine for trauma injuries is an obvious and necessary strategy to have sustained human pe',
 "DESCRIPTION (provided by applicant): This K22 Award project is designed to support the PI's transition to becoming an independent researcher at an academic institution. The PI has prepared for this research through studies in molecular biology biochemistry, microbiology, and immunology. His most recent work has shown the strong effects of soluble, multimeric CD40L- and GITRL-based molecular adjuvants on HIV DNA vaccines in mice. Using support from this K22 award, the PI would move these molecular adjuvants into adenoviral vaccines against HIV, in order to make these promising vaccines even stronger. To this end, four Specific Aims are proposed: (1) To  prepare adenoviral vectors containing HIV Gag along with novel soluble multime

## Topic Modeling with relevant pandemic abstracts

In [92]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2) , #tfdi vector see for contact, run 1-2, bigrams - 2 word phrases occured right after the other as one token, if we had n_gram range from1-3, unigrams - 1 word token, bigrams and trigrams
                                 max_df=0.4, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(lim_docs)

In [93]:
# topic modeling with NMF

nmf_model = NMF(n_components=10, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [94]:
# TRY TOPIC MODELING WITH LDA



In [95]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [96]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('infection', 0.5035075215052345)
('patients', 0.4584805340986948)
('immune', 0.2910490631265131)
('cells', 0.28900237280864405)
('risk', 0.28011306754308024)
('individuals', 0.2754118295781919)
('host', 0.25573984595436705)
('cell', 0.24592551944450705)
('genetic', 0.2440871120862163)
('aim', 0.24372964525444266)

Topic 1:
('hiv', 0.7387381928882276)
('resistance', 0.5499118781797891)
('drug', 0.4693049688266241)
('protease', 0.4015912903481949)
('inhibitors', 0.3787844272996292)
('drug resistance', 0.3396117528713032)
('hiv protease', 0.26359340460067515)
('antiviral', 0.2192466829186268)
('drugs', 0.20156621864820365)
('viral', 0.18249086136641388)

Topic 2:
('malaria', 1.1896310167427968)
('transmission', 0.41635978343345703)
('parasite', 0.2904224377801359)
('malaria transmission', 0.2214918336117923)
('vector', 0.17186099670489155)
('infection', 0.16631524774732814)
('peru', 0.16023846989391907)
('vivax', 0.13576832936370567)
('sites', 0.1348841030268517)
('infections',