# Information Retrieval - For RShiny Dashboard  
This notebook retrieves abstracts relevant to pandemics and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [5]:
# pull in data

df = pd.read_pickle("/home/kjl5t/dspg20RnD/data/final/final_dataset_7-20.pkl")

df.reset_index(inplace = True)
#df.rename(columns={'index':'original index'}, inplace=True)

In [6]:
df.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,...,working_abstract,Start_Char,nchar,LAST_CHAR,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens,final_frqwds_removed
0,0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,...,The multiprotein complex y-secretase proteolyt...,T,1402,g,"[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y_secretase, proteolyt...","[multiprotein, y_secretase, proteolytically_cl..."
1,1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,...,The Kissl gene encodes peptides called kisspep...,T,2553,y,"[Kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin..."
2,2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,...,The objective of this research is to understan...,T,1414,e,"[objective, research, be, understand, biophysi...","[objective, research, be, understand, biophysi...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[biophysical, basis, thermodynamics_kinetic, m..."
3,3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,...,Obesity is the cause of many adverse pregnancy...,O,1545,d,"[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, adverse, pregnancyoutcome, re...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, adverse_pregnancyoutcome, great, hea..."
4,4,371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,...,Local potato advisory groups have expressed in...,L,271,s,"[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, express, interest, m..."


In [7]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form!

docs = df["ABSTRACT"] 
tokens = df["final_frqwds_removed"]

## Functions needed for all info retrieval approaches

In [8]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [9]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx, if top_n = -1 return all abstracts
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    if top_n == -1:
        n = sum(scores > 0)
        ix = scores_sorted_idx[:n]
    else:
        ix = scores_sorted_idx[:top_n]
    
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [10]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [11]:
# Note - we are now using the spaCy stopwords list instead of nltk.  It is more comprehensive.

def create_stopwords():
      
    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking 
    through.  """
    
    spacy_stop_words = STOP_WORDS

    # more stop words that do not add meaning to topics
    additional_stopwords = {'addition', 'specifically', 'similar','including', 'particular', 
                            'furthermore','include', 'includes','overall', 'finally', 'specific', 
                            'additional'} 
           
    sw = spacy_stop_words.union(additional_stopwords)
    
    return sw

In [12]:
# Create document-term matrix based on count frequencies

stop_words = create_stopwords()

vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
doc_term_matrix = vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [13]:
doc_term_matrix.shape

(690814, 84347)

In [14]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [26]:
'artificial_intelligence' in terms

False

In [28]:
# CHANGE QUERY WORDS HERE

query_words = ['ai', 'artificial', 'intelligence'] # 'artificial_intelligence' not in terms

# coronavirus, 'mers', 'sars'] # 'pandemic', mers_cov', 'sars_cov']   #'mers', 'sars', 'zikv', 'denv', 'hiv'] #'influenza', 'aids'] 
              
q = create_query(query_words, terms)

In [29]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [30]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

8669

In [31]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([25., 25., 23., 20., 18., 18., 18., 17., 17., 17.])

In [32]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[492111 681010 639437 328397 665068 120772 305952 265504  65398  65361]


In [33]:
f_top_abstracts

492111    Background:  Deaths from prostate cancer are t...
681010    Beef Cattle. Successful management of estrus a...
639437    As a result of the powerful innovation and app...
328397    DESCRIPTION (provided by applicant): Two third...
665068    Improving Fluid Intelligence by Training Worki...
                                ...                        
168395    Traumatic events are common occurrences in the...
69964     This STTR Phase I project, Serious Game for En...
273080    DESCRIPTION (provided by applicant): The long-...
632517    ﻿   DESCRIPTION (provided by applicant): This ...
132463    The proposed Center for American Indian and Al...
Name: ABSTRACT, Length: 500, dtype: object

## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [34]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [35]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [36]:
tf_idf_terms == terms

True

In [37]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [38]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

8669

In [39]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([1.08297506, 1.05870074, 1.05150226, 1.01949681, 0.93721633,
       0.90397893, 0.88244022, 0.83848679, 0.82627865, 0.80352009])

In [40]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[115149 293012 261070  95701 235101 209895  10300 639437 108705 571378]


In [41]:
tfidf_top_abstracts

115149    This award supports participants to EAAI-11, T...
293012    This grant supports the participation of under...
261070    This award supports participants to EAAI-10: T...
95701     This award supports participants to EAAI-12, t...
235101    This proposal will support US-based Ph.D. stud...
                                ...                        
116320    This subproject is one of many research subpro...
35355     This project supports a Doctoral Student Sympo...
197546    This protocol compares diabetes (DM) and heart...
414149    DESCRIPTION (provided by applicant): Breast ca...
166858    The purpose of this study is to investigate th...
Name: ABSTRACT, Length: 500, dtype: object

## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [42]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [43]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [44]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(690814, 500)
(500, 84347)


In [45]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=20)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [46]:
lsa_scores.shape

(1, 690814)

In [47]:
lsa_scores

array([[0.99294071, 0.99103517, 1.00450711, ..., 0.95513276, 0.99677847,
        1.00304953]])

In [48]:
lsa_scores[0]

array([0.99294071, 0.99103517, 1.00450711, ..., 0.95513276, 0.99677847,
       1.00304953])

In [49]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

690814

In [50]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.16680793, 1.16459887, 1.15476659, 1.14918613, 1.14508752,
       1.14445104, 1.14335151, 1.14311037, 1.1421273 , 1.13914895])

In [51]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[548568  95643 556623 672013 491186 538465 548941 381912  32728 320074]


In [52]:
lsa_top_abstracts

548568    Geology, Paleontology and Paleobiology of the ...
95643     0960160MillerThis award is funded under the Am...
556623    Interpreting the environments and mammal speci...
672013    Majority rule and minority rights are importan...
491186    Arid ecosystems cover one third of Earth's lan...
                                ...                        
243578    At a time when pharmaceutical pipelines of new...
469042    The attitudes citizens hold toward judicial in...
49463     DESCRIPTION (provided by applicant): The aryl ...
316809    WuDMS-0907913     This award is funded under t...
319863    Despite their importance as recyclers, symbion...
Name: ABSTRACT, Length: 500, dtype: object

## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [53]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [54]:
docs_idx = np.unique(docs_ix)

In [55]:
docs_idx.shape

(1119,)

In [42]:
#lim_docs = [tokens[i] for i in docs_idx]

**create case-study corpuses**

In [46]:
pandemic_corpus = df.loc[docs_idx, :]

In [47]:
pandemic_corpus.shape

(1137, 40)

In [48]:
#pandemic_corpus.to_pickle("./pandemic_corpus.pkl")

In [49]:
lim_docs = pandemic_corpus["final_frqwds_removed"]

In [146]:
corona_corpus = df.loc[docs_idx, :]

In [147]:
corona_corpus.shape

(1012, 40)

In [155]:
#corona_corpus.to_pickle("./corona_corpus.pkl")

In [148]:
lim_docs = corona_corpus["final_frqwds_removed"]

In [56]:
ai_corpus = df.loc[docs_idx, :]

In [57]:
ai_corpus.shape

(1119, 40)

In [58]:
lim_docs = ai_corpus["final_frqwds_removed"]

In [59]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(" ".join(token_list))

In [60]:
len(lim_docs)

1119

## Topic Modeling with relevant pandemic abstracts

In [61]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [62]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(text)

In [63]:
nmf_tf_idf.shape

(1119, 5920)

In [64]:
# topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [65]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('youth', 1.6838253012848148)
('ai', 1.4336299491341014)
('suicide', 0.8128930023375601)
('family', 0.8021618616591655)
('prevention', 0.7715689453360194)
('substance', 0.7246083825208314)
('alcohol', 0.6931700088187136)
('adolescent', 0.6325770501296395)
('cultural', 0.5998297956773952)
('urban', 0.5854624232334149)

Topic 1:
('hf', 2.7828159665806838)
('heart', 0.4993770451817277)
('failure', 0.3662671528717993)
('clinical', 0.30580092453060814)
('patient', 0.28028129466952445)
('cardiac', 0.252053281755288)
('hospitalization', 0.1860880936865309)
('diastolic', 0.16847881490510358)
('rock1', 0.14467955307300623)
('dysfunction', 0.14447431595200974)

Topic 2:
('ahr', 1.82228747187345)
('ligand', 0.6015626094549615)
('bind', 0.21981424218395415)
('cell', 0.20505263631896875)
('gene', 0.15463846552836036)
('ahr_dependent', 0.14287212052772189)
('immune', 0.14161366838705303)
('dioxin', 0.1362486652526506)
('activation', 0.13287529482611826)
('treg', 0.1318932421995782)

Topic 

In [238]:
# TRY TOPIC MODELING WITH LDA

# create document-term matrix

lda_vectorizer = CountVectorizer(max_df=1.0, min_df=3, lowercase=True)
lda_dtm = lda_vectorizer.fit_transform(text)

In [239]:
# create model

num_topics = 30
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)
doc_top_dist = lda_model.fit_transform(lda_dtm)
top_term_dist = lda_model.components_

In [240]:
print_topics(lda_model, lda_vectorizer, 10)


Topic 0:
('protein', 225.55502749264147)
('receptor', 183.95759402950594)
('cell', 172.4504639973329)
('bind', 108.84685928736633)
('structural', 83.23407993256878)
('virus', 81.06300054201091)
('viral', 75.5561411680632)
('interaction', 68.26001146683492)
('human', 67.85457954982031)
('host', 64.82284755171038)

Topic 1:
('subset', 81.29454728313978)
('cd4', 66.23840025065098)
('memory', 62.37910850799191)
('effector', 40.5848650803486)
('viral', 40.11634209137843)
('cell', 36.6079502522241)
('rna', 36.412992443476384)
('orf', 35.0999999999997)
('protection', 33.610323276130394)
('protein', 29.212279854306338)

Topic 2:
('cell', 202.03669046303705)
('death', 163.99491597475762)
('protein', 84.11074125989704)
('human', 76.15368948404043)
('receptor', 61.47717954877514)
('cycle', 58.31210018295787)
('autophagy', 45.205793172633825)
('cellular', 39.855071437383906)
('virus', 39.49494201595987)
('play', 34.583390658500534)

Topic 3:
('virus', 503.9476289736178)
('viral', 325.232153571429