# Information Retrieval - For RShiny Dashboard  
This notebook retrieves abstracts relevant to pandemics and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# pull in data

df = pd.read_pickle("../../../data/final/final_dataset_7-20.pkl")

df.reset_index(inplace = True)
#df.rename(columns={'index':'original index'}, inplace=True)

In [3]:
df.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,...,working_abstract,Start_Char,nchar,LAST_CHAR,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens,final_frqwds_removed
0,0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,...,The multiprotein complex y-secretase proteolyt...,T,1402,g,"[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y_secretase, proteolyt...","[multiprotein, y_secretase, proteolytically_cl..."
1,1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,...,The Kissl gene encodes peptides called kisspep...,T,2553,y,"[Kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin..."
2,2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,...,The objective of this research is to understan...,T,1414,e,"[objective, research, be, understand, biophysi...","[objective, research, be, understand, biophysi...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[biophysical, basis, thermodynamics_kinetic, m..."
3,3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,...,Obesity is the cause of many adverse pregnancy...,O,1545,d,"[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, adverse, pregnancyoutcome, re...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, adverse_pregnancyoutcome, great, hea..."
4,4,371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,...,Local potato advisory groups have expressed in...,L,271,s,"[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, express, interest, m..."


In [4]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form!

docs = df["ABSTRACT"] 
tokens = df["final_frqwds_removed"]

## Functions needed for all info retrieval approaches

In [5]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [6]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx, if top_n = -1 return all abstracts
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    if top_n == -1:
        n = sum(scores > 0)
        ix = scores_sorted_idx[:n]
    else:
        ix = scores_sorted_idx[:top_n]
    
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [7]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [8]:
# Note - we are now using the spaCy stopwords list instead of nltk.  It is more comprehensive.

def create_stopwords():
      
    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking 
    through.  """
    
    spacy_stop_words = STOP_WORDS

    # more stop words that do not add meaning to topics
    additional_stopwords = {'addition', 'specifically', 'similar','including', 'particular', 
                            'furthermore','include', 'includes','overall', 'finally', 'specific', 
                            'additional'} 
           
    sw = spacy_stop_words.union(additional_stopwords)
    
    return sw

In [9]:
# Create document-term matrix based on count frequencies

stop_words = create_stopwords()

vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
doc_term_matrix = vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [10]:
doc_term_matrix.shape

(690814, 84347)

In [11]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [122]:
# CHANGE QUERY WORDS HERE

query_words = ['coronavirus']#, 'mers', 'sars'] # 'pandemic', mers_cov', 'sars_cov']   #'mers', 'sars', 'zikv', 'denv', 'hiv'] #'influenza', 'aids'] 
              
q = create_query(query_words, terms)

In [123]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [124]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

457

In [125]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([13., 11., 11., 10., 10.,  7.,  7.,  7.,  7.,  7.])

In [126]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[327940 315175  67124  45963  44370 303289 540874 408200 265059 409427]


In [127]:
f_top_abstracts

327940    DESCRIPTION (provided by applicant): Coronavir...
315175    DESCRIPTION (provided by applicant): Human cor...
67124     DESCRIPTION (provided by applicant): The long-...
45963     DESCRIPTION (provided by applicant):  Coronavi...
44370     DESCRIPTION (provided by applicant): Coronavir...
                                ...                        
230134    Project SummaryThe Interdisciplinary Vision Tr...
230135    DESCRIPTION (provided by applicant): In this s...
230136    The BADERC Transgenic Core (PI – Bradford Lowe...
231825    DESCRIPTION (provided by applicant): The devel...
230137    DESCRIPTION (provided by applicant): Enzymes p...
Name: ABSTRACT, Length: 500, dtype: object

## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [18]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [19]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [20]:
tf_idf_terms == terms

True

In [128]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [129]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

457

In [130]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.59612967, 0.56165131, 0.55908117, 0.55013338, 0.55013338,
       0.52166201, 0.51974354, 0.46893319, 0.45727428, 0.43942867])

In [131]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[ 67124 265059 327940  45963  44370 548248 315175 473394 540874 227279]


In [132]:
tfidf_top_abstracts

67124     DESCRIPTION (provided by applicant): The long-...
265059    This subproject is one of many research subpro...
327940    DESCRIPTION (provided by applicant): Coronavir...
45963     DESCRIPTION (provided by applicant):  Coronavi...
44370     DESCRIPTION (provided by applicant): Coronavir...
                                ...                        
230140    ﻿   DESCRIPTION (provided by applicant): Preec...
230152    The Career Development program is directed by ...
230141    ﻿   DESCRIPTION (provided by applicant): We pr...
230142    The administrative core will serve to coordina...
231823    DESCRIPTION (provided by applicant): There is ...
Name: ABSTRACT, Length: 500, dtype: object

## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [26]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [133]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [134]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(690814, 500)
(500, 84347)


In [135]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=20)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [136]:
lsa_scores.shape

(1, 690814)

In [137]:
lsa_scores

array([[0.9656445 , 0.99068657, 0.98044827, ..., 1.00679923, 0.99425543,
        1.00488297]])

In [138]:
lsa_scores[0]

array([0.9656445 , 0.99068657, 0.98044827, ..., 1.00679923, 0.99425543,
       1.00488297])

In [139]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

690814

In [140]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.15118662, 1.15001451, 1.14881236, 1.14453546, 1.14453546,
       1.14453546, 1.14453546, 1.14078958, 1.13988056, 1.13815179])

In [141]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[485925 537029 171359 493901 507866 512236 500722  19912  96618 194458]


In [142]:
lsa_top_abstracts

485925    Project 2  Studies of the sand fly vector  wil...
537029    AbstractExtracellular vesicles (EVs) are cargo...
171359    We have investigated the phenotype, function, ...
493901    Project 2  Studies of the sand fly vector  wil...
507866    Project 2  Studies of the sand fly vector  wil...
                                ...                        
346736    1252182FuThe goal of the proposed work is to o...
345889    1252182FuThe goal of the proposed work is to o...
35833     This subproject is one of many research subpro...
68385     DESCRIPTION (provided by applicant): A number ...
384146    DESCRIPTION (provided by applicant):  A number...
Name: ABSTRACT, Length: 500, dtype: object

## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [143]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [144]:
docs_idx = np.unique(docs_ix)

In [145]:
docs_idx.shape

(1012,)

In [42]:
#lim_docs = [tokens[i] for i in docs_idx]

**create case-study corpuses**

In [46]:
pandemic_corpus = df.loc[docs_idx, :]

In [47]:
pandemic_corpus.shape

(1137, 40)

In [48]:
pandemic_corpus.to_pickle("./pandemic_corpus.pkl")

In [49]:
lim_docs = pandemic_corpus["final_frqwds_removed"]

In [146]:
corona_corpus = df.loc[docs_idx, :]

In [147]:
corona_corpus.shape

(1012, 40)

In [155]:
corona_corpus.to_pickle("./corona_corpus.pkl")

In [148]:
lim_docs = corona_corpus["final_frqwds_removed"]

In [149]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(" ".join(token_list))

In [150]:
len(lim_docs)

1012

## Topic Modeling with relevant pandemic abstracts

In [52]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [151]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(text)

In [152]:
nmf_tf_idf.shape

(1012, 5252)

In [158]:
# topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [159]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('rna', 2.7019640472234205)
('replication', 1.8838767182028298)
('viral', 1.6131435990689662)
('protein', 1.4368932283112912)
('coronavirus', 1.3268361368060235)
('virus', 1.0330762699564786)
('coronaviruse', 0.92985372459449)
('genome', 0.7878428708335287)
('host', 0.7348722372658618)
('interaction', 0.6665080528080917)

Topic 1:
('hsv', 1.9528548414943694)
('woman', 0.28269394756273597)
('shedding', 0.23666844578105778)
('genital', 0.22517262408776506)
('reactivation', 0.21140446249874711)
('cds', 0.19136501375884643)
('infection', 0.18244958957597174)
('cell', 0.14407838765079745)
('subject', 0.1370929990203456)
('genital_hsv', 0.13302060105085622)

Topic 2:
('ebv', 1.9198800687844073)
('cell', 0.4715935988716344)
('patient', 0.2533474076440763)
('nhl', 0.23240012445805583)
('lymphoma', 0.21374570163797063)
('latent', 0.1831939635431176)
('latency', 0.17421277887102182)
('malignancy', 0.16917767440372172)
('ctl', 0.1599594978230759)
('line', 0.14526906958565947)

Topic 3:


In [238]:
# TRY TOPIC MODELING WITH LDA

# create document-term matrix

lda_vectorizer = CountVectorizer(max_df=1.0, min_df=3, lowercase=True)
lda_dtm = lda_vectorizer.fit_transform(text)

In [239]:
# create model

num_topics = 30
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)
doc_top_dist = lda_model.fit_transform(lda_dtm)
top_term_dist = lda_model.components_

In [240]:
print_topics(lda_model, lda_vectorizer, 10)


Topic 0:
('protein', 225.55502749264147)
('receptor', 183.95759402950594)
('cell', 172.4504639973329)
('bind', 108.84685928736633)
('structural', 83.23407993256878)
('virus', 81.06300054201091)
('viral', 75.5561411680632)
('interaction', 68.26001146683492)
('human', 67.85457954982031)
('host', 64.82284755171038)

Topic 1:
('subset', 81.29454728313978)
('cd4', 66.23840025065098)
('memory', 62.37910850799191)
('effector', 40.5848650803486)
('viral', 40.11634209137843)
('cell', 36.6079502522241)
('rna', 36.412992443476384)
('orf', 35.0999999999997)
('protection', 33.610323276130394)
('protein', 29.212279854306338)

Topic 2:
('cell', 202.03669046303705)
('death', 163.99491597475762)
('protein', 84.11074125989704)
('human', 76.15368948404043)
('receptor', 61.47717954877514)
('cycle', 58.31210018295787)
('autophagy', 45.205793172633825)
('cellular', 39.855071437383906)
('virus', 39.49494201595987)
('play', 34.583390658500534)

Topic 3:
('virus', 503.9476289736178)
('viral', 325.232153571429