# Information Retrieval - Pandemic Investigation

In [38]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

from gensim.models.coherencemodel import CoherenceModel

import TextCleaning

In [2]:
'''
# CLEANED AND PROCESSED DATA PULL

# import NSF data
f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/agency_data.sav', 'rb')

# import entire dataset
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/lda_data.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts

'''

In [2]:
# ORIGINAL DATA PULL

# pull in original abstracts

raw_df=pd.read_csv('~/prd/publicrd/data/prd/RND Topic Modelling/abstracts_federal_reporter_combined.csv',engine='python')

# remove null abstracts and duplicates

df = TextCleaning.remove_nulls(raw_df, "ABSTRACT")
df = TextCleaning.remove_duplicates(df)

df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

3 nulls in  ABSTRACT . These rows removed.
11 duplicate abstracts removed
0 project ID duplicates - not removed


In [3]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,FIRST_CHAR,LAST_CHAR,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_TITLE,PROJECT_TERMS,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,ORGANIZATION_NAME,CFDA_CODE,FY_TOTAL_COST
0,0,89996,"This is a project to explore Game-based, Metap...",2008,"This is a project to explore Game-based, Metap...",.,NSF,NSF,,814512,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,Achievement; analog; base; Cognitive Science; ...,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",WHEELING JESUIT UNIVERSITY,47.076,1999467.0
1,1,89997,Institution: Franklin Institute Science Museum...,2008,Institution: Franklin Institute Science Museum...,.,NSF,NSF,,741659,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,Active Learning; Child; Computer software; des...,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",FRANKLIN INSTITUTE,47.076,1799699.0
2,2,89998,Through programs (including small group conver...,2008,Through programs (including small group conver...,.,NSF,NSF,,813522,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,Address; Age; Birth; Brain; Caregivers; Child;...,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",SCIENCE MUSEUM OF MINNESOTA,47.076,1505858.0
3,3,89999,In partnership with the American Chemical Soci...,2008,In partnership with the American Chemical Soci...,.,NSF,NSF,,838627,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,Advanced Development; American; Chemicals; Che...,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,47.049,51000.0
4,4,90000,Amphibian populations around the world are exp...,2008,Amphibian populations around the world are exp...,.,NSF,NSF,,815315,COLLABORATIVE RESEARCH: EVOLUTION OF AMPHIBIAN...,Amphibia; Central America; Communicable Diseas...,"ZAMUDIO, KELLY R",,CORNELL UNIVERSITY ITHACA,47.074,370996.0


In [5]:
# input needed for doc-term matrix creation is one string per document (not a list of strings).  This is 
# already the format of df["ABSTRACT"] so nothing to do here

'''
text = []
i=0
for doc in docs:
    text.append(" ".join(doc))
'''    

'\ntext = []\ni=0\nfor doc in docs:\n    text.append(" ".join(doc))\n'

## Functions needed for all approaches

In [4]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [110]:
def return_top_abstracts(df, scores, top_n):
    
    '''
    df: dataframe that contains ABSTRACT column
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    ix = scores_sorted_idx[:top_n]
    print(ix[0:10])
    
    return ix, df['ABSTRACT'][ix]
    

In [6]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Exact word matches - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  A query is just a list of words to search for.

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [7]:
# Create document-term matrix based on count frequencies

vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(df["ABSTRACT"])

In [8]:
terms = vectorizer.get_feature_names()

In [9]:
# CHANGE QUERY WORDS HERE

query_words = ['pandemic']

q = create_query(query_words, terms)

In [10]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [11]:
sum(f_scores >0)

1643

In [12]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([25., 20., 19., 18., 17., 16., 15., 14., 14., 14.])

In [111]:
f_idx, f_top_abstracts = return_top_abstracts(df, f_scores, 100)

[236762 235170 145010  92006 128050 292930 236456 236214 346289 299250]


In [112]:
f_top_abstracts

236762    Influenza A viruses are significant human path...
235170    The hemagglutinin of the 2009 pandemic H1N1 in...
145010    Influenza A viruses are significant human path...
92006     Influenza A viruses are significant human path...
128050    DESCRIPTION (provided by applicant): The 1918-...
                                ...                        
146779    We have developed a protocol for rapid product...
83990     This subproject is one of many research subpro...
405458    ﻿   DESCRIPTION (provided by applicant):  Infl...
48451     DESCRIPTION (provided by applicant): Pandemic ...
117729    DESCRIPTION (provided by applicant): Influenza...
Name: ABSTRACT, Length: 100, dtype: object

In [113]:
f_top_abstracts.iloc[9]

'(1) To develop animal, especially nonhuman primate, models that mimic human disease:We have established nonhuman primate (NHPs) models using Cynomolgus macaques for several influenza A viruses. Seasonal H1N1 & H3N2 viruses lead to either asymptomatic or fairly mild disease in Cynomolgus macaques. Gross pathology is limited to a few areas in the lungs. Histopathological investigations revealed alveolar edema and fibrin, hyaline membrane formation and type II alveolar pneumocyte hyperplasia. Animals recover quickly from the infection and clear virus within the first few days. There is limited evidence for virus shedding. (Brining 2010; Safronetz 2011)  The pandemic H2N2 strain A/Singapore/1/57 leads to a moderate infection compared to the seasonal strains. Lung infiltrates, gross pathology and histopathology are in general slightly enhanced. Animals clear the infection with a delay and fully recover. There is limited evidence for virus shedding (Richt, 2012).  We observed different degr

In [17]:
f_df = create_result_df(df["ABSTRACT"], f_scores)

In [18]:
f_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.0
550070,This mixed-methods study seeks to deepen our u...,0.0
550071,The purpose of this project is to examine the ...,0.0
550072,The 2014 Child Care and Development Block Gran...,0.0


## TF-IDF approach

In [68]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(df["ABSTRACT"])

In [20]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()

In [23]:
# CHANGE QUERY WORDS HERE

query_words = ['pandemic']

q = create_query(query_words, tf_idf_terms)

In [None]:
# I don't need this as it created the exact same query as the create_query function

'''
# transform query to be in same space as documents
# q = q.reshape(1,-1)
qhat = tf_idf_vectorizer.transform(query_words)

temp = qhat.toarray()
qhat = np.reshape(temp, qhat.shape[1])

tf_idf_scores = pairwise_distances(qhat, tf_idf, metric='cosine', n_jobs=19)
'''

In [71]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [72]:
sum(tf_idf_scores >0)

1643

In [73]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.5857311 , 0.54566878, 0.5312256 , 0.52167944, 0.51754378,
       0.48825648, 0.43052922, 0.42960531, 0.42195537, 0.39556147])

In [114]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(df, tf_idf_scores, 100)

[346289 292930  85470 128050 236762  62499 145793 236456 148102 194186]


In [115]:
tfidf_top_abstracts

346289    PROJECT SUMMARY / ABSTRACTInfluenza pandemics ...
292930    DESCRIPTION (provided by applicant): Influenza...
85470     DESCRIPTION (provided by applicant): The pande...
128050    DESCRIPTION (provided by applicant): The 1918-...
236762    Influenza A viruses are significant human path...
                                ...                        
437135    Improvements in vaccines against influenza tha...
236028    In April 2009, the Centers for Disease Control...
354271    Influenza A viruses (IAV) are significant huma...
488951    Influenza A viruses (IAV) are significant huma...
191233    The immunogenicity and efficacy of the vaccine...
Name: ABSTRACT, Length: 100, dtype: object

In [116]:
tfidf_top_abstracts.iloc[0]

'PROJECT SUMMARY / ABSTRACTInfluenza pandemics are among the foremost international public health challenges of the 21st century.Eleven probable and three possible pandemics have occurred in the past four-hundred years. The April,2009 outbreak of a novel strain of influenza A (H1N1) in Mexico City demonstrated that infection can betransmitted globally in days and cause disproportionate morbidity and mortality in young adults, pregnantwomen, and minorities. The 2009 (H1N1) Pandemic has been defined by its transmissibility and case-fatality proportion as a  mild  pandemic; the WHO estimates that a more severe pandemic could causehundreds of millions of deaths, and overwhelm healthcare capacity. Even during this mild pandemic,hospitals and ICUs were operating near or at surge capacity during peaks of transmission. Public healthofficials have limited decision support technology to plan for or prevent healthcare utilization surge in futureinfluenza pandemics. Despite an appropriate public h

In [79]:
tf_idf_df = create_result_df(df["ABSTRACT"], tf_idf_scores)

In [80]:
tf_idf_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",0.0
1,Institution: Franklin Institute Science Museum...,0.0
2,Through programs (including small group conver...,0.0
3,In partnership with the American Chemical Soci...,0.0
4,Amphibian populations around the world are exp...,0.0
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,0.0
550070,This mixed-methods study seeks to deepen our u...,0.0
550071,The purpose of this project is to examine the ...,0.0
550072,The 2014 Child Care and Development Block Gran...,0.0


## Latent Semantic Indexing (LSI) Approach

Uses the TF-IDF matrix.

In [81]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [85]:
# CHANGE QUERY WORDS HERE

query_words = ['pandemic']

q = create_query(query_words, tf_idf_terms)

# transform query to be in same space as documents
q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [86]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(550074, 500)
(500, 1058314)


In [87]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=19)

In [88]:
lsa_scores.shape

(1, 550074)

In [89]:
lsa_scores

array([[1.00844   , 1.00255825, 0.99657046, ..., 1.01523998, 1.00458235,
        1.00618948]])

In [90]:
type(lsa_scores)

numpy.ndarray

In [91]:
sum(lsa_scores[0] > 0)

550074

In [92]:
lsa_scores[0]

array([1.00844   , 1.00255825, 0.99657046, ..., 1.01523998, 1.00458235,
       1.00618948])

In [93]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.08098967, 1.07729089, 1.07715628, 1.07566429, 1.07454822,
       1.07420224, 1.07400681, 1.0733301 , 1.06976128, 1.06955734])

In [117]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(df, lsa_scores[0], 100)

[120887 410792 483108 499717 516916 119396 412678 222388 368089 444202]


In [118]:
lsa_top_abstracts

120887    Dengue fever is one of the most important emer...
410792    ﻿   DESCRIPTION (provided by applicant): Respi...
483108    ﻿DESCRIPTION (provided by applicant): Respirat...
499717    Project SummaryThis proposal is for a new R21,...
516916    We initially focused on a PIV3-based vector to...
                                ...                        
92478     This project has been designed to investigate ...
490091    PROJECT 1: Systems Analyses of Heterologous Im...
485285    CORE A PROJECT SUMMARY/ABSTRACTThe primary goa...
28495     DESCRIPTION (provided by applicant): The long ...
414783    ﻿   DESCRIPTION (provided by applicant): Respi...
Name: ABSTRACT, Length: 100, dtype: object

In [119]:
lsa_top_abstracts.iloc[4]

'We initially focused on a PIV3-based vector to express RSV F protein, given the pediatric impact of both viruses. We used an attenuated HPIV3 vaccine candidate called B/HPIV3 that we previously developed. This virus consists of bovine PIV3 in which the F and HN genes have been replaced by those of HPIV3, yielding a chimeric virus that is attenuated in primates due to the bovine backbone, and which bears the neutralization and major protective F and HN antigens of HPIV3. Both the empty B/HPIV3 vector and B/HPIV3 expressing the unmodified RSV F protein were previously shown to be well-tolerated in infants and young children. Therefore, the crucial factor of safety has already been demonstrated for this vector.  A number of strategies were evaluated previously  to optimize the immunogenicity  of the expressed RSV F protein. Evaluation of several different positions for the  RSV F gene in the vector genome identified the second gene position as generally being optimal. Evaluation of sever

In [97]:
lsa_df = create_result_df(df["ABSTRACT"], lsa_scores[0])

In [98]:
lsa_df

Unnamed: 0,abstracts,scores
0,"This is a project to explore Game-based, Metap...",1.008440
1,Institution: Franklin Institute Science Museum...,1.002558
2,Through programs (including small group conver...,0.996570
3,In partnership with the American Chemical Soci...,0.995445
4,Amphibian populations around the world are exp...,0.997222
...,...,...
550069,The Title IV-E Prevention Services Clearinghou...,1.007695
550070,This mixed-methods study seeks to deepen our u...,1.004997
550071,The purpose of this project is to examine the ...,1.015240
550072,The 2014 Child Care and Development Block Gran...,1.004582


## Topic Modeling with relevant pandemic abstracts

In [126]:
docs_ix = np.concatenate([f_idx, tfidf_idx, lsa_idx])

In [127]:
docs_idx = np.unique(docs_ix)

In [131]:
docs_idx.shape

(238,)

In [132]:
docs = df['ABSTRACT'][docs_idx]

In [133]:
nmf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=True, max_features=int(len(docs)/2))
nmf_tf_idf = nmf_vectorizer.fit_transform(docs)

In [137]:
# topic modeling with NMF

nmf_model = NMF(n_components=10, random_state=1)
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [135]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [138]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('rsv', 2.795218020654589)
('airway', 0.36470339736504226)
('asthma', 0.35133118774291056)
('expression', 0.28851166079183294)
('induced', 0.2876505132268857)
('lung', 0.25092440219377565)
('infants', 0.24268855831998154)
('mice', 0.21239884659800604)
('role', 0.1996148426464383)
('severe', 0.1982233900343613)

Topic 1:
('vaccines', 0.9188059343400766)
('against', 0.7827593564023378)
('protection', 0.5629577644647044)
('antibody', 0.5498538945061914)
('vaccination', 0.47524610524449723)
('ha', 0.39533842881474995)
('responses', 0.36053509023532204)
('cross', 0.3111548988760989)
('antibodies', 0.2659231868889383)
('strain', 0.2623675857410589)

Topic 2:
('zikv', 1.9443930018333209)
('denv', 0.4013957558654288)
('aim', 0.11402717201109577)
('such', 0.08550146211377954)
('between', 0.08150924164521098)
('how', 0.08142181142621109)
('high', 0.07253149164384412)
('protein', 0.07243403627880844)
('it', 0.07020558889870795)
('other', 0.06261986279742487)

Topic 3:
('1918', 1.4010506