# Information Retrieval - Search Terms from Raw Text or Processed Text 


In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# pull in entire dataframe

df = pd.read_pickle("~/dspg20RnD/data/final/final_dataset_7-20.pkl")

df.reset_index(inplace = True)
#df.rename(columns={'index':'original index'}, inplace=True)

In [3]:
df.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,...,working_abstract,Start_Char,nchar,LAST_CHAR,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens,final_frqwds_removed
0,0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,...,The multiprotein complex y-secretase proteolyt...,T,1402,g,"[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y_secretase, proteolyt...","[multiprotein, y_secretase, proteolytically_cl..."
1,1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,...,The Kissl gene encodes peptides called kisspep...,T,2553,y,"[Kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin..."
2,2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,...,The objective of this research is to understan...,T,1414,e,"[objective, research, be, understand, biophysi...","[objective, research, be, understand, biophysi...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[biophysical, basis, thermodynamics_kinetic, m..."
3,3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,...,Obesity is the cause of many adverse pregnancy...,O,1545,d,"[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, adverse, pregnancyoutcome, re...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, adverse_pregnancyoutcome, great, hea..."
4,4,371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,...,Local potato advisory groups have expressed in...,L,271,s,"[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, express, interest, m..."


In [4]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form, but not if we search by the tokens instead of the original abstract.

#docs = df["ABSTRACT"] 
tokens = df["final_frqwds_removed"]

docs = []  # docs will contain the processed tokens in string form (1 string per abstract)

for abstract in tokens:
    docs.append(" ".join(abstract))
    
docs = pd.Series(docs)

## Functions needed for all info retrieval approaches

In [5]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [6]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx, if top_n = -1 return all abstracts
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    if top_n == -1:
        n = sum(scores > 0)
        ix = scores_sorted_idx[:n]
    else:
        ix = scores_sorted_idx[:top_n]
    
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [7]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [9]:
# Note - we are now using the spaCy stopwords list instead of nltk.  It is more comprehensive.

def create_stopwords():
      
    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking 
    through.  """
    
    spacy_stop_words = STOP_WORDS

    # more stop words that do not add meaning to topics
    additional_stopwords = {'addition', 'specifically', 'similar','including', 'particular', 
                            'furthermore','include', 'includes','overall', 'finally', 'specific', 
                            'additional'} 
           
    sw = spacy_stop_words.union(additional_stopwords)
    
    return sw

In [10]:
# Create document-term matrix based on count frequencies  
# when using raw text, it is appropriate to remove stop words, processed text will already have these words removed

stop_words = create_stopwords()

vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, min_df=0)
doc_term_matrix = vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [11]:
doc_term_matrix.shape

(690814, 1277618)

In [12]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [48]:
'artificialintelligence_ai' in terms

True

In [14]:
'machine_learning' in terms

True

In [15]:
'machine_learn' in terms

False

In [30]:
'supervise' in terms  

# need to look at raw text...some bi-terms are coming up, cases could be confusing with lemmatizer

True

In [86]:
s1 = "test string"
s2 = "This is a test string for practice"

s1 in s2

s1 or 'is' in s2

'test string'

In [12]:
# tokens with - or strange symbol between words

count = 0
idx = []

for ix, abstract in enumerate(df['ABSTRACT']):
    if 'artificial intelligence' in abstract.lower(): 
        count = count + 1
        idx.append(ix)
    elif 'artificialintelligence' in abstract.lower(): 
        count = count + 1
        idx.append(ix)
    elif 'artificially intelligent' in abstract.lower(): 
        count = count + 1
        idx.append(ix)
    #else: 
        # do nothing

print(count)

975


In [13]:
idx

[206,
 207,
 2940,
 2988,
 3035,
 3104,
 3602,
 3717,
 4780,
 6518,
 6774,
 6779,
 6979,
 10300,
 10654,
 10886,
 13746,
 14454,
 15589,
 18434,
 18839,
 18951,
 19512,
 19941,
 20593,
 20611,
 20741,
 21088,
 21093,
 21159,
 21247,
 22248,
 22765,
 23022,
 23259,
 23316,
 23632,
 23668,
 24083,
 24598,
 25110,
 25896,
 26198,
 26868,
 26898,
 27012,
 31055,
 31298,
 31853,
 31989,
 32191,
 32391,
 32834,
 33061,
 33141,
 34123,
 35355,
 37756,
 38934,
 40458,
 44752,
 49120,
 51183,
 51480,
 51605,
 52543,
 52692,
 53118,
 53411,
 54155,
 55952,
 57730,
 59402,
 63563,
 63579,
 63694,
 65342,
 65361,
 65398,
 65588,
 65766,
 65849,
 66156,
 66941,
 69892,
 69964,
 70069,
 70530,
 70532,
 71277,
 71629,
 72723,
 72772,
 72821,
 73117,
 73148,
 73175,
 74076,
 75206,
 75447,
 75942,
 75950,
 76041,
 76276,
 76765,
 76771,
 76835,
 77828,
 77867,
 79039,
 79062,
 79085,
 79240,
 79252,
 79271,
 79322,
 79368,
 79635,
 79694,
 79698,
 80890,
 82236,
 82414,
 83549,
 83580,
 83585,
 83610,

In [52]:
df['ABSTRACT'][0]

"The multiprotein complex y-secretase proteolytically cleaves the intramembrane region of amyloid precursorprotein (APP), which in turn forms the plaques found in Alzheimer's disease (AD) patients. The catalyticcomponent of Y-secretase is the intramembrane aspartyl protease (IAP) called presenilin. Mutations inpresenilin are directly linked to familial early-onset AD. Another known member of the IAP family is signalpeptide peptidase (SPP), which functions to further proteolyze remnant signal peptides after they have beencleaved by signal peptidase. Knowledge of the biochemistry and function of individual SPPs are onlybeginning to be elucidated, and homologues are found in all kingdoms of life. Presenilin and SPP exhibitsignificant sequence similarity, strongly suggesting they share structural and catalytic features. Thus, amolecular understanding of the more tractable SPP will likely impact drug design for presenilin and y-secretase. The goal of this proposal is to express, characteriz

In [41]:
# CHANGE QUERY WORDS HERE

query_words = ['intelligence_ai']
'''
['artificial_intelligence', 'artificial_intelligence_ai', 
               'artificial_intelligence_machine_learning', 'artificialintelligence', 'artificially_intelligent',
               'artificial_intelligence_aaai', 'artificial_intelligence_ijcai', 'artificialintelligence_ai',
               'artificialintelligent'
              ]
'''

#'ai', 'artificial', 'intelligence' 
        
q = create_query(query_words, terms)

In [42]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [43]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

# some are being left off from raw counts

7

In [44]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([1., 1., 1., 1., 1., 1., 1., 0., 0., 0.])

In [45]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, -1)  # CHANGE NUMBER OF TOP DOCS RETURNED

[245852 513841 652799 451621 239767 446589 298690]


In [46]:
f_top_abstracts

245852    need 2200 americans die cardiovascular cvd day...
513841    online eyewire prove volunteer motivate recons...
652799    small_business_innovation sbir image processin...
451621    partner socially_assistive_robot sar person_de...
239767    diabete chronic presently prevent cure treat i...
446589    perceptual assessment hypernasality consider s...
298690    visual question answer vqa empower people answ...
dtype: object

In [47]:
f_scores_sorted[:1500]

array([1., 1., 1., ..., 0., 0., 0.])

In [56]:
# others - artificial intelligence typo
# 245852, 451621: analytic, augmented intelligence

df['ABSTRACT'][298690]

"The goal of a visual question answering (VQA) system is to empower people to find the answer to any question about any image. For example, a VQA system could enable blind people to address daily visual challenges such as learning whether a pair of socks match or learning what type of food is in a can. VQA services could also facilitate the creation of smarter environments, say to monitor how many defective products are on a factory assembly line at any given time. A limitation of existing VQA systems is that they do not account for the fact that a visual question may elicit different answers from different people. VQA systems could save time and reduce user frustration if they empowered users to anticipate and resolve any answer disagreements that may arise. Blind and sighted people could more rapidly and accurately learn about the diversity of human perspectives on the visual world. VQA services also could teach people how to ask visual questions that elicit the desired answer divers

In [57]:
f_top_abstracts.iloc[6]

'visual question answer vqa empower people answer question image example vqa enable blind people daily visual challenge learn pair sock match learn food vqa facilitate creation smarter environment monitor defective product factory assembly line limitation exist vqa account fact visual question elicit answer people vqa save user frustration empower user anticipate resolve answer disagreement arise blind_sight people rapidly accurately learn diversity human perspective visual world vqa teach people ask visual question elicit desire answer diversity create artificial_intelligence_ai account possible diversity answer inherent crowd intelligence_ai predict human answer disagreement occur turn enable human computer partnership challenging necessitate framework simultaneously synthesize potentially conflict perception image language possible disagreement ensure ai generalize broad range exist corpus million visual question ask blind_sight people create annotate dataset indicate answer disagre

In [85]:
# compare idx and f_idx

s_idx = set(idx)
sf_idx = set(f_idx)

In [86]:
t = s_idx.difference(sf_idx)
print(len(t))
t

12


{110153,
 201951,
 261460,
 299829,
 326924,
 359163,
 446589,
 459436,
 525369,
 614323,
 639595,
 652799}

In [87]:
t2 = sf_idx.difference(s_idx)
print(len(t2))
t2

8


{22760, 265721, 427170, 464665, 622143, 641499, 688120, 689487}

In [100]:
temp = df.iloc[446589,:]

In [101]:
print(temp['ABSTRACT'])
print(temp["final_frqwds_removed"])

# 20611, 23632: raw had Artificial Intelligence -- which didn't match with "artificial intelligence"
#265721-622143 has "artificial-intelligence" in raw, not "artificial intelligence"
#641499 has a strange character for space in raw
#689487 has "artificially-intelligent" in raw text

# tokenize raw text - lowercase, this will take care of hyphens or other strange symbols

#2940, 13746: has token artificial_intelligence_ai not "artificial_intelligence"
#689813: has token "artificial_intelligence_machine_learning"

#110153, 446589: typo with "anartificial intelligence" in raw
#201951, 261460: artificial intelligence removed from raw because it was part of the title -- need to undo this and add titles in


# need to look at list of tokens!
# check token 'intelligence_ai' - 7 docs have this, some are typos, 
#     others will be picked up by artificial_intelligence

Perceptual assessment of hypernasality is considered a critical component when evaluating the speech ofchildren with cleft lip and/or palate (CLP). However, most speech-language pathologists (SLPs) do not receiveformal training for perceptual evaluation of speech and, as a result, research shows that the subjective ratingsare inherently biased to the perceiver and exhibit considerable variability. In this project, we aim to develop anartificial intelligence (AI) algorithm that automatically evaluates speech along four dimensions deemed to becritically important by the Americleft Speech Outcomes Group (ASOG), namely speech acceptability,articulation, hypernasality, and audible nasal emissions. The AI algorithm in this project is based on an existingdatabase of speech collected as a part of an NIH-funded project to develop reliable speech outcomes byimproving the reliability of perceptual ratings by training clinicians (NIDCR DE019-01235, PI: Kathy Chapman).This database contains speech 

In [97]:
temp

index                                                                    261466
original index                                                           302297
PROJECT_ID                                                               131540
ABSTRACT                      This award will help to subsidize the particip...
FY                                                                         2010
PROJECT_TERMS                 Artificial Intelligence; Award; Commit; Educat...
PROJECT_TITLE                 THE FOURTH NORTHEAST STUDENT COLLOQUIUM ON ART...
DEPARTMENT                                                                  NSF
AGENCY                                                                      NSF
IC_CENTER                                                                   NaN
PROJECT_NUMBER                                                          1036017
PROJECT_START_DATE                                                    5/15/2010
PROJECT_END_DATE                        

In [28]:
'_' < "a"

True

In [39]:
terms[587800:587900]

['intelligencecapability',
 'intelligenceframework',
 'intelligenceoutperform',
 'intelligences',
 'intelligencesimulation',
 'intelligent',
 'intelligent_robots_systems_iros',
 'intelligent_tire',
 'intelligent_transportation',
 'intelligent_transportation_systems',
 'intelligent_tutor',
 'intelligent_tutoring',
 'intelligent_tutoring_systems',
 'intelligentcognitive',
 'intelligentdimension',
 'intelligentdisplay',
 'intelligentelectrochemical',
 'intelligenthearing',
 'intelligently',
 'intelligentlyanalyze',
 'intelligentlydesigned',
 'intelligentlyengineer',
 'intelligentpatient',
 'intelligentreminders',
 'intelligentresearch',
 'intelligentrobotics',
 'intelligentsensor',
 'intelligentsepsis',
 'intelligentshared',
 'intelligentsia',
 'intelligentsystem',
 'intelligenttechnological',
 'intelligenttherapeutic',
 'intelligenttherapy',
 'intelligenttransportation',
 'intelligenttutoring',
 'intelligentvehicle',
 'intelligenza',
 'intelligibiity',
 'intelligibility',
 'intelligibili

In [None]:
# artificial_intelligence',
# 'artificial_intelligence_ai',
# 'artificial_intelligence_machine_learning',
# artificialintelligence
# artificially_intelligent

# 'artificial_intelligence_aaai',
# 'artificial_intelligence_ijcai',
# 'artificialintelligence_ai',
# 'artificialintelligent',

#'intelligence_ai'

## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [36]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
tf_idf = tf_idf_vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [37]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [38]:
tf_idf_terms == terms

True

In [39]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [40]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

776

In [41]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.54210581, 0.53685946, 0.40581146, 0.39018629, 0.37190372,
       0.34989933, 0.34834546, 0.34467873, 0.33076975, 0.31210131])

In [42]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, -1)  # CHANGE NUMBER OF TOP DOCS RETURNED

[293012  10300 493912 235101 581817  51183 531638 292485 378772 202240]


In [43]:
tfidf_top_abstracts

293012    grant participation undergraduate student hold...
10300     graduate student attend workshop organize conj...
493912    unique interdisciplinary team computer scienti...
235101    phd student artificial_intelligence opportunit...
581817    live maintain dimensional shape embryonic poor...
                                ...                        
490365    intellectual_merit self assembly individual ar...
636454    sbir ii create scalable virtual learning assis...
622262    anxiety disorder common psychiatric disorder y...
528444    human brain currently powerful processor man r...
498517    human infant confront world fill ambiguity fea...
Length: 500, dtype: object

In [48]:
tfidf_top_abstracts.iloc[497]

'anxiety disorder common psychiatric disorder youth lifetime prevalence range general population anxiety disorder social anxiety disorder sad youth short impairment likelihood substance_abuse limited academic achievement attenuate occupational impaired miss social relationship emerge social skill formal peer generalization session homework_assignment efficacy element skill generalization element peer generalization homework_assignment difficult implement traditional clinical setting limit optimal dissemination youth need setting eg school outpatient facility recenly complete sttr validate interactive virtual environment ve solve need intensive behavioral practice opportunity skill generalization ve pegasys vr intensive practice social skill need formal peer clinic solution intensive parental involvement home solution indicate implement ve environment set accessible credible feasible parent clinician child participate examination indicate statistically improvement sad symptom success ne

## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [49]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [50]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [51]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(690814, 500)
(500, 93578)


In [52]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=20)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [53]:
lsa_scores.shape

(1, 690814)

In [54]:
lsa_scores

array([[0.99133602, 0.98604572, 0.9832503 , ..., 0.95342406, 0.97755251,
        1.00443007]])

In [55]:
lsa_scores[0]

array([0.99133602, 0.98604572, 0.9832503 , ..., 0.95342406, 0.97755251,
       1.00443007])

In [56]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

690814

In [57]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.14581175, 1.13709072, 1.13687709, 1.13399649, 1.13048978,
       1.12849743, 1.12779594, 1.12625484, 1.12600067, 1.12432865])

In [58]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[521029 423786 306306 689195 286505 541903 290054 485773 156876 421426]


In [59]:
lsa_top_abstracts

521029    background hivcontinuestobeapressingpublicheal...
423786    award pi postdoctoral supervision dr meers opp...
306306    wave instability neutral dynamo windy windy wa...
689195    math anxiety disproportionately feel woman wom...
286505    evidence practice ebps increasingly implement ...
                                ...                        
350515    administrative adm strong consistent scientifi...
345974    administrative adm strong consistent scientifi...
601476    listener combine auditory spatial binaural_cue...
490137    biomedical infrastructure biomedical health he...
107885    barrier limit exchange nutrient organism water...
Length: 500, dtype: object

In [68]:
lsa_top_abstracts.iloc[100]

'purpose qualitative quantitative safety_tolerability arikace placebo'

## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [103]:
docs_ix = np.concatenate([f_idx, idx]) #tfidf_idx]) #, lsa_idx])

In [104]:
docs_idx = np.unique(docs_ix)

In [105]:
docs_idx.shape

(983,)

In [72]:
#lim_docs = [tokens[i] for i in docs_idx]

**create case-study corpuses**

In [106]:
ai_corpus = df.loc[docs_idx, :]

In [107]:
ai_corpus.shape

(983, 40)

In [92]:
#ai_corpus.to_pickle("./ai_corpus.pkl")

In [108]:
lim_docs = ai_corpus["final_frqwds_removed"]

In [109]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(" ".join(token_list))

In [110]:
len(lim_docs)

983

## Topic Modeling with relevant pandemic abstracts

In [111]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [112]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(text)

In [113]:
nmf_tf_idf.shape

(983, 4575)

In [114]:
# topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [115]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('student', 4.335909739666965)
('teacher', 1.509393141768934)
('assessment', 0.9903665447854408)
('reu', 0.9338575933603623)
('school', 0.9002331305445779)
('undergraduate', 0.8133692927150127)
('writing', 0.6977814444129545)
('science', 0.683545392370001)
('summer', 0.6331649985215962)
('course', 0.6294736552184661)

Topic 1:
('doctoral', 0.9016190799488903)
('conference', 0.7023612891211701)
('consortium', 0.6522387602303537)
('student', 0.621216650497578)
('international', 0.31419941629679143)
('participation', 0.27779084833626644)
('participant', 0.27413291634371595)
('career', 0.24542863357250755)
('travel', 0.24514402449292355)
('mentor', 0.2446137303974457)

Topic 2:
('patient', 1.6060208535521074)
('clinical', 0.7098923767707945)
('care', 0.701236389939995)
('health', 0.48025674862465845)
('healthcare', 0.2932686605274337)
('provider', 0.2829204942693648)
('nurse', 0.2801699764530682)
('medical', 0.25447451725416687)
('medication', 0.222507494128966)
('clinician', 0.1

In [126]:
# next step - look at documents containing topics. ex) like breast cancer (topic 17)

b_cancer_docs = W[:, 22]

In [127]:
sum(b_cancer_docs > 0) 

314

In [128]:
max_score = max(b_cancer_docs)
max_score

0.41751745497347914

In [129]:
b_cancer_docs[0:50]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00241766, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.01645926,
       0.        , 0.01659395, 0.00395674, 0.00144645, 0.        ,
       0.05852488, 0.        , 0.        , 0.00039138, 0.        ,
       0.00990916, 0.00176791, 0.        , 0.        , 0.00355259,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.03259026, 0.0022023 ,
       0.00289476, 0.03592832, 0.00386881, 0.00439733, 0.00685444])

In [138]:
print(lim_docs.iloc[25])

['drug', 'lengthy', 'expensive', 'undertaking', 'failure', 'drug', 'exceed', 'successful', 'drug', 'cover', 'cost', 'failure', 'prescription', 'drug', 'price', 'escalate', 'alarming', 'sign', 'stop', 'need', 'successful', 'drug', 'cover', 'failure', 'mean', 'pharmaceutical_company', 'primarily', 'devote', 'pursue', 'drug', 'candidate', 'population', 'company', 'earn', 'return_investment', 'small', 'portion', 'populace', 'nearly', 'oncology', 'cardiovascular', 'immunology', 'practice', 'usually', 'modeling', 'small', 'molecule', 'try', 'adapt', 'biologic', 'scientist', 'prediction', 'feasibility', 'optimal', 'drug', 'property', 'waste', 'pursue', 'chance', 'clinical', 'reimburse', 'payor', 'apply', 'biomath', 'value', 'question', 'middle', 'drug', 'pipeline', 'couple', 'quantitative', 'pharmacology', 'performance', 'computing', 'sophisticated', 'mathematical', 'algorithm', 'prove', 'predict', 'optimal', 'drug', 'property', 'enter', 'clinic', 'past', 'offer', 'pharma_biotech', 'alike', '

In [130]:
idx = np.where(b_cancer_docs == max_score)

In [131]:
idx

(array([275, 809]),)

In [125]:
b_cancer_docs[230:250]

array([0.00108239, 0.        , 0.0053626 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00339288, 0.        , 0.        , 0.        ,
       0.46861872, 0.        , 0.32673669, 0.        , 0.        ])

In [139]:
print(lim_docs.iloc[25]) # breast cancer topic with AI component

['drug', 'lengthy', 'expensive', 'undertaking', 'failure', 'drug', 'exceed', 'successful', 'drug', 'cover', 'cost', 'failure', 'prescription', 'drug', 'price', 'escalate', 'alarming', 'sign', 'stop', 'need', 'successful', 'drug', 'cover', 'failure', 'mean', 'pharmaceutical_company', 'primarily', 'devote', 'pursue', 'drug', 'candidate', 'population', 'company', 'earn', 'return_investment', 'small', 'portion', 'populace', 'nearly', 'oncology', 'cardiovascular', 'immunology', 'practice', 'usually', 'modeling', 'small', 'molecule', 'try', 'adapt', 'biologic', 'scientist', 'prediction', 'feasibility', 'optimal', 'drug', 'property', 'waste', 'pursue', 'chance', 'clinical', 'reimburse', 'payor', 'apply', 'biomath', 'value', 'question', 'middle', 'drug', 'pipeline', 'couple', 'quantitative', 'pharmacology', 'performance', 'computing', 'sophisticated', 'mathematical', 'algorithm', 'prove', 'predict', 'optimal', 'drug', 'property', 'enter', 'clinic', 'past', 'offer', 'pharma_biotech', 'alike', '

In [140]:
ai_corpus["ABSTRACT"].iloc[25]

'\ufeff   DESCRIPTION (provided by applicant):  Drug development is a very lengthy and expensive undertaking. Failure rate for novel drugs exceeds 95%. Therefore, successful drugs must cover the costs of these failures. As such, prescription drug prices have escalated at an alarming rate and show no signs of stopping. The need for successful drugs to cover failures also means that pharmaceutical companies primarily devote resources to pursuing drug candidates that have a large enough population to allow the company to earn a return on its investment. Thus, diseases that affect only a small portion of the populace are not investigated nearly as much as, say, oncology, cardiovascular or immunology. Current practice usually involves taking modeling techniques developed for small molecule research and trying to adapt them to biologics. However, this approach, more often than not, does not provide the scientist with predictions around feasibility and optimal drug properties, resulting in wa

In [141]:
ai_corpus.iloc[25]

index                                                                     20611
original index                                                           891756
PROJECT_ID                                                               933594
ABSTRACT                      ﻿   DESCRIPTION (provided by applicant):  Drug...
FY                                                                         2017
PROJECT_TERMS                 absorption; Address; Adoption; Affect; Algorit...
PROJECT_TITLE                 A QUANTITATIVE SYSTEMS PHARMACOLOGY SOFTWARE P...
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                 NIGMS
PROJECT_NUMBER                                                  5R44GM116214-02
PROJECT_START_DATE                                                     5/1/2016
PROJECT_END_DATE                        

In [137]:
ai_corpus.iloc[809]

index                                                                    628125
original index                                                           844332
PROJECT_ID                                                               819994
ABSTRACT                      Social “big data” holds information with wide-...
FY                                                                         2016
PROJECT_TERMS                 Address; Affect; AIDS prevention; Algorithms; ...
PROJECT_TITLE                 MINING REAL-TIME SOCIAL MEDIA BIG DATA TO MONI...
DEPARTMENT                                                                  HHS
AGENCY                                                                      NIH
IC_CENTER                                                                 NIAID
PROJECT_NUMBER                                                1R56AI125105-01A1
PROJECT_START_DATE                                                     9/1/2016
PROJECT_END_DATE                        

In [238]:
# TRY TOPIC MODELING WITH LDA

# create document-term matrix

lda_vectorizer = CountVectorizer(max_df=1.0, min_df=3, lowercase=True)
lda_dtm = lda_vectorizer.fit_transform(text)

In [239]:
# create model

num_topics = 30
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)
doc_top_dist = lda_model.fit_transform(lda_dtm)
top_term_dist = lda_model.components_

In [240]:
print_topics(lda_model, lda_vectorizer, 10)


Topic 0:
('protein', 225.55502749264147)
('receptor', 183.95759402950594)
('cell', 172.4504639973329)
('bind', 108.84685928736633)
('structural', 83.23407993256878)
('virus', 81.06300054201091)
('viral', 75.5561411680632)
('interaction', 68.26001146683492)
('human', 67.85457954982031)
('host', 64.82284755171038)

Topic 1:
('subset', 81.29454728313978)
('cd4', 66.23840025065098)
('memory', 62.37910850799191)
('effector', 40.5848650803486)
('viral', 40.11634209137843)
('cell', 36.6079502522241)
('rna', 36.412992443476384)
('orf', 35.0999999999997)
('protection', 33.610323276130394)
('protein', 29.212279854306338)

Topic 2:
('cell', 202.03669046303705)
('death', 163.99491597475762)
('protein', 84.11074125989704)
('human', 76.15368948404043)
('receptor', 61.47717954877514)
('cycle', 58.31210018295787)
('autophagy', 45.205793172633825)
('cellular', 39.855071437383906)
('virus', 39.49494201595987)
('play', 34.583390658500534)

Topic 3:
('virus', 503.9476289736178)
('viral', 325.232153571429