# Information Retrieval - For RShiny Dashboard  
This notebook retrieves abstracts relevant to pandemics and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# pull in entire dataframe

df = pd.read_pickle("/home/kjl5t/dspg20RnD/data/final/final_dataset_7-20.pkl")

df.reset_index(inplace = True)
#df.rename(columns={'index':'original index'}, inplace=True)

In [3]:
df.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,...,working_abstract,Start_Char,nchar,LAST_CHAR,lemma_abstract,clean_lemmas,stopwds_removed,n_grams_added,final_tokens,final_frqwds_removed
0,0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,...,The multiprotein complex y-secretase proteolyt...,T,1402,g,"[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y-secretase, proteolyt...","[multiprotein, complex, y_secretase, proteolyt...","[multiprotein, y_secretase, proteolytically_cl..."
1,1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,...,The Kissl gene encodes peptides called kisspep...,T,2553,y,"[Kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, call, kisspepti...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin...","[kissl, gene, encode, peptide, kisspeptin, bin..."
2,2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,...,The objective of this research is to understan...,T,1414,e,"[objective, research, be, understand, biophysi...","[objective, research, be, understand, biophysi...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[objective, research, understand, biophysical,...","[biophysical, basis, thermodynamics_kinetic, m..."
3,3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,...,Obesity is the cause of many adverse pregnancy...,O,1545,d,"[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, many, adverse, pregnancyoutco...","[obesity, cause, adverse, pregnancyoutcome, re...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, cause, adverse_pregnancyoutcome, res...","[obesity, adverse_pregnancyoutcome, great, hea..."
4,4,371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,...,Local potato advisory groups have expressed in...,L,271,s,"[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, group, express, inte...","[local, potato, advisory, express, interest, m..."


In [11]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data is already in this form, but not if we search by the tokens instead of the original abstract.

#docs = df["ABSTRACT"] 
tokens = df["final_frqwds_removed"]

docs = []

for abstract in tokens:
    docs.append(" ".join(abstract))
    
docs = pd.Series(docs)

In [4]:
# to save time/storage -- only read in list of final tokens, ie. df["final_frqwds_removed"] - DO NOT USE
# creates a problem when indexing since this produces a list for tokens rather than a series
'''
f = open('../topic_model_tuning/coherence_vars20.sav', 'rb')

[corpus, id2word, tokens] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# tokens - df["final_frqwds_removed"]
'''

In [5]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
''' 
docs = []

for abstract in tokens:
    docs.append(" ".join(abstract))
'''

## Functions needed for all info retrieval approaches

In [14]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [15]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx, if top_n = -1 return all abstracts
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    if top_n == -1:
        n = sum(scores > 0)
        ix = scores_sorted_idx[:n]
    else:
        ix = scores_sorted_idx[:top_n]
    
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [16]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [17]:
# Note - we are now using the spaCy stopwords list instead of nltk.  It is more comprehensive.

def create_stopwords():
      
    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking 
    through.  """
    
    spacy_stop_words = STOP_WORDS

    # more stop words that do not add meaning to topics
    additional_stopwords = {'addition', 'specifically', 'similar','including', 'particular', 
                            'furthermore','include', 'includes','overall', 'finally', 'specific', 
                            'additional'} 
           
    sw = spacy_stop_words.union(additional_stopwords)
    
    return sw

In [18]:
# Create document-term matrix based on count frequencies

stop_words = create_stopwords()

vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
doc_term_matrix = vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [19]:
doc_term_matrix.shape

(690814, 93578)

In [20]:
terms = vectorizer.get_feature_names()

### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [21]:
'artificial_intelligence' in terms

True

In [31]:
'machine_learning' in terms

True

In [32]:
'machine_learn' in terms

False

In [22]:
# CHANGE QUERY WORDS HERE

query_words = ['artificial_intelligence'] #'ai', 'artificial', 'intelligence' 

# coronavirus, 'mers', 'sars'] # 'pandemic', mers_cov', 'sars_cov']   #'mers', 'sars', 'zikv', 'denv', 'hiv'] #'influenza', 'aids'] 
              
q = create_query(query_words, terms)

In [23]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [24]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

776

In [25]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([6., 5., 5., 4., 4., 4., 4., 4., 4., 3.])

In [27]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[493912  10300  83585 293012  19512 429296 689629 637044 689591 290517]


In [28]:
f_top_abstracts

493912    unique interdisciplinary team computer scienti...
10300     graduate student attend workshop organize conj...
83585     eager award education collaboration kindle_mat...
293012    grant participation undergraduate student hold...
19512     aaai artificial_intelligence interactive digit...
                                ...                        
520598    range contemporary emerge technology far reach...
196879    nih interdisciplinary environment bring multid...
569741    modify date modify cs george lee abstract prog...
524959    radical shift computing paradigm neuro_inspire...
386530    renewal biomedical_informatics grow significan...
Length: 500, dtype: object

In [35]:
f_top_abstracts.iloc[498]

'radical shift computing paradigm neuro_inspired computing attractive intensive image speech recognition neuro inspire architecture leverage distribute computation neuron node localize storage synaptic element neuron node today generally implement silicon transistor crossbar_array synaptic element silicon neuron power_hungry area_inefficient parallelism computing context single device efficiently emulate neuronal eg integrate fire neuromorphic_hardware exploit metal_insulator transition phenomenon strongly correlate oxide compact neuron node self oscillate oxide neuron overcome aforemention limitation silicon neuron profound society embrace artificial_intelligence instance compact neuromorphic_hardware enable intelligent processing power efficient mobile platform eg autonomous_vehicle personalize healthcare wearable device smart sensor education integration train undergraduate graduate student generation workforce interdisciplinary skill cross layer nature range engineering semiconduct

## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [36]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, min_df=20)
tf_idf = tf_idf_vectorizer.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [37]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [38]:
tf_idf_terms == terms

True

In [39]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [40]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

776

In [41]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.54210581, 0.53685946, 0.40581146, 0.39018629, 0.37190372,
       0.34989933, 0.34834546, 0.34467873, 0.33076975, 0.31210131])

In [42]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[293012  10300 493912 235101 581817  51183 531638 292485 378772 202240]


In [43]:
tfidf_top_abstracts

293012    grant participation undergraduate student hold...
10300     graduate student attend workshop organize conj...
493912    unique interdisciplinary team computer scienti...
235101    phd student artificial_intelligence opportunit...
581817    live maintain dimensional shape embryonic poor...
                                ...                        
490365    intellectual_merit self assembly individual ar...
636454    sbir ii create scalable virtual learning assis...
622262    anxiety disorder common psychiatric disorder y...
528444    human brain currently powerful processor man r...
498517    human infant confront world fill ambiguity fea...
Length: 500, dtype: object

In [48]:
tfidf_top_abstracts.iloc[497]

'anxiety disorder common psychiatric disorder youth lifetime prevalence range general population anxiety disorder social anxiety disorder sad youth short impairment likelihood substance_abuse limited academic achievement attenuate occupational impaired miss social relationship emerge social skill formal peer generalization session homework_assignment efficacy element skill generalization element peer generalization homework_assignment difficult implement traditional clinical setting limit optimal dissemination youth need setting eg school outpatient facility recenly complete sttr validate interactive virtual environment ve solve need intensive behavioral practice opportunity skill generalization ve pegasys vr intensive practice social skill need formal peer clinic solution intensive parental involvement home solution indicate implement ve environment set accessible credible feasible parent clinician child participate examination indicate statistically improvement sad symptom success ne

## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [49]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

In [50]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [51]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

(1, 500)
(690814, 500)
(500, 93578)


In [52]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=20)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [53]:
lsa_scores.shape

(1, 690814)

In [54]:
lsa_scores

array([[0.99133602, 0.98604572, 0.9832503 , ..., 0.95342406, 0.97755251,
        1.00443007]])

In [55]:
lsa_scores[0]

array([0.99133602, 0.98604572, 0.9832503 , ..., 0.95342406, 0.97755251,
       1.00443007])

In [56]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

690814

In [57]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

array([1.14581175, 1.13709072, 1.13687709, 1.13399649, 1.13048978,
       1.12849743, 1.12779594, 1.12625484, 1.12600067, 1.12432865])

In [58]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 500)  # CHANGE NUMBER OF TOP DOCS RETURNED

[521029 423786 306306 689195 286505 541903 290054 485773 156876 421426]


In [59]:
lsa_top_abstracts

521029    background hivcontinuestobeapressingpublicheal...
423786    award pi postdoctoral supervision dr meers opp...
306306    wave instability neutral dynamo windy windy wa...
689195    math anxiety disproportionately feel woman wom...
286505    evidence practice ebps increasingly implement ...
                                ...                        
350515    administrative adm strong consistent scientifi...
345974    administrative adm strong consistent scientifi...
601476    listener combine auditory spatial binaural_cue...
490137    biomedical infrastructure biomedical health he...
107885    barrier limit exchange nutrient organism water...
Length: 500, dtype: object

In [68]:
lsa_top_abstracts.iloc[100]

'purpose qualitative quantitative safety_tolerability arikace placebo'

## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [87]:
docs_ix = np.concatenate([f_idx, tfidf_idx]) #, lsa_idx])

In [88]:
docs_idx = np.unique(docs_ix)

In [89]:
docs_idx.shape

(661,)

In [72]:
#lim_docs = [tokens[i] for i in docs_idx]

**create case-study corpuses**

In [90]:
ai_corpus = df.loc[docs_idx, :]

In [91]:
ai_corpus.shape

(661, 40)

In [92]:
#ai_corpus.to_pickle("./ai_corpus.pkl")

In [93]:
lim_docs = ai_corpus["final_frqwds_removed"]

In [94]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(" ".join(token_list))

In [95]:
len(lim_docs)

661

## Topic Modeling with relevant pandemic abstracts

In [96]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [97]:
# Create a TF-IDF document-term matrix for the pandemics corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

nmf_tf_idf = nmf_vectorizer.fit_transform(text)

In [98]:
nmf_tf_idf.shape

(661, 3537)

In [103]:
# topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [104]:
print_topics(nmf_model, nmf_vectorizer, 10)


Topic 0:
('robot', 2.9276555431026523)
('robotics', 1.8796799753240636)
('planning', 1.3957262296727975)
('robotic', 0.9263724100777656)
('environment', 0.5895251796196265)
('human', 0.5321522585489054)
('autonomous', 0.5262712301936743)
('execution', 0.5087147277827263)
('sensor', 0.4999294818395091)
('manufacturing', 0.4834539125793396)

Topic 1:
('student', 1.1983127483998974)
('reu', 0.4400906665204058)
('graduate', 0.3372826043302055)
('faculty', 0.334001298311603)
('summer', 0.3303538555777053)
('mentor', 0.3024311421821423)
('undergraduate', 0.29963604825409723)
('science', 0.28998847836336)
('school', 0.24223020110358717)
('rusis', 0.23107958190465744)

Topic 2:
('melanoma', 1.0048244609218435)
('spore', 0.6415026256853423)
('skin', 0.4814985220707861)
('cancer', 0.4711648935169887)
('yale', 0.41760852379198254)
('modifier', 0.3670705669140861)
('epigenetic', 0.3670705669140861)
('specimen', 0.3447241921186862)
('basal', 0.2557584887001398)
('therapy', 0.24913391894604944)

To

In [108]:
# next step - look at documents containing topics like breast cancer (topic 17)

b_cancer_docs = W[:, 17]

In [111]:
sum(b_cancer_docs > 0) 

186

In [121]:
max_score = max(b_cancer_docs)

In [115]:
b_cancer_docs[0:20]

array([0.14184447, 0.14186073, 0.        , 0.        , 0.        ,
       0.00243191, 0.01463379, 0.        , 0.        , 0.        ,
       0.        , 0.0077163 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [120]:
print(lim_docs.iloc[2])

['literature', 'causal', 'statistical', 'inference', 'causal', 'channel', 'interest', 'net', 'direct', 'causal', 'block', 'variable', 'represent', 'net', 'useful', 'learn', 'way', 'causally', 'policy', 'purpose', 'evaluation', 'evident', 'importance', 'latest', 'theoretical', 'definition', 'identification', 'estimation', 'place', 'economics', 'introduce', 'net', 'economics', 'employ', 'familiar', 'econometric', 'language', 'ii', 'literature', 'causal', 'partial', 'point', 'identification', 'nonparametric_parametric', 'assignment', 'heterogeneous', 'iii', 'relevant', 'simulation', 'guide', 'future', 'apply', 'employ', 'framework', 'neyman', '1923', 'rubin', '1974', 'related', 'concept', 'principal', 'stratification', 'frangakis_rubin_2002', 'introduce', 'economics', 'concept', 'net', 'average', 'nate', 'average', 'mate', 'decompose', 'total', 'average', 'eat', 'related', 'concept', 'previously', 'introduce', 'robins', 'greenland', '1992', 'pearl', '2001', 'second', 'nonparametric', 'par

In [123]:
idx = np.where(b_cancer_docs == max_score)

In [124]:
idx

(array([245]),)

In [125]:
b_cancer_docs[230:250]

array([0.00108239, 0.        , 0.0053626 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00339288, 0.        , 0.        , 0.        ,
       0.46861872, 0.        , 0.32673669, 0.        , 0.        ])

In [130]:
print(lim_docs.iloc[245]) # breast cancer topic with AI component

['clinical', 'decision', 'individual', 'patient', 'society', 'makinggood', 'healthcare', 'decision', 'paramount', 'task', 'decisionsupport', 'utilize', 'clinical', 'feature', 'genomic', 'profile', 'breast', 'cancer', 'patient', 'toassist', 'physician', 'integrate', 'patient', 'diagnostic', 'subtype', 'tumor', 'stage', 'andgrade', 'age', 'comorbidity', 'therapeutic', 'patient', 'traditional', 'clinical', 'increasingly', 'available', 'electronic', 'unprecedentedlyabundant', 'genomic', 'available', 'advanced', 'sequencing', 'technology', 'generation', 'sequencing', 'patient', 'genomic', 'likely', 'available', 'patientsin', 'foreseeable_future', 'source', 'opportunity', 'newgeneration', 'clinical', 'decision', 'achieve', 'substantial', 'progress', 'currentlypossible', 'sheer', 'magnitude', 'variable', 'million', 'present', 'formidable', 'computational', 'modeling', 'challenge', 'integrate', 'heterogeneous', 'informationin', 'clinical', 'dataset', 'genomic', 'dataset', 'present', 'arduous',

In [131]:
ai_corpus["ABSTRACT"].iloc[245]

'Critical clinical activities involve decision making. For both individual patients and for society at large, makinggood healthcare decisions is a paramount task. The objective of this research is to develop a novel decisionsupport system that utilizes both the clinical features and the genomic profile of a breast cancer patient toassist the physician in integrating information about a specific patient (diagnostic subtype, tumor stage andgrade, age, comorbidities) to make therapeutic plans for the patient. Traditional clinical data are becoming increasingly available in electronic form. Unprecedentedlyabundant genomic data are available to researchers as the results of advanced sequencing technologies suchas next generation sequencing. Patient-specific genomic data are likely to become available for most patientsin the foreseeable future. These sources of data provide significant opportunities for developing newgeneration clinical decision support systems that can achieve substantial p

In [238]:
# TRY TOPIC MODELING WITH LDA

# create document-term matrix

lda_vectorizer = CountVectorizer(max_df=1.0, min_df=3, lowercase=True)
lda_dtm = lda_vectorizer.fit_transform(text)

In [239]:
# create model

num_topics = 30
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)
doc_top_dist = lda_model.fit_transform(lda_dtm)
top_term_dist = lda_model.components_

In [240]:
print_topics(lda_model, lda_vectorizer, 10)


Topic 0:
('protein', 225.55502749264147)
('receptor', 183.95759402950594)
('cell', 172.4504639973329)
('bind', 108.84685928736633)
('structural', 83.23407993256878)
('virus', 81.06300054201091)
('viral', 75.5561411680632)
('interaction', 68.26001146683492)
('human', 67.85457954982031)
('host', 64.82284755171038)

Topic 1:
('subset', 81.29454728313978)
('cd4', 66.23840025065098)
('memory', 62.37910850799191)
('effector', 40.5848650803486)
('viral', 40.11634209137843)
('cell', 36.6079502522241)
('rna', 36.412992443476384)
('orf', 35.0999999999997)
('protection', 33.610323276130394)
('protein', 29.212279854306338)

Topic 2:
('cell', 202.03669046303705)
('death', 163.99491597475762)
('protein', 84.11074125989704)
('human', 76.15368948404043)
('receptor', 61.47717954877514)
('cycle', 58.31210018295787)
('autophagy', 45.205793172633825)
('cellular', 39.855071437383906)
('virus', 39.49494201595987)
('play', 34.583390658500534)

Topic 3:
('virus', 503.9476289736178)
('viral', 325.232153571429