## Haritha Guttikonda (hg5mn@virginia.edu)
#### DS 5001
#### 28 April 2020



In [36]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from nltk.corpus import stopwords 

In [37]:
%matplotlib inline

In [38]:
def convert_corpus(docs):
    
    global TOKEN, VOCAB
    
    # Convert docs into tokens
    stop_words = set(stopwords.words('english')) 
    tokens = []
    for i, doc in enumerate(docs):
        for j, token in enumerate(doc.split()):
            term_str = re.sub(r'[\W_]+', '', token).lower()
            if term_str not in stop_words:
                tokens.append((i, j, term_str))
    TOKEN = pd.DataFrame(tokens, columns=['doc_id','token_num','term_str'])\
        .set_index(['doc_id','token_num'])
    
    # Extract vocabulary
    VOCAB = TOKEN.term_str.value_counts().to_frame().reset_index()
    VOCAB.columns = ['term_str', 'n']
    VOCAB.index.name = 'term_id'
    TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)
    

In [39]:
def init_model():
    
    global n_topics, n_docs, n_words, TOKEN, BOW, TOPIC, THETA, PHI, topic_names

    # Extract BOW from TOKEN
    BOW = TOKEN.groupby(['doc_id', 'term_id']).term_id.count()\
        .to_frame().rename(columns={'term_id':'n'})
    
    # Normalize n
    # May want normalize n to binary or log form
    
    # Create TOPIC table
    TOPIC = pd.DataFrame(index=range(n_topics))
    TOPIC.index.name = 'topic_id'
    topic_names = TOPIC.index.tolist()
    
    # Randomly assign topics to words (word = term in BOW)
    BOW['topic_id'] = TOPIC.sample(BOW.shape[0], replace=True).index
    
    # Generete topic-doc count matrix
    THETA = BOW.groupby(['topic_id', 'doc_id']).n.sum()\
        .unstack().fillna(0).astype('int')

    # Generate term-topic matrix (aka word-topic)
    PHI = BOW.groupby(['term_id', 'topic_id']).n.sum()\
        .unstack().fillna(0).astype('int')        

    # Get doc and word counts
    n_docs = THETA.shape[1]
    n_words = PHI.shape[0]
  

In [40]:
def gibbs_sample(d, w):
    
    global n_topics, n_docs, n_words, BOW, PHI, THETA, alpha, beta, topic_names
    
    # Get current topic for word in doc
    z1 = BOW.at[(d, w), 'topic_id']
    
    # Get the number of tokens
    n = BOW.at[(d, w), 'n']
    
    # Remove current assignment from the counts
    PHI.at[w, z1] -= 1
    THETA.at[z1, d] -= 1

    # Sample from the two count matrices
    weights = np.zeros(n_topics)
    for t in topic_names:
        
        # How much the topic likes the word
        P = (PHI.at[w, t] + alpha) / (PHI[t].sum() + n_words * alpha)
        
        # How much the document likes the topic
        T = (THETA.at[t, d] + beta) / (THETA[d].sum() + n_topics * beta)
        
        # Append result
        weights[t] = P * T
            
    # Draw new topic value from weighted list of topics
    pwgt = weights / weights.sum()
    z2 = np.random.choice(topic_names, p=pwgt)
    
    # Apply the value of the word to the topic  
    PHI.at[w, z2] += n
    THETA.at[z2, d] += n
        
    # Update the topic assignment
    BOW.at[(d , w), 'topic_id'] = z2

In [47]:
def show_results():
    
    global VOCAB, TOPIC, THETA, PHI, topic_names, docs
        
    P = PHI[topic_names] / PHI[topic_names].sum()
    
    P['term_str'] = VOCAB.term_str 
    for t in topic_names:
        top_terms = P.sort_values(t, ascending=False).head(10)[[t, 'term_str']]
        print('-' * 80)
        print(top_terms)
        # Add a string to TOPIC.loc[t, 'top_terms']
    
    print('-' * 80)
    
    DOC = THETA.T
    DOC['doc'] = docs
    print(DOC.head(1000))
    
    print('-' * 80)
    return DOC

In [48]:
data = pd.read_csv('../dataset/processed-files/consolidated.csv')
data = data[data['is_covid19'] == True]
data

Unnamed: 0,paper_id,text,journal,publish_year,is_covid19,section_num,publish_month
17,24a61244c69f09bc9392bbbb4a97e881ee68dc5e,African swine fever virus (ASFV) is a dsDNA vi...,Cell Discov,2020,True,0,4
35,12332e3ee5f8253ed8db253e1acb48ea3bc8f27f,OBJECTIVE: According to the WHO coronavirus di...,BMC Res Notes,2020,True,0,4
289,655b175741b74ae8c98f93fd449c88a147bee96e,"INTRODUCTION: In the beginning of 2020, an une...",Arch Acad Emerg Med,2020,True,0,3
443,188f3e97042155ac1709aa8b74c0755760c3b50d,BACKGROUND: Cancer and transplant patients wit...,Ecancermedicalscience,2020,True,0,3
456,80993091f576dc7fdbec10552b45b4af5eec2b8b,The ongoing COVID-19 epidemic continues to spr...,J Clin Med,2020,True,0,2
...,...,...,...,...,...,...,...
14555,14591bae9b4cf2cfdbd409c0dec9106a13298297,There were 200 people who were initially invit...,Med Sci Monit,2020,True,1,3
14697,8793343683237029bac548225ba51f403489cd35,Although the published parent studies enrolled...,BMJ Open,2015,True,1,3
14731,4b36607cdbc54f8006161a9a1839489dd0a51269,The majority of study participants were males ...,Int Health,2020,True,1,2
14797,606233835c3d6d195b7d230745ccb0fded626aa7,To obtain a general profile of the case distri...,Chin Med J (Engl),2020,True,1,1


In [50]:
docs = data.text.tolist()

In [51]:
n_topics = 5
n_iters = 100
alpha = .1
beta = .1

In [52]:
convert_corpus(docs)
init_model()
for i in tqdm(range(n_iters)):
    BOW.apply(lambda x: gibbs_sample(x.name[0], x.name[1]), 1) 

100%|██████████| 100/100 [1:01:40<00:00, 37.01s/it]


In [53]:
DOC = show_results()

--------------------------------------------------------------------------------
topic_id         0  term_str
term_id                     
0         0.102015          
3         0.028879         p
28        0.020698         e
34        0.017263         n
73        0.014494         l
50        0.009472   sarscov
31        0.008129   january
132       0.008007     water
24        0.007539  sarscov2
84        0.007143  positive
--------------------------------------------------------------------------------
topic_id         1  term_str
term_id                     
2         0.035521     cases
1         0.028803  patients
13        0.027034         c
0         0.018594          
6         0.014292       fig
16        0.013938      risk
91        0.013390         f
63        0.013207         h
120       0.010709  endpoint
17        0.010407  reported
--------------------------------------------------------------------------------
topic_id         2   term_str
term_id                      
4

In [56]:
DOC.to_csv('../dataset/processed-files/Topic_Distribution.csv')

# Using gensim

In [25]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [26]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /Users/haritha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haritha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [29]:
import random
text_data = []
with open('../dataset/processed-files/consolidated.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            #print(tokens)
            text_data.append(tokens)

In [30]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [31]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.008*"model" + 0.007*"sample" + 0.007*"result" + 0.006*"patient"')
(1, '0.011*"group" + 0.010*"virus" + 0.010*"infection" + 0.010*"figure"')
(2, '0.016*"virus" + 0.014*"cell" + 0.011*"infection" + 0.008*"show"')
(3, '0.020*"cell" + 0.016*"protein" + 0.008*"level" + 0.008*"figure"')
(4, '0.014*"figure" + 0.008*"show" + 0.007*"result" + 0.007*"peptide"')


In [35]:
topics

[(0, '0.008*"model" + 0.007*"sample" + 0.007*"result" + 0.006*"patient"'),
 (1, '0.011*"group" + 0.010*"virus" + 0.010*"infection" + 0.010*"figure"'),
 (2, '0.016*"virus" + 0.014*"cell" + 0.011*"infection" + 0.008*"show"'),
 (3, '0.020*"cell" + 0.016*"protein" + 0.008*"level" + 0.008*"figure"'),
 (4, '0.014*"figure" + 0.008*"show" + 0.007*"result" + 0.007*"peptide"')]

#### visualize with pyldavis

In [32]:
#pip install pyLDAvis

In [33]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [34]:
pyLDAvis.save_html(lda_display, 'lda.html')