In [1]:
# -*- coding: utf-8 -*-

# packages to store and manipulate data
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

# model building package
import sklearn

# package to clean text
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenanfan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import lda
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [4]:
#print n_top_words
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [64]:
df = pd.read_excel("/Users/chenanfan/Aaron_Topic_model/clean_tweet.xlsx", index_col=None,na_values=['NA'])
df.shape
n_features = 5000

In [69]:
tf_vectorizer = CountVectorizer(lowercase=False,
                                strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.content)
vocab=tf_vectorizer.get_feature_names()
model = lda.LDA(n_topics=15, n_iter=100, random_state=1)  
model.fit(tf)
print('model done')

INFO:lda:n_documents: 44489
INFO:lda:vocab_size: 5000
INFO:lda:n_words: 590228
INFO:lda:n_topics: 15
INFO:lda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -6616531
INFO:lda:<10> log likelihood: -4317095
INFO:lda:<20> log likelihood: -4042364
INFO:lda:<30> log likelihood: -3980212
INFO:lda:<40> log likelihood: -3949851
INFO:lda:<50> log likelihood: -3927702
INFO:lda:<60> log likelihood: -3914070
INFO:lda:<70> log likelihood: -3907214
INFO:lda:<80> log likelihood: -3895984
INFO:lda:<90> log likelihood: -3889617
INFO:lda:<99> log likelihood: -3885366


model done


In [70]:
#distribution of topic-word
topic_word = model.topic_word_ 
print("shape: {}".format(topic_word.shape))
print(vocab[:3])
print(topic_word[:, :3])
for n in range(15):
    sum_pr = sum(topic_word[n,:])  
    print("topic: {} sum: {}".format(n, sum_pr))

shape: (15, 5000)
['aabpbpzu', 'aan', 'aaya']
[[5.82954413e-07 5.82954413e-07 5.82954413e-07]
 [2.17822214e-07 2.17822214e-07 2.17822214e-07]
 [1.91673695e-07 1.91673695e-07 1.91673695e-07]
 [1.66378278e-07 1.66378278e-07 1.66378278e-07]
 [2.75451741e-07 2.75451741e-07 2.75451741e-07]
 [1.64209005e-07 1.64209005e-07 1.64209005e-07]
 [9.36140930e-04 2.75254611e-07 2.75254611e-07]
 [4.18532625e-07 4.18532625e-07 4.18532625e-07]
 [2.75687150e-07 2.75687150e-07 2.75687150e-07]
 [2.88001843e-07 2.88001843e-07 2.88001843e-07]
 [3.20523094e-07 3.20523094e-07 3.20523094e-07]
 [2.12318733e-07 2.12318733e-07 3.61154165e-04]
 [2.24789822e-07 2.24789822e-07 2.24789822e-07]
 [3.39190014e-07 1.18750424e-03 3.39190014e-07]
 [2.86098475e-07 2.86098475e-07 2.86098475e-07]]
topic: 0 sum: 0.9999999999999971
topic: 1 sum: 0.9999999999999298
topic: 2 sum: 0.9999999999999679
topic: 3 sum: 0.9999999999998656
topic: 4 sum: 1.0000000000000224
topic: 5 sum: 0.9999999999999072
topic: 6 sum: 1.0000000000000553
to

In [71]:
#computing Top-N words of each topic
import numpy as np
n = 10
for i, topic_dist in enumerate(topic_word):  
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]  
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))  

*Topic 0
- ไวร coronavirus ncov นธ ใหม สโคโรน สโคโรนาสายพ สโคโรนา กร wuhan
*Topic 1
- coronaviru china coronavirus travel spread medic wuhan outbreak chines hong
*Topic 2
- health emerg declar public intern concern time coronavirus global com
*Topic 3
- china wuhan coronaviru death ncov coronavirus case confirm report outsid
*Topic 4
- travel china pheic coronavirus ncov coronavirusaustralia countri ban declar restrict
*Topic 5
- coronavirus wuhan wuhancoronavirus viru china outbreak peopl pneumonia coronaviru virus
*Topic 6
- ncov coronavirus twitter pic com inform transmiss novel coronaviru infect
*Topic 7
- le coronaviru coronavirus la wuhan twitter pic com en chine
*Topic 8
- coronaviru wuhan coronavirus china like look taiwan govern market virolog
*Topic 9
- wuhan coronavirus china india die evacu case student der indian
*Topic 10
- coronaviru zhong nanshan expert china day sar respiratori say wuhan
*Topic 11
- twitter pic com china coronaviru coronavirus viru flu wuhan outbreak
*

In [72]:
#Document-Topic distribution
doc_topic = model.doc_topic_  
print("type(doc_topic): {}".format(type(doc_topic)))  
print("shape: {}".format(doc_topic.shape))
topicList=[]

type(doc_topic): <class 'numpy.ndarray'>
shape: (44489, 15)


In [73]:
#export all the document-tpoic results to a .xlsx file
for n in range(len(doc_topic)):  
    topic_most_pr = doc_topic[n].argmax()
    topicList.append(topic_most_pr)
    #print("doc: {} topic: {}".format(n, topic_most_pr))  
topicdf=pd.DataFrame(topicList)
df['topic']=topicList
df.to_excel('Coronavirus-20topics-10words.xlsx',sheet_name='Sheet1')
print('done')

done
