In [13]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()

In [14]:
import gensim

contents = read_csv_column('./trump/processed_trump.csv', '微博正文')


new_list = []

for content in contents:
    content = content.split()
    new_content = []
    for i in content:
        if len(i) != 1:
            new_content.append(i)
    new_list.append(new_content)


In [15]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''

dictionary = gensim.corpora.Dictionary(new_list)

dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in new_list]

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 143 ("关注") appears 1 time.
Word 250 ("30日") appears 1 time.
Word 251 ("BBC") appears 1 time.
Word 252 ("专家") appears 1 time.
Word 253 ("今年") appears 1 time.
Word 254 ("凝视") appears 1 time.
Word 255 ("可笑") appears 1 time.
Word 256 ("回应") appears 1 time.
Word 257 ("多次") appears 1 time.
Word 258 ("大会") appears 1 time.
Word 259 ("天下") appears 1 time.
Word 260 ("对话") appears 1 time.
Word 261 ("少女") appears 1 time.
Word 262 ("当时") appears 1 time.
Word 263 ("接受") appears 1 time.
Word 264 ("攻击") appears 1 time.
Word 265 ("显然") appears 1 time.
Word 266 ("来说") appears 1 time.
Word 267 ("根本") appears 1 time.
Word 268 ("格里塔") appears 5 time.
Word 269 ("正常") appears 1 time.
Word 270 ("死亡") appears 1 time.
Word 271 ("气候") appears 1 time.
Word 272 ("浪费") appears 2 time.
Word 273 ("焦点") appears 1 time.
Word 274 ("狠狠") appears 1 time.
Word 275 ("环保") appears 1 time.
Word 276 ("瑞典") appears 1 time.
Word 277 ("生活") appears 1 time.
Word 278 ("看来") appears 1 time.
Word 279 ("科学家") appears 1 time.
Word

In [16]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus,
#                                    num_topics = 10,
#                                    id2word = dictionary,
#                                    passes = 50)

# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 10,
                                   id2word = dictionary,
                                   passes = 10,
                                   workers = 3)

'''
For each topic, we will explore the words occurring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.031*"肺炎" + 0.030*"死亡" + 0.025*"病例" + 0.023*"疫苗" + 0.016*"人数" + 0.013*"福奇" + 0.012*"中心" + 0.012*"专家" + 0.010*"数据" + 0.010*"可能"


Topic: 1 
Words: 0.012*"朗普" + 0.011*"知道" + 0.009*"没有" + 0.008*"网友" + 0.006*"川普" + 0.006*"现在" + 0.006*"觉得" + 0.006*"世界" + 0.006*"记者" + 0.005*"看看"


Topic: 2 
Words: 0.027*"美元" + 0.015*"市场" + 0.013*"字节" + 0.013*"跳动" + 0.012*"经济" + 0.012*"关税" + 0.008*"刺激" + 0.007*"人民网" + 0.007*"指数" + 0.007*"黄金"


Topic: 3 
Words: 0.013*"可能" + 0.012*"问题" + 0.010*"国家" + 0.010*"没有" + 0.008*"围观" + 0.008*"已经" + 0.008*"世界" + 0.008*"现在" + 0.007*"经济" + 0.006*"政治"


Topic: 4 
Words: 0.024*"白宫" + 0.020*"报道" + 0.017*"当地" + 0.015*"TikTok" + 0.014*"竞选" + 0.010*"集会" + 0.010*"媒体" + 0.010*"纽约" + 0.009*"活动" + 0.007*"政府"


Topic: 5 
Words: 0.023*"疗法" + 0.021*"发烧" + 0.021*"巴马" + 0.020*"医生" + 0.014*"举报" + 0.009*"失职" + 0.009*"治疗" + 0.009*"见闻" + 0.008*"环球" + 0.008*"出现"


Topic: 6 
Words: 0.062*"哈哈哈" + 0.017*"外交部" + 0.014*"国际" + 0.013*"美方" + 0.013*"伊朗" + 0.013*"国家" + 0.011*"蓬佩奥" + 0.

In [25]:
trump1_content = read_csv_column('./trump/trump_10.csv', '微博正文')

# unseen_document = []
# for trump1_content in trump1_contents:
#     unseen_document = unseen_document + trump1_content.split()
unseen_document = []
item = ''
for i in range(len(trump1_content)):
    item = item + trump1_content[i]
unseen_document_raw = item.split()
for i in unseen_document_raw:
    if len(i) != 1:
        unseen_document.append(i)

bow_vector = dictionary.doc2bow(unseen_document)

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 20)))

Score: 0.20569856464862823	 Topic: 0.069*"确诊" + 0.041*"检测" + 0.033*"夫妇" + 0.029*"感染" + 0.026*"口罩" + 0.026*"阳性" + 0.021*"接受" + 0.020*"症状" + 0.019*"朗普" + 0.017*"治疗" + 0.014*"白宫" + 0.013*"抗体" + 0.013*"医院" + 0.010*"鸡尾酒" + 0.010*"助理" + 0.010*"出现" + 0.010*"良好" + 0.010*"隔离" + 0.009*"医疗" + 0.008*"康复"
Score: 0.14996275305747986	 Topic: 0.013*"可能" + 0.012*"问题" + 0.010*"国家" + 0.010*"没有" + 0.008*"围观" + 0.008*"已经" + 0.008*"世界" + 0.008*"现在" + 0.007*"经济" + 0.006*"政治" + 0.006*"认为" + 0.005*"政府" + 0.004*"连任" + 0.004*"回答" + 0.004*"政策" + 0.004*"这种" + 0.004*"希望" + 0.004*"全球" + 0.004*"中美" + 0.004*"价值"
Score: 0.14765140414237976	 Topic: 0.024*"白宫" + 0.020*"报道" + 0.017*"当地" + 0.015*"TikTok" + 0.014*"竞选" + 0.010*"集会" + 0.010*"媒体" + 0.010*"纽约" + 0.009*"活动" + 0.007*"政府" + 0.007*"接受" + 0.007*"抗议" + 0.007*"举行" + 0.006*"联邦" + 0.006*"记者" + 0.006*"进行" + 0.005*"华盛顿" + 0.005*"梅拉尼娅" + 0.005*"顾问" + 0.005*"采访"
Score: 0.10504915565252304	 Topic: 0.012*"朗普" + 0.011*"知道" + 0.009*"没有" + 0.008*"网友" + 0.006*"川普" + 0.006*"现在" + 