In [63]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()

def read_csv_with_weight(csv_path):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    # 转发数,评论数,点赞数
    comments = df['评论数'].tolist()
    reposts = df['转发数'].tolist()
    likes = df['点赞数'].tolist()
    weibos = df['微博正文'].tolist()
    results = []

    for row in range(len(weibos)):
        if comments[row] + reposts[row] + likes[row] > 10:
            results.append(weibos[row])

    return results


In [64]:
contents = read_csv_with_weight('./trump/preprocessed_v2.csv' )

new_list = []

for content in contents:
    content = content.split()
    new_list.append(content)


In [65]:
from gensim.corpora import Dictionary

'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''

dictionary = Dictionary(new_list)

dictionary.filter_extremes(no_below=5, no_above=0.85, keep_n=10000)

'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in new_list]

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 61 ("总统") appears 2 time.
Word 74 ("美国") appears 2 time.
Word 81 ("27号") appears 1 time.
Word 82 ("29号") appears 1 time.
Word 83 ("30号") appears 1 time.
Word 84 ("之间") appears 1 time.
Word 85 ("俄罗斯") appears 4 time.
Word 86 ("公共") appears 1 time.
Word 87 ("公民") appears 1 time.
Word 88 ("关系") appears 2 time.
Word 89 ("军备") appears 1 time.
Word 90 ("原因") appears 1 time.
Word 91 ("反恐") appears 1 time.
Word 92 ("发动") appears 1 time.
Word 93 ("发生") appears 1 time.
Word 94 ("发表") appears 1 time.
Word 95 ("合作") appears 1 time.
Word 96 ("场所") appears 1 time.
Word 97 ("声明") appears 2 time.
Word 98 ("安全局") appears 1 time.
Word 99 ("工作") appears 1 time.
Word 100 ("恐怖") appears 2 time.
Word 101 ("情报") appears 1 time.
Word 102 ("感谢") appears 1 time.
Word 103 ("挫败") appears 1 time.
Word 104 ("控制") appears 1 time.
Word 105 ("提供") appears 2 time.
Word 106 ("新年") appears 1 time.
Word 107 ("普京") appears 3 time.
Word 108 ("期间") appears 2 time.
Word 109 ("未来") appears 1 time.
Word 110 ("状况") appears 

In [66]:
from gensim.models import LdaMulticore

# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus,
#                                    num_topics = 10,
#                                    id2word = dictionary,
#                                    passes = 50)

# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model = LdaMulticore(bow_corpus,
                               num_topics = 100,
                               id2word = dictionary,
                               passes = 10,
                               workers = 4)

'''
For each topic, we will explore the words occurring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.085*"喜欢" + 0.038*"国民" + 0.033*"关心" + 0.027*"侄女" + 0.026*"家族" + 0.025*"警卫队" + 0.024*"记录" + 0.021*"超级" + 0.021*"大会" + 0.019*"小布什"


Topic: 1 
Words: 0.162*"第一" + 0.103*"夫人" + 0.086*"总统" + 0.084*"梅拉尼娅" + 0.082*"美国" + 0.033*"有史以来" + 0.023*"视频" + 0.021*"雕像" + 0.015*"福克斯" + 0.014*"堕胎"


Topic: 2 
Words: 0.111*"美国" + 0.083*"TikTok" + 0.058*"公司" + 0.028*"企业" + 0.025*"华为" + 0.022*"政府" + 0.021*"业务" + 0.014*"中国" + 0.014*"芯片" + 0.014*"用户"


Topic: 3 
Words: 0.100*"美元" + 0.046*"法案" + 0.044*"经济" + 0.042*"美国" + 0.038*"刺激" + 0.028*"计划" + 0.024*"签署" + 0.015*"失业" + 0.015*"总统" + 0.012*"威斯康星州"


Topic: 4 
Words: 0.031*"视频" + 0.030*"成立" + 0.029*"L" + 0.027*"官方" + 0.024*"美国" + 0.023*"入院" + 0.023*"北京" + 0.021*"自治区" + 0.018*"重复" + 0.018*"消毒水"


Topic: 5 
Words: 0.082*"中国" + 0.066*"蓬佩奥" + 0.060*"美国" + 0.043*"关系" + 0.041*"贸易" + 0.034*"国务卿" + 0.022*"外交" + 0.017*"政策" + 0.015*"协议" + 0.015*"协定"


Topic: 6 
Words: 0.070*"物资" + 0.034*"美国" + 0.030*"收到" + 0.029*"医疗" + 0.027*"中国" + 0.024*"30日" + 0.024

In [69]:
for i in range(10):
    # trump1_content = read_csv_column('./trump/trump_4.csv', '微博正文')
    index = str(i + 1)
    trump1_content = read_csv_with_weight('./trump/trump_' + index + '.csv')

    item = ''
    for i in range(len(trump1_content)):
        item = item + trump1_content[i]
    unseen_document = item.split()

    bow_vector = dictionary.doc2bow(unseen_document)

    print(index + "月\n")
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 20)))

    print("\n\n")

1月

Score: 0.1692439168691635	 Topic: 0.166*"伊朗" + 0.052*"美国" + 0.035*"袭击" + 0.018*"美军" + 0.017*"伊拉克" + 0.016*"苏莱曼尼" + 0.014*"总统" + 0.013*"报复" + 0.012*"暗杀" + 0.012*"国防部长" + 0.011*"国际" + 0.011*"埃斯珀" + 0.010*"制裁" + 0.010*"大使馆" + 0.009*"军事" + 0.009*"1月" + 0.009*"报道" + 0.008*"发出" + 0.008*"发动" + 0.008*"基地"
Score: 0.03634488955140114	 Topic: 0.046*"顾问" + 0.042*"总统" + 0.037*"报道" + 0.031*"美国" + 0.025*"白宫" + 0.022*"人士" + 0.021*"国会" + 0.020*"国家" + 0.018*"博尔顿" + 0.017*"指控" + 0.015*"调查" + 0.014*"情报" + 0.014*"否认" + 0.013*"弹劾" + 0.013*"消息" + 0.012*"知情" + 0.011*"担任" + 0.010*"山报" + 0.010*"高级" + 0.010*"美媒"
Score: 0.03547772765159607	 Topic: 0.127*"英国" + 0.051*"协议" + 0.028*"首相" + 0.024*"约翰逊" + 0.021*"法国" + 0.020*"美国" + 0.018*"政府" + 0.017*"达成" + 0.013*"报道" + 0.012*"公司" + 0.012*"中国" + 0.010*"马克龙" + 0.010*"发动机" + 0.009*"国家" + 0.009*"卫报" + 0.009*"合作" + 0.009*"霸凌" + 0.009*"贸易" + 0.007*"阶段" + 0.007*"英媒"
Score: 0.03242999315261841	 Topic: 0.077*"美国" + 0.067*"军事" + 0.050*"美军" + 0.021*"伊拉克" + 0.018*"武器" + 0.013*