In [3]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()

def read_csv_with_weight(csv_path):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    # 转发数,评论数,点赞数
    comments = df['评论数'].tolist()
    reposts = df['转发数'].tolist()
    likes = df['点赞数'].tolist()
    weibos = df['微博正文'].tolist()
    results = []

    for row in range(len(weibos)):
        # times = 1 + int(reposts[row]/100) + int(comments[row]/500) + int(likes[row]/1000)
        if comments[row] + reposts[row] + likes[row] > 10:
            results.append(weibos[row])
        # for time in range(times):
        #     results.append(weibos[row])
    return results


In [4]:
contents = read_csv_with_weight('./trump/preprocessed_v2.csv' )

new_list = []

for content in contents:
    content = content.split()
    new_list.append(content)


In [5]:
from gensim.corpora import Dictionary

'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''

dictionary = Dictionary(new_list)
dictionary.filter_extremes(no_below=5, no_above=0.85, keep_n=10000)

'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in new_list]

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 61 ("总统") appears 2 time.
Word 74 ("美国") appears 2 time.
Word 81 ("27号") appears 1 time.
Word 82 ("29号") appears 1 time.
Word 83 ("30号") appears 1 time.
Word 84 ("之间") appears 1 time.
Word 85 ("俄罗斯") appears 4 time.
Word 86 ("公共") appears 1 time.
Word 87 ("公民") appears 1 time.
Word 88 ("关系") appears 2 time.
Word 89 ("军备") appears 1 time.
Word 90 ("原因") appears 1 time.
Word 91 ("反恐") appears 1 time.
Word 92 ("发动") appears 1 time.
Word 93 ("发生") appears 1 time.
Word 94 ("发表") appears 1 time.
Word 95 ("合作") appears 1 time.
Word 96 ("场所") appears 1 time.
Word 97 ("声明") appears 2 time.
Word 98 ("安全局") appears 1 time.
Word 99 ("工作") appears 1 time.
Word 100 ("恐怖") appears 2 time.
Word 101 ("情报") appears 1 time.
Word 102 ("感谢") appears 1 time.
Word 103 ("挫败") appears 1 time.
Word 104 ("控制") appears 1 time.
Word 105 ("提供") appears 2 time.
Word 106 ("新年") appears 1 time.
Word 107 ("普京") appears 3 time.
Word 108 ("期间") appears 2 time.
Word 109 ("未来") appears 1 time.
Word 110 ("状况") appears 

In [6]:
from gensim.models import LdaMulticore

# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus,
#                                    num_topics = 10,
#                                    id2word = dictionary,
#                                    passes = 50)

# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model = LdaMulticore(bow_corpus,
                               num_topics = 50,
                               id2word = dictionary,
                               passes = 10,
                               workers = 4)

'''
For each topic, we will explore the words occurring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.046*"美股" + 0.031*"股市" + 0.030*"熔断" + 0.016*"首场" + 0.016*"指数" + 0.014*"稀土" + 0.013*"A股" + 0.013*"下跌" + 0.013*"美国" + 0.011*"市场"


Topic: 1 
Words: 0.095*"美国" + 0.076*"TikTok" + 0.021*"业务" + 0.019*"公司" + 0.014*"报道" + 0.013*"用户" + 0.012*"微软" + 0.012*"政府" + 0.011*"消息" + 0.011*"信息"


Topic: 2 
Words: 0.065*"美国" + 0.056*"病毒" + 0.047*"疫情" + 0.027*"新冠" + 0.024*"冠状" + 0.020*"中心" + 0.018*"新型" + 0.016*"中国" + 0.014*"肺炎" + 0.014*"政府"


Topic: 3 
Words: 0.082*"美国" + 0.021*"中国" + 0.021*"世界" + 0.020*"国家" + 0.010*"俄罗斯" + 0.008*"联合国" + 0.008*"政治" + 0.008*"武器" + 0.008*"政府" + 0.007*"国际"


Topic: 4 
Words: 0.109*"微信" + 0.062*"交易" + 0.055*"禁止" + 0.038*"美国" + 0.036*"命令" + 0.035*"禁令" + 0.035*"45" + 0.034*"行政" + 0.032*"天后" + 0.029*"回应"


Topic: 5 
Words: 0.065*"美国" + 0.057*"关税" + 0.051*"CNN" + 0.028*"加拿大" + 0.019*"加征" + 0.019*"新闻" + 0.019*"征收" + 0.018*"时间" + 0.017*"新闻网" + 0.017*"有线"


Topic: 6 
Words: 0.054*"协议" + 0.034*"美国" + 0.027*"以色列" + 0.024*"和平" + 0.024*"计划" + 0.021*"暴乱" + 0.018*"塞尔维亚" 

In [7]:
for i in range(10):
    # trump1_content = read_csv_column('./trump/trump_4.csv', '微博正文')
    index = str(i + 1)
    trump1_content = read_csv_with_weight('./trump/trump_' + index + '.csv')

    item = ''
    for j in range(len(trump1_content)):
        item = item + trump1_content[j]
    unseen_document = item.split()

    bow_vector = dictionary.doc2bow(unseen_document)

    print(index + "月")
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 20)))

    print("\n")

1月
Score: 0.22044894099235535	 Topic: 0.079*"伊朗" + 0.053*"美国" + 0.036*"美军" + 0.029*"军事" + 0.019*"伊拉克" + 0.018*"导弹" + 0.015*"袭击" + 0.011*"战争" + 0.011*"总统" + 0.010*"报道" + 0.010*"逮捕" + 0.009*"国际" + 0.008*"组织" + 0.008*"武器" + 0.008*"基地" + 0.008*"制裁" + 0.008*"苏莱曼尼" + 0.007*"威胁" + 0.007*"中东" + 0.007*"军队"
Score: 0.062435656785964966	 Topic: 0.064*"中国" + 0.059*"美国" + 0.027*"关系" + 0.016*"政策" + 0.014*"外交" + 0.009*"贸易" + 0.009*"蓬佩奥" + 0.008*"战略" + 0.007*"脱钩" + 0.006*"经济" + 0.006*"合作" + 0.006*"政府" + 0.006*"文章" + 0.006*"政治" + 0.006*"之间" + 0.006*"贸易战" + 0.005*"国务卿" + 0.005*"阶段" + 0.005*"利益" + 0.005*"竞争"
Score: 0.05021282657980919	 Topic: 0.082*"美国" + 0.021*"中国" + 0.021*"世界" + 0.020*"国家" + 0.010*"俄罗斯" + 0.008*"联合国" + 0.008*"政治" + 0.008*"武器" + 0.008*"政府" + 0.007*"国际" + 0.006*"军事" + 0.006*"全世界" + 0.006*"发展" + 0.005*"西方" + 0.005*"拥有" + 0.005*"强大" + 0.005*"战略" + 0.005*"社会" + 0.004*"全球" + 0.004*"利益"
Score: 0.044830091297626495	 Topic: 0.047*"佩洛西" + 0.034*"总统" + 0.033*"众议院" + 0.028*"美国" + 0.025*"国会" + 0.022