In [6]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()

def read_csv_with_weight(csv_path):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    # 转发数,评论数,点赞数
    comments = df['评论数'].tolist()
    reposts = df['转发数'].tolist()
    likes = df['点赞数'].tolist()
    weibos = df['微博正文'].tolist()
    results = []

    for row in range(len(weibos)):
        # times = 1 + int(reposts[row]/100) + int(comments[row]/500) + int(likes[row]/1000)
        if comments[row] + reposts[row] + likes[row] > 10:
            results.append(weibos[row])
        # for time in range(times):
        #     results.append(weibos[row])
    return results


In [7]:
contents = read_csv_with_weight('./trump/preprocessed_v2.csv' )

new_list = []

for content in contents:
    content = content.split()
    new_list.append(content)


In [8]:
from gensim.corpora import Dictionary

'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''

dictionary = Dictionary(new_list)
dictionary.filter_extremes(no_below=5, no_above=0.85, keep_n=10000)

'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in new_list]

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 61 ("总统") appears 2 time.
Word 74 ("美国") appears 2 time.
Word 81 ("27号") appears 1 time.
Word 82 ("29号") appears 1 time.
Word 83 ("30号") appears 1 time.
Word 84 ("之间") appears 1 time.
Word 85 ("俄罗斯") appears 4 time.
Word 86 ("公共") appears 1 time.
Word 87 ("公民") appears 1 time.
Word 88 ("关系") appears 2 time.
Word 89 ("军备") appears 1 time.
Word 90 ("原因") appears 1 time.
Word 91 ("反恐") appears 1 time.
Word 92 ("发动") appears 1 time.
Word 93 ("发生") appears 1 time.
Word 94 ("发表") appears 1 time.
Word 95 ("合作") appears 1 time.
Word 96 ("场所") appears 1 time.
Word 97 ("声明") appears 2 time.
Word 98 ("安全局") appears 1 time.
Word 99 ("工作") appears 1 time.
Word 100 ("恐怖") appears 2 time.
Word 101 ("情报") appears 1 time.
Word 102 ("感谢") appears 1 time.
Word 103 ("挫败") appears 1 time.
Word 104 ("控制") appears 1 time.
Word 105 ("提供") appears 2 time.
Word 106 ("新年") appears 1 time.
Word 107 ("普京") appears 3 time.
Word 108 ("期间") appears 2 time.
Word 109 ("未来") appears 1 time.
Word 110 ("状况") appears 

In [9]:
from gensim.models import LdaMulticore

# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus,
#                                    num_topics = 10,
#                                    id2word = dictionary,
#                                    passes = 50)

# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model = LdaMulticore(bow_corpus,
                               num_topics = 50,
                               id2word = dictionary,
                               passes = 10,
                               workers = 4)

'''
For each topic, we will explore the words occurring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.051*"疫苗" + 0.046*"新冠" + 0.046*"美国" + 0.027*"治疗" + 0.025*"中心" + 0.013*"药物" + 0.013*"疾控" + 0.013*"白宫" + 0.012*"疗法" + 0.012*"研究"


Topic: 1 
Words: 0.133*"印度" + 0.060*"州长" + 0.031*"访问" + 0.024*"美国" + 0.021*"莫迪" + 0.020*"政府" + 0.019*"批评" + 0.018*"感恩" + 0.014*"彭斯" + 0.011*"荷兰"


Topic: 2 
Words: 0.043*"美股" + 0.029*"市场" + 0.024*"股市" + 0.020*"指数" + 0.016*"下跌" + 0.015*"美国" + 0.013*"暴跌" + 0.013*"道指" + 0.010*"刺激" + 0.010*"500"


Topic: 3 
Words: 0.054*"美国" + 0.035*"美军" + 0.027*"导弹" + 0.024*"军事" + 0.013*"军队" + 0.011*"破坏" + 0.010*"人员" + 0.010*"海军" + 0.009*"报道" + 0.009*"总统"


Topic: 4 
Words: 0.060*"美国" + 0.029*"网友" + 0.020*"暴乱" + 0.011*"关系" + 0.010*"世界" + 0.010*"有人" + 0.009*"真的" + 0.008*"搞笑" + 0.006*"吐槽" + 0.006*"民众"


Topic: 5 
Words: 0.093*"拜登" + 0.068*"美国" + 0.066*"大选" + 0.047*"总统" + 0.029*"辩论" + 0.021*"竞选" + 0.021*"民主党" + 0.020*"候选人" + 0.019*"选举" + 0.019*"投票"


Topic: 6 
Words: 0.182*"口罩" + 0.076*"记者" + 0.023*"CNN" + 0.021*"提问" + 0.020*"新闻" + 0.017*"美国" + 0.015*"拒绝" + 0.014*

In [10]:
for i in range(10):
    # trump1_content = read_csv_column('./trump/trump_4.csv', '微博正文')
    index = str(i + 1)
    trump1_content = read_csv_with_weight('./trump/trump_' + index + '.csv')

    item = ''
    for j in range(len(trump1_content)):
        item = item + trump1_content[j]
    unseen_document = item.split()

    bow_vector = dictionary.doc2bow(unseen_document)

    print(index + "月")
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 20)))

    print("\n")

1月
Score: 0.2103423774242401	 Topic: 0.123*"伊朗" + 0.054*"美国" + 0.030*"伊拉克" + 0.024*"战争" + 0.019*"袭击" + 0.017*"美军" + 0.012*"制裁" + 0.012*"苏莱曼尼" + 0.012*"组织" + 0.012*"中东" + 0.011*"国际" + 0.011*"下令" + 0.010*"总统" + 0.010*"报复" + 0.010*"阿富汗" + 0.009*"暗杀" + 0.009*"发动" + 0.008*"恐怖" + 0.008*"威胁" + 0.007*"基地"
Score: 0.054512206465005875	 Topic: 0.094*"美国" + 0.038*"关税" + 0.032*"贸易" + 0.028*"协议" + 0.015*"组织" + 0.015*"联合国" + 0.014*"世界" + 0.014*"政府" + 0.013*"加征" + 0.013*"征收" + 0.011*"总统" + 0.011*"进口" + 0.010*"签署" + 0.010*"欧盟" + 0.010*"商品" + 0.009*"谈判" + 0.008*"退出" + 0.008*"世贸" + 0.007*"非法" + 0.007*"产品"
Score: 0.04247905686497688	 Topic: 0.036*"美国" + 0.031*"华为" + 0.015*"技术" + 0.015*"关系" + 0.013*"利益" + 0.013*"芯片" + 0.013*"发展" + 0.012*"企业" + 0.011*"中国" + 0.011*"科技" + 0.009*"领域" + 0.008*"政府" + 0.008*"制裁" + 0.008*"全球" + 0.008*"民主" + 0.006*"政策" + 0.006*"市场" + 0.006*"稀土" + 0.006*"打压" + 0.006*"政治"
Score: 0.04211108013987541	 Topic: 0.144*"中国" + 0.077*"美国" + 0.017*"国家" + 0.015*"世界" + 0.007*"中国人" + 0.006*"政府" +