In [39]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()


def read_csv_with_weight(csv_path):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)

    # 转发数,评论数,点赞数
    comments = df['评论数'].tolist()
    reposts = df['转发数'].tolist()
    likes = df['点赞数'].tolist()
    weibos = df['微博正文'].tolist()
    results = []

    # for row in range(len(weibos)):
    #     if comments[row] + reposts[row] + likes[row] > 10:
    #         results.append(weibos[row])

    for row in range(len(weibos)):
        times = 1 + int(reposts[row]/100) + int(comments[row]/500) + int(likes[row]/1000)
        for time in range(times):
            results.append(weibos[row])

    new_list = []
    for result in results:
        result = result.split()
        new_list.append(result)

    return new_list

In [40]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaMulticore


In [41]:
trump_content = read_csv_with_weight('./trump/trump01.csv')

In [42]:
dictionary = Dictionary(trump_content)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=3000)
bow_corpus = [dictionary.doc2bow(doc) for doc in trump_content]

In [43]:
tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [44]:
lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=10, workers=4)
lda_model_tfidf.save('./model/lda_models_tfidf_1.lda')

In [45]:
lda_model = LdaMulticore.load('./model/lda_models_tfidf_1.lda')
item = []
for j in range(len(trump_content)):
    item = item + trump_content[j]

bow_vector = dictionary.doc2bow(item)

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.10087643563747406	 Topic: 0.016*"普京" + 0.013*"中国" + 0.010*"美国" + 0.009*"电话" + 0.009*"披露" + 0.009*"亲信" + 0.009*"痛斥" + 0.008*"台湾" + 0.008*"错过" + 0.007*"商务"
Score: 0.09863615781068802	 Topic: 0.034*"伊朗" + 0.021*"袭击" + 0.020*"伊拉克" + 0.020*"美军" + 0.018*"基地" + 0.015*"导弹" + 0.011*"军事" + 0.010*"美国" + 0.010*"战争" + 0.010*"空袭"
Score: 0.07081770896911621	 Topic: 0.048*"伊朗" + 0.042*"声明" + 0.041*"领导人" + 0.040*"报复" + 0.036*"发表" + 0.032*"袭击" + 0.029*"严厉" + 0.027*"3日" + 0.027*"发动" + 0.025*"伊拉克"
Score: 0.05953076854348183	 Topic: 0.051*"中国" + 0.038*"提供" + 0.030*"肺炎" + 0.025*"停止" + 0.022*"美方" + 0.022*"打压" + 0.021*"疫情" + 0.019*"英国" + 0.017*"企业" + 0.017*"外交部"
Score: 0.05686280503869057	 Topic: 0.052*"女儿" + 0.041*"喊话" + 0.039*"孩子" + 0.036*"家人" + 0.036*"收尸" + 0.032*"美军" + 0.031*"苏莱曼尼" + 0.029*"葬礼" + 0.016*"伊朗" + 0.016*"领袖"
Score: 0.05512788146734238	 Topic: 0.035*"参议院" + 0.034*"弹劾案" + 0.032*"弹劾" + 0.027*"审理" + 0.024*"审判" + 0.019*"正式" + 0.011*"投票" + 0.011*"时刻" + 0.011*"律师" + 0.010*"总统"
Score: 0.05129