In [1]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()


def read_csv_with_weight(csv_path):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)

    # 转发数,评论数,点赞数
    comments = df['评论数'].tolist()
    reposts = df['转发数'].tolist()
    likes = df['点赞数'].tolist()
    weibos = df['微博正文'].tolist()
    results = []

    # for row in range(len(weibos)):
    #     if comments[row] + reposts[row] + likes[row] > 10:
    #         results.append(weibos[row])

    for row in range(len(weibos)):
        times = 1 + int(reposts[row]/100) + int(comments[row]/500) + int(likes[row]/1000)
        for time in range(times):
            results.append(weibos[row])

    new_list = []
    for result in results:
        result = result.split()
        new_list.append(result)

    return new_list

In [2]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaMulticore


In [3]:
trump_content = read_csv_with_weight('./dataset/trump/trump01.csv')

In [4]:
dictionary = Dictionary(trump_content)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=3000)
bow_corpus = [dictionary.doc2bow(doc) for doc in trump_content]

In [5]:
tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [6]:
lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=10, workers=4)
lda_model_tfidf.save('./model/lda_models_tfidf_1.lda')

In [7]:
lda_model = LdaMulticore.load('./model/lda_models_tfidf_1.lda')
item = []
for j in range(len(trump_content)):
    item = item + trump_content[j]

bow_vector = dictionary.doc2bow(item)

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.1339540034532547	 Topic: 0.047*"伊朗" + 0.037*"伊拉克" + 0.027*"袭击" + 0.027*"报复" + 0.024*"领导人" + 0.022*"声明" + 0.020*"发动" + 0.019*"3日" + 0.019*"暗杀" + 0.018*"下令"
Score: 0.09265375882387161	 Topic: 0.016*"黄金" + 0.014*"市场" + 0.013*"经济" + 0.013*"关税" + 0.011*"欧盟" + 0.011*"名单" + 0.011*"中国" + 0.010*"支持" + 0.010*"贸易" + 0.009*"美元"
Score: 0.06266501545906067	 Topic: 0.028*"伊朗" + 0.014*"文化" + 0.012*"谈判" + 0.012*"美国" + 0.011*"抗议" + 0.010*"恐怖" + 0.010*"52" + 0.010*"示威" + 0.010*"分子" + 0.009*"无条件"
Score: 0.05460808426141739	 Topic: 0.035*"达沃斯" + 0.028*"弹劾" + 0.025*"论坛" + 0.017*"民主党" + 0.016*"环保" + 0.016*"演讲" + 0.016*"少女" + 0.015*"首日" + 0.015*"边境" + 0.013*"众议院"
Score: 0.047627873718738556	 Topic: 0.037*"协议" + 0.027*"阶段" + 0.026*"签署" + 0.023*"制定" + 0.020*"第一" + 0.017*"经贸" + 0.017*"贸易" + 0.016*"隐私" + 0.014*"中国" + 0.014*"用户"
Score: 0.0471968948841095	 Topic: 0.072*"病毒" + 0.066*"疫情" + 0.060*"冠状" + 0.057*"新型" + 0.053*"中国" + 0.037*"提供" + 0.034*"肺炎" + 0.024*"应对" + 0.021*"美国" + 0.017*"武汉"
Score: 0.04623202

In [8]:
for i in range(10):
    if i < 9:
        index = '0' + str(i + 1)
    else:
        index = str(i + 1)
    trump_content = read_csv_with_weight('./dataset/trump/trump' + index + '.csv')

    dictionary = Dictionary(trump_content)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=3000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in trump_content]
    tfidf = TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=10, workers=4)
    lda_model_tfidf.save('./model/lda_models_tfidf_' + index + '.lda')
    lda_model = LdaMulticore.load('./model/lda_models_tfidf_' + index + '.lda')
    item = []
    for j in range(len(trump_content)):
        item = item + trump_content[j]

    bow_vector = dictionary.doc2bow(item)

    print(index + '月')
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))
    print()

01月
Score: 0.12487968057394028	 Topic: 0.049*"伊朗" + 0.023*"美军" + 0.018*"导弹" + 0.018*"袭击" + 0.016*"基地" + 0.014*"伊拉克" + 0.014*"苏莱曼尼" + 0.014*"战争" + 0.012*"斩首" + 0.011*"美国"
Score: 0.12072467803955078	 Topic: 0.016*"贸易" + 0.015*"协议" + 0.012*"黄金" + 0.012*"阶段" + 0.012*"经济" + 0.011*"签署" + 0.011*"市场" + 0.010*"巨星" + 0.010*"第一" + 0.008*"致敬"
Score: 0.08322108536958694	 Topic: 0.030*"伊拉克" + 0.029*"领导人" + 0.029*"伊朗" + 0.026*"声明" + 0.025*"报复" + 0.022*"发动" + 0.021*"3日" + 0.020*"严厉" + 0.018*"袭击" + 0.016*"身亡"
Score: 0.06328310072422028	 Topic: 0.018*"关税" + 0.015*"华为" + 0.014*"英国" + 0.011*"政府" + 0.011*"中国" + 0.011*"欧盟" + 0.010*"美国" + 0.009*"欧洲" + 0.009*"进口" + 0.008*"5G"
Score: 0.05114225298166275	 Topic: 0.057*"弹劾" + 0.036*"参议院" + 0.033*"审判" + 0.027*"众议院" + 0.021*"弹劾案" + 0.017*"民主党" + 0.017*"条款" + 0.017*"国会" + 0.016*"总统" + 0.016*"投票"
Score: 0.049506478011608124	 Topic: 0.049*"女儿" + 0.041*"喊话" + 0.039*"孩子" + 0.037*"收尸" + 0.035*"家人" + 0.032*"美军" + 0.030*"苏莱曼尼" + 0.029*"葬礼" + 0.016*"哭泣" + 0.015*"领袖"
Score: