In [196]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()

In [197]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [198]:
def extract_topn_from_vector(feature_name_list, sorted_item_list, topn=10):
    """
    get the feature names and tf-idf score of top n items
    """

    # use only topn items from vector
    sorted_item_list = sorted_item_list[:topn]

    score_vals = []
    feature_vals = []

    # word index and corresponding tf-idf score
    for idx, score in sorted_item_list:

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_name_list[idx])

    # create a tuples of feature,score
    # results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

In [199]:
from sklearn.feature_extraction.text import CountVectorizer

# get the weibo contents
weibo_contents = read_csv_column('./trump/processed_trump.csv', '微博正文')

# ignore words that appear in 85% of documents
cv=CountVectorizer(max_df=0.85, max_features=10000)
word_count_vector=cv.fit_transform(weibo_contents)

In [200]:
print(type(weibo_contents[0]))
list(cv.vocabulary_.keys())[:10]

<class 'str'>


['但特朗普', '不知', '知道', '说话', '创业', '平均', '股价', '2018年', '国庆', '本来']

In [201]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [202]:
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
trump1_content = read_csv_column('./trump/trump_10.csv', '微博正文')

In [203]:
item = ''
for i in range(len(trump1_content)):
    item = item + trump1_content[i]
# item = trump1_content[1]
# generate tf-idf for the given document
tf_idf_vector = tfidf_transformer.transform(cv.transform([item]))

# sort the tf-idf vectors by descending order of scores
sorted_items = sort_coo(tf_idf_vector.tocoo())

# extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items, 20)

# now print the results
# print("\n=====Doc=====")
# print(period_contents)
print("\nKeywords")
for k in keywords:
    print(k, keywords[k])


Keywords
新冠 0.329
美国 0.272
哈哈哈 0.255
确诊 0.231
总统 0.197
夫妇 0.188
检测 0.146
视频 0.144
病毒 0.137
阳性 0.127
感染 0.127
白宫 0.126
朗普 0.125
拜登 0.122
接受 0.116
辩论 0.115
大选 0.114
症状 0.111
竞选 0.11
治疗 0.108
