In [22]:
import pandas as pd

def read_csv_column(csv_path, column_name):
    df = pd.read_csv(csv_path, encoding='utf-8', low_memory=False)
    return df[column_name].tolist()

In [23]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [24]:
def extract_topn_from_vector(feature_name_list, sorted_item_list, topn):
    """
    get the feature names and tf-idf score of top n items
    """

    # use only topn items from vector
    sorted_item_list = sorted_item_list[:topn]

    score_vals = []
    feature_vals = []

    # word index and corresponding tf-idf score
    for idx, score in sorted_item_list:

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_name_list[idx])

    # create a tuples of feature,score
    # results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

# get the weibo contents
weibo_contents = read_csv_column('./trump/preprocessed_v2.csv', '微博正文')

# ignore words that appear in 85% of documents
cv=CountVectorizer(max_df=0.85, max_features=10000)
word_count_vector=cv.fit_transform(weibo_contents)

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [27]:
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
trump1_content = read_csv_column('./trump/trump_9.csv', '微博正文')

In [28]:
item = ''
for i in range(len(trump1_content)):
    item = item + trump1_content[i]
# item = trump1_content[1]
# generate tf-idf for the given document
tf_idf_vector = tfidf_transformer.transform(cv.transform([item]))

# sort the tf-idf vectors by descending order of scores
sorted_items = sort_coo(tf_idf_vector.tocoo())

# extract only the top n; n here is 10
keywords = extract_topn_from_vector(feature_names, sorted_items, 20)

# now print the results
# print("\n=====Doc=====")
# print(period_contents)
print("\nKeywords")
for k in keywords:
    print(k, keywords[k])


Keywords
美国 0.464
中国 0.257
政府 0.179
总统 0.169
9月 0.157
拜登 0.147
tiktok 0.125
疫苗 0.106
大选 0.104
新冠 0.102
报道 0.101
疫情 0.101
关税 0.095
视频 0.094
美元 0.093
新闻 0.089
死亡 0.082
时间 0.08
记者 0.08
国家 0.078
