In [None]:
# 本模块：tf、idf值计算和文章向量计算

In [1]:
import pandas as pd
table = pd.read_csv('data.csv')
table

Unnamed: 0,title,body,topic,id,new_body
0,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarner...,business,1,quarterly profit medium giant jump three month...
1,Dollar gains on Greenspan speech,The dollar has hit its highest level against t...,business,2,dollar hit high level euro almost three month ...
2,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuko...,business,3,owner oil giant ask buyer former production un...
3,High fuel prices hit BA's profits,British Airways has blamed high fuel prices fo...,business,4,blame high fuel price drop profit report resul...
4,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Domec...,business,5,share drink food firm risen speculation could ...
...,...,...,...,...,...
2220,BT program to beat dialler scams,BT is introducing two initiatives to help beat...,tech,2221,introduce two initiative help beat cost net us...
2221,Spam e-mails tempt net shoppers,Computer users across the world continue to ig...,tech,2222,computer user across world continue ignore sec...
2222,Be careful how you code,A new European directive could put software wr...,tech,2223,new directive could put software writer risk l...
2223,US cyber security chief resigns,The man making sure US computer networks are s...,tech,2224,man make sure computer network safe secure res...


In [2]:
# 分出每个新闻中的词
split_word = table['new_body'].str.split(' ', expand=True).stack().rename('word').reset_index()
table2 = pd.merge(table[['title', 'id']], split_word, left_index=True, right_on='level_0')
table2

Unnamed: 0,title,id,level_0,level_1,word
0,Ad sales boost Time Warner profit,1,0,0,quarterly
1,Ad sales boost Time Warner profit,1,0,1,profit
2,Ad sales boost Time Warner profit,1,0,2,medium
3,Ad sales boost Time Warner profit,1,0,3,giant
4,Ad sales boost Time Warner profit,1,0,4,jump
...,...,...,...,...,...
344116,Losing yourself in online gaming,2225,2224,1146,half
344117,Losing yourself in online gaming,2225,2224,1147,term
344118,Losing yourself in online gaming,2225,2224,1148,online
344119,Losing yourself in online gaming,2225,2224,1149,game


In [3]:
# 统计每个新闻的单词数
tabletotcnt = table2.groupby('id').count().rename(columns={'word':'totcnt'})[['totcnt']]
table2 = pd.merge(table2, tabletotcnt, on='id')
table2

Unnamed: 0,title,id,level_0,level_1,word,totcnt
0,Ad sales boost Time Warner profit,1,0,0,quarterly,183
1,Ad sales boost Time Warner profit,1,0,1,profit,183
2,Ad sales boost Time Warner profit,1,0,2,medium,183
3,Ad sales boost Time Warner profit,1,0,3,giant,183
4,Ad sales boost Time Warner profit,1,0,4,jump,183
...,...,...,...,...,...,...
344116,Losing yourself in online gaming,2225,2224,1146,half,1151
344117,Losing yourself in online gaming,2225,2224,1147,term,1151
344118,Losing yourself in online gaming,2225,2224,1148,online,1151
344119,Losing yourself in online gaming,2225,2224,1149,game,1151


In [4]:
# 计算TF值
tablewordcnt = table2.groupby(['id', 'word']).count().rename(columns={'title':'wordcnt'})[['wordcnt']]
tabletf = pd.merge(table2, tablewordcnt, on=['id', 'word'], how='left')
tabletf['TF'] = tabletf['wordcnt']/tabletf['totcnt']
tabletf = tabletf.drop_duplicates(subset=['id', 'word'])

In [5]:
# 计算IDF值和TF*IDF值
import numpy as np
N = tabletf['id'].nunique()
tableidf = tabletf.groupby('word').count().rename(columns={'level_0':'IDF'})[['IDF']]
tableidf['IDF'] = np.log(N/tableidf['IDF'])
tabletfidf = pd.merge(tabletf, tableidf, on='word').sort_values(by=['id', 'word'])
tabletfidf['TFIDF'] = tabletfidf['TF']*tabletfidf['IDF']
tabletfidf = tabletfidf.drop(columns=['level_0', 'level_1'])
tabletfidf

Unnamed: 0,title,id,word,totcnt,wordcnt,TF,IDF,TFIDF
25654,Ad sales boost Time Warner profit,1,account,183,2,0.010929,2.677074,0.029258
6994,Ad sales boost Time Warner profit,1,advert,183,1,0.005464,4.375308,0.023909
14026,Ad sales boost Time Warner profit,1,advertising,183,2,0.010929,4.375308,0.047818
26985,Ad sales boost Time Warner profit,1,already,183,1,0.005464,1.783256,0.009745
16304,Ad sales boost Time Warner profit,1,also,183,2,0.010929,0.564685,0.006171
...,...,...,...,...,...,...,...,...
42834,Losing yourself in online gaming,2225,worry,1151,3,0.002606,2.989013,0.007791
58736,Losing yourself in online gaming,2225,would,1151,7,0.006082,0.663479,0.004035
185676,Losing yourself in online gaming,2225,write,1151,8,0.006950,2.365178,0.016439
3236,Losing yourself in online gaming,2225,year,1151,9,0.007819,0.425439,0.003327


In [6]:
tabletfidf.to_csv('tfidf.csv', sep=',', index=False)

In [7]:
file = open('vocab.txt', mode='r')
word_list = file.read()
file.close()
word_list = word_list.split(' ')
l = len(word_list)

In [8]:
import copy

# 生成文章向量/词向量
def fun(x):
    vec = np.zeros(l)
    for i in range(l):
        if word_list[i] in x['word'].values:
            vec[i] = x.loc[x['word'].values == word_list[i]]['TFIDF']
    return vec

data = copy.deepcopy(tabletfidf)
news_vec = data.groupby('id').apply(fun)

In [10]:
# 保存
from itertools import chain
news_vec_l = news_vec.to_numpy()
news_vec_l = list(chain.from_iterable(news_vec_l))
news_vec_np = np.array(news_vec_l).reshape(table.shape[0], l)

In [11]:
news_vec_df = pd.DataFrame(news_vec_np)
news_vec_df.to_csv('newsvec.csv', sep=',', index=True)