In [None]:
# 本模块：数据预处理

In [2]:
# 读入数据集
import pandas as pd
table = pd.read_csv("./data/all_news.csv")

In [3]:
from nltk.corpus import stopwords
import re

# 去除标点
def rid_of_specials(news):
    return re.sub('[^A-za-z]+', ' ', news).lower()

table["new_body"] = table["body"].astype(str).apply(rid_of_specials)

sw_nltk = (stopwords.words('english'))
stop_words = set(sw_nltk)

# 去除停用词
def remove_sw(x):
    x = x.split(' ')
    return  ' '.join(z for z in x if z not in stop_words)

table['new_body'] = table["new_body"].apply(remove_sw)

In [None]:
table

In [5]:
# 分词、提取词根(词形还原)
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# 获取词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# 提取词根
def lemmatization(x):
    tags = pos_tag(word_tokenize(x))
    return ' '.join(lemmatizer.lemmatize(word=tag[0], pos=get_wordnet_pos(tag[1])) for tag in tags)

table['new_body'] = table["new_body"].apply(lemmatization)    

In [6]:
# 去除低频词
from nltk.probability import *

# 过滤频率过低的词
def filterfreq(x):
    x = x.split()
    return ' '.join(word for word in x if fdist[word] >= 10)

word_dct = []
for news in table['body']:
    for word in news.split(' '):
        word_dct.append(word)
print(word_dct[:10])
fdist = FreqDist(word_dct)
table['new_body'] = table['new_body'].apply(filterfreq)
table

['Quarterly', 'profits', 'at', 'US', 'media', 'giant', 'TimeWarner', 'jumped', '76%', 'to']


Unnamed: 0,title,body,topic,id,new_body
0,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarner...,business,1,quarterly profit medium giant jump three month...
1,Dollar gains on Greenspan speech,The dollar has hit its highest level against t...,business,2,dollar hit high level euro almost three month ...
2,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuko...,business,3,owner oil giant ask buyer former production un...
3,High fuel prices hit BA's profits,British Airways has blamed high fuel prices fo...,business,4,blame high fuel price drop profit report resul...
4,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Domec...,business,5,share drink food firm risen speculation could ...
...,...,...,...,...,...
2220,BT program to beat dialler scams,BT is introducing two initiatives to help beat...,tech,2221,introduce two initiative help beat cost net us...
2221,Spam e-mails tempt net shoppers,Computer users across the world continue to ig...,tech,2222,computer user across world continue ignore sec...
2222,Be careful how you code,A new European directive could put software wr...,tech,2223,new directive could put software writer risk l...
2223,US cyber security chief resigns,The man making sure US computer networks are s...,tech,2224,man make sure computer network safe secure res...


In [7]:
# 保存
all_word = set()
for body in table['new_body']:
    word_list = word_tokenize(body)
    for word in word_list:
        all_word.add(word)
print(len(all_word))

3367


In [8]:
# 写入文件
file = open('vocab.txt', mode='w')
file.write(' '.join(all_word))
file.close()

In [9]:
# 保存csv
table.to_csv("data.csv", sep=',', index=False)