In [2]:
import pandas as pd

#pandasによってcsvファイルをデータフレームに変換
file = './news+aggregator/newsCorpora.csv'
df = pd.read_csv(file, sep="\t", header=None,names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])
df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
#Dataframe.locによって指定されたpublisherに含まれていたらそのTITLEとCATEGORYを抜き出す
#reset_indexによってデータフレームのindexを振りなおす(引数dropは元indexを削除するかを指定)
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
df_ = df.loc[df['PUBLISHER'].isin(publishers), ['TITLE', 'CATEGORY']].reset_index(drop=True)
df_.head()

Unnamed: 0,TITLE,CATEGORY
0,Europe reaches crunch point on banking union,b
1,ECB FOCUS-Stronger euro drowns out ECB's messa...,b
2,"Euro Anxieties Wane as Bunds Top Treasuries, S...",b
3,Noyer Says Strong Euro Creates Unwarranted Eco...,b
4,REFILE-Bad loan triggers key feature in ECB ba...,b


In [6]:
# 学習用、検証用、評価用に分割する
from sklearn.model_selection import train_test_split
#test_size...テストデータと訓練データのサイズをそれぞれ指定
#shuffle...分類前にデータをシャッフル
#random_state...random_seedの決定(結果が常に同じになる)
#stratify...データによるラベルの偏りをなくす

#train,vaild_testに分類(trainには全データのうち80%を割り当て)
train, valid_test = train_test_split(df_, test_size=0.2, shuffle=True, random_state=64, stratify=df_['CATEGORY'])
#vaild_testをvaild,testに分類(それぞれ全データのうち10%を割り当て)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=64, stratify=valid_test['CATEGORY'])

# データの保存(DataFrame.to_csv)
train.to_csv('./news+aggregator/train.txt', sep='\t', index=False)
valid.to_csv('./news+aggregator/valid.txt', sep='\t', index=False)
test.to_csv('./news+aggregator/test.txt', sep='\t', index=False)

# データ数の確認
print('学習データ')
print(train['CATEGORY'].value_counts())
print('検証データ')
print(valid['CATEGORY'].value_counts())
print('評価データ')
print(test['CATEGORY'].value_counts())

学習データ
CATEGORY
b    4502
e    4223
t    1219
m     728
Name: count, dtype: int64
検証データ
CATEGORY
b    562
e    528
t    153
m     91
Name: count, dtype: int64
評価データ
CATEGORY
b    563
e    528
t    152
m     91
Name: count, dtype: int64
