In [1]:
import pandas as pd

## 50. データの入手・整形

In [2]:
df = pd.read_csv('data/NewsAggregatorDataset/newsCorpora.csv', header=None, sep='\t',
                 names=['id', 'title', 'url', 'publisher', 'category', 'story', 'hostname', 'timestamp'])
cols = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
df = df[df['publisher'].isin(cols)]

In [3]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,id,title,url,publisher,category,story,hostname,timestamp
0,351350,Beloved Young Adult Writer Walter Dean Myers D...,http://www.contactmusic.com/article/monster-au...,Contactmusic.com,e,d8ffUOxditMOG7MLsJtv-MEuTgZwM,www.contactmusic.com,1404396394745
1,56195,Bitcoin: the New Era of Wildcat Banking,http://www.huffingtonpost.com/jason-gordo/wild...,Huffington Post,b,de0dBu9KqC-vjPMnsC-wC_kui-1SM,www.huffingtonpost.com,1396010852000
2,203544,Drone Almost Hit US Airways Jet Over Florida i...,http://www.businessweek.com/news/2014-05-10/dr...,Businessweek,b,d84jnFyV6DXI8FMbgrDu2O6JMzqRM,www.businessweek.com,1399750142749
3,107050,Summit Will Split 'Divergent' Finale 'Allegian...,http://www.huffingtonpost.com/2014/04/11/alleg...,Huffington Post,e,dgbsxQzsw6Y5ixMWUWnv2iVmooMZM,www.huffingtonpost.com,1397354660593
4,36286,UPDATE 1-'Divergent' teen warriors defeat 'Mup...,http://in.reuters.com/article/2014/03/23/boxof...,Reuters,e,d_oUk702ysXcmLMXRJe0R-XgNycCM,in.reuters.com,1395624050892


In [4]:
from sklearn.model_selection import train_test_split

train, rest = train_test_split(df, test_size=0.2, random_state=42, stratify=df['category'])
valid, test = train_test_split(rest, test_size=0.5, random_state=42, stratify=rest['category'])

## 51. 特徴量抽出

In [5]:
train['title_words'] = train['title'].apply(lambda x: x.split(' '))
train.groupby('category')['title_words', 'publisher'].describe()

  train.groupby('category')['title_words', 'publisher'].describe()


Unnamed: 0_level_0,title_words,title_words,title_words,title_words,publisher,publisher,publisher,publisher
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
b,4502,4432,"[Deals, of, the, day-, Mergers, and, acquisiti...",5,4502,5,Reuters,2512
e,4223,4179,"[The, top, films, at, the, North, American, bo...",5,4223,5,Contactmusic.com,1865
m,728,717,"[Air, Pollution, Kills, 7, Million, People, Ev...",2,728,5,Huffington Post,253
t,1219,1190,"[UPDATE, 4-Honda, and, others, recall, nearly,...",2,1219,5,Huffington Post,370


## 52. 学習



In [14]:
df.shape

(13340, 8)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# [Choosing the right estimator — scikit\-learn 1\.0\.2 documentation](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)
# As cheat sheet, Use LinearSVC or Naive Bayes for Text Classification.

text_lsvc_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ())])
text_nb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])