In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc,accuracy_score,roc_auc_score

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse
%matplotlib inline
seed = 42
import os
os.environ['OMP_NUM_THREADS'] = '4'

In [3]:
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
#!unzip NewsAggregatorDataset.zip

In [4]:
# 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
#!sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# データの読込
df = pd.read_csv('./input/newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割
df_train, df_valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=df_valid_test['CATEGORY'])
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

print(df_train.head())

                                               TITLE CATEGORY
0  REFILE-UPDATE 1-European car sales up for sixt...        b
1  Amazon Plans to Fight FTC Over Mobile-App Purc...        t
2  Kids Still Get Codeine In Emergency Rooms Desp...        m
3  What On Earth Happened Between Solange And Jay...        e
4  NATO Missile Defense Is Flight Tested Over Hawaii        b


In [7]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= None,ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words= None,ngram_range=(3,6),dtype=np.float32)

In [8]:
# Word ngram vector
tr_vect = vect_word.fit_transform(df_train['TITLE'])
vl_vect = vect_word.transform(df_valid['TITLE'])
ts_vect = vect_word.transform(df_test['TITLE'])

# Character n gram vector
tr_vect_char = vect_char.fit_transform(df_train['TITLE'])
vl_vect_char = vect_char.transform(df_valid['TITLE'])
ts_vect_char = vect_char.transform(df_test['TITLE'])
gc.collect()

12

In [9]:
X = sparse.hstack([tr_vect, tr_vect_char])
x_val = sparse.hstack([vl_vect, vl_vect_char])
x_test = sparse.hstack([ts_vect, ts_vect_char])

In [10]:
# Onehot encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()


In [60]:
target_col = ['CATEGORY']
y_tr = ohe.fit_transform(df_train[target_col]).toarray()
y_vl = ohe.transform(df_valid[target_col]).toarray()
y_te = ohe.transform(df_test[target_col]).toarray()
#del tr_vect, ts_vect, tr_vect_char, ts_vect_char
gc.collect()

71

In [61]:
y_vl.shape

(1336, 4)

In [62]:
prd_val = np.zeros((x_test.shape[0],y_tr.shape[1]))
prd = np.zeros((x_test.shape[0],y_te.shape[1]))
cv_score =[]
models = []
for i,col in enumerate(range(4)):
    model = LogisticRegression()
    print('Building {} model for column:{''}'.format(i,col)) 
    model.fit(X,y_tr[:,i])
    #cv_score.append(lr.score)
    prd[:,i] = model.predict_proba(x_test)[:,1]
    
    prd_val[:,i] = model.predict_proba(x_val)[:,1]
    
    models.append(model)

Building 0 model for column:0
Building 1 model for column:1
Building 2 model for column:2
Building 3 model for column:3


In [63]:
prd_val.shape

(1336, 4)

In [72]:
pred_vl = np.argmax(prd_val, axis=1)
y_vl_argmax = np.argmax(y_vl, axis=1)
pred_te = np.argmax(prd, axis=1)
y_te_argmax = np.argmax(y_te, axis=1)

print((pred_vl == y_vl_argmax).sum() / pred_vl.shape[0])
print((pred_te == y_te_argmax).sum() / pred_te.shape[0])

0.8974550898203593
0.8959580838323353


In [73]:
y_vl.shape

(1336, 4)

In [74]:
y_vl_argmax.shape

(1336,)

In [80]:
# 検証データを予測する
print('{:.4f}'.format(roc_auc_score(y_vl, prd_val, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_vl, prd_val)))

# 評価データを予測する
print('{:.4f}'.format(roc_auc_score(y_te, prd, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_te, prd)))

0.9758
0.3873
0.9829
0.3672
