In [63]:
import lightgbm as lgb
import sklearn

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc,accuracy_score,roc_auc_score
from sklearn.model_selection import StratifiedKFold
# Onehot encoding

from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import sparse
import gc
import os
import random

In [40]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True
    #torch.backends.cudnn.benchmark = False
seed_everything()

In [19]:
!mkdir input/
%cd input/
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip



mkdir: cannot create directory ‘input/’: File exists
/lib/modules/input
--2021-12-12 09:05:35--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/x-httpd-php]
Saving to: ‘NewsAggregatorDataset.zip.1’

 NewsAggregatorData  19%[==>                 ]   5.32M  2.90MB/s               ^C
Archive:  NewsAggregatorDataset.zip
replace 2pageSessions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [13]:
# 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
!sed -e 's/"/'\''/g' ./input/newsCorpora.csv > ./input/newsCorpora_re.csv
#%cd ..

In [14]:


# データの読込
df = pd.read_csv('./input/newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]



In [15]:
df.head()

Unnamed: 0,TITLE,CATEGORY
12,Europe reaches crunch point on banking union,b
13,ECB FOCUS-Stronger euro drowns out ECB's messa...,b
19,"Euro Anxieties Wane as Bunds Top Treasuries, S...",b
20,Noyer Says Strong Euro Creates Unwarranted Eco...,b
29,REFILE-Bad loan triggers key feature in ECB ba...,b


In [16]:
# データの分割
df_train, df_valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=df_valid_test['CATEGORY'])
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

print(df_train.head())

                                               TITLE CATEGORY
0  REFILE-UPDATE 1-European car sales up for sixt...        b
1  Amazon Plans to Fight FTC Over Mobile-App Purc...        t
2  Kids Still Get Codeine In Emergency Rooms Desp...        m
3  What On Earth Happened Between Solange And Jay...        e
4  NATO Missile Defense Is Flight Tested Over Hawaii        b


In [22]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(3,6),dtype=np.float32)

In [28]:
# Word ngram vector
tr_vect = vect_word.fit_transform(df_train['TITLE'])
vl_vect = vect_word.transform(df_valid['TITLE'])
ts_vect = vect_word.transform(df_test['TITLE'])

# Character n gram vector
tr_vect_char = vect_char.fit_transform(df_train['TITLE'])
vl_vect_char = vect_char.transform(df_valid['TITLE'])
ts_vect_char = vect_char.transform(df_test['TITLE'])
gc.collect()

16

In [34]:
tr_vect.shape

(10684, 20000)

In [29]:
X = sparse.hstack([tr_vect, tr_vect_char])
x_val = sparse.hstack([vl_vect, vl_vect_char])
x_test = sparse.hstack([ts_vect, ts_vect_char])

In [57]:
le = LabelEncoder()
y_tr = le.fit_transform(df_train['CATEGORY'].values)
y_vl = le.transform(df_valid['CATEGORY'].values)
y_te = le.transform(df_test['CATEGORY'].values)

In [42]:
svd = TruncatedSVD(n_components=300, random_state=42)
X = svd.fit_transform(tr_vect)
x_val = svd.transform(vl_vect)
x_test = svd.transform(ts_vect)

In [43]:
X.shape

(10684, 300)

In [44]:
y_vl.shape

(1336, 4)

In [49]:
y_tr.shape

(10684, 4)

In [58]:
model = lgb.LGBMClassifier()
model.fit(X, y_tr)

LGBMClassifier()

In [68]:
# 検証データを予測する
y_pred = model.predict_proba(x_val)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_vl == y_pred_max) / len(y_vl)
print(accuracy)

0.8787425149700598


In [69]:
print(roc_auc_score(y_vl, y_pred, multi_class='ovo'))
print(log_loss(y_vl, y_pred))

0.9514706810553059
0.3888652444277943


In [70]:
# 評価データを予測する
y_pred = model.predict_proba(x_test)
y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

accuracy = sum(y_te == y_pred_max) / len(y_te)
print(accuracy)

0.8839820359281437


In [71]:
print(roc_auc_score(y_te, y_pred, multi_class='ovo'))
print(log_loss(y_te, y_pred))

0.9547788083432888
0.36538368891460654
