<a href="https://colab.research.google.com/github/tktkbohshi/m1_study_nlp100practices/blob/main/M1_NLP_100practices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [162]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import plotly.graph_objects as go

# 第6章: 機械学習
本章では，Fabio Gasparetti氏が公開しているNews Aggregator Data Setを用い，ニュース記事の見出しを「ビジネス」「科学技術」「エンターテイメント」「健康」のカテゴリに分類するタスク（カテゴリ分類）に取り組む．

## 50.データの入手・整形
News Aggregator Data Setをダウンロードし、以下の要領で学習データ（train.txt），検証データ（valid.txt），評価データ（test.txt）を作成せよ．
1. ダウンロードしたzipファイルを解凍し，readme.txtの説明を読む．
1. 情報源（publisher）が”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, “Daily Mail”の事例（記事）のみを抽出する．
1. 抽出された事例をランダムに並び替える．
1. 抽出された事例の80%を学習データ，残りの10%ずつを検証データと評価データに分割し，それぞれtrain.txt，valid.txt，test.txtというファイル名で保存する．ファイルには，１行に１事例を書き出すこととし，カテゴリ名と記事見出しのタブ区切り形式とせよ（このファイルは後に問題70で再利用する）．
学習データと評価データを作成したら，各カテゴリの事例数を確認せよ．

### Datasets detail
+ ID		Numeric ID
+ TITLE		News title 
+ URL		Url
+ PUBLISHER	Publisher name
+ CATEGORY	News category (b = business, t = science and technology, e = entertainment, m = health)
+ STORY		Alphanumeric ID of the cluster that includes news about the same story
+ HOSTNAME	Url hostname
+ TIMESTAMP 	Approximate time the news was published, as the number of milliseconds since the epoch 00:00:00 GMT, January 1, 1970

In [2]:
columns = ["ID","TITLE","URL","PUBLISHER","CATEGORY","STORY","HOSTNAME","TIMESTAMP"]
df_publisher = pd.read_csv("./data/NewsAggregatorDataset/newsCorpora.csv", names=columns, sep="\t")
df_publisher = df_publisher[df_publisher["PUBLISHER"].isin(["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"])]
df_publisher = df_publisher.sample(frac=1).reset_index(drop=True)
df_publisher["TITLE"] = df_publisher["TITLE"].str.lower()
df_publisher.head(5)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,65200,not another girl! hilarious moment a six-year-...,http://www.dailymail.co.uk/femail/article-2591...,Daily Mail,e,dY3JocJkjWVDL8MpqmyTwyq_EHkjM,www.dailymail.co.uk,1396130769368
1,178746,mptf - bosses of the night before the oscars b...,http://www.contactmusic.com/story/bosses-of-th...,Contactmusic.com,e,dxdvgkRptuOmHTM5AndyD2BHXMZsM,www.contactmusic.com,1399320392690
2,273002,musk's tesla to name final gigafactory site ar...,http://www.businessweek.com/news/2014-06-03/mu...,Businessweek,b,dENUOKJciS7TC6Ml-DZqMjtX7vZnM,www.businessweek.com,1401887989436
3,131593,raw oysters spike us rise in bacteria infectio...,http://www.businessweek.com/news/2014-04-17/ra...,Businessweek,m,dNRtScuBJlukRSMcbX5ArfzxKH7ZM,www.businessweek.com,1397780060938
4,158815,update 2-cash drop in the euro zone adds to im...,http://in.reuters.com/article/2014/04/24/ecb-l...,Reuters,b,diuBb8GaScP3H_M08sSdIRHIKRl3M,in.reuters.com,1398391755389


In [3]:
df_train = df_publisher[0:int(len(df_publisher)*0.8)]
df_valid = df_publisher[int(len(df_publisher)*0.8):int(len(df_publisher)*0.9)]
df_test = df_publisher[int(len(df_publisher)*0.9):int(len(df_publisher))]
df_train.to_csv("./data/outputs/train.txt",sep="\t", index=False)
df_valid.to_csv("./data/outputs/valid.txt",sep="\t", index=False)
df_test.to_csv("./data/outputs/test.txt",sep="\t", index=False)

## 51.特徴量抽出

In [147]:
tfidf_vec = TfidfVectorizer()
X_train = tfidf_vec.fit_transform(df_train["TITLE"])
X_test = tfidf_vec.transform(df_test["TITLE"])
X_valid = tfidf_vec.transform(df_valid["TITLE"])

## 52.学習

In [5]:
model = LogisticRegression(random_state=123, max_iter=10000)
model.fit(X_train, df_train["CATEGORY"])

LogisticRegression(max_iter=10000, random_state=123)

## 53.予測

In [6]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

## 54.正解率の計測

In [7]:
accuracy_train = accuracy_score(df_train["CATEGORY"], pred_train)
accuracy_test = accuracy_score(df_test["CATEGORY"], pred_test)
accuracy_train, accuracy_test

(0.9444340329835083, 0.8980509745127436)

## 55.混同行列の作成

In [8]:
labels = df_train["CATEGORY"].unique()
labels

array(['e', 'b', 'm', 't'], dtype=object)

In [9]:
confusion_matrix(df_train["CATEGORY"], pred_train)

array([[4419,   60,    2,   30],
       [  18, 4182,    0,    6],
       [  86,  129,  491,    6],
       [ 148,  105,    3,  987]], dtype=int64)

## 56.適合率，再現率，F1スコアの計測

In [10]:
precision = precision_score(df_test["CATEGORY"], pred_test, average=None, labels=labels)
precision

array([0.89347079, 0.8956229 , 0.98214286, 0.89215686])

In [11]:
recall = recall_score(df_test["CATEGORY"], pred_test, average=None, labels=labels)
recall

array([0.97196262, 0.96202532, 0.53921569, 0.63194444])

In [12]:
f1 = f1_score(df_test["CATEGORY"], pred_test, average=None, labels=labels)
f1

array([0.93106535, 0.92763731, 0.69620253, 0.7398374 ])

In [13]:
df_eval = pd.DataFrame({"Precision":precision,"Recall":recall,"F1":f1},index=labels)
df_eval.loc["マイクロ平均"] = [
    precision_score(df_test["CATEGORY"], pred_test, average="micro", labels=labels),
    recall_score(df_test["CATEGORY"], pred_test, average="micro", labels=labels),
    f1_score(df_test["CATEGORY"], pred_test, average="micro", labels=labels)
    ]
df_eval.loc["マクロ平均"] = [
    precision_score(df_test["CATEGORY"], pred_test, average="macro", labels=labels),
    recall_score(df_test["CATEGORY"], pred_test, average="macro", labels=labels),
    f1_score(df_test["CATEGORY"], pred_test, average="macro", labels=labels)
    ]
df_eval

Unnamed: 0,Precision,Recall,F1
e,0.893471,0.971963,0.931065
b,0.895623,0.962025,0.927637
m,0.982143,0.539216,0.696203
t,0.892157,0.631944,0.739837
マイクロ平均,0.898051,0.898051,0.898051
マクロ平均,0.915848,0.776287,0.823686


## 57.特徴量の重みの確認

In [42]:
df_X_train = pd.DataFrame(X_train.toarray(), columns=tfidf_vec.get_feature_names())
df_X_test = pd.DataFrame(X_test.toarray(), columns=tfidf_vec.get_feature_names())
df_X_train.head(3)

Unnamed: 0,00,07,08,09,0ff,0ut,10,100,1000,10000,...,zone,zooey,zoosk,zuckerberg,zynga,zâ,œf,œlousyâ,œpiece,œwaist
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
df_weights = pd.DataFrame(model.coef_, index=model.classes_, columns=df_X_train.columns).T
df_weights

Unnamed: 0,b,e,m,t
00,0.207972,-0.137211,-0.029906,-0.040855
07,0.075289,-0.046413,-0.012357,-0.016519
08,0.022768,-0.010603,-0.005378,-0.006787
09,0.029912,-0.013153,-0.006844,-0.009915
0ff,-0.028595,0.063282,-0.018156,-0.016530
...,...,...,...,...
zâ,-0.013246,0.053270,-0.014438,-0.025586
œf,-0.069415,0.123408,-0.019745,-0.034249
œlousyâ,-0.056550,0.101371,-0.019855,-0.024966
œpiece,-0.087821,0.116799,-0.010400,-0.018578


In [137]:
df_best10 = pd.DataFrame(index=range(1,10))
df_worst10 = pd.DataFrame()
for label in labels:
  df_best10 = pd.merge(
    df_best10,
    pd.DataFrame(df_weights[label].sort_values(ascending=False).head(10).reset_index().set_axis([f"{label}_word",f"{label}_value"], axis=1)),
    left_index=True,
    right_index=True,
    how="outer"
  )
  df_worst10 = pd.merge(
    df_worst10,
    pd.DataFrame(df_weights[label].sort_values(ascending=True).head(10).reset_index().set_axis([f"{label}_word",f"{label}_value"], axis=1)),
    left_index=True,
    right_index=True,
    how="outer"
  )
df_best10

Unnamed: 0,e_word,e_value,b_word,b_value,m_word,m_value,t_word,t_value
0,kardashian,3.343597,bank,3.36602,ebola,4.755725,google,5.220598
1,chris,2.795961,china,3.336268,cancer,3.911757,facebook,4.871147
2,her,2.697897,fed,3.277741,study,3.871911,apple,4.606009
3,kim,2.639984,ecb,3.160443,fda,3.385715,microsoft,4.018327
4,she,2.536047,stocks,2.980106,drug,3.304109,climate,3.825137
5,miley,2.504986,euro,2.723072,mers,3.24458,tesla,2.938886
6,cyrus,2.461294,ukraine,2.6873,health,2.513325,nasa,2.786496
7,star,2.394155,oil,2.650428,could,2.242806,gm,2.755585
8,movie,2.253061,update,2.638831,cases,2.210906,comcast,2.535543
9,paul,2.235613,dollar,2.489106,virus,2.186384,fcc,2.417991


In [138]:
df_worst10

Unnamed: 0,e_word,e_value,b_word,b_value,m_word,m_value,t_word,t_value
0,update,-3.441656,and,-2.333996,facebook,-1.062082,stocks,-1.496614
1,us,-3.276398,the,-2.137717,gm,-1.053594,fed,-1.15378
2,google,-2.656597,ebola,-2.066416,google,-1.03905,her,-1.131312
3,china,-2.377915,she,-2.001245,sales,-0.971496,cancer,-1.047856
4,facebook,-2.228708,her,-1.927218,apple,-0.962734,american,-1.041196
5,says,-2.154723,microsoft,-1.706848,billion,-0.885382,ecb,-1.03907
6,ceo,-2.141222,apple,-1.649707,amazon,-0.871232,drug,-0.999433
7,gm,-2.054252,kardashian,-1.642719,ceo,-0.854978,kardashian,-0.96117
8,apple,-1.993568,video,-1.630257,climate,-0.842889,ebola,-0.957469
9,study,-1.992855,star,-1.602739,twitter,-0.797898,ukraine,-0.956081


## 58.正則化パラメータの変更

In [174]:
df_regularization = pd.DataFrame(columns=["lambda","train_accuracy","valid_accuracy","test_accuracy"])
for C in tqdm(np.arange(0.1,2.0,0.2)):
    model = LogisticRegression(random_state=123, max_iter=10000,C=C)
    model.fit(X_train, df_train["CATEGORY"])
    pred_train = model.predict(X_train)
    pred_valid = model.predict(X_valid)
    pred_test = model.predict(X_test)
    df_regularization.loc[C] = [
        C,
        accuracy_score(df_train["CATEGORY"], pred_train),
        accuracy_score(df_valid["CATEGORY"], pred_valid),
        accuracy_score(df_test["CATEGORY"], pred_test)
    ]
df_regularization = df_regularization.reset_index(drop=True)
df_regularization

100%|██████████| 10/10 [00:09<00:00,  1.00it/s]


Unnamed: 0,lambda,train_accuracy,valid_accuracy,test_accuracy
0,0.1,0.791136,0.791604,0.787856
1,0.4,0.887556,0.874813,0.862069
2,0.7,0.924663,0.890555,0.890555
3,1.0,0.944434,0.898801,0.898051
4,1.3,0.958958,0.905547,0.901049
5,1.6,0.969265,0.910045,0.908546
6,1.9,0.976762,0.911544,0.912294
7,2.2,0.981259,0.913043,0.910795
8,2.5,0.984445,0.913793,0.911544
9,2.8,0.987725,0.914543,0.912294


In [175]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_regularization["lambda"], y=df_regularization["train_accuracy"], name="train_accuracy"))
fig.add_trace(go.Scatter(x=df_regularization["lambda"], y=df_regularization["valid_accuracy"], name="valid_accuracy"))
fig.add_trace(go.Scatter(x=df_regularization["lambda"], y=df_regularization["test_accuracy"], name="test_accuracy"))
fig

## 59.ハイパーパラメータの探索

TypeError: cannot concatenate object of type '<class 'numpy.float64'>'; only Series and DataFrame objs are valid