# 言語処理100本ノック 2020 第６章

Reference: https://nlp100.github.io/ja/ch06.html

In [None]:
!mkdir NewsAggregatorDataset
!cd NewsAggregatorDataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip
!cd ../

In [None]:
ifile = "NewsAggregatorDataset/newsCorpora.csv"

In [None]:
import re
import numpy as np
import pandas as pd
import seaborn as sns

## 50. データの入手・整形

News Aggregator Data Setをダウンロードし、以下の要領で学習データ（train.txt），検証データ（valid.txt），評価データ（test.txt）を作成せよ．

* ダウンロードしたzipファイルを解凍し，readme.txtの説明を読む．
* 情報源（publisher）が”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, “Daily Mail”の事例（記事）のみを抽出する．
* 抽出された事例をランダムに並び替える．
* 抽出された事例の80%を学習データ，残りの10%ずつを検証データと評価データに分割し，それぞれtrain.txt，valid.txt，test.txtというファイル名で保存する．ファイルには，１行に１事例を書き出すこととし，カテゴリ名と記事見出しのタブ区切り形式とせよ（このファイルは後に問題70で再利用する）．

学習データと評価データを作成したら，各カテゴリの事例数を確認せよ．

In [58]:
dat = pd.read_csv(ifile, sep="\t", header=None)
dat.columns = ["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
dat.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [59]:
print(dat.shape)
publishers = ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]
dat = dat.loc[dat.PUBLISHER.isin(publishers), :]
print(dat.shape)
dat.PUBLISHER.value_counts()

(422419, 8)
(13340, 8)


Reuters             3902
Huffington Post     2455
Businessweek        2395
Contactmusic.com    2334
Daily Mail          2254
Name: PUBLISHER, dtype: int64

In [60]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

tmp = dat.loc[:,["TITLE", "CATEGORY"]]
tmp = shuffle(tmp)
train, tmp = train_test_split(tmp, test_size=0.2)
test, valid = train_test_split(tmp, test_size=0.5)

train.shape, test.shape, valid.shape

((10672, 2), (1334, 2), (1334, 2))

In [61]:
train.to_csv("NewsAggregatorDataset/train.txt", sep="\t", index=False)
test.to_csv("NewsAggregatorDataset/test.txt", sep="\t", index=False)
valid.to_csv("NewsAggregatorDataset/valid.txt", sep="\t", index=False)

## 51. 特徴量抽出

学習データ，検証データ，評価データから特徴量を抽出し，それぞれtrain.feature.txt，valid.feature.txt，test.feature.txtというファイル名で保存せよ． なお，カテゴリ分類に有用そうな特徴量は各自で自由に設計せよ．記事の見出しを単語列に変換したものが最低限のベースラインとなるであろう．

In [90]:
from nltk.tokenize import word_tokenize
from gensim import corpora
from sklearn.preprocessing import OneHotEncoder

In [181]:
# Tokenize
tokens = [word_tokenize(text) for text in train.TITLE]

# Remove (i) numbers, (ii) single-character words
tokens = [[t for t in token if (not t.isnumeric()) and (len(t) > 1)] for token in tokens]

# Create dictionary
dictionary = corpora.Dictionary(tokens)
print(dictionary)

# Filter out words that occur less than 10 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.5)
print(dictionary)

Dictionary(19395 unique tokens: ['In', 'Labeouf', 'Rehab', 'Report', 'Shia']...)
Dictionary(1894 unique tokens: ['In', 'Report', 'Shia', "'s", '1-China']...)


In [182]:
def vectorize(texts, dictionary):
    # Tokenize
    tokens = [word_tokenize(text) for text in texts]
    # Vectorize
    corpus = [dictionary.doc2bow(t) for t in tokens]
    # 
    mat = pd.DataFrame(np.zeros([len(tokens), len(dictionary)]), 
                       columns=dictionary.token2id.keys())
    for i,c_s in enumerate(corpus):
        idx = [c[0] for c in c_s]
        mat.iloc[i, idx] = 1
    return(mat)

In [184]:
x_train = vectorize(train.TITLE, dictionary)
x_test = vectorize(test.TITLE, dictionary)
x_valid = vectorize(valid.TITLE, dictionary)
x_train.iloc[:5, :5]

Unnamed: 0,In,Report,Shia,'s,1-China
0,1.0,1.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,1.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0


## 52. 学習

51で構築した学習データを用いて，ロジスティック回帰モデルを学習せよ．

In [186]:
# b = business, t = science and technology, e = entertainment, m = health
train.CATEGORY.value_counts()

b    4544
e    4160
t    1240
m     728
Name: CATEGORY, dtype: int64

In [188]:
# ここではエンタメ関連の記事をタイトルから予測することを目指す
y_train = (train.CATEGORY == "e")
y_test = (test.CATEGORY == "e")
y_valid = (valid.CATEGORY == "e")
y_train.head()

341749     True
226448    False
19723      True
134564     True
41167     False
Name: CATEGORY, dtype: bool

In [193]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## 53. 予測

52で学習したロジスティック回帰モデルを用い，与えられた記事見出しからカテゴリとその予測確率を計算するプログラムを実装せよ．

In [210]:
# 学習データ
yt_train = lr.predict(x_train)
yt_train

array([ True, False,  True, ...,  True, False, False])

In [212]:
# 評価データ
yt_test = lr.predict(x_test)
yt_test

array([False,  True,  True, ...,  True, False, False])

## 54. 正解率の計測

52で学習したロジスティック回帰モデルの正解率を，学習データおよび評価データ上で計測せよ．

In [222]:
# 学習データ
np.sum(y_train == yt_train) / len(y_train) * 100

96.56109445277362

In [223]:
# 評価データ
np.sum(y_test == yt_test) / len(y_test) * 100

91.1544227886057

## 55. 混同行列の作成

52で学習したロジスティック回帰モデルの混同行列（confusion matrix）を，学習データおよび評価データ上で作成せよ．

In [211]:
# 学習データ
perform_train = pd.DataFrame({"Measured":y_train, "Predict":yt_train}).groupby("Predict").Measured.value_counts().unstack(level=1)
perform_train

Measured,False,True
Predict,Unnamed: 1_level_1,Unnamed: 2_level_1
False,6368,223
True,144,3937


In [213]:
# 評価データ
perform_test = pd.DataFrame({"Measured":y_test, "Predict":yt_test}).groupby("Predict").Measured.value_counts().unstack(level=1)
perform_test

Measured,False,True
Predict,Unnamed: 1_level_1,Unnamed: 2_level_1
False,697,78
True,40,519


## 56. 適合率，再現率，F1スコアの計測

52で学習したロジスティック回帰モデルの適合率(precision)，再現率(recall)，F1スコアを，評価データ上で計測せよ．カテゴリごとに適合率，再現率，F1スコアを求め，カテゴリごとの性能をマイクロ平均（micro-average）とマクロ平均（macro-average）で統合せよ．

In [220]:
def performance(mat):
    precision = mat.loc[True,True] / np.sum(mat.loc[True,:])
    recall = mat.loc[True,True] / np.sum(mat.loc[:,True])
    f1score = (2 * precision * recall) / (precision + recall)
    return precision, recall, f1score

In [226]:
# 学習データ
performance(perform_train)

(0.9647145307522667, 0.9463942307692308, 0.9554665695910691)

In [227]:
# 評価データ
performance(perform_test)

(0.9284436493738819, 0.8693467336683417, 0.8979238754325258)

## 57. 特徴量の重みの確認

52で学習したロジスティック回帰モデルの中で，重みの高い特徴量トップ10と，重みの低い特徴量トップ10を確認せよ．

In [245]:
model = pd.DataFrame({"Feature":x_train.columns.tolist(), "Coefficient":lr.coef_.flatten()})
model = model.sort_values("Coefficient", ascending=False)
model.head(n=10)

Unnamed: 0,Feature,Coefficient
132,Chris,2.846989
245,Kardashian,2.635808
1490,Paul,2.384013
783,Harris,2.220136
1390,Cannes,2.217722
711,George,2.193563
381,Film,2.061563
363,Miley,2.049237
158,Met,2.044361
775,Movie,2.023829


In [246]:
model.tail(n=10)

Unnamed: 0,Feature,Coefficient
606,Apple,-1.977237
584,Billion,-2.001221
147,Bank,-2.011097
93,China,-2.048291
289,Ebola,-2.069122
142,CEO,-2.09419
1218,Obamacare,-2.106589
682,Climate,-2.223175
553,Facebook,-2.334967
409,Google,-3.022455


## 58. 正則化パラメータの変更

ロジスティック回帰モデルを学習するとき，正則化パラメータを調整することで，学習時の過学習（overfitting）の度合いを制御できる．異なる正則化パラメータでロジスティック回帰モデルを学習し，学習データ，検証データ，および評価データ上の正解率を求めよ．実験の結果は，正則化パラメータを横軸，正解率を縦軸としたグラフにまとめよ．

In [261]:
figmat = dict()
c_s = [0.01, 0.1, 0.5, 1, 5, 10]
for c in c_s:
    # Model construction
    lr = LogisticRegression(C=c, penalty="l2")
    lr.fit(x_train, y_train)
    # 
    perform_train = pd.DataFrame({"Measured":y_train, "Predict":lr.predict(x_train)}).groupby("Predict").Measured.value_counts().unstack(level=1)
    perform_test = pd.DataFrame({"Measured":y_test, "Predict":lr.predict(x_test)}).groupby("Predict").Measured.value_counts().unstack(level=1)
    figmat[c] = list(performance(perform_train)) + list(performance(perform_test))
    #
columns = ["precision_train", "recall_train", "F1score_train", "precision_test", "recall_test", "F1score_test"]
figmat = pd.DataFrame(figmat, index=columns).T
figmat

Unnamed: 0,precision_train,recall_train,F1score_train,precision_test,recall_test,F1score_test
0.01,0.850585,0.68149,0.756706,0.849785,0.663317,0.745061
0.1,0.923877,0.87524,0.898901,0.916974,0.832496,0.872695
0.5,0.956758,0.930769,0.943585,0.927536,0.857621,0.89121
1.0,0.964715,0.946394,0.955467,0.928444,0.869347,0.897924
5.0,0.976074,0.961058,0.968508,0.928826,0.874372,0.900777
10.0,0.979577,0.96851,0.974012,0.919499,0.860972,0.889273


## 59. ハイパーパラメータの探索

学習アルゴリズムや学習パラメータを変えながら，カテゴリ分類モデルを学習せよ．検証データ上の正解率が最も高くなる学習アルゴリズム・パラメータを求めよ．また，その学習アルゴリズム・パラメータを用いたときの評価データ上の正解率を求めよ．