# ウェブサービスにおけるデータサイエンス実践 第8回

## 実データを用いた機械学習実践

## 今回の内容

- 前回: 機械学習しやすいデータでの教師あり学習の演習
- 今回: ログデータからの教師あり学習

-  クリック履歴からの性別推定を例とする


## 扱うトピック
- ログの整形
- 次元削減
- ダウンサンプリング
- その他細かいテクニック

In [2]:
%matplotlib inline

import sqlite3
import pandas

import numpy

from sklearn import datasets
from sklearn.decomposition import NMF
from sklearn.model_selection import (
    cross_validate,
    GridSearchCV,
    train_test_split
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# ログからトレーニングデータを作成する

## 必要なもの

- クリック履歴
- ユーザの性別情報

In [3]:
sqlite_conn = sqlite3.connect('../resources/waseda.db')

In [4]:
sql = """
SELECT
    DISTINCT
    user_id,
    article_id
FROM
    clicks
"""
df_clicks = pandas.read_sql(sql, sqlite_conn)

# ログの整形

- このままだと、扱えないのでクリック履歴をベクトル表現で扱うことを考える
- 記事が次元を表し、クリックした記事が1、クリックしていない記事が0になるベクトル
- Pivotテーブルを作れば解決しそう
- Pivotにはvalueが必要なので、value=1にして、Pivotしたあと、fillna(0)でできる

In [5]:
df_clicks['value'] = 1

In [6]:
df_matrix = df_clicks.pivot(index='user_id', columns='article_id', values='value')

In [7]:
df_matrix = df_matrix.fillna(0).reset_index()

In [8]:
df_matrix.head()

article_id,user_id,0,1,2,3,4,5,6,7,8,...,26445,26446,26447,26448,26449,26450,26451,26452,26453,26454
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
sql = """
SELECT
    user_id,
    gender_id
FROM
    users
WHERE
    gender_id IS NOT NULL
"""
df_users = pandas.read_sql(sql, sqlite_conn)

In [10]:
df = df_matrix.merge(df_users, on='user_id')

# データの作成

- user_idは除去
- data (x)とlabel (y)を分ける
- DataFrameからvaluesでarrayを取り出す

In [11]:
x = df[df.columns[1:-1]].values
y = df['gender_id'].values

# 学習してみる

- ひとまずtrain, test splitしてロジスティクス回帰で学習

In [12]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=23)

In [13]:
sum(train_y)/len(train_y)

0.4294465106103697

In [14]:
clf_lr = LogisticRegression(solver='lbfgs')

In [15]:
cross_validate(clf_lr, train_x, train_y, cv=3, scoring=['accuracy', 'precision', 'recall'])

{'fit_time': array([5.85385513, 5.15660596, 4.80414319]),
 'score_time': array([0.0857358 , 0.07206821, 0.07928967]),
 'test_accuracy': array([0.66338583, 0.6496063 , 0.65856861]),
 'test_precision': array([0.6284153 , 0.61567878, 0.62593985]),
 'test_recall': array([0.52752294, 0.49160305, 0.50917431])}

In [16]:
clf_lr.fit(train_x, train_y)

LogisticRegression()

In [17]:
print(classification_report(clf_lr.predict(test_x), test_y))

              precision    recall  f1-score   support

         0.0       0.78      0.66      0.72       753
         1.0       0.50      0.65      0.56       390

    accuracy                           0.66      1143
   macro avg       0.64      0.65      0.64      1143
weighted avg       0.69      0.66      0.66      1143



In [19]:
joblib.dump(clf_lr, 'clf_lr.joblib')

['clf_lr.joblib']

In [20]:
clf_lr_decom = LogisticRegression(solver='lbfgs')

# 次元削減をする

- 次元削減 => 行列を元の特徴を残したまま小さなものにする
- 代表的な手法: PCA, SVD, LSI, NMFなど
- 疎なデータを扱うときに特に有効であることが多い => 次元数が多いと次元の呪いが発生する


In [21]:
model = NMF(n_components=300)

In [22]:
x_decom = model.fit_transform(x)

In [23]:
train_x_decom, test_x_decom, train_y_decom, test_y_decom = train_test_split(x_decom, y, test_size=0.2, random_state=23)

In [24]:
clf_lr_decom = LogisticRegression(solver='lbfgs')

In [25]:
cross_validate(clf_lr_decom, train_x_decom, train_y_decom, cv=3, scoring=['accuracy', 'precision', 'recall'])

{'fit_time': array([0.10808778, 0.06663299, 0.05255103]),
 'score_time': array([0.00905585, 0.00542498, 0.00459599]),
 'test_accuracy': array([0.59278689, 0.57452397, 0.57518056]),
 'test_precision': array([0.69767442, 0.52777778, 0.53535354]),
 'test_recall': array([0.09160305, 0.08715596, 0.08103976])}

In [26]:
clf_lr_decom.fit(train_x_decom, train_y_decom)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
print(classification_report(clf_lr_decom.predict(test_x_decom), test_y_decom))

              precision    recall  f1-score   support

         0.0       0.96      0.58      0.72      1067
         1.0       0.10      0.70      0.18        76

    accuracy                           0.58      1143
   macro avg       0.53      0.64      0.45      1143
weighted avg       0.91      0.58      0.68      1143



In [28]:
joblib.dump(model, 'nmf.joblib')

['nmf.joblib']

In [37]:
joblib.dump(clf_lr_decom, 'clf_lr_decom.joblib')

['clf_lr_decom.joblib']

- 次元削減によって精度が下がった
- 特に、男女の予測の偏りが強くなった
- trainingでrecallが低くなっているので、Recallが高くなるようにSVCでパラメータサーチしてみる

In [29]:
parameters = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
]

In [31]:
clf_svc_decom = GridSearchCV( SVC() , parameters, cv=3, scoring='recall_macro')

In [32]:
clf_svc_decom.fit(train_x_decom, train_y_decom)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall_macro', verbose=0)

In [33]:
clf_svc_decom.best_params_

{'C': 1000, 'kernel': 'linear'}

In [35]:
print(classification_report(clf_svc_decom.predict(train_x_decom), train_y_decom))

              precision    recall  f1-score   support

         0.0       0.91      0.66      0.76      3591
         1.0       0.37      0.75      0.50       980

    accuracy                           0.68      4571
   macro avg       0.64      0.70      0.63      4571
weighted avg       0.79      0.68      0.71      4571



In [39]:
print(classification_report(clf_svc_decom.predict(test_x_decom), test_y_decom))

              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72       927
         1.0       0.28      0.67      0.40       216

    accuracy                           0.62      1143
   macro avg       0.59      0.64      0.56      1143
weighted avg       0.77      0.62      0.66      1143



In [54]:
joblib.dump(clf_svc_decom, 'clf_svc_decom.joblib')

['clf_svc_decom.joblib']

# ダウンサンプリングする

- 男性に偏ったのは、訓練データに男性のデータが多いからと考えられる
- 訓練データ内の男性と女性のデータがおんなじになるようにする

In [46]:
df_train_decom = pandas.DataFrame(train_x_decom)

In [47]:
df_train_decom['gender_id'] = train_y_decom

In [48]:
df_train_decom_male = df_train_decom[df_train_decom['gender_id'] == 0]
df_train_decom_female = df_train_decom[df_train_decom['gender_id'] == 1]

In [50]:
df_train_decom_male = df_train_decom_male.take(numpy.random.permutation(len(df_train_decom_male)))
df_train_decom_male = df_train_decom_male.iloc[:len(df_train_decom_female), :].reset_index(drop = True)

In [51]:
df_train_decom_ds = pandas.concat([df_train_decom_male, df_train_decom_female])

In [52]:
train_x_decom_ds = df_train_decom_ds[df_train_decom_ds.columns[:-1]].values
train_y_decom_ds = df_train_decom_ds['gender_id'].values

In [53]:
clf_lr_decom_ds = LogisticRegression(solver='lbfgs')

In [55]:
cross_validate(clf_lr_decom_ds, train_x_decom_ds, train_y_decom_ds, cv=3, scoring=['accuracy', 'precision', 'recall'])

{'fit_time': array([0.09034324, 0.06142783, 0.05270076]),
 'score_time': array([0.00762963, 0.00433731, 0.00418115]),
 'test_accuracy': array([0.54503817, 0.55657492, 0.55351682]),
 'test_precision': array([0.53134963, 0.53830228, 0.53564155]),
 'test_recall': array([0.76335878, 0.79510703, 0.80428135])}

In [56]:
clf_lr_decom_ds.fit(train_x_decom_ds, train_y_decom_ds)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
print(classification_report(clf_lr_decom_ds.predict(test_x_decom), test_y_decom))

              precision    recall  f1-score   support

         0.0       0.39      0.69      0.50       364
         1.0       0.77      0.50      0.61       779

    accuracy                           0.56      1143
   macro avg       0.58      0.60      0.55      1143
weighted avg       0.65      0.56      0.57      1143



In [58]:
train_x_decom_ds.shape

(3926, 300)

In [59]:
clf_svc_decom_ds = GridSearchCV( SVC() , parameters, cv=3, scoring='recall_macro')

In [60]:
clf_svc_decom_ds.fit(train_x_decom_ds, train_y_decom_ds)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall_macro', verbose=0)

In [61]:
print(classification_report(clf_svc_decom_ds.predict(train_x_decom_ds), train_y_decom_ds))

              precision    recall  f1-score   support

         0.0       0.59      0.73      0.65      1585
         1.0       0.78      0.65      0.71      2341

    accuracy                           0.68      3926
   macro avg       0.68      0.69      0.68      3926
weighted avg       0.70      0.68      0.69      3926



In [62]:
print(classification_report(clf_svc_decom_ds.predict(test_x_decom), test_y_decom))

              precision    recall  f1-score   support

         0.0       0.55      0.68      0.61       518
         1.0       0.67      0.54      0.60       625

    accuracy                           0.61      1143
   macro avg       0.61      0.61      0.61      1143
weighted avg       0.62      0.61      0.60      1143



In [77]:
joblib.dump(clf_svc_decom_ds, 'clf_svc_decom_ds.joblib')

['clf_svc_decom_ds.joblib']

# 予測対象を絞ってみる

- いまは全データをやっている
- しかし、クリック数が少ない時は、予測がし辛いのでは？

In [64]:
user_click_num = numpy.sum(x, axis=1)

In [67]:
print(classification_report(clf_svc_decom_ds.predict(x_decom[numpy.where(user_click_num >= 10)]), y[numpy.where(user_click_num >= 10)]))

              precision    recall  f1-score   support

         0.0       0.77      0.85      0.81       984
         1.0       0.79      0.69      0.74       810

    accuracy                           0.78      1794
   macro avg       0.78      0.77      0.77      1794
weighted avg       0.78      0.78      0.77      1794



In [68]:
print(classification_report(clf_svc_decom_ds.predict(x_decom[numpy.where(user_click_num < 5)]), y[numpy.where(user_click_num < 5)]))

              precision    recall  f1-score   support

         0.0       0.42      0.68      0.52       971
         1.0       0.76      0.52      0.62      1882

    accuracy                           0.58      2853
   macro avg       0.59      0.60      0.57      2853
weighted avg       0.65      0.58      0.59      2853



In [69]:
train_x_decom_clk, test_x_decom_clk, train_y_decom_clk, test_y_decom_clk = train_test_split(x_decom[numpy.where(user_click_num >= 10)], y[numpy.where(user_click_num >= 10)], test_size=0.2, random_state=23)

In [71]:
clf_svc_decom_clk = GridSearchCV( SVC() , parameters, cv=3, scoring='recall_macro')

In [72]:
clf_svc_decom_clk.fit(train_x_decom_clk, train_y_decom_clk)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall_macro', verbose=0)

In [73]:
print(classification_report(clf_svc_decom_clk.predict(train_x_decom_clk), train_y_decom_clk))

              precision    recall  f1-score   support

         0.0       0.89      0.83      0.86       926
         1.0       0.72      0.82      0.76       509

    accuracy                           0.82      1435
   macro avg       0.81      0.82      0.81      1435
weighted avg       0.83      0.82      0.82      1435



In [74]:
print(classification_report(clf_svc_decom_clk.predict(test_x_decom_clk), test_y_decom_clk))

              precision    recall  f1-score   support

         0.0       0.83      0.78      0.81       237
         1.0       0.62      0.69      0.65       122

    accuracy                           0.75       359
   macro avg       0.73      0.74      0.73       359
weighted avg       0.76      0.75      0.75       359



In [75]:
joblib.dump(clf_svc_decom_clk, 'clf_svc_decom_clk.joblib')

['clf_svc_decom_clk.joblib']

# 次元削減する時点で、データを絞ってみる

In [188]:
x_clk = x[numpy.where(user_click_num >= 10)]
y_clk = y[numpy.where(user_click_num >= 10)]

In [185]:
model_clk = NMF(n_components=300)

In [189]:
x_clk_decom = model_clk.fit_transform(x_clk)

In [190]:
train_x_clk_decom, test_x_clk_decom, train_y_clk_decom, test_y_clk_decom = train_test_split(x_clk_decom, y_clk, test_size=0.2, random_state=23)

In [191]:
clf_lr_clk_decom = LogisticRegression(solver='lbfgs')

In [192]:
cross_validate(clf_lr_clk_decom, train_x_clk_decom, train_y_clk_decom, cv=3, scoring=['accuracy', 'precision', 'recall'])

{'fit_time': array([0.03401017, 0.02101588, 0.03772116]),
 'score_time': array([0.005198  , 0.00380397, 0.00357699]),
 'test_accuracy': array([0.60751566, 0.62343096, 0.61924686]),
 'test_precision': array([0.54761905, 0.65789474, 0.72727273]),
 'test_recall': array([0.11979167, 0.13020833, 0.08333333])}

In [194]:
df_train_clk_decom = pandas.DataFrame(train_x_clk_decom)
df_train_clk_decom['gender_id'] = train_y_clk_decom
df_train_clk_decom_male = df_train_clk_decom[df_train_clk_decom['gender_id'] == 0]
df_train_clk_decom_female = df_train_clk_decom[df_train_clk_decom['gender_id'] == 1]
df_train_clk_decom_male = df_train_clk_decom_male.take(numpy.random.permutation(len(df_train_clk_decom_male)))
df_train_clk_decom_male = df_train_clk_decom_male.iloc[:len(df_train_clk_decom_female), :].reset_index(drop = True)
df_train_clk_decom_ds = pandas.concat([df_train_clk_decom_male, df_train_clk_decom_female])

In [196]:
train_x_clk_decom_ds = df_train_clk_decom_ds[df_train_clk_decom_ds.columns[:-1]].values
train_y_clk_decom_ds = df_train_clk_decom_ds['gender_id'].values

In [197]:
clf_lr_clk_decom_ds = LogisticRegression(solver='lbfgs')

In [198]:
cross_validate(clf_lr_clk_decom_ds, train_x_clk_decom_ds, train_y_clk_decom_ds, cv=3, scoring=['accuracy', 'precision', 'recall'])

{'fit_time': array([0.0225842 , 0.02065611, 0.01859379]),
 'score_time': array([0.00286174, 0.00256801, 0.00331807]),
 'test_accuracy': array([0.65885417, 0.58333333, 0.59375   ]),
 'test_precision': array([0.70748299, 0.56666667, 0.59473684]),
 'test_recall': array([0.54166667, 0.70833333, 0.58854167])}

In [199]:
clf_lr_clk_decom_ds.fit(train_x_clk_decom_ds, train_y_clk_decom_ds)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [200]:
print(classification_report(clf_lr.predict(test_x_clk_decom), test_y_clk_decom))

              precision    recall  f1-score   support

         0.0       0.44      0.60      0.51       163
         1.0       0.52      0.36      0.42       196

    accuracy                           0.47       359
   macro avg       0.48      0.48      0.46       359
weighted avg       0.48      0.47      0.46       359



# まとめ

- ベクトル表現にして予測
- 次元削減すると、精度下がった
     - 予測結果の偏りが発生
     - 疎でなくなった分、偏りの影響が大きくなった？
- ダウンサンプリングすると精度上がった
- クリック数が高いユーザの精度が高い
- クリック数が高いユーザに絞って学習すると精度があがる
- クリック数が高いユーザに絞って次元削減してから学習すると精度が下がる
     - 次元削減のような全体の傾向が必要な場合は絞らないほうが良かった