## セクション８　汎化性能と過学習
- hold-out
- LOOCV
- K-fold  
※特徴量スケーリングでテストデータを処理するときは、学習データのパラメータを使用


In [59]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold,RepeatedKFold
from sklearn.pipeline import Pipeline

### hold-out法

In [10]:
df = sns.load_dataset('tips')
y_col = 'tip'
X = df.drop(columns=[y_col])
# 標準化のために数値カラムのリストを取得
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
# 分割後のデータ数を確認
print(len(X_train))
print(len(X_test))

170
74


In [16]:
# 標準化
# 標準化は，データ分割の後に実施する
sc = StandardScaler()

#　元のX＿trainを更新すると厄介なので、コピーする
X_train_sc = X_train.copy()
#　ダミー変数以外の数値データを標準化する
X_train_sc[numeric_cols] = sc.fit_transform(X_train[numeric_cols]) 

X_test_sc = X_test.copy()
#　すでに学習データすでに学習データでフィットさせているので、transformだけで良い。
# fitさせるとテストデータを使って標準化することになってしまう。
X_test_sc[numeric_cols] = sc.transform(X_test[numeric_cols])


In [19]:
# 線形回帰モデル学習
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_sc, y_train)
y_pred = model.predict(X_test_sc)

In [21]:
# MSEの算出
mean_squared_error(y_test,y_pred)

0.955080898861715

### LOOCV(Leave-one-out)
- 1.LOOモジュール
- 2.Cross validationに分けて実行

In [45]:
##　自分で実装
df = sns.load_dataset('tips')
y_col = 'tip'
X = df['total_bill'].values.reshape(-1,1)
y = df[y_col]


loo = LeaveOneOut()
model = LinearRegression()
y_preds = []
#　学習データとテストデータのインデックスを出力
for train_idx,test_idx in loo.split(X):
    X_train,X_test = X[train_idx],X[test_idx]
    y_train,y_test = y[train_idx],y[test_idx]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_preds.append(y_pred)
mse = mean_squared_error(y_test,y_pred)
mse

0.011624611173673088

## LOOCV模範解答

In [46]:
model = LinearRegression()
mse_list = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
print(f"MSE(LOOCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(LOOCV):1.0675673489857438
std:2.0997944551776313


### Cross_val_scoreを使う方法

In [47]:
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
print(f"MSE(LOOCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(LOOCV):1.0675673489857438
std:2.0997944551776313


## K-fold CV
- 一番使われる評価指標
- LOOCVよりコストが低い
- データをk個に分割して交差検証を行う

In [53]:
##　自分で実装
df = sns.load_dataset('tips')
y_col = 'tip'
X = df.drop(columns=[y_col])
# 標準化のために数値カラムのリストを取得
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]

model = LinearRegression()
cv = KFold(n_splits=5,shuffle=True ,random_state=0)
cross_val_score(estimator=model, X=X, y=y,cv=cv, scoring='neg_mean_squared_error')

# 同様にcross_val_scoreで簡単に実施可能
print(f"MSE 5 FoldCV): {-np.mean(scores)}")
print(f"std: {np.std(scores)}")

MSE 5 FoldCV): 1.168619658041242
std: 0.18808285556593823


### Repeated k-fold cv
- kFoldを複数回実施
- n_repeats引数に回数を指定
- そのほかはkfoldと同じ

In [58]:
R_cv = RepeatedKFold(n_splits=5,n_repeats=5,random_state=0)
mse_list = cross_val_score(estimator=model, X=X, y=y,cv=R_cv, scoring='neg_mean_squared_error')
mse_list


array([-0.89391952, -1.10133223, -1.41313748, -1.34453334, -1.09017572,
       -1.14902031, -1.58845751, -1.08426032, -0.79808926, -0.91573102,
       -0.92845031, -1.73725679, -0.72522842, -1.15657763, -1.19620839,
       -0.90981594, -1.26451304, -0.84320185, -0.9138454 , -1.50926067,
       -1.01472258, -0.73786051, -1.13736158, -1.33445047, -1.54803209])

## スクラッチで交差検証を実装すると標準化をfor文中に組み込めるが、cross_val_score()を使うと組み込めない
### Pipelineの構築
- Piplelineオブジェクトを使うことで複数の処理をまとめることができる  
 引数stepsに[('処理名１',クラス()),('処理名２',クラス()),...])の形で記述

### pipeline + K-fold

In [60]:
pipeline = Pipeline(steps=[('scaler',StandardScaler()),('model',LinearRegression())])
cv = KFold(n_splits=5,shuffle=True,random_state=0)
scores =cross_val_score(pipeline,X,y,cv=cv,scoring='neg_mean_squared_error')
scores

array([-0.89391952, -1.10133223, -1.41313748, -1.34453334, -1.09017572])

In [64]:
## Pipelineなし
# 標準化 + 線形回帰
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred

array([2.82249035, 2.97504474, 2.8260184 , 1.38113692, 3.15154584,
       1.72121268, 2.48332645, 3.03579004, 2.75176346, 4.52560955,
       3.1133346 , 3.14781575, 2.33198109, 2.11518372, 2.93262778,
       4.27846609, 1.83157994, 2.26626275, 2.31085596, 3.24382161,
       3.81889336, 2.85616455, 2.42949782, 2.42039736, 2.20253234,
       2.42509643, 2.81777778, 4.70274951, 3.81268552, 2.38673795,
       2.29194112, 2.20803273, 2.45503466, 1.7743294 , 2.71663745,
       2.22913684, 2.72146912, 2.01205852, 5.85346207, 3.49435578,
       2.26246168, 2.20347519, 2.50905642, 4.41646769, 1.97212663,
       2.78445294, 2.65274212, 3.01652357, 2.73423023, 3.95761528,
       3.9498931 , 2.53992971, 2.71758399, 6.35620823, 1.7434279 ,
       2.33450139, 4.23562521, 3.29319236, 2.41114285, 2.20345847,
       3.72455103, 2.29099827, 3.04008335, 3.74539008, 4.01431996,
       2.26547605, 2.66047323, 3.84238482, 2.17921165, 3.87859588,
       2.59899485, 1.94814647, 3.70801825, 2.11341037])

In [65]:
# pipelineあり
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
pipeline = Pipeline(steps=[('scaler',StandardScaler()),('model',LinearRegression())])
pipeline.fit(X_train,y_train)
y_pred_p = pipeline.predict(X_test)
y_pred_p

array([2.82249035, 2.97504474, 2.8260184 , 1.38113692, 3.15154584,
       1.72121268, 2.48332645, 3.03579004, 2.75176346, 4.52560955,
       3.1133346 , 3.14781575, 2.33198109, 2.11518372, 2.93262778,
       4.27846609, 1.83157994, 2.26626275, 2.31085596, 3.24382161,
       3.81889336, 2.85616455, 2.42949782, 2.42039736, 2.20253234,
       2.42509643, 2.81777778, 4.70274951, 3.81268552, 2.38673795,
       2.29194112, 2.20803273, 2.45503466, 1.7743294 , 2.71663745,
       2.22913684, 2.72146912, 2.01205852, 5.85346207, 3.49435578,
       2.26246168, 2.20347519, 2.50905642, 4.41646769, 1.97212663,
       2.78445294, 2.65274212, 3.01652357, 2.73423023, 3.95761528,
       3.9498931 , 2.53992971, 2.71758399, 6.35620823, 1.7434279 ,
       2.33450139, 4.23562521, 3.29319236, 2.41114285, 2.20345847,
       3.72455103, 2.29099827, 3.04008335, 3.74539008, 4.01431996,
       2.26547605, 2.66047323, 3.84238482, 2.17921165, 3.87859588,
       2.59899485, 1.94814647, 3.70801825, 2.11341037])