In [25]:
import numpy as np
import pandas as pd
import seaborn as sns

import statsmodels.api as sma
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,LeaveOneOut,cross_val_score,KFold,RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# hold-out

In [2]:
# データロード
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
y_col = 'tip'
X = df.drop(columns=[y_col])

In [4]:
# 参考：カテゴリ列のみ抽出
category_cols = X.select_dtypes(include='category')
category_cols.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [5]:
# 標準化のために数値列のみ抽出
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
# one-hotエンコーディング
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]

In [6]:
# ホールドアウト
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
# 標準化：trainデータのみを使用するためsplit後に実施
scaler = StandardScaler()

# 標準化用のデータフィールドとするためコピー
X_train_scaled = X_train.copy()
# 数値カラムのみ標準化
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_train_scaled = sma.add_constant(X_train_scaled)

# テストデータも同様に標準化
X_test_scaled = X_test.copy()
# 学習データの平均、標準偏差から標準化
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
X_test_scaled = sma.add_constant(X_test_scaled)

In [8]:
# 線形回帰モデル
model = LinearRegression()

# 学習
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [9]:
# モデルの評価（MSE）
mean_squared_error(y_test, y_pred) #np.mean(np.square(y_test - y_pred))

0.9550808988617153

# LOOCV（Leave-One-Out Cross Validation）

In [10]:
# データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']

## LOOによる実装

In [11]:
loo = LeaveOneOut()
mse_list = []
model = LinearRegression()

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model.fit(X_train, y_train)   
    # 予測
    y_pred = model.predict(X_test)    
    # モデルの評価（MSE）
    mse_list.append(mean_squared_error(y_test, y_pred))

In [12]:
# モデルの精度
np.mean(mse_list)

1.0675673489857438

## cross_val_scoreによる実装

In [13]:
cv = LeaveOneOut()
model = LinearRegression()

# ループせずとも1行で処理できる
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
np.mean(-scores)

1.0675673489857438

# K-FoldCV（K-Fold Cross Validation）

In [14]:
# データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']

## k-Fold CVによる実装

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
mse_list = []
model = LinearRegression()

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model.fit(X_train, y_train)   
    # 予測
    y_pred = model.predict(X_test)    
    # モデルの評価（MSE）
    mse_list.append(mean_squared_error(y_test, y_pred))

In [16]:
# モデルの精度
np.mean(mse_list)

1.080211088394392

## cross_val_scoreによる実装

In [17]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
model = LinearRegression()

# ループせずとも1行で処理できる
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
np.mean(-scores)

1.080211088394392

## Repeated k-Fold CVによる実装
データ分割はランダムであるため、複数回実行した評価値（例：MSE）の平均値から評価することが多い

In [23]:
mse_list = []
model = LinearRegression()

# 5分割×50回
rkf = RepeatedKFold(n_splits=5, n_repeats=50, random_state=0)

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model.fit(X_train, y_train)   
    # 予測
    y_pred = model.predict(X_test)    
    # モデルの評価（MSE）
    mse_list.append(mean_squared_error(y_test, y_pred))

In [24]:
# モデルの精度
np.mean(mse_list)

1.073638147035722

# Pipeline

## 標準化をk-Fold CVに組み込む

In [28]:
pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                           ('model', LinearRegression())])

In [29]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
np.mean(-scores)

1.0802110883943916

## Pipelineの挙動を確認

In [51]:
# Pipelineなし

# hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 標準化：テストデータは、学習データの平均、標準偏差で標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 学習
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# 予測
y_pred = model.predict(X_test_scaled)
# 評価
mean_squared_error(y_test, y_pred)

0.8711845537539947

In [50]:
# Pipelineあり

# hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                           ('model', LinearRegression())])

# 学習
pipeline.fit(X_train, y_train)
# 予測
y_pred = pipeline.predict(X_test)
# 評価
mean_squared_error(y_test, y_pred)

0.8711845537539947