# hold-out

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import statsmodels.api as sma
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# データロード
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
y_col = 'tip'
X = df.drop(columns=[y_col])

In [4]:
# 参考：カテゴリ列のみ抽出
category_cols = X.select_dtypes(include='category')
category_cols.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [5]:
# 標準化のために数値列のみ抽出
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
# one-hotエンコーディング
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]

In [8]:
# ホールドアウト
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
# 標準化：trainデータのみを使用するためsplit後に実施
scaler = StandardScaler()

# 標準化用のデータフィールドとするためコピー
X_train_scaled = X_train.copy()
# 数値カラムのみ標準化
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_train_scaled = sma.add_constant(X_train_scaled)

# テストデータも同様に標準化
X_test_scaled = X_test.copy()
# 学習データの平均、標準偏差から標準化
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
X_test_scaled = sma.add_constant(X_test_scaled)

In [10]:
# 線形回帰モデル
model = LinearRegression()

# 学習
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [11]:
# 精度（MSE）
mean_squared_error(y_test, y_pred) #np.mean(np.square(y_test - y_pred))

0.9550808988617153