<a href="https://colab.research.google.com/github/tomonari-masada/course2025-sml/blob/main/06_linear_regression_1_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# プランナー課題６

* RMSEによって評価される予測性能を、良くして下さい
* test setとそれ以外の部分の分割は、変えないでください
  * test set以外の部分をどう使うかは、自由です。
  * training setとvalidation setをくっつけて、交差検証をしていいです。
* その他、いろいろ試行錯誤してみてください。
  * リッジ回帰とLassoを使ってもいいです
  * 高次多項式特徴量を使ってもいいです（cf. `sklearn.preprocessing.PolynomialFeatures`）
* 予測手法のチューニングを尽くした上で、最後にtest setでのRMSEによる評価を実施してください。
  * test setでの評価結果を見て、チューニングに戻ってはいけません。

## 解答例

In [None]:
from tqdm.auto import tqdm
import numpy as np
from scipy import stats, special
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures

%config InlineBackend.figure_format = 'retina'

np.random.seed(42)

In [None]:
!wget https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz
!tar zxvf housing.tgz

In [None]:
df = pd.read_csv("housing.csv")
df_onehot = pd.get_dummies(df, dtype=int)
X = df_onehot.drop('median_house_value', axis=1)
y = df_onehot["median_house_value"].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1234)

### 交差検証をするために、training setとvalidation setを合併する。

In [None]:
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])

### 目的変数の対数をとる。

In [None]:
y_train = np.log(y_train)
y_test = np.log(y_test)

### 10-foldの交差検証を3回おこなう。

In [None]:
kfold = []
for i in range(3):
  kfold.append(KFold(n_splits=10, shuffle=True, random_state=np.random.randint(1, 10000)))

### リッジ回帰＋min-maxスケーリング＋2次多項式特徴量

In [None]:
scaler = MinMaxScaler()
poly = PolynomialFeatures(2)

for alpha in 10. ** np.arange(-6, 2):
  reg = Ridge(alpha=alpha, random_state=123)
  print(f"---- Ridge regression for alpha={alpha:.2e}")
  rmses = []
  for i in tqdm(range(len(kfold))):
    for train_index, val_index in kfold[i].split(X_train):
      _X_train, _X_val = X_train.iloc[train_index], X_train.iloc[val_index]
      _y_train, _y_val = y_train.iloc[train_index], y_train.iloc[val_index]
      total_bedrooms_median = _X_train["total_bedrooms"].median()
      _X_train = _X_train.fillna({'total_bedrooms': total_bedrooms_median})
      _X_val = _X_val.fillna({'total_bedrooms': total_bedrooms_median})
      _X_train = poly.fit_transform(scaler.fit_transform(_X_train))
      _X_val = poly.transform(scaler.transform(_X_val))
      reg.fit(_X_train, _y_train)
      y_val_pred = reg.predict(_X_val)
      y_val_pred[y_val_pred > _y_train.max()] = _y_train.max()
      rmse = root_mean_squared_error(np.exp(_y_val), np.exp(y_val_pred))
      rmses.append(rmse)
  rmses = np.array(rmses)
  print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

### リッジ回帰＋min-maxスケーリング＋3次多項式特徴量

In [None]:
scaler = MinMaxScaler()
poly = PolynomialFeatures(3)

for alpha in 10. ** np.arange(-8, -2):
  reg = Ridge(alpha=alpha, random_state=123)
  print(f"---- Ridge regression for alpha={alpha:.2e}")
  rmses = []
  for i in tqdm(range(len(kfold))):
    for train_index, val_index in kfold[i].split(X_train):
      _X_train, _X_val = X_train.iloc[train_index], X_train.iloc[val_index]
      _y_train, _y_val = y_train.iloc[train_index], y_train.iloc[val_index]
      total_bedrooms_median = _X_train["total_bedrooms"].median()
      _X_train = _X_train.fillna({'total_bedrooms': total_bedrooms_median})
      _X_val = _X_val.fillna({'total_bedrooms': total_bedrooms_median})
      _X_train = poly.fit_transform(scaler.fit_transform(_X_train))
      _X_val = poly.transform(scaler.transform(_X_val))
      reg.fit(_X_train, _y_train)
      y_val_pred = reg.predict(_X_val)
      y_val_pred[y_val_pred > _y_train.max()] = _y_train.max()
      rmse = root_mean_squared_error(np.exp(_y_val), np.exp(y_val_pred))
      rmses.append(rmse)
  rmses = np.array(rmses)
  print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

### リッジ回帰＋min-maxスケーリング＋3次多項式特徴量（再）
* データセットの分割を変えてもう一度。

In [None]:
kfold = []
for i in range(3):
  kfold.append(KFold(n_splits=10, shuffle=True, random_state=np.random.randint(1, 10000)))

In [None]:
scaler = MinMaxScaler()
poly = PolynomialFeatures(3)

for alpha in 10. ** np.arange(-8, -2):
  reg = Ridge(alpha=alpha, random_state=123)
  print(f"---- Ridge regression for alpha={alpha:.2e}")
  rmses = []
  for i in tqdm(range(len(kfold))):
    for train_index, val_index in kfold[i].split(X_train):
      _X_train, _X_val = X_train.iloc[train_index], X_train.iloc[val_index]
      _y_train, _y_val = y_train.iloc[train_index], y_train.iloc[val_index]
      total_bedrooms_median = _X_train["total_bedrooms"].median()
      _X_train = _X_train.fillna({'total_bedrooms': total_bedrooms_median})
      _X_val = _X_val.fillna({'total_bedrooms': total_bedrooms_median})
      _X_train = poly.fit_transform(scaler.fit_transform(_X_train))
      _X_val = poly.transform(scaler.transform(_X_val))
      reg.fit(_X_train, _y_train)
      y_val_pred = reg.predict(_X_val)
      y_val_pred[y_val_pred > _y_train.max()] = _y_train.max()
      rmse = root_mean_squared_error(np.exp(_y_val), np.exp(y_val_pred))
      rmses.append(rmse)
  rmses = np.array(rmses)
  print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* `alpha=1.0e-06`で良さそう。

## チューニング済みの予測手法をテストデータ上で評価

In [None]:
scaler = MinMaxScaler()
poly = PolynomialFeatures(3)

alpha = 10. ** -6
reg = Ridge(alpha=alpha, random_state=123)
total_bedrooms_median = X_train["total_bedrooms"].median()
_X_train = X_train.fillna({'total_bedrooms': total_bedrooms_median})
_X_test = X_test.fillna({'total_bedrooms': total_bedrooms_median})
_X_train = poly.fit_transform(scaler.fit_transform(_X_train))
_X_test = poly.transform(scaler.transform(_X_test))
reg.fit(_X_train, y_train)
y_test_pred = reg.predict(_X_test)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = root_mean_squared_error(np.exp(y_test), np.exp(y_test_pred))
print(f'my test RMSE: {rmse:.1f}')

### baseline（何の工夫もない線形回帰）のテスト性能を確認しておく。

In [None]:
df = pd.read_csv("housing.csv")
df_onehot = pd.get_dummies(df, dtype=int)
X = df_onehot.drop('median_house_value', axis=1)
y = df_onehot["median_house_value"].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

reg = LinearRegression()
total_bedrooms_median = X_train["total_bedrooms"].median()
_X_train = X_train.fillna({'total_bedrooms': total_bedrooms_median})
_X_test = X_test.fillna({'total_bedrooms': total_bedrooms_median})
reg.fit(_X_train, y_train)
y_test_pred = reg.predict(_X_test)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = root_mean_squared_error(y_test, y_test_pred)
print(f'baseline test RMSE: {rmse:.1f}')

* これに比べれば、かなり良くなっている。