<a href="https://colab.research.google.com/github/tomonari-masada/course2023-sml/blob/main/07_linear_regression_1_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 課題7

* RMSEによって評価される予測性能を、良くして下さい
* test setとそれ以外の部分の分割は、変えないでください
 * test set以外の部分をどう使うかは、自由です。
 * training setとvalidation setをくっつけて、交差検証をしていいです。
* リッジ回帰とLassoを使ってもいいです
* 高次多項式特徴量を使ってもいいです（cf. `sklearn.preprocessing.PolynomialFeatures`）
* test setでのRMSEによる評価は最後に一回おこなうだけです

In [None]:
import numpy as np
from scipy import stats, special
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures

%config InlineBackend.figure_format = 'retina'

In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  os.makedirs(housing_path, exist_ok=True)
  tgz_path = os.path.join(housing_path, "housing.tgz")
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)

（ここより上の詳細はフォローしなくてもいいいです。）

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

## 1) `ocean_proximity`を0/1の数値データへ変換

* pandasの`get_dummies`を使って、カテゴリカル変数`ocean_proximity`の値を0/1の数値データに変換する。

In [None]:
housing_dummies = pd.get_dummies(housing['ocean_proximity'])

In [None]:
housing_dummies.head()

In [None]:
housing_num = housing.drop('ocean_proximity', axis=1)

In [None]:
housing = pd.concat([housing_num, housing_dummies], axis=1)

In [None]:
housing.head()

In [None]:
X = housing_num.drop('median_house_value', axis=1)
y = housing_num["median_house_value"].copy()

## 2) テストデータの欠損値を訓練データの中央値で埋める
* 本当は、テストデータ全てについて予測をさせて評価すべきなので、欠損箇所を埋める。

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
print(X_train.shape, X_valid.shape, X_test.shape)

In [None]:
X_train.info()

In [None]:
X_valid.info()

In [None]:
X_test.info()

* テストセットで欠測値を含むインスタンスを単に脱落させたものも作っておく。

In [None]:
na_index = X_test.isna().any(axis=1)
X_test_original = X_test[~ na_index]
y_test_original = y_test[~ na_index]

* 欠測箇所を中央値で埋める
 * テストデータにだけ、total_bedroomsの値が欠けているエントリがある
 * ここでは訓練データの中央値で埋めることにする。
 * 訓練データだけから得られる情報を使って埋めているので、問題はない。

In [None]:
median_total_bedrooms = np.median(X_train.total_bedrooms[~ X_train.total_bedrooms.isna()])
X_test.total_bedrooms = X_test.total_bedrooms.replace(np.nan, median_total_bedrooms)

* 欠測箇所がなくなっていることを確認する。

In [None]:
X_test.info()

## 3) 目的変数の対数をとる
* RMSEで評価するときに、np.exp()を使って元の値に戻す。

In [None]:
y_train = np.log(y_train)
y_valid = np.log(y_valid)
y_test = np.log(y_test)

## 4) 交差検証をしたいので訓練データと検証データを合併して一つにする

In [None]:
X_train = pd.concat([X_train, X_valid])

In [None]:
print(X_train.shape)

In [None]:
y_train = pd.concat([y_train, y_valid])

In [None]:
print(y_train.shape)

* 交差検証は10-foldで行う。

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=123)

## 5) 特徴量を加工する
* `sklearn.preprocessing.PolynomialFeatures`を使う

### 5-1) 比較のためにまず元データのまま交差検証を行なう

* 正則化なしの線形回帰

In [None]:
reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  reg.fit(X_train.values[train_index], y_train.values[train_index])
  y_valid_pred = reg.predict(X_train.values[valid_index])
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.3f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* Ridge回帰

In [None]:
for alpha in 10.0 ** np.arange(-3, 4):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print(f'\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    reg.fit(X_train.values[train_index], y_train.values[train_index])
    y_valid_pred = reg.predict(X_train.values[valid_index])
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* Lasso

In [None]:
for alpha in 10.0 ** np.arange(-3, 4):
  reg = Lasso(alpha=alpha, random_state=42)
  rmses = []
  print(f'\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    reg.fit(X_train.values[train_index], y_train.values[train_index])
    y_valid_pred = reg.predict(X_train.values[valid_index])
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

### 5-2) 2次の項を追加する

* 正則化なしの線形回帰

In [None]:
poly = PolynomialFeatures(2)
scaler = MinMaxScaler()

reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
  X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
  X_train_transformed = poly.fit_transform(X_train.values[train_index])
  X_valid_transformed = poly.transform(X_train.values[valid_index])
  reg.fit(X_train_transformed, y_train.values[train_index])
  y_valid_pred = reg.predict(X_valid_transformed)
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.1f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(2)
scaler = StandardScaler()

reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
  X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
  X_train_transformed = poly.fit_transform(X_train.values[train_index])
  X_valid_transformed = poly.transform(X_train.values[valid_index])
  reg.fit(X_train_transformed, y_train.values[train_index])
  y_valid_pred = reg.predict(X_valid_transformed)
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.1f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* Ridge回帰

In [None]:
poly = PolynomialFeatures(2)
scaler = MinMaxScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
    X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(2)
scaler = StandardScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
    X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* Lasso

In [None]:
poly = PolynomialFeatures(2)
scaler = MinMaxScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Lasso(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
    X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(2)
scaler = StandardScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Lasso(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
    X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

### 5-3) 3次までの項を追加する

* 正則化なしの線形回帰

In [None]:
poly = PolynomialFeatures(3)
scaler = MinMaxScaler()

reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
  X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
  reg.fit(X_train_transformed, y_train.values[train_index])
  y_valid_pred = reg.predict(X_valid_transformed)
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.1f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(3)
scaler = StandardScaler()

reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
  X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
  reg.fit(X_train_transformed, y_train.values[train_index])
  y_valid_pred = reg.predict(X_valid_transformed)
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.1f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* Ridge回帰

In [None]:
poly = PolynomialFeatures(3)
scaler = MinMaxScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler.fit_transform(poly.fit_transform(X_train.values[train_index]))
    X_valid_transformed = scaler.transform(poly.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(3)
scaler = StandardScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler.fit_transform(poly.fit_transform(X_train.values[train_index]))
    X_valid_transformed = scaler.transform(poly.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

### 5-4) 4次までの項を追加する

In [None]:
poly = PolynomialFeatures(4)
scaler = StandardScaler()

reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
  X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
  reg.fit(X_train_transformed, y_train.values[train_index])
  y_valid_pred = reg.predict(X_valid_transformed)
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.1f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()

reg = LinearRegression()
rmses = []
print(f'\tRMSE:', end=' ')
for train_index, valid_index in kf.split(X_train):
  X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
  X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
  reg.fit(X_train_transformed, y_train.values[train_index])
  y_valid_pred = reg.predict(X_valid_transformed)
  y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
  rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
  rmses.append(rmse)
  print(f'{rmse:.1f}', end=', ')
print()
rmses = np.array(rmses)
print(f'mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
    X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = StandardScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = poly.fit_transform(scaler.fit_transform(X_train.values[train_index]))
    X_valid_transformed = poly.transform(scaler.transform(X_train.values[valid_index]))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = StandardScaler()
scaler2 = StandardScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train.values[train_index])))
    X_valid_transformed = scaler2.transform(poly.transform(scaler.transform(X_train.values[valid_index])))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()
scaler2 = MinMaxScaler()

for alpha in 10.0 ** np.arange(-5, 3):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train.values[train_index])))
    X_valid_transformed = scaler2.transform(poly.transform(scaler.transform(X_train.values[valid_index])))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()
scaler2 = MinMaxScaler()

for alpha in 10.0 ** np.arange(-9, -5):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train.values[train_index])))
    X_valid_transformed = scaler2.transform(poly.transform(scaler.transform(X_train.values[valid_index])))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = StandardScaler()
scaler2 = MinMaxScaler()

for alpha in 10.0 ** np.arange(-7, 1):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train.values[train_index])))
    X_valid_transformed = scaler2.transform(poly.transform(scaler.transform(X_train.values[valid_index])))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()
scaler2 = StandardScaler()

for alpha in 10.0 ** np.arange(-7, 1):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train.values[train_index])))
    X_valid_transformed = scaler2.transform(poly.transform(scaler.transform(X_train.values[valid_index])))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

### 5-5) 乱数のシードを変える

* `random_state`の値を変更した10-fold交差検証をおこなって、似たような性能が出せるか、確認する。

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=23456)

poly = PolynomialFeatures(4)
scaler = MinMaxScaler()
scaler2 = MinMaxScaler()

for alpha in 10.0 ** np.arange(-8, 0):
  reg = Ridge(alpha=alpha, random_state=42)
  rmses = []
  print('\tRMSE:', end=' ')
  for train_index, valid_index in kf.split(X_train):
    X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train.values[train_index])))
    X_valid_transformed = scaler2.transform(poly.transform(scaler.transform(X_train.values[valid_index])))
    reg.fit(X_train_transformed, y_train.values[train_index])
    y_valid_pred = reg.predict(X_valid_transformed)
    y_valid_pred[y_valid_pred > y_train.values[train_index].max()] = y_train.values[train_index].max()
    rmse = mean_squared_error(np.exp(y_train.values[valid_index]), np.exp(y_valid_pred), squared=False)
    rmses.append(rmse)
    print(f'{rmse:.3f}', end=', ')
  print()
  rmses = np.array(rmses)
  print(f'alpha={alpha:.1e} | mean RMSE: {rmses.mean():.1f} ({rmses.std():.1f})')

* `alpha=1.0e-05`で良さそう。

## 6) チューニング済みの手法をテストデータ上で評価

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()
scaler2 = MinMaxScaler()

X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train)))
X_test_transformed = scaler2.transform(poly.transform(scaler.transform(X_test)))

reg = Ridge(alpha=1.0e-5, random_state=42)
reg.fit(X_train_transformed, y_train)
y_test_pred = reg.predict(X_test_transformed)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = mean_squared_error(np.exp(y_test), np.exp(y_test_pred), squared=False)
print(f'test RMSE: {rmse:.1f}')

* 欠測箇所を含むインスタンスを脱落させて作ったテストセットで評価する。

In [None]:
poly = PolynomialFeatures(4)
scaler = MinMaxScaler()
scaler2 = MinMaxScaler()

X_train_transformed = scaler2.fit_transform(poly.fit_transform(scaler.fit_transform(X_train)))
X_test_transformed = scaler2.transform(poly.transform(scaler.transform(X_test_original)))

reg = Ridge(alpha=1.0e-5, random_state=42)
reg.fit(X_train_transformed, y_train)
y_test_pred = reg.predict(X_test_transformed)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = mean_squared_error(y_test_original, np.exp(y_test_pred), squared=False)
print(f'test RMSE: {rmse:.1f}')

* 何の工夫もない線形回帰だとテスト性能がどうなるか、確認しておく。

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_test_pred = reg.predict(X_test)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = mean_squared_error(np.exp(y_test), np.exp(y_test_pred), squared=False)
print(f'test RMSE: {rmse:.1f}')

* 欠測箇所を含むインスタンスを脱落させて作ったテストセットで評価する。

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_test_pred = reg.predict(X_test_original)
y_test_pred[y_test_pred > y_train.max()] = y_train.max()
rmse = mean_squared_error(y_test_original, np.exp(y_test_pred), squared=False)
print(f'test RMSE: {rmse:.1f}')