<a href="https://colab.research.google.com/github/tomonari-masada/course2022-sml/blob/main/08_linear_regression_3_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2022/06/04の課題
* solubilityデータセットの、上で作った検証データに対して、できるだけ予測性能の良いモデルを見つけよう
 * Ridge回帰やLassoを使ってもいいです。
 * 特徴量はどのように加工してもいいです。（上では2値変数にPCAを使った）
* 検証データを使って見つけた最も良いモデルを、最後に一回、テストデータで評価してみよう

In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

%config InlineBackend.figure_format = 'retina'

In [2]:
PATH = '/content/drive/MyDrive/data/'

X = pd.read_csv(PATH + 'solTrainX.csv')
y = pd.read_csv(PATH + 'solTrainY.csv')['x']

X_test = pd.read_csv(PATH + 'solTestX.csv')
y_test = pd.read_csv(PATH + 'solTestY.csv')['x']

In [3]:
continuous = [s for s in X.columns if s[:3] in ['Num', 'Hyd', 'Mol', 'Sur']]
print(len(continuous), 'continuous features')
print(continuous)

20 continuous features
['MolWeight', 'NumAtoms', 'NumNonHAtoms', 'NumBonds', 'NumNonHBonds', 'NumMultBonds', 'NumRotBonds', 'NumDblBonds', 'NumAromaticBonds', 'NumHydrogen', 'NumCarbon', 'NumNitrogen', 'NumOxygen', 'NumSulfer', 'NumChlorine', 'NumHalogen', 'NumRings', 'HydrophilicFactor', 'SurfaceArea1', 'SurfaceArea2']


In [4]:
# 0/1値をとる変数の名前を取り出す
binary = X.columns[X.columns.str.startswith('FP')]
print(len(binary), 'binary features')

208 binary features


* 訓練データと検証データを分けておく

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## ２値変数のインタラクションを考慮してみる

* PolynomialFeaturesを2次の設定で使う
* その上で主成分分析を適用


## Ridge回帰の場合

In [6]:
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

for n_components in [150, 200, 250, 300, 350, 400]:

  pca = PCA(n_components=n_components, random_state=123)

  X_train_binary = pca.fit_transform(poly.fit_transform(X_train[binary]))
  X_train_embedded = scaler.fit_transform(np.concatenate([X_train_binary, X_train[continuous]], 1))

  X_valid_binary = pca.transform(poly.transform(X_valid[binary]))
  X_valid_embedded = scaler.transform(np.concatenate([X_valid_binary, X_valid[continuous]], 1))

  for alpha in 10.0 ** np.arange(-3, 3):
    reg = Ridge(alpha=alpha, random_state=42)
    reg.fit(X_train_embedded, y_train)
    y_valid_pred = reg.predict(X_valid_embedded)
    rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
    print(f'{n_components} components | alpha {alpha:.1e} | RMSE {rmse:.4f}')

  print('-'*40)

150 components | alpha 1.0e-03 | RMSE 0.5923
150 components | alpha 1.0e-02 | RMSE 0.5920
150 components | alpha 1.0e-01 | RMSE 0.5908
150 components | alpha 1.0e+00 | RMSE 0.5911
150 components | alpha 1.0e+01 | RMSE 0.6112
150 components | alpha 1.0e+02 | RMSE 0.6718
----------------------------------------
200 components | alpha 1.0e-03 | RMSE 0.5749
200 components | alpha 1.0e-02 | RMSE 0.5741
200 components | alpha 1.0e-01 | RMSE 0.5720
200 components | alpha 1.0e+00 | RMSE 0.5703
200 components | alpha 1.0e+01 | RMSE 0.5859
200 components | alpha 1.0e+02 | RMSE 0.6491
----------------------------------------
250 components | alpha 1.0e-03 | RMSE 0.5699
250 components | alpha 1.0e-02 | RMSE 0.5679
250 components | alpha 1.0e-01 | RMSE 0.5652
250 components | alpha 1.0e+00 | RMSE 0.5668
250 components | alpha 1.0e+01 | RMSE 0.5830
250 components | alpha 1.0e+02 | RMSE 0.6416
----------------------------------------
300 components | alpha 1.0e-03 | RMSE 0.5749
300 components | alpha

## Lassoの場合
* まだ収束していないというwarningが出ないように、max_iterを大きな値にしておく。

In [7]:
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

for n_components in [150, 200, 250, 300, 350, 400]:

  pca = PCA(n_components=n_components, random_state=123)

  X_train_binary = pca.fit_transform(poly.fit_transform(X_train[binary]))
  X_train_embedded = scaler.fit_transform(np.concatenate([X_train_binary, X_train[continuous]], 1))

  X_valid_binary = pca.transform(poly.transform(X_valid[binary]))
  X_valid_embedded = scaler.transform(np.concatenate([X_valid_binary, X_valid[continuous]], 1))

  for alpha in 10.0 ** np.arange(-4, 0):
    reg = Lasso(alpha=alpha, random_state=42, max_iter=50000)
    reg.fit(X_train_embedded, y_train)
    y_valid_pred = reg.predict(X_valid_embedded)
    rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
    print(f'{n_components} components | alpha {alpha:.1e} | RMSE {rmse:.4f}')

  print('-'*40)

150 components | alpha 1.0e-04 | RMSE 0.5911
150 components | alpha 1.0e-03 | RMSE 0.5877
150 components | alpha 1.0e-02 | RMSE 0.6044
150 components | alpha 1.0e-01 | RMSE 0.8761
----------------------------------------
200 components | alpha 1.0e-04 | RMSE 0.5727
200 components | alpha 1.0e-03 | RMSE 0.5651
200 components | alpha 1.0e-02 | RMSE 0.5858
200 components | alpha 1.0e-01 | RMSE 0.8772
----------------------------------------
250 components | alpha 1.0e-04 | RMSE 0.5653
250 components | alpha 1.0e-03 | RMSE 0.5642
250 components | alpha 1.0e-02 | RMSE 0.5778
250 components | alpha 1.0e-01 | RMSE 0.8774
----------------------------------------
300 components | alpha 1.0e-04 | RMSE 0.5607
300 components | alpha 1.0e-03 | RMSE 0.5551
300 components | alpha 1.0e-02 | RMSE 0.5660
300 components | alpha 1.0e-01 | RMSE 0.8773
----------------------------------------
350 components | alpha 1.0e-04 | RMSE 0.5875
350 components | alpha 1.0e-03 | RMSE 0.5704
350 components | alpha 1.0

* Ridge回帰でも、Lassoでも、コンポーネント数300あたりが良さそう。

## 交差検証をおこなう

* 訓練データと検証データの切り分け方が変わると、予測性能がどの程度変動するか、確認する。

* テストセット以外を読み直す。

In [8]:
X = pd.read_csv(PATH + 'solTrainX.csv')
y = pd.read_csv(PATH + 'solTrainY.csv')['x']

In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=123)

In [10]:
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

for n_components in [300]:
  pca = PCA(n_components=n_components, random_state=123)

  for alpha in 10.0 ** np.arange(-3, 3):
    reg = Ridge(alpha=alpha, random_state=42)

    rmses = []
    print(f'\tRMSE:', end=' ')
    for train_index, valid_index in kf.split(X):
      X_train = X.iloc[train_index]
      X_valid = X.iloc[valid_index]

      X_train_binary = pca.fit_transform(poly.fit_transform(X_train[binary]))
      X_train_embedded = scaler.fit_transform(np.concatenate([X_train_binary, X_train[continuous]], 1))
      X_valid_binary = pca.transform(poly.transform(X_valid[binary]))
      X_valid_embedded = scaler.transform(np.concatenate([X_valid_binary, X_valid[continuous]], 1))

      reg.fit(X_train_embedded, y[train_index])
      y_valid_pred = reg.predict(X_valid_embedded)
      rmse = mean_squared_error(y[valid_index], y_valid_pred, squared=False)
      rmses.append(rmse)
      print(f'{rmse:.4f}', end=', ')

    print()
    rmses = np.array(rmses)
    print(f'{n_components} components | alpha {alpha:.1e} | mean RMSE {rmses.mean():.4f} ({rmses.std():.4f})')
  print('-'*40)

	RMSE: 0.5797, 0.6249, 0.6082, 0.7003, 0.6256, 
300 components | alpha 1.0e-03 | mean RMSE 0.6278 (0.0399)
	RMSE: 0.5797, 0.6280, 0.6013, 0.7009, 0.6255, 
300 components | alpha 1.0e-02 | mean RMSE 0.6271 (0.0409)
	RMSE: 0.5812, 0.6301, 0.5898, 0.7025, 0.6241, 
300 components | alpha 1.0e-01 | mean RMSE 0.6255 (0.0429)
	RMSE: 0.5873, 0.6301, 0.5842, 0.7040, 0.6228, 
300 components | alpha 1.0e+00 | mean RMSE 0.6257 (0.0433)
	RMSE: 0.5842, 0.6309, 0.5857, 0.6766, 0.6104, 
300 components | alpha 1.0e+01 | mean RMSE 0.6176 (0.0342)
	RMSE: 0.6194, 0.6838, 0.6022, 0.6908, 0.6162, 
300 components | alpha 1.0e+02 | mean RMSE 0.6425 (0.0371)
----------------------------------------


In [11]:
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

for n_components in [300]:
  pca = PCA(n_components=n_components, random_state=123)

  for alpha in 10.0 ** np.arange(-4, 0):
    reg = Lasso(alpha=alpha, random_state=42, max_iter=50000)

    rmses = []
    print(f'\tRMSE:', end=' ')
    for train_index, valid_index in kf.split(X):
      X_train = X.iloc[train_index]
      X_valid = X.iloc[valid_index]

      X_train_binary = pca.fit_transform(poly.fit_transform(X_train[binary]))
      X_train_embedded = scaler.fit_transform(np.concatenate([X_train_binary, X_train[continuous]], 1))
      X_valid_binary = pca.transform(poly.transform(X_valid[binary]))
      X_valid_embedded = scaler.transform(np.concatenate([X_valid_binary, X_valid[continuous]], 1))

      reg.fit(X_train_embedded, y[train_index])
      y_valid_pred = reg.predict(X_valid_embedded)
      rmse = mean_squared_error(y[valid_index], y_valid_pred, squared=False)
      rmses.append(rmse)
      print(f'{rmse:.4f}', end=', ')

    print()
    rmses = np.array(rmses)
    print(f'{n_components} components | alpha {alpha:.1e} | mean RMSE {rmses.mean():.4f} ({rmses.std():.4f})')
  print('-'*40)

	RMSE: 0.5783, 0.6279, 0.5915, 0.7033, 0.6236, 
300 components | alpha 1.0e-04 | mean RMSE 0.6249 (0.0435)
	RMSE: 0.5807, 0.6331, 0.5851, 0.7128, 0.6151, 
300 components | alpha 1.0e-03 | mean RMSE 0.6254 (0.0478)
	RMSE: 0.5991, 0.6813, 0.5683, 0.6472, 0.6028, 
300 components | alpha 1.0e-02 | mean RMSE 0.6197 (0.0397)
	RMSE: 0.9214, 0.9583, 0.8276, 0.8898, 0.9403, 
300 components | alpha 1.0e-01 | mean RMSE 0.9075 (0.0459)
----------------------------------------


* Ridge回帰のほうを、もう少しチューニングしてみる。

In [12]:
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

for n_components in [250, 300, 350]:
  pca = PCA(n_components=n_components, random_state=123)

  for alpha in [5.0, 10.0, 20.0]:
    reg = Ridge(alpha=alpha, random_state=42)

    rmses = []
    print(f'\tRMSE:', end=' ')
    for train_index, valid_index in kf.split(X):
      X_train = X.iloc[train_index]
      X_valid = X.iloc[valid_index]

      X_train_binary = pca.fit_transform(poly.fit_transform(X_train[binary]))
      X_train_embedded = scaler.fit_transform(np.concatenate([X_train_binary, X_train[continuous]], 1))
      X_valid_binary = pca.transform(poly.transform(X_valid[binary]))
      X_valid_embedded = scaler.transform(np.concatenate([X_valid_binary, X_valid[continuous]], 1))

      reg.fit(X_train_embedded, y[train_index])
      y_valid_pred = reg.predict(X_valid_embedded)
      rmse = mean_squared_error(y[valid_index], y_valid_pred, squared=False)
      rmses.append(rmse)
      print(f'{rmse:.4f}', end=', ')

    print()
    rmses = np.array(rmses)
    print(f'{n_components} components | alpha {alpha:.1e} | mean RMSE {rmses.mean():.4f} ({rmses.std():.4f})')
  print('-'*40)

	RMSE: 0.5916, 0.6302, 0.5924, 0.6812, 0.6243, 
250 components | alpha 5.0e+00 | mean RMSE 0.6239 (0.0327)
	RMSE: 0.5929, 0.6312, 0.5920, 0.6726, 0.6217, 
250 components | alpha 1.0e+01 | mean RMSE 0.6221 (0.0297)
	RMSE: 0.5968, 0.6363, 0.5931, 0.6679, 0.6186, 
250 components | alpha 2.0e+01 | mean RMSE 0.6225 (0.0275)
----------------------------------------
	RMSE: 0.5856, 0.6291, 0.5851, 0.6867, 0.6162, 
300 components | alpha 5.0e+00 | mean RMSE 0.6205 (0.0373)
	RMSE: 0.5842, 0.6309, 0.5857, 0.6766, 0.6104, 
300 components | alpha 1.0e+01 | mean RMSE 0.6176 (0.0342)
	RMSE: 0.5867, 0.6384, 0.5881, 0.6716, 0.6053, 
300 components | alpha 2.0e+01 | mean RMSE 0.6180 (0.0326)
----------------------------------------
	RMSE: 0.5915, 0.6541, 0.5986, 0.6726, 0.6322, 
350 components | alpha 5.0e+00 | mean RMSE 0.6298 (0.0312)
	RMSE: 0.5869, 0.6572, 0.6006, 0.6623, 0.6252, 
350 components | alpha 1.0e+01 | mean RMSE 0.6264 (0.0299)
	RMSE: 0.5867, 0.6665, 0.6049, 0.6580, 0.6193, 
350 components

## 最終評価

In [13]:
n_components = 300
alpha = 1.0e+01

pca = PCA(n_components=n_components, random_state=123)
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

X_binary = pca.fit_transform(poly.fit_transform(X[binary]))
X_embedded = scaler.fit_transform(np.concatenate([X_binary, X[continuous]], 1))

reg = Ridge(alpha=alpha, random_state=42)
reg.fit(X_embedded, y)

X_test_binary = pca.transform(poly.transform(X_test[binary]))
X_test_embedded = scaler.transform(np.concatenate([X_test_binary, X_test[continuous]], 1))
y_test_pred = reg.predict(X_test_embedded)
print(f'test RMSE {mean_squared_error(y_test, y_test_pred, squared=False):.4f}')

test RMSE 0.6788
