<a href="https://colab.research.google.com/github/tomonari-masada/course2021-sml/blob/main/08_linear_regression_3_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ケーススタディ： solubility data の実行例

Max Kuhn and Kjell Johnson. Applied Predictive Modeling. Springer, 2013. に出てくるデータセット (Section 6.1)

http://appliedpredictivemodeling.com/data

* 説明変数は下記の228個
 * Two hundred and eight binary “fingerprints” that indicate the presence or absence of a particular chemical substructure.
 * Sixteen count descriptors, such as the number of bonds or the number of bromine atoms.
 * Four continuous descriptors, such as molecular weight or surface area.

* 目的変数はlog solubility
 * 範囲は−11.6から1.6、平均は−2.7

In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

%config InlineBackend.figure_format = 'retina'

In [2]:
PATH = '/content/drive/MyDrive/data/'

X = pd.read_csv(PATH + 'solTrainX.csv')
y = pd.read_csv(PATH + 'solTrainY.csv')['x']

X_test = pd.read_csv(PATH + 'solTestX.csv')
y_test = pd.read_csv(PATH + 'solTestY.csv')['x']

In [3]:
# 0/1でない値をとる変数の名前を取り出す
continuous = [s for s in X.columns.to_list() if s[:3] in ['Num', 'Hyd', 'Mol', 'Sur']]
print(len(continuous), 'continuous features')
print(continuous)

20 continuous features
['MolWeight', 'NumAtoms', 'NumNonHAtoms', 'NumBonds', 'NumNonHBonds', 'NumMultBonds', 'NumRotBonds', 'NumDblBonds', 'NumAromaticBonds', 'NumHydrogen', 'NumCarbon', 'NumNitrogen', 'NumOxygen', 'NumSulfer', 'NumChlorine', 'NumHalogen', 'NumRings', 'HydrophilicFactor', 'SurfaceArea1', 'SurfaceArea2']


In [4]:
# 0/1値をとる変数の名前を取り出す
binary = X.columns[X.columns.str.startswith('FP')].to_list()
print(len(binary), 'binary features')

208 binary features


* 訓練データと検証データを分けておく

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

* 元のままのデータで最小二乗法を検証データ上で評価

In [6]:
X_train = pd.concat([X_train[binary], X_train[continuous]], axis=1)
reg = LinearRegression()
reg.fit(X_train, y_train)
X_valid = pd.concat([X_valid[binary], X_valid[continuous]], axis=1)
y_valid_pred = reg.predict(X_valid)
print(f'RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.4f}')

RMSE: 0.6571


* このRMSEを改善できるかどうか、いろいろ試行錯誤する。

---

## ２値変数のインタラクションを考慮してみる

* PolynomialFeaturesを2次の設定で使う
* その上で主成分分析を適用


* ただし、主成分分析のコンポーネント数（次元削減後の次元数）は、Ridge回帰やLassoのalphaと同時にチューニングする
 * ここが重要っぽいです。

## Ridge回帰の場合

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

for n_components in [150, 200, 250, 300, 350, 400]:

  pca = PCA(n_components=n_components, random_state=123)
  poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)

  X_train_binary_poly = poly.fit_transform(X_train[binary])
  X_train_binary_poly_embedded = pca.fit_transform(X_train_binary_poly)
  X_train_embedded = np.concatenate([X_train_binary_poly_embedded, X_train[continuous]], 1)

  X_valid_binary_poly_embedded = pca.transform(poly.transform(X_valid[binary]))
  X_valid_embedded = np.concatenate([X_valid_binary_poly_embedded, X_valid[continuous]], 1)

  for alpha in 10.0 ** np.arange(-3, 4):
    reg = Ridge(alpha=alpha)
    reg.fit(X_train_embedded, y_train)
    y_valid_pred = reg.predict(X_valid_embedded)
    print(f'{n_components} components | alpha {alpha:.2e} | RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.4f}')

150 components | alpha 1.00e-03 | RMSE: 0.5924
150 components | alpha 1.00e-02 | RMSE: 0.5922
150 components | alpha 1.00e-01 | RMSE: 0.5910
150 components | alpha 1.00e+00 | RMSE: 0.5872
150 components | alpha 1.00e+01 | RMSE: 0.5913
150 components | alpha 1.00e+02 | RMSE: 0.6139
150 components | alpha 1.00e+03 | RMSE: 0.6535
200 components | alpha 1.00e-03 | RMSE: 0.5750
200 components | alpha 1.00e-02 | RMSE: 0.5747
200 components | alpha 1.00e-01 | RMSE: 0.5726
200 components | alpha 1.00e+00 | RMSE: 0.5680
200 components | alpha 1.00e+01 | RMSE: 0.5712
200 components | alpha 1.00e+02 | RMSE: 0.5865
200 components | alpha 1.00e+03 | RMSE: 0.6372
250 components | alpha 1.00e-03 | RMSE: 0.5703
250 components | alpha 1.00e-02 | RMSE: 0.5697
250 components | alpha 1.00e-01 | RMSE: 0.5669
250 components | alpha 1.00e+00 | RMSE: 0.5643
250 components | alpha 1.00e+01 | RMSE: 0.5706
250 components | alpha 1.00e+02 | RMSE: 0.5809
250 components | alpha 1.00e+03 | RMSE: 0.6312
300 component

## Lassoの場合
* まだ収束していないというwarningができるだけ出ないように、max_iterを大きな値にしておく。

In [8]:
from sklearn.linear_model import Lasso

for n_components in [150, 200, 250, 300, 350, 400]:
    
  pca = PCA(n_components=n_components, random_state=123)
  poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)

  X_train_binary_poly = poly.fit_transform(X_train[binary])
  X_train_binary_poly_embedded = pca.fit_transform(X_train_binary_poly)
  X_train_embedded = np.concatenate([X_train_binary_poly_embedded, X_train[continuous]], 1)

  X_valid_binary_poly_embedded = pca.transform(poly.transform(X_valid[binary]))
  X_valid_embedded = np.concatenate([X_valid_binary_poly_embedded, X_valid[continuous]], 1)

  for alpha in 10.0 ** np.arange(-4, 1):
    reg = Lasso(alpha=alpha, max_iter=100000) # max_iterを大きな値に
    reg.fit(X_train_embedded, y_train)
    y_valid_pred = reg.predict(X_valid_embedded)
    print(f'{n_components} components | alpha {alpha:.2e} | RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.4f}')

150 components | alpha 1.00e-04 | RMSE: 0.5913
150 components | alpha 1.00e-03 | RMSE: 0.5865
150 components | alpha 1.00e-02 | RMSE: 0.6055
150 components | alpha 1.00e-01 | RMSE: 0.7277
150 components | alpha 1.00e+00 | RMSE: 0.9769
200 components | alpha 1.00e-04 | RMSE: 0.5730
200 components | alpha 1.00e-03 | RMSE: 0.5658
200 components | alpha 1.00e-02 | RMSE: 0.5802
200 components | alpha 1.00e-01 | RMSE: 0.7286
200 components | alpha 1.00e+00 | RMSE: 0.9769


  positive)


250 components | alpha 1.00e-04 | RMSE: 0.5666
250 components | alpha 1.00e-03 | RMSE: 0.5618
250 components | alpha 1.00e-02 | RMSE: 0.5692
250 components | alpha 1.00e-01 | RMSE: 0.7285
250 components | alpha 1.00e+00 | RMSE: 0.9769


  positive)


300 components | alpha 1.00e-04 | RMSE: 0.5677
300 components | alpha 1.00e-03 | RMSE: 0.5520
300 components | alpha 1.00e-02 | RMSE: 0.5602
300 components | alpha 1.00e-01 | RMSE: 0.7285
300 components | alpha 1.00e+00 | RMSE: 0.9769


  positive)


350 components | alpha 1.00e-04 | RMSE: 0.5952
350 components | alpha 1.00e-03 | RMSE: 0.5670
350 components | alpha 1.00e-02 | RMSE: 0.5508
350 components | alpha 1.00e-01 | RMSE: 0.7285
350 components | alpha 1.00e+00 | RMSE: 0.9769


  positive)


400 components | alpha 1.00e-04 | RMSE: 0.5777
400 components | alpha 1.00e-03 | RMSE: 0.5601
400 components | alpha 1.00e-02 | RMSE: 0.5482
400 components | alpha 1.00e-01 | RMSE: 0.7285
400 components | alpha 1.00e+00 | RMSE: 0.9769


## Lassoをさらにチューニング

In [9]:
from sklearn.linear_model import Lasso

for n_components in [350, 400, 450, 500]:
    
  pca = PCA(n_components=n_components, random_state=123)
  poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)

  X_train_binary_poly = poly.fit_transform(X_train[binary])
  X_train_binary_poly_embedded = pca.fit_transform(X_train_binary_poly)
  X_train_embedded = np.concatenate([X_train_binary_poly_embedded, X_train[continuous]], 1)

  X_valid_binary_poly_embedded = pca.transform(poly.transform(X_valid[binary]))
  X_valid_embedded = np.concatenate([X_valid_binary_poly_embedded, X_valid[continuous]], 1)

  for alpha in [0.002, 0.005, 0.01, 0.02, 0.05]:
    reg = Lasso(alpha=alpha, max_iter=100000)
    reg.fit(X_train_embedded, y_train)
    y_valid_pred = reg.predict(X_valid_embedded)
    print(f'{n_components} components | alpha {alpha:.2e} | RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.4f}')

350 components | alpha 2.00e-03 | RMSE: 0.5614
350 components | alpha 5.00e-03 | RMSE: 0.5550
350 components | alpha 1.00e-02 | RMSE: 0.5508
350 components | alpha 2.00e-02 | RMSE: 0.5807
350 components | alpha 5.00e-02 | RMSE: 0.6605
400 components | alpha 2.00e-03 | RMSE: 0.5549
400 components | alpha 5.00e-03 | RMSE: 0.5469
400 components | alpha 1.00e-02 | RMSE: 0.5482
400 components | alpha 2.00e-02 | RMSE: 0.5798
400 components | alpha 5.00e-02 | RMSE: 0.6605
450 components | alpha 2.00e-03 | RMSE: 0.5705
450 components | alpha 5.00e-03 | RMSE: 0.5580
450 components | alpha 1.00e-02 | RMSE: 0.5529
450 components | alpha 2.00e-02 | RMSE: 0.5795
450 components | alpha 5.00e-02 | RMSE: 0.6605
500 components | alpha 2.00e-03 | RMSE: 0.5651
500 components | alpha 5.00e-03 | RMSE: 0.5473
500 components | alpha 1.00e-02 | RMSE: 0.5490
500 components | alpha 2.00e-02 | RMSE: 0.5797
500 components | alpha 5.00e-02 | RMSE: 0.6605


In [10]:
from sklearn.linear_model import Lasso

for n_components in [380, 390, 400, 410, 420]:
    
  pca = PCA(n_components=n_components, random_state=123)
  poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)

  X_train_binary_poly = poly.fit_transform(X_train[binary])
  X_train_binary_poly_embedded = pca.fit_transform(X_train_binary_poly)
  X_train_embedded = np.concatenate([X_train_binary_poly_embedded, X_train[continuous]], 1)

  X_valid_binary_poly_embedded = pca.transform(poly.transform(X_valid[binary]))
  X_valid_embedded = np.concatenate([X_valid_binary_poly_embedded, X_valid[continuous]], 1)

  for alpha in [0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]:
    reg = Lasso(alpha=alpha, max_iter=100000)
    reg.fit(X_train_embedded, y_train)
    y_valid_pred = reg.predict(X_valid_embedded)
    print(f'{n_components} components | alpha {alpha:.2e} | RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.4f}')

380 components | alpha 3.00e-03 | RMSE: 0.5657
380 components | alpha 4.00e-03 | RMSE: 0.5629
380 components | alpha 5.00e-03 | RMSE: 0.5601
380 components | alpha 6.00e-03 | RMSE: 0.5580
380 components | alpha 7.00e-03 | RMSE: 0.5559
380 components | alpha 8.00e-03 | RMSE: 0.5540
380 components | alpha 9.00e-03 | RMSE: 0.5538
390 components | alpha 3.00e-03 | RMSE: 0.5665
390 components | alpha 4.00e-03 | RMSE: 0.5623
390 components | alpha 5.00e-03 | RMSE: 0.5603
390 components | alpha 6.00e-03 | RMSE: 0.5587
390 components | alpha 7.00e-03 | RMSE: 0.5582
390 components | alpha 8.00e-03 | RMSE: 0.5575
390 components | alpha 9.00e-03 | RMSE: 0.5563
400 components | alpha 3.00e-03 | RMSE: 0.5507
400 components | alpha 4.00e-03 | RMSE: 0.5483
400 components | alpha 5.00e-03 | RMSE: 0.5469
400 components | alpha 6.00e-03 | RMSE: 0.5469
400 components | alpha 7.00e-03 | RMSE: 0.5462
400 components | alpha 8.00e-03 | RMSE: 0.5466
400 components | alpha 9.00e-03 | RMSE: 0.5474
410 component

### ここまで一番良かった手法で最終評価
* PCAで400次元に落とす
* Lassoのalphaを0.007にする

In [11]:
n_components = 400
pca = PCA(n_components=n_components, random_state=123)
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
X_train_binary_poly = poly.fit_transform(X_train[binary])
X_train_binary_poly_embedded = pca.fit_transform(X_train_binary_poly)
X_train_embedded = np.concatenate([X_train_binary_poly_embedded, X_train[continuous]], 1)

reg = Lasso(alpha=0.007, max_iter=100000)
reg.fit(X_train_embedded, y_train)

X_test_binary_poly_embedded = pca.transform(poly.transform(X_test[binary]))
X_test_embedded = np.concatenate([X_test_binary_poly_embedded, X_test[continuous]], 1)
y_test_pred = reg.predict(X_test_embedded)
print('RMSE:', mean_squared_error(y_test, y_test_pred, squared=False))

RMSE: 0.6735017331270905
