# Regression-3: model performance

OLS、リッジ回帰、ランダムフォレスト、勾配ブースティングのアルゴリズム性能を比較してみましょう。<br>データはボストン・ハウジングデータを使いましょう。

## トレーニングデータ読み込み

In [13]:
import pandas as pd
from IPython.core.display import display
from sklearn.datasets import load_boston

X = pd.read_csv('./data/train.csv')
X.head()

Unnamed: 0,ID,Score,col1,col2,col3,col4,col5,col6,col7,col8,...,col3796,col3797,col3798,col3799,col3800,col3801,col3802,col3803,col3804,col3805
0,5,3.475628,0,4.058,0.824,0,10.267,0.728,4.403,0.05,...,1.067,0,0.0,0.115,30.395,24.541,0,0.415,0.997,0
1,8,3.601332,0,4.111,0.929,0,8.352,0.907,4.216,0.034,...,0.934,0,0.0,0.227,38.508,35.038,0,3.979,0.997,3
2,9,1.935003,0,4.139,0.833,66,9.494,0.733,4.069,0.267,...,1.722,0,0.0,0.148,27.932,19.518,0,0.849,0.999,0
3,12,3.283663,0,4.016,0.88,0,8.237,0.836,3.956,0.129,...,0.993,0,0.0,0.124,18.993,25.403,0,0.988,0.998,0
4,14,3.409121,0,4.657,0.522,0,35.882,0.383,4.234,-0.089,...,2.095,0,0.0,0.088,44.225,15.741,0,1.595,0.997,0


In [15]:
y = X['Score']
y.head()

0    3.475628
1    3.601332
2    1.935003
3    3.283663
4    3.409121
Name: Score, dtype: float64

In [16]:
X  = X.drop(['ID', 'Score'], axis=1)
X.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,...,col3796,col3797,col3798,col3799,col3800,col3801,col3802,col3803,col3804,col3805
0,0,4.058,0.824,0,10.267,0.728,4.403,0.05,11,34.568,...,1.067,0,0.0,0.115,30.395,24.541,0,0.415,0.997,0
1,0,4.111,0.929,0,8.352,0.907,4.216,0.034,9,37.527,...,0.934,0,0.0,0.227,38.508,35.038,0,3.979,0.997,3
2,0,4.139,0.833,66,9.494,0.733,4.069,0.267,10,33.399,...,1.722,0,0.0,0.148,27.932,19.518,0,0.849,0.999,0
3,0,4.016,0.88,0,8.237,0.836,3.956,0.129,7,33.482,...,0.993,0,0.0,0.124,18.993,25.403,0,0.988,0.998,0
4,0,4.657,0.522,0,35.882,0.383,4.234,-0.089,24,53.268,...,2.095,0,0.0,0.088,44.225,15.741,0,1.595,0.997,0


ツリー系のアルゴリズム（ランダムフォレストや勾配ブースティングなど）を除き、通常、機械学習モデルは入力ベクトルのスケールを統一させる必要があります。ここではその処理をPipelineに組み込み対応しています。

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# ホールドアウト
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=1)

# pipeline setting
pipelines = {
     'ols': Pipeline([('scl',StandardScaler()),
                      ('est',LinearRegression())]),
     
     'ridge':Pipeline([('scl',StandardScaler()),
                       ('est',Ridge(random_state=0))]),

     'rf': Pipeline([('scl',StandardScaler()),
                     ('est',RandomForestRegressor(random_state=0))]),
     
     'gbr1': Pipeline([('scl',StandardScaler()),
                      ('est',GradientBoostingRegressor(random_state=0))]),

     'gbr2': Pipeline([('scl',StandardScaler()),
                      ('est',GradientBoostingRegressor(n_estimators=250,
                                                       random_state=0))])
}

# build and evaluate
scores = {}
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    scores[(pipe_name,'train')] = r2_score(y_train, pipeline.predict(X_train))
    scores[(pipe_name,'test')] = r2_score(y_test, pipeline.predict(X_test))

pd.Series(scores).unstack()

  from numpy.core.umath_tests import inner1d


Unnamed: 0,test,train
gbr1,0.4740051,0.560919
gbr2,0.5299021,0.673056
ols,-2.639079e+21,0.718904
rf,0.4811667,0.903736
ridge,0.3993156,0.674483


In [21]:
pipe =  Pipeline([('scl',StandardScaler()),
                      ('est',GradientBoostingRegressor(n_estimators=1000,
                                                       random_state=0))])

In [22]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
         ...          presort='auto', random_state=0, subsample=1.0, verbose=0,
             warm_start=False))])

In [23]:
print('train', r2_score(y_train, pipe.predict(X_train)))
print('test ', r2_score(y_test, pipe.predict(X_test)))

train 0.8680548059101914
test  0.5765084035701519


In [24]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

selector = RFE(GradientBoostingRegressor(n_estimators=200, random_state=0),
               n_features_to_select=200,
               step=.05)

selector.fit(X,y)

X_select = pd.DataFrame(selector.transform(X),
                     columns=X.columns[selector.support_])

print('X_select shape:(%i,%i)' % X_select.shape)
X_select.head()

X_select shape:(13731,200)


  if np.issubdtype(mask.dtype, np.int):


Unnamed: 0,col7,col15,col44,col46,col68,col70,col88,col99,col100,col111,...,col3692,col3699,col3701,col3722,col3748,col3753,col3760,col3791,col3794,col3797
0,4.403,0.0,2.001,1.4746,0.0,2.0,21.0,4.53,0.079,0.0,...,0.485,8.938,1.273,0.507,0.838,0.0,1.555,0.0,3.991,0.0
1,4.216,0.0,1.104,1.6038,78.705,1.0,30.0,5.463,0.388,0.0,...,0.678,4.166,1.227,0.591,0.776,0.0,0.811,0.0,4.038,0.0
2,4.069,0.0,0.559,2.4008,60.694,0.0,-19.0,3.985,0.305,0.0,...,0.635,0.0,0.782,0.722,0.987,0.0,1.097,0.0,3.779,0.0
3,3.956,0.0,1.108,1.693,18.011,0.0,7.0,4.053,0.143,0.0,...,0.619,16.894,1.233,0.35,0.955,0.0,1.847,0.0,3.832,0.0
4,4.234,3.869,1.069,1.7353,60.694,2.0,-16.0,3.827,0.06,0.0,...,0.549,4.674,0.78,0.881,0.646,0.0,1.176,0.0,3.959,0.0


In [25]:
# ホールドアウト
X_s_train,X_s_test,y_s_train,y_s_test = train_test_split(X_select,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=1)


In [32]:
pipe =  Pipeline([('scl',StandardScaler()),
                      ('est',GradientBoostingRegressor(n_estimators=2000,
                                                       random_state=0))])

In [33]:
pipe.fit(X_s_train, y_s_train)

Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
         ...          presort='auto', random_state=0, subsample=1.0, verbose=0,
             warm_start=False))])

In [34]:
print('train', r2_score(y_s_train, pipe.predict(X_s_train)))
print('test ', r2_score(y_s_test, pipe.predict(X_s_test)))

train 0.9152663011380762
test  0.5783880520373195
