In [4]:
# jupyter notebook cell 너비 조절
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [5]:
import warnings
warnings.filterwarnings("ignore")

# Personal Project for ML/DL : [House Prices: Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

### 이전 단계에서 전처리를 완료한 Data를 Machine Learning 알고리즘을 통해 학습하는 것을 목표로 한다.

## Metric
* Root-Mean-Squared-Error(RMSE)
$$ \text{RMSE} = \sqrt{\frac{1}{|\hat{R}|} \sum_{\hat{r}_{ui} \in \hat{R}}(r_{ui} - \hat{r}_{ui})^2} $$

In [6]:
from sklearn.externals import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [7]:
dfX_all = joblib.load('dfX_all.pkl')
dfX_test_all = joblib.load('dfX_test_all.pkl')

dfX_removed = joblib.load('dfX_removed.pkl')
dfX_test_removed = joblib.load('dfX_test_removed.pkl')

dfy = joblib.load('dfy.pkl')

print(dfX_all.shape)
print(dfX_test_all.shape)
print(dfX_removed.shape)
print(dfX_test_removed.shape)
print(dfy.shape)

(1458, 255)
(1459, 255)
(1458, 234)
(1459, 234)
(1458,)


# # Index
## Step 1 : Set criteria(Linear Regression)
## Step 2 : Single Algorithm learning
  * Ridge 회귀
  * Lasso 회귀
  * ElasticNet 회귀
  * Gradient Boosting Regression
  * Support Vector Regression
  * XGBoost Regression

## # Set Criteria t : Linear Regression 
* 가장 간단한 회귀 알고리즘은 Linear Regression
* Linear Regression은 복잡도를 제어할 수 없는 간단한 모형이기 때문에 Linear Regression으로 구한 값보다 좋으면 양호한 것으로 판단한다.

In [17]:
cv = KFold(10, shuffle=True, random_state=0)

In [6]:
linear_model = LinearRegression()

In [7]:
%%time
linear_score_all = np.sqrt(-cross_val_score(linear_model, dfX_all, dfy, scoring="neg_mean_squared_error", cv=cv))
linear_score_part = np.sqrt(-cross_val_score(linear_model, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=cv))

Wall time: 1.33 s


In [8]:
print("Linear Regression(all) RMSE : {:.5f} ({:.5f})".format(linear_score_all.mean(), linear_score_all.std()))
print("Linear Regression(removed) RMSE : {:.5f} ({:.5f})".format(linear_score_part.mean(), linear_score_part.std()))

Linear Regression(all) RMSE : 0.12620 (0.01200)
Linear Regression(removed) RMSE : 0.12353 (0.01218)


* 제출 점수 확인

In [9]:
model_all = LinearRegression().fit(dfX_all, dfy)
model_removed = LinearRegression().fit(dfX_removed, dfy)

In [10]:
df_idx = pd.DataFrame(np.array(dfX_test_all.reset_index()["index"] + 1), columns=["Id"])

linear_pred_all = pd.DataFrame(np.exp(model_all.predict(dfX_test_all)), columns=["SalePrice"])
linear_pred_removed = pd.DataFrame(np.exp(model_removed.predict(dfX_test_removed)), columns=["SalePrice"])

linear_sub_all = pd.concat([df_idx, linear_pred_all], axis=1)
linear_sub_removed = pd.concat([df_idx, linear_pred_removed], axis=1)

linear_sub_all.to_csv('./linear_sub_all.csv', index=False)
linear_sub_removed.to_csv('./linear_sub_removed.csv', index=False)

* Score : submission 점수(validation과 오차)
* linear_sub_all RMSE score : 0.13514 (0.00894)
* linear_sub_removed RMSE score : 0.14042 (0.01689)

In [23]:
linear_all_sub_score = 0.13514
linear_removed_sub_score = 0.14042

In [8]:
def check_RMSE(algo_name, score_all, score_removed, gs_all, gs_removed):
    print("{}(all) RMSE : {:.5f} ({:.5f})".format(algo_name, score_all.mean(), score_all.std()))
    print("{}(removed) RMSE : {:.5f} ({:.5f})".format(algo_name, score_removed.mean(), score_removed.std()), "\n")


    print("{} parameters(all) :".format(algo_name), gs_all.best_estimator_.steps[0][1])
    print("{} parameters(removed) :".format(algo_name), gs_removed.best_estimator_.steps[0][1])

In [9]:
def store_sub_to_csv(gs_all, gs_removed, algo_name):
    pred_all = pd.DataFrame(np.exp(gs_all.best_estimator_.steps[0][1].predict(dfX_test_all)), columns=["SalePrice"])
    pred_removed = pd.DataFrame(np.exp(gs_removed.best_estimator_.steps[0][1].predict(dfX_test_removed)), columns=["SalePrice"])
    
    sub_all = pd.concat([df_idx, pred_all], axis=1)
    sub_removed = pd.concat([df_idx, pred_removed], axis=1)
    
    sub_all.to_csv('./{}_sub_all.csv'.format(algo_name), index=False)
    sub_removed.to_csv('./{}_sub_removed.csv'.format(algo_name), index=False)

## # Single Algorithm learning

### Ridge 회귀 모형
* 선형회귀 계수(weigt)에 대한 제약 조건을 추가함으로써 과최적화를 막는 정규화 선형회귀 방법 중 하나이다.
* 가중치들의 제곱합을 최소화 하는 것을 추가적인 제약 조건으로 한다.
$$ w = \text{arg}\min_w \left(\sum_{i=1}^N e_i^2 + \lambda \sum_{j_1}^M w_j^2 \right)$$
* $ \lambda \sum_{j_1}^M w_j^2 $ 이 추가된 규제항이 된다.
* `alpha` : 하이퍼모수 $\lambda$. 정규화 정도를 조절하며 크면 정규화 정도가 커지고 가중치의 값들이 작이진다. 0이 되면 일반적인 선형회귀모형이 된다.
* `max_iter` : gradient solver의 최대 반복 횟수이다.
* 규제항은 훈련하는 동안에만 비용 함수에 추가되며 성능을 평가하거나 예측할 때는 포함하지 않고 규제가 없는 성능 지표로 평가한다.
* 단점은 모든 예측 변수를 중요도에 따라 가중값만 축소시킬 뿐, 0값을 부여하지 않기 때문에 불필요한 변수가 제거되지 않고 항상 남아 있게 된다.

In [13]:
ridge_alpha = list(np.arange(0.5, 20, 0.5))
ridge_iter = list(range(1000, 3001, 500))
pipe_ridge = Pipeline([("ridge", Ridge())])
param_grid = [{"ridge__alpha" : ridge_alpha}, {"ridge__max_iter" : ridge_iter}]

ridge_gs1 = GridSearchCV(estimator=pipe_ridge, param_grid=param_grid, scoring="neg_mean_squared_error", cv=cv, n_jobs=-1)
ridge_gs2 = GridSearchCV(estimator=pipe_ridge, param_grid=param_grid, scoring="neg_mean_squared_error", cv=cv, n_jobs=-1)

In [14]:
%%time 
r_gs_all = ridge_gs1.fit(dfX_all, dfy)
r_gs_removed = ridge_gs2.fit(dfX_removed, dfy)

Wall time: 21.8 s


In [18]:
%%time
ridge_score_all = np.sqrt(-cross_val_score(r_gs_all.best_estimator_, dfX_all, dfy, scoring="neg_mean_squared_error", cv=cv))
ridge_score_removed = np.sqrt(-cross_val_score(r_gs_removed.best_estimator_, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=cv))

Wall time: 553 ms


In [27]:
check_RMSE("Ridge", ridge_score_all, ridge_score_removed, r_gs_all, r_gs_removed)
store_sub_to_csv(r_gs_all, r_gs_removed, "ridge")

Ridge(all) RMSE : 0.11286 (0.01269)
Ridge(removed) RMSE : 0.11593 (0.01321) 

Ridge parameters(all) : Ridge(alpha=19.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Ridge parameters(removed) : Ridge(alpha=11.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


* Score : submission 점수(validation과 오차)
* ridge_sub_all RMSE score : 0.11983 (0.00697)
* ridge_sub_removed RMSE score : 0.12751 (0.01158)

In [24]:
r_all_sub_score = 0.11983
r_removed_sub_score = 0.12751

### Lasso 회귀 모형
* 선형회귀 계수(weigt)에 대한 제약 조건을 추가함으로써 과최적화를 막는 정규화 선형회귀 방법 중 하나이다.
* 가중치들의 절대값의 합을 최소화 하는 것을 추가적인 제약 조건으로 한다.

$$ w = \text{arg}\min_w \left(\sum_{i=1}^N e_i^2 + \lambda \sum_{j=1}^M | w_j | \right) $$
* $ \lambda \sum_{j_1}^M | w_j | $ 이 추가된 규제항이 된다.
* `alpha` : 하이퍼모수 $\lambda$. 정규화 정도를 조절하며 크면 정규화 정도가 커지고 가중치의 값들이 작이진다. 0이 되면 일반적인 선형회귀모형이 된다.
* Lasso의 중요한 특징은 덜 중요한 특성의 가중치를 0으로 만들어 완전히 제거하려고 한다는 점이다. 다시 말해 Lasso는 자동으로 특성 선택을 하고 희소 모델(sparse model)을 만든다. 즉, 0이 아닌 특성의 가중치가 적다.

In [28]:
lasso_alpha = list(np.arange(0.5, 20, 0.5))
pipe_lasso = Pipeline([('lasso', Lasso())])
param_grid = [{'lasso__alpha' : lasso_alpha}]

lasso_gs1 = GridSearchCV(estimator=pipe_lasso, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
lasso_gs2 = GridSearchCV(estimator=pipe_lasso, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

In [29]:
%%time 
l_gs_all = lasso_gs1.fit(dfX_all, dfy)
l_gs_removed = lasso_gs2.fit(dfX_removed, dfy)

Wall time: 15.7 s


In [30]:
%%time
lasso_score_all = np.sqrt(-cross_val_score(l_gs_all.best_estimator_, dfX_all, dfy, scoring="neg_mean_squared_error", cv=cv))
lasso_score_removed = np.sqrt(-cross_val_score(l_gs_removed.best_estimator_, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=cv))

Wall time: 2.31 s


In [31]:
check_RMSE("Lasso", lasso_score_all, lasso_score_removed, l_gs_all, l_gs_removed)
store_sub_to_csv(l_gs_all, l_gs_removed, "lasso")

Lasso(all) RMSE : 0.16040 (0.01670)
Lasso(removed) RMSE : 0.16104 (0.01657) 

Lasso parameters(all) : Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
Lasso parameters(removed) : Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)


* Score : submission 점수(validation과 오차)
* lasso_sub_all RMSE score : 0.17650 (0.0161)
* lasso_sub_removed RMSE score : 0.17847 (0.01743)

In [26]:
l_all_sub_score = 0.17650
l_removed_sub_score = 0.17847

### ElasticNet 회귀 모형
* 선형회귀 계수(weigt)에 대한 제약 조건을 추가함으로써 과최적화를 막는 정규화 선형회귀 방법 중 하나이다
* 가중치의 절대값의 합과 제곱합을 동시에 제약 조건으로 가지는 것으로, Ridge 회귀와 Lasso 회귀를 절충한 모델이다.
$$ w = \text{arg}\min_w \left( \sum_{i=1}^N e_i^2 + \lambda_1 \sum_{j=1}^M | w_j | + \lambda_2 \sum_{j=1}^M w_j^2 \right) $$
* $\lambda_1$과 $\lambda_2$ 두 개의 하이퍼 모수를 가진다. 혼합 정도는 혼합 비율 r을 사용해 조절한다. r=0이면 Ridge 회귀와 같고 r=1이면 Lasso 회귀와 같다.
* `alpha` : 정규화 정도를 조절하며 크면 정규화 정도가 커지고 가중치의 값들이 작이진다. 0이 되면 일반적인 선형회귀모형이 된다.
* `l1_ratio` : $\lambda_1$과 $\lambda_2$ 의 혼합 비율

In [32]:
elastic_alpha = list(np.arange(0.5, 20, 0.5))
elastic_ratio = list(np.arange(0, 1, 0.1))
elastic_iter = list(range(1000, 3001, 500))

pipe_elastic = Pipeline([('elastic', ElasticNet())])
param_grid = [{'elastic__alpha' : elastic_alpha}, {'elastic__l1_ratio' : elastic_ratio}, {'elastic__max_iter' : elastic_iter}]

elastic_gs1 = GridSearchCV(estimator=pipe_elastic, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
elastic_gs2 = GridSearchCV(estimator=pipe_elastic, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

In [33]:
%%time 
elastic_gs_all = elastic_gs1.fit(dfX_all, dfy)
elastic_gs_removed = elastic_gs2.fit(dfX_removed, dfy)

Wall time: 50.6 s


In [34]:
%%time
elastic_score_all = np.sqrt(-cross_val_score(elastic_gs_all.best_estimator_, dfX_all, dfy, scoring="neg_mean_squared_error", cv=cv))
elastic_score_removed = np.sqrt(-cross_val_score(elastic_gs_removed.best_estimator_, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=cv))

Wall time: 10.6 s


In [35]:
check_RMSE("ElasticNet", elastic_score_all, elastic_score_removed, elastic_gs_all, elastic_gs_removed)
store_sub_to_csv(elastic_gs_all, elastic_gs_removed, "elastic")

ElasticNet(all) RMSE : 0.12943 (0.01566)
ElasticNet(removed) RMSE : 0.13380 (0.01696) 

ElasticNet parameters(all) : ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.0,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
ElasticNet parameters(removed) : ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.0,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


* Score : submission 점수(validation과 오차)
* elastic_sub_all RMSE score : 0.14259 (0.01316)
* elastic_sub_removed RMSE score : 0.15012 (0.01632)

In [27]:
elastic_all_sub_score = 0.14259
elastic_removed_sub_score = 0.15012

### Gradient Boosting Regression
* 약한 학습기(Weak Learner)를 결합하여 강한 학습기(Strong Learner)를 만드는 Boosting 방법 중 하나로 Gradien descent를 사용하여 최적의 파라미터를 찾는 Boosting 방법이다.
* Ensemble에 이전까지의 오차를 보정하도록 예측기를 순차적으로 추가한다. 
* `loss` : 손실함수(loss function)를 지정하는 파라미터. 'ls'는 least squares로 최소자승법(residual의 제곱의 합을 최소화)을 의미. 'lad'는 least absolute deviation으로 오차의 절대값의 합계를 의미한다. 'huber'는 huber loss로 ls와 lad를 절충한 것이다. 일정한 범위를 정해서 그 안에 있으면 오차를 구하고, 그 밖에 있으면 오차의 절대값을 구하는 것이다.
* `max_features` : 분할할 최적의 숫자를 지정한다.

In [36]:
gbr_loss = ['ls', 'lad', 'huber']
gbr_lr = [0.01, 0.05, 0.1, 0.5]
gbr_estimator = list(range(100, 3501, 500))
gbr_depth = range(3, 21, 1)
gbr_features = ["auto", "sqrt", "log2", None]

In [37]:
pipe_gbr = Pipeline([('gbr', GradientBoostingRegressor())])
gbr_param_grid = [{'gbr__loss' : gbr_loss}, {'gbr__learning_rate' : gbr_lr}, {'gbr__n_estimators' : gbr_estimator}, {'gbr__max_depth' : gbr_depth}
             , {'gbr__max_features' : gbr_features}]

gbr_gs1 = GridSearchCV(estimator=pipe_gbr, param_grid=gbr_param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
gbr_gs2 = GridSearchCV(estimator=pipe_gbr, param_grid=gbr_param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

In [38]:
%%time 
gbr_gs_all = gbr_gs1.fit(dfX_all, dfy)
gbr_gs_removed = gbr_gs2.fit(dfX_removed, dfy)

Wall time: 49min 32s


In [39]:
%%time
gbr_score_all = np.sqrt(-cross_val_score(gbr_gs_all.best_estimator_, dfX_all, dfy, scoring="neg_mean_squared_error", cv=cv))
gbr_score_removed = np.sqrt(-cross_val_score(gbr_gs_removed.best_estimator_, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=cv))

Wall time: 9min 19s


In [40]:
check_RMSE("GBR", gbr_score_all, gbr_score_removed, gbr_gs_all, gbr_gs_removed)
store_sub_to_csv(gbr_gs_all, gbr_gs_removed, "gbr")

GBR(all) RMSE : 0.11839 (0.01267)
GBR(removed) RMSE : 0.12345 (0.01288) 

GBR parameters(all) : GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1600, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
GBR parameters(removed) : GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_es

In [41]:
joblib.dump(gbr_gs_all, "gbr_gs_all.pkl")
joblib.dump(gbr_gs_removed, "gbr_gs_removed.pkl")

['gbr_gs_removed.pkl']

* gbr_sub_all RMSE score : 0.13461 (0.01622)
* gbr_sub_removed RMSE score : 0.13569 (0.01224)

In [28]:
gbr_all_sub_score = 0.13461
gbr_removed_sub_score = 0.13569

### Support Vector Regression
* SVM의 maximal margin과 같은 특징들을 유지하여 Regression을 사용할 수 있다.
* SVM 회귀는 SVM 분류의 목표와는 반대로 한다. 일정한 마진 오류 안에서 두 클래스 간의 도로 폭이 가능한 최대가 되도록 하면서 제한된 마진 오류 안에서 가능한 많은 샘플이 들어가도록 학습한다.
* `kernel` : 알고리즘에 사용할 kernel 유형을 지정한다. 'rbf'는 가우시안 방사 기저 함수(Radial Basis Function)을 의미하며 비선형 특성을 다루는 함수 중 하나이다. 여기에서는 'rbf'만 사용한다. (kernel 인수의 종류로는 'linear', 'poly', 'rbf','sigmoid'가 있다.
* `gamma` : 하나의 훈련 샘플이 영향을 미치는 범위를 결정한다. 작은 값은 넓은 범위를 의미하며 큰 값은 영향을 미치는 범위가 제한적이다.
* `C` : error의 규제 parameter이다. C를 줄이면 margin의 폭이 넓어지지만 margin 오류도 커진다. C가 커지면 margin이 좁아지지만 margin 오류가 적다.
* `epsilon` : margin의 폭을 지정한다.(허용오차와 다르다.)
* `tol` : 허용오차

In [42]:
svr_gamma = ['auto_deprecated', 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
svr_C = [0.1, 0.5, 1, 5, 10, 50, 100]
svr_epsilon = [0.0001, 0.0005,0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 1.5]
svr_tol = [0.0001, 0.001, 0.01, 0.1]

In [43]:
pipe_svr = Pipeline([('svr', SVR())])
svr_param_grid = [{'svr__gamma' : svr_gamma}, {'svr__C' : svr_C}, {'svr__epsilon' : svr_epsilon}, {'svr__tol' : svr_tol}]

svr_gs1 = GridSearchCV(estimator=pipe_svr, param_grid=svr_param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
svr_gs2 = GridSearchCV(estimator=pipe_svr, param_grid=svr_param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

In [44]:
%%time 
svr_gs_all = svr_gs1.fit(dfX_all, dfy)
svr_gs_removed = svr_gs2.fit(dfX_removed, dfy)

Wall time: 5min 43s


In [45]:
%%time
svr_score_all = np.sqrt(-cross_val_score(svr_gs_all.best_estimator_, dfX_all, dfy, scoring="neg_mean_squared_error", cv=cv))
svr_score_removed = np.sqrt(-cross_val_score(svr_gs_removed.best_estimator_, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=cv))

Wall time: 18.2 s


In [46]:
check_RMSE("SVR", svr_score_all, svr_score_removed, svr_gs_all, svr_gs_removed)
store_sub_to_csv(svr_gs_all, svr_gs_removed, "svr")

SVR(all) RMSE : 0.39571 (0.02713)
SVR(removed) RMSE : 0.39548 (0.02703) 

SVR parameters(all) : SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
SVR parameters(removed) : SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


* svr_sub_all RMSE score : 0.41716 (0.02145)
* svr_sub_removed RMSE score : 0.41656 (0.02108)

In [29]:
svr_all_sub_score = 0.41716
svr_removed_sub_score = 0.41656

In [47]:
joblib.dump(svr_gs_all, "svr_gs_all.pkl")
joblib.dump(svr_gs_removed, "svr_gs_removed.pkl")

['svr_gs_removed.pkl']

### XGBoost Regression
* XGBoost는 Extreme Gradient Boosting의 약자로 Gradient Boosting 알고리즘을 핵심으로 한다.
* 병렬 처리를 사용하기 때문에 학습과 계산이 빠르고 Greedy-algorithm을 사용한 자동 가지치기로 과적합이 잘 발생하지 않는다.

In [48]:
xgb_depth = list(range(3, 21))
xgb_lr = [0.005, 0.01, 0.05, 0.1, 0.5]
xgb_estimators = list(range(100, 3501, 100))
xgb_bytree = [0.6, 0.7, 0.8, 0.9, 1]
xgb_child_weight = list(np.arange(0, 1.1, 0.1))

In [49]:
pipe_xgb = Pipeline([('xgb', XGBRegressor())])
xgb_param_grid = [{'xgb__max_depth' : xgb_depth}, {'xgb__learning_rate' : xgb_lr}, {'xgb__n_estimators' : xgb_estimators}, 
                  {'xgb__colsample_bytree' : xgb_bytree}, {'xgb__min_child_weight' : xgb_child_weight}]
xgb_gs1 = GridSearchCV(estimator=pipe_xgb, param_grid=xgb_param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
xgb_gs2 = GridSearchCV(estimator=pipe_xgb, param_grid=xgb_param_grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

In [50]:
%%time 
xgb_gs_all = xgb_gs1.fit(dfX_all, dfy)
xgb_gs_removed = xgb_gs2.fit(dfX_removed, dfy)

Wall time: 4h 33min 35s


In [51]:
%%time
xgb_score_all = np.sqrt(-cross_val_score(xgb_gs_all.best_estimator_, dfX_all, dfy, scoring="neg_mean_squared_error", cv=KFold(5, shuffle=True, random_state=0)))
xgb_score_removed = np.sqrt(-cross_val_score(xgb_gs_removed.best_estimator_, dfX_removed, dfy, scoring="neg_mean_squared_error", cv=KFold(5, shuffle=True, random_state=0)))

Wall time: 2min 26s


In [52]:
check_RMSE("XGB", xgb_score_all, xgb_score_removed, xgb_gs_all, xgb_gs_removed)
store_sub_to_csv(xgb_gs_all, xgb_gs_removed, "xgb")

XGB(all) RMSE : 0.11742 (0.00971)
XGB(removed) RMSE : 0.12090 (0.01061) 

XGB parameters(all) : XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)
XGB parameters(removed) : XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=700,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1

In [53]:
joblib.dump(xgb_gs_all, "xgb_gs_all.pkl")
joblib.dump(xgb_gs_removed, "xgb_gs_removed.pkl")

['xgb_gs_removed.pkl']

* xgb_sub_all RMSE score : 0.13020 (0.01278)
* xgb_sub_removed RMSE score : 0.13660 (0.0157)

In [30]:
xgb_all_sub_score = 0.13020
xgb_removed_sub_score = 0.13660

In [15]:
# xgb_gs_all = joblib.load("xgb_gs_all.pkl")
# xgb_gs_all = joblib.load("xgb_gs_removed.pkl")

### 7개 알고리즘의 Submission Score 확인

In [18]:
linear_all_best = LinearRegression()
linear_removed_best = LinearRegression()

# r_all_best = r_gs_all.best_estimator_.steps[0][1]
# r_removed_best = r_gs_removed.best_estimator_.steps[0][1]
r_all_best = Ridge(alpha=19.5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001)
r_removed_best = Ridge(alpha=11.0, copy_X=True, fit_intercept=True, max_iter=None,normalize=False, random_state=None, solver='auto', tol=0.001)

# l_all_best = l_gs_all.best_estimator_.steps[0][1]
# l_removed_best = l_gs_removed.best_estimator_.steps[0][1]
l_all_best = Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None,
                   selection='cyclic', tol=0.0001, warm_start=False)
l_removed_best = Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None,
                       selection='cyclic', tol=0.0001, warm_start=False)

# elastic_all_best = elastic_gs_all.best_estimator_.steps[0][1]
# elastic_removed_best = elastic_gs_removed.best_estimator_.steps[0][1]
elastic_all_best = ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.0, max_iter=1000, normalize=False, positive=False, 
                              precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
elastic_removed_best = ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.0, max_iter=1000, normalize=False, positive=False, 
                                  precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

# gbr_all_best = gbr_gs_all.best_estimator_.steps[0][1]
# gbr_removed_best = gbr_gs_removed.best_estimator_.steps[0][1]
gbr_all_best = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
                                         max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1,
                                         min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=1600, n_iter_no_change=None, presort='auto',
                                         random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)
gbr_removed_best = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=3, 
                                             max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
                                             min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=3100, 
                                             n_iter_no_change=None, presort='auto', random_state=None, subsample=1.0, tol=0.0001, 
                                             validation_fraction=0.1, verbose=0, warm_start=False)

# svr_all_best = svr_gs_all.best_estimator_.steps[0][1]
# svr_removed_best = svr_gs_removed.best_estimator_.steps[0][1]
svr_all_best = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001, kernel='rbf', max_iter=-1, shrinking=True, 
                   tol=0.001, verbose=False)
svr_removed_best = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001, kernel='rbf', max_iter=-1, shrinking=True, 
                       tol=0.001, verbose=False)

# xgb_all_best = xgb_gs_all.best_estimator_.steps[0][1]
# xgb_removed_best = xgb_gs_removed.best_estimator_.steps[0][1]
xgb_all_best = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, 
                            importance_type='gain', learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, 
                            n_estimators=1000, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, 
                            scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1)
xgb_removed_best = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, 
                                importance_type='gain', learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, 
                                n_estimators=700, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, 
                                scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1)

In [34]:
algo_names = ["Linear Regression", "Ridge", "Lasso", "ElasticNet", "Gradient Boosting Regression", "Support Vector Regression", "XGBoost Regression"]
all_sub_scores = [linear_all_sub_score, r_all_sub_score, l_all_sub_score, elastic_all_sub_score, gbr_all_sub_score, svr_all_sub_score, xgb_all_sub_score]
removed_sub_scores = [linear_removed_sub_score, r_removed_sub_score, l_removed_sub_score, elastic_removed_sub_score, gbr_removed_sub_score, 
                      svr_removed_sub_score, xgb_removed_sub_score]

In [39]:
print("[Regression of All columns]")
for i in range(len(algo_names)):
    print("{} (all) RMSE : {}".format(algo_names[i], all_sub_scores[i]))
print("\n")
    
print("[Regression of Removed columns]")
for i in range(len(algo_names)):
    print("{} (all) RMSE : {}".format(algo_names[i], removed_sub_scores[i]))

[Regression of All columns]
Linear Regression (all) RMSE : 0.13514
Ridge (all) RMSE : 0.11983
Lasso (all) RMSE : 0.1765
ElasticNet (all) RMSE : 0.14259
Gradient Boosting Regression (all) RMSE : 0.13461
Support Vector Regression (all) RMSE : 0.41716
XGBoost Regression (all) RMSE : 0.1302


[Regression of Removed columns]
Linear Regression (all) RMSE : 0.14042
Ridge (all) RMSE : 0.12751
Lasso (all) RMSE : 0.17847
ElasticNet (all) RMSE : 0.15012
Gradient Boosting Regression (all) RMSE : 0.13569
Support Vector Regression (all) RMSE : 0.41656
XGBoost Regression (all) RMSE : 0.1366
