In [None]:
# GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=2017).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, train_target, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
### Gradient Boosting Regression

Reference
- [Gradient Boosted Regression Trees](https://www.datarobot.com/blog/gradient-boosted-regression-trees/)
- [Caifornia house price predictions with Gradient Boosted Regression Trees](https://shankarmsy.github.io/stories/gbrt-sklearn.html)

In [None]:
# =================================================
# model_selection.GridSearchCV
# =================================================
# 1) Create model
# Set params
# Scores XXX
est = GradientBoostingRegressor(n_estimators=3000)

# 2) Set the grid
param_grid = {'n_estimators':[100,1000,3000], 
              'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [3, 5, 9, 17],
#              'min_saples_split': [5, 10, 15],
              'max_features': [1.0, 0.3, 0.1] ## not possible in our example (only 1 fx)
              }
# 3) Run GridSearch
grid = GridSearchCV(est, param_grid, n_jobs=5).fit(X_train, y_train)

# 4) Show best Params and Score
print("_"*30)
print("Result of Gridsearch")
print("Best params: ", grid.best_params_)
print("Best Estimator: ", grid.best_estimator_)
print("MSE: ", grid.best_score_)

# 5) Learning with best params
gbm_g = GradientBoostingRegressor(**grid.best_params_)
gbm_g.fit(X_train, y_train)
y_pred_gs = gbm_g.predict(X_test)

# 6) The error metric: RMSE
print("_"*30)
print("vs Prediction")
print("RMSE from local train: ", rmse(y_test, y_pred_gs))
print("MSE from local train: ", mean_squared_error(y_test, y_pred_gs))
print("R2 from local train: ", r2_score(y_test, y_pred_gs))

In [None]:
### LGBM

In [None]:
# =================================================
# model_selection.GridSearchCV
# =================================================
# 1) Create model
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt', 
                        n_jobs = 5,
                        metric='RMSE'
                       )


#Best params:  {'colsample_bytree': 0.8, 'learning_rate': 0.05, 
#'max_depth': 3, 'n_estimators': 100, 'num_leaves': 5, 
#'objective': 'regression', 'reg_alpha': 1.2, 'reg_lambda': 1.4, 'subsample': 0.75}

# 2) Set params for gridsearch
gridParams = {
    'objective': ['binary','regression'],
    'num_leaves': [4,5,6], #2,10,20,100   
    'learning_rate': [0.05, 0.06], # 0.005,
    'n_estimators': [100], #8,24,
    'colsample_bytree' :[0.8, 0.85, 0.9], #0.64,
    'reg_lambda' : [1.3,1.4,1.5], #1,1.2,
    'max_depth' :[2,3,4], #1,2,5,10
    'subsample' :[0.7,0.75], 
    'reg_alpha' : [1.2], #0.1,0.51,
#    'min_split_gain' :[],
#    'subsample_for_bin' :[],
#    'max_drop' :[], 
#    'gaussian_eta' :[], 
#    'drop_rate' :[],
#    'silent' :[], 
#    'boosting_type' :['gbdt'], 
#    'min_child_weight' :[], 
#    'skip_drop' :[], 
#    'fair_c' :[], 
#    'seed' :[], 
#    'poisson_max_delta_step' :[], 
#    'subsample_freq' :[], 
#    'max_bin' :[],  #55
#    'nthread' :[], 
#    'min_child_samples' :[], 
#    'huber_delta' :[], 
#    'use_missing' :[], 
#    'uniform_drop' :[], 
#    'bagging_fraction': [] #0.8,
#    'bagging_freq': [] # 5
#    'feature_fraction': [] # 0.2319,
#    'feature_fraction_seed': [] #9
#    'bagging_seed': [] #9,
#    'min_data_in_leaf': [] #6
#    'min_sum_hessian_in_leaf': [] # 11                              
#    'xgboost_dart_mode' :[]
}

# 3) Run GridSearch
grid = GridSearchCV(mdl, gridParams, verbose=1, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

# 4) Show best Params and Score
print("_"*30)
print("Result of Gridsearch")
print("Best params: ", grid.best_params_)
print("Best Estimator: ", grid.best_estimator_)
print("MSE: ", grid.best_score_)

# 5) Learning with best params
lgm_g = lgb.LGBMRegressor(**grid.best_params_)
lgm_g.fit(X_train, y_train)
y_pred_gs = lgm_g.predict(X_test)

# 6) The error metric: RMSE
print("_"*30)
print("vs Prediction")
print("RMSE from local train: ", rmse(y_test, y_pred_gs))
print("MSE from local train: ", mean_squared_error(y_test, y_pred_gs))
print("R2 from local train: ", r2_score(y_test, y_pred_gs))


In [None]:
__ Score: __
```
______________________________
Result of Gridsearch
Best params:  {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'num_leaves': 5, 'objective': 'regression', 'reg_alpha': 1.2, 'reg_lambda': 1.4, 'subsample': 0.75}
Best Estimator:  LGBMRegressor(boosting_type='gbdt', colsample_bytree=0.8, learning_rate=0.05,
       max_bin=255, max_depth=3, metric='RMSE', min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=5, num_leaves=5, objective='regression', random_state=None,
       reg_alpha=1.2, reg_lambda=1.4, silent=True, subsample=0.75,
       subsample_for_bin=200000, subsample_freq=1)
MSE:  0.349386487223
______________________________
vs Prediction
RMSE from local train:  6.02327567741
MSE from local train:  36.2798498861
R2 from local train:  0.395531874947
```

In [None]:
### Kernel Ridge

In [None]:
# =================================================
# model_selection.GridSearchCV
# =================================================
# 1) Create model
model = KernelRidge()

# 2) Set params for gridsearch
param_grid = {
    "alpha": [1e0, 0.1, 1e-2, 1e-3],
    "gamma": np.logspace(-2, 2, 5),
    "kernel" : ['polynomial','rbf'],
    "degree" : [2,5,10,20], 
    "coef0" : [2.5,5,10,20],
}

# 3) Run GridSearch
model_ = GridSearchCV(estimator= model, param_grid= param_grid, scoring='neg_mean_squared_error',cv=5, n_jobs=-1)
model_.fit(X_train, y_train)

# 4) Show best Params and Score
print("_"*30)
print("Result of Gridsearch")
print("Best params: ", model_.best_params_)
print("Best Estimator: ", model_.best_estimator_)
print("MSE: ", model_.best_score_)

# 5) Learning with best params
krr_g = KernelRidge(**model_.best_params_)
krr_g.fit(X_train, y_train)
y_pred_gs = krr_g.predict(X_test)

# 6) The error metric: RMSE
print("_"*30)
print("vs Prediction")
print("RMSE from local train: ", rmse(y_test, y_pred_gs))
print("MSE from local train: ", mean_squared_error(y_test, y_pred_gs))
print("R2 from local train: ", r2_score(y_test, y_pred_gs))

In [None]:
### Lasso

In [None]:
Referring to followings
- [Exploring features and regression models](https://www.kaggle.com/youssefer/xgb-and-lasso-regression)
- [House Prices # Regression and Bagging techniques](https://www.kaggle.com/aarti1/house-prices-regression-and-bagging-techniques)
- [XGB and Lasso Regression](https://www.kaggle.com/youssefer/xgb-and-lasso-regression)

In [None]:
#------------------------------------------------------------
print ("10	Lasso regression")
# importance of train set size: first, we set a relevant alpha

rm_tr=[]
rm_te=[]

opti=[]
alphas=[1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1]
# alphas=np.linspace(1e-4,1e-2,20)
#X_train, X_test, y_train, y_test = train_test_split(
#     feat, price, test_size=0.8, random_state=42)

for al in alphas:
	ls=Lasso(alpha=al, copy_X=True, fit_intercept=True, max_iter=5000,
	   normalize=False, positive=False, precompute=False, random_state=111,
	   selection='cyclic', tol=0.0001, warm_start=False)
	ls.fit(X_train,y_train)
	rm_tr.append(np.sqrt(mean_squared_error(y_train,ls.predict(X_train))))
	rm_te.append(np.sqrt(mean_squared_error(y_test,ls.predict(X_test))))

plt.figure()
plt.cla()
plt.clf()
plt.plot(np.log(alphas),rm_tr,np.log(alphas),rm_te,"r")	
plt.title("Train and test error vs log(alphas)")
#plt.savefig('fig4.png')

In [None]:
# =================================================
# model_selection.GridSearchCV
# =================================================
# 1) Create model
model = Lasso()

# 2) Set params for gridsearch
param_grid = { 'alpha': [i/100000 for i in range(1,50000)]}

# 3) Run GridSearch
model_ = GridSearchCV(estimator= model, param_grid= param_grid, scoring='neg_mean_squared_error',cv=5, n_jobs=-1)
model_.fit(X_train, y_train)

# 4) Show best Params and Score
print("_"*30)
print("Result of Gridsearch")
print("Best params: ", model_.best_params_)
print("Best Estimator: ", model_.best_estimator_)
print("MSE: ", model_.best_score_)

# 5) Learning with best params
las_g = xgb.XGBRegressor(**model_.best_params_)
las_g.fit(X_train, y_train)
y_pred_gs = las_g.predict(X_test)

# 6) The error metric: RMSE
print("_"*30)
print("vs Prediction")
print("RMSE from local train: ", rmse(y_test, y_pred_gs))
print("MSE from local train: ", mean_squared_error(y_test, y_pred_gs))
print("R2 from local train: ", r2_score(y_test, y_pred_gs))

In [None]:
# =================================================
# Manual GridSearch
# =================================================
# Search `alpha` regression parameters
#x_train = x[x.train_test == 1]
lasso = Lasso(max_iter=1e2, normalize=True)
alphas = np.logspace(-5, -3, 10)
scores = []
scores_std = []
for alpha in alphas:
    lasso.alpha = alpha
    this_scores = np.sqrt(-cross_val_score(lasso, train, train_target, cv=5, scoring='mean_squared_error'))
    print('Compute alpha = {} - {}'.format(alpha, np.mean(this_scores)))
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))
plt.figure(figsize=(4, 3))
plt.semilogx(alphas, scores)
plt.semilogx(alphas, np.array(scores) + np.array(scores_std) / np.sqrt(len(train)), 'b--')
plt.semilogx(alphas, np.array(scores) - np.array(scores_std) / np.sqrt(len(train)), 'b--')
plt.ylabel('CV score')
plt.xlabel('alpha')
plt.axhline(np.max(scores), linestyle='--', color='.5')
#plt.savefig('lasso_lars.png')

In [None]:
### Elastic Net Regression

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
enr_cv = ElasticNetCV(cv=5, random_state=2017)
enr_cv.fit(X_train, y_train)
print(enr_cv.alpha_)
print(enr_cv.l1_ratio_)
#print(enr_cv.intercept_) 

In [None]:
# =================================================
# model_selection.GridSearchCV
# =================================================
# 1) Create model
regr = ElasticNet()

# 2) Set params for gridsearch
enr_params = {
    'alpha' : [0.0640, 0.0645, 0.0649678226388, 0.65],
    'l1_ratio' : [0.4,0.5,0.6,0.7,0.8,0.9]
}

# 3) Run gridsearch
grid_enr = GridSearchCV(regr,enr_params,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)
grid_enr.fit(X_train, y_train)

# 4) Show best params and score
print("_"*30)
print("Result of Gridsearch")
print("Best params: ", grid_enr.best_params_)
print("Best Estimator: ", grid_enr.best_estimator_)
print("MSE: ", grid_enr.best_score_)

# 5) Learning with best params
eln_g = ElasticNet(**grid_enr.best_params_)
eln_g.fit(X_train, y_train)
y_pred_gs = eln_g.predict(X_test)

# 6) The error metric: RMSE
print("_"*30)
print("vs Prediction")
print("RMSE from local train: ", rmse(y_test, y_pred_gs))
print("MSE from local train: ", mean_squared_error(y_test, y_pred_gs))
print("R2 from local train: ", r2_score(y_test, y_pred_gs))

In [None]:
### XGBoost

In [None]:
# =================================================
# model_selection.GridSearchCV
# =================================================
# 1) xgboostモデルの作成
reg = xgb.XGBRegressor()

# 2) XGBoost params
xgb_params = {
    'objective' : ['reg:gamma','reg:linear'],
    'learning_rate' : [0.05,0.75,0.1,0.125],
    'n_estimators' : [50,100,200],
    'max_depth' : [2,4,6],
    'subsample' : [0.79,0.8,0.81,0.85],
    'colsample_bytree' : [0.9,1.0],
    'min_child_weight' : [13,14, 15, 16]
}

# 3) Run GridSearch
grid_xgb = GridSearchCV(reg,xgb_params,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)
grid_xgb.fit(X_train, y_train)

# 4) Show best Params and Score
print("_"*30)
print("Result of Gridsearch")
print("Best params: ", grid_xgb.best_params_)
print("Best Estimator: ", grid_xgb.best_estimator_)
print("MSE: ", grid_xgb.best_score_)

# 5) Learning with best params
xgr_g = xgb.XGBRegressor(**grid_xgb.best_params_)
xgr_g.fit(X_train, y_train)
y_pred_gs = xgr_g.predict(X_test)

# 6) The error metric: RMSE
print("_"*30)
print("vs Prediction")
print("RMSE from local train: ", rmse(y_test, y_pred_gs))
print("MSE from local train: ", mean_squared_error(y_test, y_pred_gs))
print("R2 from local train: ", r2_score(y_test, y_pred_gs))

# 総当たり戦

In [3]:
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

boston = load_boston()
X=pd.DataFrame(boston.data[:,:], columns=boston.feature_names)
y=pd.DataFrame(boston.target[:])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

lin_1d = LinearRegression()
x_train = X_train.loc[:, ['LSTAT']].as_matrix()
lin_1d.fit(x_train, y_train)

n = np.linspace(np.min(x_train),np.max(x_train), 1000)
y_1d_fit=lin_1d.predict(n[:,np.newaxis])

# LSTAT
x_test = X_test['LSTAT'].values[:,np.newaxis]
score_1d = lin_1d.score(x_test, y_test)
print("一次式における'LSTAT'の住宅価格への決定係数は%f" % (score_1d))

一次式における'LSTAT'の住宅価格への決定係数は0.430957




In [None]:
from itertools import chain, combinations
dim=[3,4]
i = set(list(X.columns))

for di in dim:

    degree_=PolynomialFeatures(degree=di)

    for p in chain.from_iterable(combinations(i, r) for r in range(len(i)+1)):
        if len(p) == 0:
            continue
        else:
            x_train = X_train.loc[:, p].as_matrix()
            x_train_d = degree_.fit_transform(x_train)

            lin_ = LinearRegression(normalize=True)
            #normalize=Trueは入力データを正規化してトレーニングすることを意味します。    
            lin_.fit(x_train_d,y_train)

            x_test = X_test.loc[:, p].as_matrix()
            x_test_d = degree_.fit_transform(x_test)

            score_d = lin_.score(x_test_d, y_test)

            if score_d >= 0.75:
                print('d:{0} s:{1} p:{2}'.format(di,score_d, p))

d:3 s:0.7535202032485527 p:('LSTAT', 'RM', 'NOX', 'RAD', 'CHAS')
d:3 s:0.7685431811199619 p:('LSTAT', 'RM', 'RAD', 'DIS', 'TAX', 'B')


# Gridsearch

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

boston = load_boston()
X=pd.DataFrame(boston.data[:,:], columns=boston.feature_names)
y=pd.DataFrame(boston.target[:],columns=['MEDV'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [4]:
from sklearn.metrics import r2_score, make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

regressor = DecisionTreeRegressor()
params = {"max_depth":  list(range(1,11))}

scoring_fnc=make_scorer(r2_score)

grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc,cv=5)
grid.fit(X_train,y_train)
print(grid.best_params_)

{'max_depth': 5}


In [17]:
#for params, mean_score, scores in grid.grid_scores_:
#    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

In [16]:
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.368496 (0.064662) with: {'max_depth': 1}
0.626112 (0.111121) with: {'max_depth': 2}
0.698623 (0.128678) with: {'max_depth': 3}
0.742889 (0.129903) with: {'max_depth': 4}
0.782156 (0.090582) with: {'max_depth': 5}
0.752935 (0.100415) with: {'max_depth': 6}
0.736055 (0.086652) with: {'max_depth': 7}
0.746791 (0.092950) with: {'max_depth': 8}
0.740257 (0.111650) with: {'max_depth': 9}
0.766889 (0.103395) with: {'max_depth': 10}


In [10]:
grid.cv_results_ 

{'mean_fit_time': array([ 0.00198293,  0.00157156,  0.00190525,  0.00142169,  0.00166516,
         0.00173316,  0.00181875,  0.0019855 ,  0.00214195,  0.00211492]),
 'mean_score_time': array([ 0.00058565,  0.00050249,  0.00049834,  0.00027561,  0.00025125,
         0.000246  ,  0.00024881,  0.00026979,  0.0002737 ,  0.00025415]),
 'mean_test_score': array([ 0.36849619,  0.62611191,  0.69862293,  0.74288852,  0.78215579,
         0.75293457,  0.73605545,  0.74679111,  0.74025721,  0.76688932]),
 'mean_train_score': array([ 0.48476147,  0.73065992,  0.83904408,  0.90665078,  0.93863857,
         0.96125492,  0.97595741,  0.98612753,  0.99196729,  0.99524738]),
 'param_max_depth': masked_array(data = [1 2 3 4 5 6 7 8 9 10],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'params': [{'max_depth': 1},
  {'max_depth': 2},
  {'max_depth': 3},
  {'max_depth': 4},
  {'max_depth': 5},
  {'max_depth': 6},
  {'max_depth': 7},
  {'max_de

# Gridsearch関数作成
# [20171106追記]
[こちら](http://chrisstrelioff.ws/sandbox/2015/06/25/decision_trees_in_python_again_cross_validation.html)を参考にDecisionTreeRegressor()に書き換えて、再度グリッドサーチを実施し、パラメータを探索しました。


In [6]:
def report(grid_scores, n_top=3):
    """Report top n_top parameters settings, default n_top=3.

    Args
    ----
    grid_scores -- output from grid or random search
    n_top -- how many to report, of top models

    Returns
    -------
    top_params -- [dict] top parameter settings found in
                  search
    """
    top_scores = sorted(grid_scores,
                        key=itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

    return top_scores[0].parameters

In [7]:
from time import time
from operator import itemgetter

def run_gridsearch(X, y, clf, param_grid, cv=5):
    """Run a grid search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_grid -- [dict] parameter settings to test
    cv -- fold of cross-validation, default 5

    Returns
    -------
    top_params -- [dict] from report()
    """
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=cv)
    start = time()
    grid_search.fit(X, y)

    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                len(grid_search.grid_scores_)))

    top_params = report(grid_search.grid_scores_, 3)
    return  top_params

In [8]:
print("-- Grid Parameter Search via 10-fold CV")

# set of parameters to test
param_grid = {"criterion": ["mse", "friedman_mse", "mae"],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [2, 5, 10],
              "max_leaf_nodes": [None, 2, 5, 10, 20]
              }

dt = DecisionTreeRegressor()
ts_gs = run_gridsearch(X, y, dt, param_grid, cv=10)

-- Grid Parameter Search via 10-fold CV

GridSearchCV took 13.98 seconds for 180 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.366 (std: 0.417)
Parameters: {'criterion': 'mae', 'max_depth': 5, 'max_leaf_nodes': None, 'min_samples_leaf': 10}

Model with rank: 2
Mean validation score: 0.357 (std: 0.410)
Parameters: {'criterion': 'mae', 'max_depth': 10, 'max_leaf_nodes': None, 'min_samples_leaf': 10}

Model with rank: 3
Mean validation score: 0.354 (std: 0.416)
Parameters: {'criterion': 'mae', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 10}





In [18]:
print("\n-- Best Parameters:")
for k, v in ts_gs.items():
    print("parameter: {:<20s} setting: {}".format(k, v))


-- Best Parameters:
parameter: criterion            setting: mae
parameter: max_depth            setting: 5
parameter: max_leaf_nodes       setting: None
parameter: min_samples_leaf     setting: 10
