### Competition Series 1: Forecast use of a city bike share system 2nd version

Data come from kaggle https://www.kaggle.com/c/bike-sharing-demand.

https://www.kaggle.com/c/bike-sharing-demand/leaderboard

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

%matplotlib inline



### 1. import the data

In [3]:
train = pd.read_csv('data/bike_train.csv')
test = pd.read_csv('data/bike_test.csv')

In [4]:
test['casual'] = 0
test['registered'] = 0
test['count'] = 0
data = train.append(test, ignore_index = True) 

### 2. add new variables

In [5]:
data['datetime'] = pd.to_datetime(data['datetime'])
data['year'] = data['datetime'].map(lambda x: x.year)
data['month'] = data['datetime'].map(lambda x: x.month)
data['hour'] = data['datetime'].map(lambda x: x.hour)
data['weekday'] = data['datetime'].map(lambda x: x.dayofweek)
data['weekend'] = (data['weekday'] == 5) | (data['weekday'] == 6)

data['adverse_rush_hour'] = abs(data['hour'] -14)
data['rush_hour_working'] = ((data['hour'] == 7) | (data['hour'] == 8) | (data['hour'] == 17) |\
                    (data['hour'] == 18) | (data['hour'] == 19)) & (data['workingday'] == 1)
data['rush_hour_nonwork'] = ((data['hour'] == 12) | (data['hour'] == 13) | (data['hour'] == 14) |\
                    (data['hour'] == 15) | (data['hour'] == 16) | (data['hour'] == 17)) & (data['workingday'] == 0)

data['year_part'] = 0
data.loc[(data.year == 2011) & (data.month <= 3), 'year_part'] = 1
data.loc[(data.year == 2011) & (data.month > 3) & (data.month <= 6), 'year_part'] = 2
data.loc[(data.year == 2011) & (data.month > 6) & (data.month <= 9), 'year_part'] = 3
data.loc[(data.year == 2011) & (data.month > 9), 'year_part'] = 4
data.loc[(data.year == 2012) & (data.month <= 3), 'year_part'] = 5
data.loc[(data.year == 2012) & (data.month > 3) & (data.month <= 6), 'year_part'] = 6
data.loc[(data.year == 2012) & (data.month > 6) & (data.month <= 9), 'year_part'] = 7
data.loc[(data.year == 2012) & (data.month > 9), 'year_part'] = 8

data['hour_reg'] = 0
data.loc[data.hour < 7, 'hour_reg'] = 1
data.loc[data.hour == 7, 'hour_reg'] = 2
data.loc[data.hour == 8, 'hour_reg'] = 3
data.loc[(data.hour > 8) & (data.hour < 16), 'hour_reg'] = 4
data.loc[(data.hour == 16) | (data.hour == 17), 'hour_reg'] = 5
data.loc[(data.hour == 18) | (data.hour == 19), 'hour_reg'] = 6
data.loc[data.hour >= 20, 'hour_reg'] = 7

data['hour_cas'] = 0
data.loc[data.hour <= 7, 'hour_cas'] = 1
data.loc[(data.hour == 8) | (data.hour == 9), 'hour_cas'] = 2
data.loc[(data.hour >= 10) & (data.hour < 20), 'hour_cas'] = 3
data.loc[data.hour >= 20, 'hour_cas'] = 4

data['temp_reg'] = 0
data.loc[data.temp < 13, 'temp_reg'] = 1
data.loc[(data.temp >= 13) & (data.temp < 23), 'temp_reg'] = 2
data.loc[(data.temp >= 23) & (data.temp < 30), 'temp_reg'] = 3
data.loc[data.temp >= 30, 'temp_reg'] = 4

data['temp_cas'] = 0
data.loc[data.temp < 15, 'temp_cas'] = 1
data.loc[(data.temp >= 15) & (data.temp < 23), 'temp_cas'] = 2
data.loc[(data.temp >= 23) & (data.temp < 30), 'temp_cas'] = 3
data.loc[data.temp >= 30, 'temp_cas'] = 4

data['day'] = data['datetime'].map(lambda x: x.day)

data['log_casual'] = np.log(data['casual'] + 1)
data['log_registered'] = np.log(data['registered'] + 1)
data['log_count'] = np.log(data['count'] + 1)

data['weekend'] = data['weekend'].map(lambda x: int(x))
data['rush_hour_working'] = data['rush_hour_working'].map(lambda x: int(x))
data['rush_hour_nonwork'] = data['rush_hour_nonwork'].map(lambda x: int(x))

In [5]:
data.shape

(17379, 25)

### 3. split the data and train the model

In [6]:
train_new = data[data.day <= 19]
test_new = data[data.day > 19]

In [7]:
data.columns

Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp', u'atemp', u'humidity', u'windspeed', u'casual', u'registered', u'count', u'year', u'month', u'hour', u'weekday', u'weekend', u'adverse_rush_hour', u'rush_hour_working', u'rush_hour_nonwork', u'year_part', u'hour_reg', u'hour_cas', u'temp_reg', u'temp_cas', u'day', u'log_casual', u'log_registered', u'log_count'], dtype='object')

In [7]:
x_r = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', \
       'year', 'month', 'hour', 'weekday', 'weekend', 'adverse_rush_hour', 'rush_hour_working', \
     'rush_hour_nonwork', 'year_part', 'hour_reg', 'temp_reg']

x_c = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', \
       'year', 'month', 'hour', 'weekday', 'weekend', 'adverse_rush_hour', 'rush_hour_working', \
     'rush_hour_nonwork', 'year_part', 'hour_cas', 'temp_cas']

In [8]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold

from sklearn.grid_search import GridSearchCV

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, \
AdaBoostRegressor, GradientBoostingRegressor

from sklearn.svm import SVR

In [9]:
train_index, test_index, _, _ = train_test_split(train_new.index, train_new['log_casual'], test_size = 0.2, \
                                                            random_state = 0)

In [10]:
train_index = pd.Series(train_index)
test_index = pd.Series(test_index)

In [11]:
x_train_c = train_new.loc[train_index, x_c]
y_train_c = train_new.loc[train_index, 'log_casual']

x_train_r = train_new.loc[train_index, x_r]
y_train_r = train_new.loc[train_index, 'log_registered']

x_test_c = train_new.loc[test_index, x_c]
y_test_c = train_new.loc[test_index, 'log_casual']

x_test_r = train_new.loc[test_index, x_r]
y_test_r = train_new.loc[test_index, 'log_registered']

In [49]:
v_x_train_c = train_new.loc[:, x_c]
v_y_train_c = train_new.loc[:, 'log_casual']

v_x_train_r = train_new.loc[:, x_r]
v_y_train_r = train_new.loc[:, 'log_registered']

v_x_test_c = test_new.loc[:, x_c]
v_y_test_c = test_new.loc[:, 'log_casual']

v_x_test_r = test_new.loc[:, x_r]
v_y_test_r = test_new.loc[:, 'log_registered']

#### 3.1 the performance of single model

In [28]:
def grid_search(base_model, param_grid, cv = 5):
    model_c = GridSearchCV(estimator = base_model, param_grid = param_grid, scoring = 'mean_squared_error', cv = cv, verbose=20)
    model_c.fit(x_train_c, y_train_c)
    
    f = open('model_output.txt', 'a')
    print >> f, "For casual users:"
    print >> f
    print >> f, model_c.best_estimator_
    print >> f
    print >> f, "For casual users:"
    print >> f
    for params, mean_score, scores in model_c.grid_scores_:
        print >> f, ("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print >> f
    f.close()
    
    model_r = GridSearchCV(estimator = base_model, param_grid = param_grid, scoring = 'mean_squared_error', cv = cv, verbose=20)
    model_r.fit(x_train_r, y_train_r)
    
    f = open('model_output.txt', 'a')
    print >> f, "For registered users:"
    print >> f
    print >> f, model_r.best_estimator_
    print >> f
    print >> f, "For registered users:"
    print >> f
    for params, mean_score, scores in model_r.grid_scores_:
        print >> f, ("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print >> f, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
    f.close()

In [30]:
# ExtraTreesRegressor casual
# -0.232 (+/-0.005) for {'max_features': 'sqrt', 'n_estimators': 500}
# ExtraTreesRegressor registered
# -0.093 (+/-0.003) for {'max_features': 'log2', 'n_estimators': 300}

# {'max_features': 'log2', 'n_estimators': 500}
etree = ExtraTreesRegressor()

param_grid = {'n_estimators': [100, 300, 500], \
              'max_features': ['auto', 'sqrt', 'log2']}

grid_search(etree, param_grid)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.264273 -   2.6s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.282751 -   2.6s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    2.6s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    5.4s



[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.240616 -   2.7s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.255683 -   2.7s

[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    8.2s
[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:   11.0s



[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.254852 -   2.7s
[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.262127 -   8.3s

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:   13.8s
[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:   22.2s



[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.278814 -   8.0s
[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.240696 -   8.4s

[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:   30.4s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:   38.9s



[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.252062 -   8.1s
[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.254129 -   8.0s

[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:   47.1s
[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:   55.3s



[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.262855 -  13.5s
[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.279615 -  13.4s

[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:  1.4min



[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.241734 -  13.6s
[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.251298 -  13.4s

[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:  1.8min



[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.254349 -  13.4s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.245615 -   0.8s

[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:  2.1min



[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.255232 -   0.8s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.232270 -   0.8s

[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  2.1min



[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.236752 -   0.8s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.228413 -   0.8s

[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:  2.1min



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.242051 -   2.7s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.254847 -   2.7s

[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:  2.2min



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.231935 -   2.7s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.233855 -   2.7s

[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:  2.3min



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.226132 -   2.7s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.242657 -   4.6s

[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  2.4min
[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:  2.4min



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.254688 -   4.5s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.229529 -   4.6s

[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:  2.5min
[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:  2.6min



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.234349 -   4.6s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.225458 -   4.5s

[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:  2.7min
[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:  2.8min



[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.244393 -   0.9s
[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.256295 -   0.8s

[Parallel(n_jobs=1)]: Done  31 jobs       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:  2.8min



[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.230639 -   0.8s
[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.238937 -   0.8s

[Parallel(n_jobs=1)]: Done  33 jobs       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  34 jobs       | elapsed:  2.8min



[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.229361 -   0.8s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.243725 -   2.7s

[Parallel(n_jobs=1)]: Done  35 jobs       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  36 jobs       | elapsed:  2.9min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.256960 -   2.7s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.229870 -   2.7s

[Parallel(n_jobs=1)]: Done  37 jobs       | elapsed:  2.9min
[Parallel(n_jobs=1)]: Done  38 jobs       | elapsed:  3.0min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.234521 -   2.7s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.226678 -   2.7s

[Parallel(n_jobs=1)]: Done  39 jobs       | elapsed:  3.0min
[Parallel(n_jobs=1)]: Done  40 jobs       | elapsed:  3.1min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.241080 -   4.5s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.256505 -   4.5s

[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:  3.2min
[Parallel(n_jobs=1)]: Done  42 jobs       | elapsed:  3.2min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.229612 -   4.6s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.233024 -   4.5s

[Parallel(n_jobs=1)]: Done  43 jobs       | elapsed:  3.3min
[Parallel(n_jobs=1)]: Done  44 jobs       | elapsed:  3.4min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.226762 -   4.6s
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.099008 -   2.7s

[Parallel(n_jobs=1)]: Done  45 jobs       | elapsed:  3.5min
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  3.5min finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    2.7s



[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.096799 -   2.6s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.093522 -   2.6s

[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    8.1s



[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.115647 -   2.6s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ... max_features=auto, n_estimators=100, score=-0.089818 -   2.6s

[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:   10.9s
[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:   13.6s



[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.098883 -   8.1s
[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.095676 -   8.1s

[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:   21.8s
[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:   30.0s



[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.093446 -   8.1s
[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.115038 -   8.1s

[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:   38.2s
[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:   46.4s



[CV] max_features=auto, n_estimators=300 .............................
[CV] ... max_features=auto, n_estimators=300, score=-0.089153 -   8.1s
[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.098791 -  13.5s

[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:   54.6s
[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:  1.1min



[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.095698 -  13.5s
[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.092373 -  13.6s

[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  1.6min



[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.114143 -  13.6s
[CV] max_features=auto, n_estimators=500 .............................
[CV] ... max_features=auto, n_estimators=500, score=-0.089255 -  13.6s

[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:  2.1min



[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.092200 -   0.8s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.096704 -   0.8s

[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:  2.1min



[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.097524 -   0.8s
[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.107730 -   0.9s

[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:  2.1min



[CV] max_features=sqrt, n_estimators=100 .............................
[CV] ... max_features=sqrt, n_estimators=100, score=-0.084658 -   0.8s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.091866 -   2.7s

[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:  2.2min



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.093867 -   2.8s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.096741 -   2.7s

[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:  2.3min



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.105448 -   2.7s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.084147 -   2.7s

[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  2.4min



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.091185 -   4.6s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.093286 -   4.6s

[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:  2.4min
[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:  2.5min



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.095044 -   4.6s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.106966 -   4.6s

[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:  2.6min
[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:  2.7min



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.083717 -   4.6s
[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.091829 -   0.9s

[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  31 jobs       | elapsed:  2.8min



[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.095379 -   0.9s
[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.096050 -   0.9s

[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  33 jobs       | elapsed:  2.8min



[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.107422 -   0.8s
[CV] max_features=log2, n_estimators=100 .............................
[CV] ... max_features=log2, n_estimators=100, score=-0.085045 -   0.9s

[Parallel(n_jobs=1)]: Done  34 jobs       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  35 jobs       | elapsed:  2.8min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.091707 -   2.8s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.094494 -   2.8s

[Parallel(n_jobs=1)]: Done  36 jobs       | elapsed:  2.9min
[Parallel(n_jobs=1)]: Done  37 jobs       | elapsed:  2.9min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.095347 -   2.9s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.104966 -   2.8s

[Parallel(n_jobs=1)]: Done  38 jobs       | elapsed:  3.0min
[Parallel(n_jobs=1)]: Done  39 jobs       | elapsed:  3.0min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.083554 -   2.8s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.091647 -   4.7s

[Parallel(n_jobs=1)]: Done  40 jobs       | elapsed:  3.1min
[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:  3.2min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.093786 -   4.7s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.096088 -   4.6s

[Parallel(n_jobs=1)]: Done  42 jobs       | elapsed:  3.3min
[Parallel(n_jobs=1)]: Done  43 jobs       | elapsed:  3.3min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.106635 -   4.7s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.082525 -   4.6s

[Parallel(n_jobs=1)]: Done  44 jobs       | elapsed:  3.4min
[Parallel(n_jobs=1)]: Done  45 jobs       | elapsed:  3.5min





[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  3.5min finished


In [31]:
# AdaBoostRegressor casual
# -0.487 (+/-0.008) for {'n_estimators': 100, 'learning_rate': 0.2}
# -0.490 (+/-0.007) for {'n_estimators': 300, 'loss': 'linear', 'learning_rate': 0.1}
# AdaBoostRegressor registered
# -0.376 (+/-0.010) for {'n_estimators': 50, 'learning_rate': 0.2}
# -0.380 (+/-0.012) for {'n_estimators': 50, 'loss': 'linear', 'learning_rate': 0.2}
ada = AdaBoostRegressor()

param_grid = {'n_estimators': [50, 100, 300], \
              'learning_rate': [0.05, 0.1, 0.2, 0.3]}

grid_search(ada, param_grid)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.579164 -   0.5s
[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.558407 -   0.5s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    1.0s



[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.565069 -   0.4s
[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.601627 -   0.4s

[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:    2.1s



[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.519718 -   0.5s
[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.557991 -   1.0s

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:    2.7s
[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:    3.8s



[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.526759 -   1.1s
[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.544267 -   1.1s

[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:    5.0s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:    6.1s



[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.574013 -   1.0s
[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.518890 -   1.0s

[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:    7.3s
[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:    8.4s



[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.500638 -   3.2s
[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.505670 -   3.1s

[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:   11.6s
[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:   14.9s



[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.490804 -   3.2s
[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.487806 -   3.2s

[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:   18.1s
[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:   21.4s



[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.467904 -   3.1s
[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.558139 -   0.5s

[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:   24.7s
[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:   25.2s



[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.522096 -   0.5s
[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.538448 -   0.4s

[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:   25.8s
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:   26.3s



[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.568243 -   0.4s
[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.516958 -   0.5s

[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:   26.9s
[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:   27.4s



[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.511135 -   1.0s
[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.511653 -   1.0s

[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:   28.5s
[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:   29.6s



[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.499227 -   1.0s
[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.514140 -   1.0s

[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:   30.7s
[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:   31.8s



[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.472339 -   1.0s
[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.501253 -   3.1s

[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:   32.9s
[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:   36.1s



[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.510444 -   3.1s
[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.489110 -   3.1s

[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:   39.3s
[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:   42.5s



[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.491259 -   3.1s
[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.466961 -   3.1s

[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:   45.7s
[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:   48.9s



[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.508539 -   0.5s
[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.509341 -   0.5s

[Parallel(n_jobs=1)]: Done  31 jobs       | elapsed:   49.4s
[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:   50.0s



[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.505904 -   0.5s
[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.512027 -   0.5s

[Parallel(n_jobs=1)]: Done  33 jobs       | elapsed:   50.5s
[Parallel(n_jobs=1)]: Done  34 jobs       | elapsed:   51.1s



[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.485336 -   0.4s
[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.499381 -   1.0s

[Parallel(n_jobs=1)]: Done  35 jobs       | elapsed:   51.6s
[Parallel(n_jobs=1)]: Done  36 jobs       | elapsed:   52.7s



[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.502798 -   1.0s
[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.490679 -   1.0s

[Parallel(n_jobs=1)]: Done  37 jobs       | elapsed:   53.8s
[Parallel(n_jobs=1)]: Done  38 jobs       | elapsed:   54.9s



[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.484551 -   1.0s
[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.464753 -   1.0s

[Parallel(n_jobs=1)]: Done  39 jobs       | elapsed:   56.0s
[Parallel(n_jobs=1)]: Done  40 jobs       | elapsed:   57.1s



[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.505555 -   3.0s
[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.522927 -   3.0s

[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  42 jobs       | elapsed:  1.1min



[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.507722 -   3.1s
[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.500111 -   3.1s

[Parallel(n_jobs=1)]: Done  43 jobs       | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done  44 jobs       | elapsed:  1.2min



[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.481292 -   3.1s
[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.503407 -   0.5s

[Parallel(n_jobs=1)]: Done  45 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  46 jobs       | elapsed:  1.2min



[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.512837 -   0.4s
[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.492478 -   0.5s

[Parallel(n_jobs=1)]: Done  47 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  48 jobs       | elapsed:  1.2min



[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.510157 -   0.5s
[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.456415 -   0.5s

[Parallel(n_jobs=1)]: Done  49 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:  1.3min



[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.507915 -   1.0s
[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.508401 -   1.0s

[Parallel(n_jobs=1)]: Done  51 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  52 jobs       | elapsed:  1.3min



[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.491454 -   1.0s
[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.486659 -   1.0s

[Parallel(n_jobs=1)]: Done  53 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  54 jobs       | elapsed:  1.3min



[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.454407 -   1.0s
[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.530608 -   3.0s

[Parallel(n_jobs=1)]: Done  55 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  56 jobs       | elapsed:  1.4min



[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.503868 -   1.3s
[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.500130 -   1.4s

[Parallel(n_jobs=1)]: Done  57 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  58 jobs       | elapsed:  1.5min



[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.508317 -   3.1s
[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.490953 -   2.9s

[Parallel(n_jobs=1)]: Done  59 jobs       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  60 jobs       | elapsed:  1.6min



Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.434322 -   0.5s

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.5s



[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.432949 -   0.5s
[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.456569 -   0.5s

[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    1.6s



[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.449709 -   0.5s
[CV] n_estimators=50, learning_rate=0.05 .............................
[CV] ... n_estimators=50, learning_rate=0.05, score=-0.424728 -   0.5s

[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:    2.2s
[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:    2.8s



[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.406543 -   1.1s
[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.375669 -   1.0s

[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:    4.0s
[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:    5.1s



[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.409040 -   1.0s
[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.426123 -   1.0s

[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:    6.3s
[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:    7.4s



[CV] n_estimators=100, learning_rate=0.05 ............................
[CV] .. n_estimators=100, learning_rate=0.05, score=-0.414756 -   1.0s
[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.377620 -   3.3s

[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:    8.5s
[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:   11.9s



[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.352230 -   3.2s
[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.399632 -   3.2s

[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:   15.2s
[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:   18.4s



[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.424614 -   3.2s
[CV] n_estimators=300, learning_rate=0.05 ............................
[CV] .. n_estimators=300, learning_rate=0.05, score=-0.375877 -   3.2s

[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:   21.7s
[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:   25.0s



[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.382458 -   0.5s
[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.402871 -   0.5s

[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:   25.6s
[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:   26.1s



[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.409076 -   0.5s
[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.428920 -   0.4s

[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:   26.7s
[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:   27.2s



[CV] n_estimators=50, learning_rate=0.1 ..............................
[CV] .... n_estimators=50, learning_rate=0.1, score=-0.404998 -   0.4s
[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.361353 -   1.0s

[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:   27.8s
[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:   28.9s



[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.359980 -   1.0s
[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.380436 -   1.0s

[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:   30.0s
[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:   31.1s



[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.409249 -   1.0s
[CV] n_estimators=100, learning_rate=0.1 .............................
[CV] ... n_estimators=100, learning_rate=0.1, score=-0.373443 -   1.0s

[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:   32.2s
[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:   33.3s



[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.388558 -   3.1s
[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.354067 -   3.1s

[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:   36.5s
[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:   39.7s



[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.386895 -   3.0s
[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.423011 -   3.0s

[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:   42.8s
[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:   45.9s



[CV] n_estimators=300, learning_rate=0.1 .............................
[CV] ... n_estimators=300, learning_rate=0.1, score=-0.366942 -   3.1s
[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.356678 -   0.5s

[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:   49.1s
[Parallel(n_jobs=1)]: Done  31 jobs       | elapsed:   49.7s



[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.342993 -   0.5s
[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.388384 -   0.5s

[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:   50.3s
[Parallel(n_jobs=1)]: Done  33 jobs       | elapsed:   50.9s



[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.403930 -   0.5s
[CV] n_estimators=50, learning_rate=0.2 ..............................
[CV] .... n_estimators=50, learning_rate=0.2, score=-0.376585 -   0.5s

[Parallel(n_jobs=1)]: Done  34 jobs       | elapsed:   51.4s
[Parallel(n_jobs=1)]: Done  35 jobs       | elapsed:   52.0s



[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.372622 -   1.0s
[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.351039 -   1.0s

[Parallel(n_jobs=1)]: Done  36 jobs       | elapsed:   53.1s
[Parallel(n_jobs=1)]: Done  37 jobs       | elapsed:   54.2s



[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.390996 -   1.0s
[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.437301 -   1.0s

[Parallel(n_jobs=1)]: Done  38 jobs       | elapsed:   55.2s
[Parallel(n_jobs=1)]: Done  39 jobs       | elapsed:   56.3s



[CV] n_estimators=100, learning_rate=0.2 .............................
[CV] ... n_estimators=100, learning_rate=0.2, score=-0.374064 -   1.0s
[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.394521 -   2.2s

[Parallel(n_jobs=1)]: Done  40 jobs       | elapsed:   57.4s
[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:   59.7s



[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.362971 -   2.9s
[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.385754 -   2.0s

[Parallel(n_jobs=1)]: Done  42 jobs       | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  43 jobs       | elapsed:  1.1min



[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.434295 -   1.6s
[CV] n_estimators=300, learning_rate=0.2 .............................
[CV] ... n_estimators=300, learning_rate=0.2, score=-0.390120 -   2.3s

[Parallel(n_jobs=1)]: Done  44 jobs       | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done  45 jobs       | elapsed:  1.1min



[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.386413 -   0.5s
[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.351786 -   0.5s

[Parallel(n_jobs=1)]: Done  46 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  47 jobs       | elapsed:  1.2min



[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.397573 -   0.5s
[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.429551 -   0.5s

[Parallel(n_jobs=1)]: Done  48 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  49 jobs       | elapsed:  1.2min



[CV] n_estimators=50, learning_rate=0.3 ..............................
[CV] .... n_estimators=50, learning_rate=0.3, score=-0.380781 -   0.5s
[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.381309 -   0.9s

[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  51 jobs       | elapsed:  1.2min



[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.354139 -   1.0s
[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.369134 -   1.0s

[Parallel(n_jobs=1)]: Done  52 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  53 jobs       | elapsed:  1.2min



[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.415341 -   0.9s
[CV] n_estimators=100, learning_rate=0.3 .............................
[CV] ... n_estimators=100, learning_rate=0.3, score=-0.386910 -   0.9s

[Parallel(n_jobs=1)]: Done  54 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  55 jobs       | elapsed:  1.3min



[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.408492 -   1.8s
[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.376508 -   2.8s

[Parallel(n_jobs=1)]: Done  56 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  57 jobs       | elapsed:  1.4min



[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.413025 -   2.4s
[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.441324 -   1.9s

[Parallel(n_jobs=1)]: Done  58 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  59 jobs       | elapsed:  1.4min



[CV] n_estimators=300, learning_rate=0.3 .............................
[CV] ... n_estimators=300, learning_rate=0.3, score=-0.388470 -   2.1s


[Parallel(n_jobs=1)]: Done  60 jobs       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.5min finished


In [32]:
# RandomForestRegressor casual
# -0.231 (+/-0.005) for {'max_features': 'log2', 'n_estimators': 500}
# RandomForestRegressor registered
# -0.092 (+/-0.003) for {'max_features': 'log2', 'n_estimators': 500}
rf = RandomForestRegressor()

param_grid = {'n_estimators': [300, 500, 700], \
              'max_features': ['sqrt', 'log2']}

grid_search(rf, param_grid)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.241567 -   3.0s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.249664 -   2.9s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    6.0s



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.225183 -   3.0s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.230360 -   3.0s

[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    9.1s
[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:   12.2s



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.220779 -   2.9s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.239414 -   5.0s

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:   15.2s
[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:   20.3s



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.249374 -   4.9s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.223955 -   4.7s

[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:   25.3s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:   30.2s



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.229111 -   4.7s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.219022 -   4.7s

[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:   35.0s
[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:   39.8s



[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.240195 -   6.6s
[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.249490 -   6.6s

[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:   46.5s
[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:   53.3s



[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.224563 -   6.6s
[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.229972 -   6.6s

[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:  1.1min



[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.220251 -   6.6s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.240198 -   2.8s

[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:  1.3min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.249766 -   2.8s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.223511 -   2.8s

[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  1.4min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.229396 -   2.8s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.221819 -   2.8s

[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:  1.5min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.240270 -   4.7s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.248791 -   4.7s

[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:  1.6min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.224510 -   4.7s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.229079 -   4.7s

[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:  1.7min
[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:  1.8min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.220289 -   4.7s
[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.238574 -   6.6s

[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:  2.0min



[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.247348 -   6.6s
[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.224986 -   6.7s

[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:  2.2min



[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.230613 -   6.6s
[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.219355 -   6.6s

[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:  2.4min



Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.089991 -   3.0s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    3.0s



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.092007 -   3.0s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.095477 -   3.0s

[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    6.2s
[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    9.2s



[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.100572 -   3.0s
[CV] max_features=sqrt, n_estimators=300 .............................
[CV] ... max_features=sqrt, n_estimators=300, score=-0.082733 -   3.0s

[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:   12.3s
[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:   15.4s



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.088858 -   5.0s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.091132 -   5.0s

[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:   20.5s
[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:   25.7s



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.094473 -   5.0s
[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.100425 -   5.0s

[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:   30.8s
[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:   36.0s



[CV] max_features=sqrt, n_estimators=500 .............................
[CV] ... max_features=sqrt, n_estimators=500, score=-0.081023 -   5.0s
[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.089989 -   7.0s

[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:   41.1s
[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:   48.3s



[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.091405 -   7.1s
[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.095198 -   7.1s

[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:   55.5s
[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  1.0min



[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.100525 -   7.1s
[CV] max_features=sqrt, n_estimators=700 .............................
[CV] ... max_features=sqrt, n_estimators=700, score=-0.081008 -   7.0s

[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:  1.3min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.090044 -   2.9s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.091520 -   3.0s

[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:  1.4min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.095582 -   3.0s
[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.100579 -   3.0s

[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:  1.5min



[CV] max_features=log2, n_estimators=300 .............................
[CV] ... max_features=log2, n_estimators=300, score=-0.081873 -   3.0s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.089701 -   5.1s

[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:  1.6min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.091484 -   5.0s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.096180 -   5.1s

[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:  1.7min
[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:  1.8min



[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.099817 -   5.1s
[CV] max_features=log2, n_estimators=500 .............................
[CV] ... max_features=log2, n_estimators=500, score=-0.081525 -   5.0s

[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  2.0min



[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.089663 -   7.1s
[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.090249 -   7.1s

[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:  2.2min



[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.094458 -   7.0s
[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.100242 -   7.0s

[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:  2.5min



[CV] max_features=log2, n_estimators=700 .............................
[CV] ... max_features=log2, n_estimators=700, score=-0.081816 -   7.0s


[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:  2.6min
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.6min finished


In [37]:
svr_l = SVR(kernel = 'linear')

param_grid = {'C': [0.1, 0.2, 0.3], \
              'epsilon': [0.2, 0.3]}

grid_search(svr_l, param_grid)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.610236 -   4.8s
[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.767376 -   4.7s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    4.8s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    9.6s



[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.653245 -   7.4s
[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.033324 -   5.1s

[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:   17.1s
[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:   22.3s



[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.590473 -   4.9s
[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-2.076252 -   4.9s

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:   27.3s
[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:   32.4s



[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.748666 -   4.7s
[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.637994 -   7.2s

[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:   37.1s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:   44.5s



[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.769951 -   6.9s
[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.657568 -   5.0s

[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:   51.5s
[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:   56.6s



[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-5.542217 -   4.9s
[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-5.225891 -   5.6s

[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:  1.1min



[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-6.098182 -   6.5s
[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-3.066893 -   5.8s

[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:  1.3min



[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-3.336510 -   4.8s
[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-1.611206 -   4.6s

[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:  1.5min



[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-2.628181 -   4.6s
[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-5.481250 -   6.2s

[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  1.7min



[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-2.741552 -   4.6s
[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-3.017880 -   4.9s

[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:  1.8min



[CV] epsilon=0.2, C=0.3 ..............................................
[CV] .................... epsilon=0.2, C=0.3, score=-3.335429 -   4.7s
[CV] epsilon=0.2, C=0.3 ..............................................
[CV] ................... epsilon=0.2, C=0.3, score=-10.826813 -   4.8s

[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:  2.0min



[CV] epsilon=0.2, C=0.3 ..............................................
[CV] .................... epsilon=0.2, C=0.3, score=-3.931952 -   6.5s
[CV] epsilon=0.2, C=0.3 ..............................................
[CV] .................... epsilon=0.2, C=0.3, score=-5.771926 -   4.5s

[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:  2.2min



[CV] epsilon=0.2, C=0.3 ..............................................
[CV] .................... epsilon=0.2, C=0.3, score=-6.718074 -   4.9s
[CV] epsilon=0.3, C=0.3 ..............................................
[CV] .................... epsilon=0.3, C=0.3, score=-2.880308 -   5.0s

[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:  2.4min



[CV] epsilon=0.3, C=0.3 ..............................................
[CV] .................... epsilon=0.3, C=0.3, score=-5.498909 -   5.6s
[CV] epsilon=0.3, C=0.3 ..............................................
[CV] .................... epsilon=0.3, C=0.3, score=-5.260030 -   4.9s

[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:  2.4min
[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:  2.5min



[CV] epsilon=0.3, C=0.3 ..............................................
[CV] .................... epsilon=0.3, C=0.3, score=-5.910420 -   4.5s
[CV] epsilon=0.3, C=0.3 ..............................................
[CV] .................... epsilon=0.3, C=0.3, score=-7.173710 -   5.0s

[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:  2.6min
[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:  2.7min



Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.679682 -   4.7s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.7min finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    4.7s



[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-0.965042 -   4.6s
[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.800949 -   5.3s

[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    9.5s
[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:   14.9s



[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.091001 -   5.0s
[CV] epsilon=0.2, C=0.1 ..............................................
[CV] .................... epsilon=0.2, C=0.1, score=-1.759850 -   5.1s

[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:   20.0s
[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:   25.2s



[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.483414 -   4.8s
[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.014828 -   4.8s

[Parallel(n_jobs=1)]: Done   6 jobs       | elapsed:   30.0s
[Parallel(n_jobs=1)]: Done   7 jobs       | elapsed:   34.9s



[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.804031 -   5.0s
[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.591691 -   4.5s

[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:   40.0s
[Parallel(n_jobs=1)]: Done   9 jobs       | elapsed:   44.6s



[CV] epsilon=0.3, C=0.1 ..............................................
[CV] .................... epsilon=0.3, C=0.1, score=-1.281391 -   4.6s
[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-4.591968 -   4.9s

[Parallel(n_jobs=1)]: Done  10 jobs       | elapsed:   49.3s
[Parallel(n_jobs=1)]: Done  11 jobs       | elapsed:   54.3s



[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-5.131189 -   4.7s
[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-5.911893 -   4.7s

[Parallel(n_jobs=1)]: Done  12 jobs       | elapsed:   59.1s
[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  1.1min



[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-2.921386 -   5.2s
[CV] epsilon=0.2, C=0.2 ..............................................
[CV] .................... epsilon=0.2, C=0.2, score=-5.632405 -   5.2s

[Parallel(n_jobs=1)]: Done  14 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  15 jobs       | elapsed:  1.2min



[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-4.493208 -   5.5s
[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-5.151680 -   4.7s

[Parallel(n_jobs=1)]: Done  16 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done  17 jobs       | elapsed:  1.4min



[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-6.025889 -   5.0s
[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-5.336505 -   4.3s

[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done  19 jobs       | elapsed:  1.6min



[CV] epsilon=0.3, C=0.2 ..............................................
[CV] .................... epsilon=0.3, C=0.2, score=-5.626155 -   5.0s
[CV] epsilon=0.2, C=0.3 ..............................................
[CV] ................... epsilon=0.2, C=0.3, score=-10.868978 -   4.6s

[Parallel(n_jobs=1)]: Done  20 jobs       | elapsed:  1.7min
[Parallel(n_jobs=1)]: Done  21 jobs       | elapsed:  1.7min



[CV] epsilon=0.2, C=0.3 ..............................................
[CV] ................... epsilon=0.2, C=0.3, score=-10.349576 -   4.8s
[CV] epsilon=0.2, C=0.3 ..............................................
[CV] ................... epsilon=0.2, C=0.3, score=-12.732390 -   5.4s

[Parallel(n_jobs=1)]: Done  22 jobs       | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done  23 jobs       | elapsed:  1.9min



[CV] epsilon=0.2, C=0.3 ..............................................
[CV] .................... epsilon=0.2, C=0.3, score=-5.518989 -   4.6s
[CV] epsilon=0.2, C=0.3 ..............................................
[CV] ................... epsilon=0.2, C=0.3, score=-11.790695 -   5.1s

[Parallel(n_jobs=1)]: Done  24 jobs       | elapsed:  2.0min
[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  2.1min



[CV] epsilon=0.3, C=0.3 ..............................................
[CV] ................... epsilon=0.3, C=0.3, score=-10.671675 -   4.7s
[CV] epsilon=0.3, C=0.3 ..............................................
[CV] ................... epsilon=0.3, C=0.3, score=-10.165667 -   4.9s

[Parallel(n_jobs=1)]: Done  26 jobs       | elapsed:  2.2min
[Parallel(n_jobs=1)]: Done  27 jobs       | elapsed:  2.2min



[CV] epsilon=0.3, C=0.3 ..............................................
[CV] ................... epsilon=0.3, C=0.3, score=-12.577310 -   5.3s
[CV] epsilon=0.3, C=0.3 ..............................................
[CV] .................... epsilon=0.3, C=0.3, score=-6.730797 -   7.3s

[Parallel(n_jobs=1)]: Done  28 jobs       | elapsed:  2.3min
[Parallel(n_jobs=1)]: Done  29 jobs       | elapsed:  2.5min



[CV] epsilon=0.3, C=0.3 ..............................................
[CV] ................... epsilon=0.3, C=0.3, score=-11.973865 -   5.1s


[Parallel(n_jobs=1)]: Done  30 jobs       | elapsed:  2.5min
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.5min finished


In [36]:
def get_pred(model_c, model_r, x_train_c, y_train_c, x_test_c, x_train_r, y_train_r, x_test_r):
    model_c_pred = model_c.fit(x_train_c, y_train_c).predict(x_test_c)
    model_r_pred = model_r.fit(x_train_r, y_train_r).predict(x_test_r)
    return (model_c_pred, model_r_pred)

In [12]:
def rmsle(pred_count, actual_count):
    return round(np.sqrt(np.mean((np.log(pred_count + 1) - np.log(actual_count + 1)) ** 2)), 5) 

In [69]:
def single_model_test(model_c, model_r, file_name):
    
    model_c_pred, model_r_pred = get_pred(model_c, model_r, x_train_c, y_train_c, x_test_c, x_train_r, y_train_r, x_test_r)
    
    casual = np.exp(model_c_pred) - 1
    registered = np.exp(model_r_pred) - 1
    count = casual + registered
    df = pd.DataFrame({'count': count, 'casual': casual, 'registered': registered}) 
    dir_name = 'test/' + 'python_' + file_name + '.csv'
    df.to_csv(dir_name, index = False)
    
    print 'casual rmsle:'
    print rmsle(casual, train_new.loc[test_index, 'casual'])
    print 'registered rmsle:'
    print rmsle(registered, train_new.loc[test_index, 'registered']) 
    print 'count rmsle:'
    print rmsle(count, train_new.loc[test_index, 'count']) 

In [68]:
def single_model_validation(model_c, model_r, file_name): 
    model_c_pred, model_r_pred = get_pred(model_c, model_r, \
                                      train_new[x_c], train_new['log_casual'], test_new[x_c], \
                                      train_new[x_r], train_new['log_registered'], test_new[x_r])
    casual = np.exp(model_c_pred) - 1
    registered = np.exp(model_r_pred) - 1
    count = casual + registered
    df = pd.DataFrame({'count': count, 'casual': casual, 'registered': registered}) 
    dir_name = 'validation/' + 'python_' + file_name + '.csv'
    df.to_csv(dir_name, index = False)

In [71]:
# etree 0.3 0.43
model_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
model_r = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
# test dataset
single_model_test(model_c, model_r, 'etree')
# validation dataset
single_model_validation(model_c, model_r, 'etree')

casual rmsle:
0.46601
registered rmsle:
0.30354
count rmsle:
0.29997


In [72]:
# ada 0.6 0.69
model_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
model_r = AdaBoostRegressor(n_estimators = 50, learning_rate = 0.2)
# test dataset
single_model_test(model_c, model_r, 'ada')
# validation dataset
single_model_validation(model_c, model_r, 'ada')

casual rmsle:
0.69835
registered rmsle:
0.61753
count rmsle:
0.61399


In [73]:
# rf 0.29 0.42
model_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
model_r = RandomForestRegressor(n_estimators = 700, max_features='log2')
# test dataset
single_model_test(model_c, model_r, 'rf')
# validation dataset
single_model_validation(model_c, model_r, 'rf')

casual rmsle:
0.46301
registered rmsle:
0.30158
count rmsle:
0.29902


In [74]:
# svr 0.78 0.89
model_c = SVR(kernel = 'linear', C = 0.01, epsilon = 0.2)
model_r = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
# test dataset
single_model_test(model_c, model_r, 'svr')
# validation dataset
single_model_validation(model_c, model_r, 'svr')

casual rmsle:
0.72718
registered rmsle:
0.70758
count rmsle:
0.69402


#### 3.2 the performance of simple ensemble

In [38]:
def get_file(method, file_name):
    df1 = pd.DataFrame() 
    for i in method:
        file_n = file_name + '/' + i + '.csv'
        df2 = pd.read_csv(file_n)
        df2.columns = file_name + '_' + i + '_' + df2.columns.values
        df1 = pd.concat([df1, df2], axis = 1)
    return df1

In [40]:
method = ['python_ada', 'python_etree', 'python_rf', 'python_svr']

In [41]:
df_t = get_file(method, 'test')

In [84]:
df_t.head()

Unnamed: 0,test_python_ada_casual,test_python_ada_count,test_python_ada_registered,test_python_etree_casual,test_python_etree_count,test_python_etree_registered,test_python_rf_casual,test_python_rf_count,test_python_rf_registered,test_python_svr_casual,test_python_svr_count,test_python_svr_registered
0,18.044293,120.937032,102.89274,30.835246,251.964879,221.129633,29.96522,243.823204,213.857984,22.363279,164.354626,141.991346
1,40.69245,188.868883,148.176433,39.01985,258.476507,219.456656,34.91084,224.555372,189.644532,25.046906,208.653082,183.606176
2,32.536544,173.555958,141.019414,22.604125,233.800543,211.196418,20.290719,230.681014,210.390295,21.505515,176.718866,155.213351
3,50.453366,416.008033,365.554666,82.25068,549.318179,467.067499,88.343449,546.36698,458.023531,63.610035,801.369389,737.759353
4,31.381752,271.513481,240.131729,16.36969,360.832528,344.462838,17.948519,362.371732,344.423212,30.29352,479.278508,448.984988


In [87]:
# correlationship between methods
np.corrcoef(df_t[['test_python_ada_casual', 'test_python_etree_casual', 'test_python_rf_casual', 'test_python_svr_casual']], \
            rowvar=0)

array([[ 1.        ,  0.88557477,  0.8937193 ,  0.82311731],
       [ 0.88557477,  1.        ,  0.99733153,  0.83202309],
       [ 0.8937193 ,  0.99733153,  1.        ,  0.8387068 ],
       [ 0.82311731,  0.83202309,  0.8387068 ,  1.        ]])

In [98]:
# get mean 
casual = df_t[['test_python_ada_casual', 'test_python_etree_casual', 'test_python_rf_casual', 'test_python_svr_casual']].mean(axis = 1)
rmsle(casual, train_new.loc[test_index, 'casual'].reset_index().casual)

0.5366

In [99]:
casual = df_t[['test_python_ada_casual', 'test_python_etree_casual', 'test_python_rf_casual']].mean(axis = 1)
rmsle(casual, train_new.loc[test_index, 'casual'].reset_index().casual)

0.49725

In [101]:
casual = df_t[['test_python_etree_casual']].mean(axis = 1)
rmsle(casual, train_new.loc[test_index, 'casual'].reset_index().casual)

0.46601

In [102]:
casual = df_t[['test_python_rf_casual']].mean(axis = 1)
rmsle(casual, train_new.loc[test_index, 'casual'].reset_index().casual)

0.46301

In [112]:
# the mean of etree and rf is better than anyone
casual = df_t[['test_python_etree_casual', 'test_python_rf_casual']].mean(axis = 1)
rmsle(casual, train_new.loc[test_index, 'casual'].reset_index().casual)

0.46193

- To casual users, the mean of etree and rf is better than anyone.

| |mean of etree rf ada svr|mean of etree rf ada|etree|rf|mean of etree rf|
|-|:----------------------:|:------------------:|:---:|:-:|:--------------:|
|**casual**|0.**536**6|0.**497**25|0.**466**01|0.**463**01|0.**461**93|

In [104]:
registered = df_t[['test_python_ada_registered', 'test_python_etree_registered', 'test_python_rf_registered']].mean(axis = 1)
rmsle(registered, train_new.loc[test_index, 'registered'].reset_index().registered)

0.36823

In [106]:
registered = df_t[['test_python_etree_registered']].mean(axis = 1)
rmsle(registered, train_new.loc[test_index, 'registered'].reset_index().registered)

0.30354

In [107]:
registered = df_t[['test_python_rf_registered']].mean(axis = 1)
rmsle(registered, train_new.loc[test_index, 'registered'].reset_index().registered)

0.30158

In [113]:
# the mean of etree and rf is better than anyone
registered = df_t[['test_python_etree_registered', 'test_python_rf_registered']].mean(axis = 1)
rmsle(registered, train_new.loc[test_index, 'registered'].reset_index().registered)

0.30002

In [42]:
registered = df_t[['test_python_svr_registered']].mean(axis = 1)
rmsle(registered, train_new.loc[test_index, 'registered'].reset_index().registered)

0.70758

- To registered users, the mean of etree and rf is better than anyone.

| |mean of etree rf ada|etree| rf |mean of etree rf|
|-|:------------------:|:---:|:--:|:------------:|
|**registered**|0.**368**23|0.**303**54|0.**301**58|0.**300**02|

In [110]:
count = df_t[['test_python_etree_count']].mean(axis = 1)
rmsle(count, train_new.loc[test_index, 'count'].reset_index()['count'])

0.29997

In [111]:
count = df_t[['test_python_rf_count']].mean(axis = 1)
rmsle(count, train_new.loc[test_index, 'count'].reset_index()['count'])

0.29902

In [109]:
# the mean of etree and rf is better than anyone
count = df_t[['test_python_etree_count', 'test_python_rf_count']].mean(axis = 1)
rmsle(count, train_new.loc[test_index, 'count'].reset_index()['count'])

0.29706

- To count, the mean of etree and rf is better than anyone.

| |etree|rf|mean of etree rf|
|-|:---:|:-:|:--------------:|
|**count**|0.**299**97|0.**299**02|0.**297**06|

In [127]:
# get weight average 
count = np.average(df_t[['test_python_ada_count', 'test_python_etree_count', 'test_python_rf_count']], \
                   axis = 1, weights = [1, 400, 400])
rmsle(count, train_new.loc[test_index, 'count'].reset_index()['count'])

0.29719

In [14]:
df_v = get_file(method, 'validation')

In [15]:
df_v.head()

Unnamed: 0,validation_python_ada_casual,validation_python_ada_count,validation_python_ada_registered,validation_python_etree_casual,validation_python_etree_count,validation_python_etree_registered,validation_python_rf_casual,validation_python_rf_count,validation_python_rf_registered,validation_python_svr_casual,validation_python_svr_count,validation_python_svr_registered
0,1.646999,9.310434,7.663436,0.79598,9.222792,8.426812,0.947495,9.362391,8.414896,6.7e-05,6.031021,6.030954
1,1.542942,8.61962,7.076678,0.348914,4.852138,4.503224,0.515072,5.198991,4.68392,0.239844,6.860225,6.620381
2,1.412425,6.471536,5.059111,0.419036,2.799393,2.380357,0.666503,3.101975,2.435472,0.299639,7.077659,6.778019
3,1.314668,6.218266,4.903597,0.488847,2.41931,1.930463,0.53829,2.451748,1.913458,0.277599,7.01147,6.733871
4,1.314668,6.218266,4.903597,0.321512,2.459805,2.138293,0.351852,2.283956,1.932105,0.339216,7.233072,6.893856


In [16]:
# correlationship between methods
np.corrcoef(df_v[['validation_python_ada_casual', 'validation_python_etree_casual', 'validation_python_rf_casual', 'validation_python_svr_casual']], \
            rowvar=0)

array([[ 1.        ,  0.90120355,  0.90768658,  0.79720309],
       [ 0.90120355,  1.        ,  0.99512991,  0.81041801],
       [ 0.90768658,  0.99512991,  1.        ,  0.81662433],
       [ 0.79720309,  0.81041801,  0.81662433,  1.        ]])

In [19]:
# get etree 0.43117
df = pd.DataFrame({'datetime': test.datetime, 'count':df_v.validation_python_etree_count}) 
df.to_csv('submission/python_etree.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S") 

In [20]:
# get rf 0.41884
df = pd.DataFrame({'datetime': test.datetime, 'count':df_v.validation_python_rf_count}) 
df.to_csv('submission/python_rf.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S") 

In [17]:
# get mean of etree and rf 0.42365
df = pd.DataFrame({'datetime': test.datetime, 'count':df_v[['validation_python_etree_count', 'validation_python_rf_count']].mean(axis = 1)}) 
df.to_csv('submission/python_mean.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")

#### 3.3 the performance of stack ensemble

In [13]:
# out of fold prediction
def get_oof(clf, x_train, y_train, x_test):
        ntrain = x_train.shape[0]
        ntest = x_test.shape[0]
           
        oof_train = np.zeros((ntrain, ))
        oof_test = np.zeros((ntest, ))
        oof_test_kf = np.empty((5, ntest))
        
        kf = KFold(ntrain, n_folds = 5, random_state = 0)
        
        for i, (train_index_kf, test_index_kf) in enumerate(kf):
            kf_x_train = x_train.loc[train_index_kf, ]
            kf_y_train = np.ravel(y_train.loc[train_index_kf, ])
            kf_x_test = x_train.loc[test_index_kf, ]
           
            clf.fit(kf_x_train, kf_y_train)
        
            oof_train[test_index_kf] = clf.predict(kf_x_test)
            oof_test_kf[i, :] = clf.predict(x_test)
    
        oof_test[:] = oof_test_kf.mean(axis = 0)      
        return oof_train, oof_test

In [14]:
def get_stack(stacker, base_models, x_train, y_train, x_test):
    
    nbase = len(base_models)
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    
    s_train = np.empty((ntrain, nbase))
    s_test = np.empty((ntest, nbase))                 
                       
    for i, base_model in enumerate(base_models):
        s_train[:, i], s_test[:, i] = get_oof(base_model, x_train, y_train, x_test)
    
    return stacker.fit(s_train, np.ravel(y_train)).predict(s_test)                      

In [16]:
def log_rmsle(pred_count, actual_count):
    return round(np.sqrt(np.mean((pred_count - actual_count) ** 2)), 5) 

In [15]:
x_train_c = x_train_c.reset_index().drop('index', axis = 1)
y_train_c = y_train_c.reset_index().drop('index', axis = 1)
x_test_c = x_test_c.reset_index().drop('index', axis = 1)

In [17]:
# 0.48141 
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]
stacker = ExtraTreesRegressor(n_estimators = 500, max_features='log2')

pred_stacker_c = get_stack(stacker, base_models_c,  x_train_c, y_train_c, x_test_c)
log_rmsle(pred_stacker_c, y_test_c)

0.48141

In [18]:
# rf 0.47926
stacker = RandomForestRegressor(n_estimators = 500, max_features='log2')

pred_stacker_c = get_stack(stacker, base_models_c,  x_train_c, y_train_c, x_test_c)
log_rmsle(pred_stacker_c, y_test_c)

0.47926

In [19]:
# ada 0.49262
stacker = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)

pred_stacker_c = get_stack(stacker, base_models_c,  x_train_c, y_train_c, x_test_c)
log_rmsle(pred_stacker_c, y_test_c)

0.49262

In [20]:
stacker = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

pred_stacker_c = get_stack(stacker, base_models_c,  x_train_c, y_train_c, x_test_c)
log_rmsle(pred_stacker_c, y_test_c)

0.45985

In [21]:
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]

stacker_c_etree = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
pred_stacker_c_etree = get_stack(stacker_c_etree, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_c_rf = get_stack(stacker_c_rf, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_ada = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
pred_stacker_c_ada = get_stack(stacker_c_ada, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_c_svr = get_stack(stacker_c_svr, base_models_c,  x_train_c, y_train_c, x_test_c)

log_rmsle((pred_stacker_c_etree + pred_stacker_c_rf + pred_stacker_c_ada + pred_stacker_c_svr) / 4.0, y_test_c)

0.46243

In [22]:
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]

stacker_c_etree = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
pred_stacker_c_etree = get_stack(stacker_c_etree, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_c_rf = get_stack(stacker_c_rf, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_c_svr = get_stack(stacker_c_svr, base_models_c,  x_train_c, y_train_c, x_test_c)

log_rmsle((pred_stacker_c_etree + pred_stacker_c_rf + pred_stacker_c_svr) / 3.0, y_test_c)

0.46419

In [23]:
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]

stacker_c_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_c_rf = get_stack(stacker_c_rf, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_ada = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
pred_stacker_c_ada = get_stack(stacker_c_ada, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_c_svr = get_stack(stacker_c_svr, base_models_c,  x_train_c, y_train_c, x_test_c)

log_rmsle((pred_stacker_c_rf + pred_stacker_c_ada + pred_stacker_c_svr) / 3.0, y_test_c)

0.46491

In [24]:
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]

stacker_c_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_c_rf = get_stack(stacker_c_rf, base_models_c,  x_train_c, y_train_c, x_test_c)

stacker_c_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_c_svr = get_stack(stacker_c_svr, base_models_c,  x_train_c, y_train_c, x_test_c)

log_rmsle((pred_stacker_c_rf + pred_stacker_c_svr) / 2.0, y_test_c)

0.46069

- To casual users, the stacker of svr is better than anyone.

| |stacker_etree|stacker_rf|stacker_ada|stacker_svr|mean of etree rf ada svr|mean of etree rf svr|mean of rf ada svr|mean of rf svr
|-|:-----------:|:--------:|:---------:|:---------:|:----------------------:|:------------------:|:----------------:|:---:|
|**casual**|0.**481**41|0.**479**26|0.**492**62|0.**459**85|0.**462**43|0.**464**19|0.**464**91|0.**460**69|

In [25]:
x_train_r = x_train_r.reset_index().drop('index', axis = 1)
y_train_r = y_train_r.reset_index().drop('index', axis = 1)
x_test_r = x_test_r.reset_index().drop('index', axis = 1)

In [26]:
etree_r = ExtraTreesRegressor(n_estimators = 300, max_features='log2')
rf_r = RandomForestRegressor(n_estimators = 500, max_features='log2')
ada_r = AdaBoostRegressor(n_estimators = 50, learning_rate = 0.2)
svr_r = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

base_models_r = [etree_r, rf_r, ada_r, svr_r]

In [27]:
# etree 0.30874
stacker = ExtraTreesRegressor(n_estimators = 500, max_features='log2')

pred_stacker_r = get_stack(stacker, base_models_r, x_train_r, y_train_r, x_test_r)
log_rmsle(pred_stacker_r, y_test_r) 

0.30874

In [28]:
# rf 0.30464
stacker = RandomForestRegressor(n_estimators = 500, max_features='log2')

pred_stacker_r = get_stack(stacker, base_models_r, x_train_r, y_train_r, x_test_r)
log_rmsle(pred_stacker_r, y_test_r) 

0.30464

In [29]:
# 0.35299
stacker = AdaBoostRegressor(n_estimators = 300, learning_rate = 0.2)

pred_stacker_r = get_stack(stacker, base_models_r, x_train_r, y_train_r, x_test_r)
log_rmsle(pred_stacker_r, y_test_r) 

0.35299

In [30]:
stacker = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

pred_stacker_r = get_stack(stacker, base_models_r, x_train_r, y_train_r, x_test_r)
log_rmsle(pred_stacker_r, y_test_r) 

0.29651

In [31]:
stacker_etree = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_etree = get_stack(stacker_etree, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_rf = get_stack(stacker_rf, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_ada = AdaBoostRegressor(n_estimators = 300, learning_rate = 0.2)
pred_stacker_r_ada = get_stack(stacker_ada, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_r_svr = get_stack(stacker_svr, base_models_r, x_train_r, y_train_r, x_test_r)

log_rmsle((pred_stacker_r_etree + pred_stacker_r_rf + pred_stacker_r_ada + pred_stacker_r_svr) / 4.0, y_test_r) 

0.29978

In [32]:
stacker_etree = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_etree = get_stack(stacker_etree, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_rf = get_stack(stacker_rf, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_ada = AdaBoostRegressor(n_estimators = 300, learning_rate = 0.2)
pred_stacker_r_ada = get_stack(stacker_ada, base_models_r, x_train_r, y_train_r, x_test_r)

log_rmsle((pred_stacker_r_etree + pred_stacker_r_rf + pred_stacker_r_ada) / 3.0, y_test_r) 

0.30597

In [33]:
stacker_etree = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_etree = get_stack(stacker_etree, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_rf = get_stack(stacker_rf, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_r_svr = get_stack(stacker_svr, base_models_r, x_train_r, y_train_r, x_test_r)

log_rmsle((pred_stacker_r_etree + pred_stacker_r_rf + pred_stacker_r_svr) / 3.0, y_test_r) 

0.29678

In [34]:
stacker_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_rf = get_stack(stacker_rf, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_ada = AdaBoostRegressor(n_estimators = 300, learning_rate = 0.2)
pred_stacker_r_ada = get_stack(stacker_ada, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_r_svr = get_stack(stacker_svr, base_models_r, x_train_r, y_train_r, x_test_r)

log_rmsle((pred_stacker_r_rf + pred_stacker_r_ada + pred_stacker_r_svr) / 3.0, y_test_r) 

0.30267

In [36]:
stacker_etree = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_etree = get_stack(stacker_etree, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_rf = get_stack(stacker_rf, base_models_r, x_train_r, y_train_r, x_test_r)

log_rmsle((pred_stacker_r_etree + pred_stacker_r_rf) / 2.0, y_test_r) 

0.30132

In [35]:
stacker_rf = RandomForestRegressor(n_estimators = 500, max_features='log2')
pred_stacker_r_rf = get_stack(stacker_rf, base_models_r, x_train_r, y_train_r, x_test_r)

stacker_svr = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)
pred_stacker_r_svr = get_stack(stacker_svr, base_models_r, x_train_r, y_train_r, x_test_r)

log_rmsle((pred_stacker_r_rf + pred_stacker_r_svr) / 2.0, y_test_r) 

0.29682

- To registered users, the stacker of svr is better than anyone.

| |stacker_etree|stacker_rf|stacker_ada|stacker_svr|mean of etree rf ada svr|mean of etree rf ada|mean of etree rf svr|mean of rf ada svr|mean of rf svr|
|-|:-----------:|:--------:|:---------:|:---------:|:----------------------:|:---:|:------------------:|:----------------:|:---:|
|**registered**|0.**308**74|0.**304**64|0.**352**99|0.**296**51|0.**299**78|0.**305**97|0.**296**78|0.**302**67|0.**296**82|

In [45]:
# 0.45985
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]
stacker = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

pred_stacker_c = get_stack(stacker, base_models_c,  x_train_c, y_train_c, x_test_c)
log_rmsle(pred_stacker_c, y_test_c)

0.45979

In [46]:
# 0.29651
etree_r = ExtraTreesRegressor(n_estimators = 300, max_features='log2')
rf_r = RandomForestRegressor(n_estimators = 500, max_features='log2')
ada_r = AdaBoostRegressor(n_estimators = 50, learning_rate = 0.2)
svr_r = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

base_models_r = [etree_r, rf_r, ada_r, svr_r]
stacker_r = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

pred_stacker_r = get_stack(stacker_r, base_models_r, x_train_r, y_train_r, x_test_r)
log_rmsle(pred_stacker_r, y_test_r) 

0.29653

In [47]:
# 0.29411
rmsle(np.exp(pred_stacker_c) + np.exp(pred_stacker_r) - 2, np.exp(y_test_c) + np.exp(y_test_r) - 2) 

0.29411

- validation dataset

In [51]:
etree_c = ExtraTreesRegressor(n_estimators = 500, max_features='log2')
rf_c = RandomForestRegressor(n_estimators = 700, max_features='log2')
ada_c = AdaBoostRegressor(n_estimators = 100, learning_rate = 0.2)
svr_c = SVR(C = 0.01, epsilon = 0.2)

base_models_c = [etree_c, rf_c, ada_c, svr_c]
stacker = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

pred_stacker_c = get_stack(stacker, base_models_c,  v_x_train_c, v_y_train_c, v_x_test_c)

In [52]:
etree_r = ExtraTreesRegressor(n_estimators = 300, max_features='log2')
rf_r = RandomForestRegressor(n_estimators = 500, max_features='log2')
ada_r = AdaBoostRegressor(n_estimators = 50, learning_rate = 0.2)
svr_r = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

base_models_r = [etree_r, rf_r, ada_r, svr_r]
stacker_r = SVR(kernel = 'linear', C = 0.01, epsilon = 0.3)

pred_stacker_r = get_stack(stacker_r, base_models_r, v_x_train_r, v_y_train_r, v_x_test_r)

In [53]:
# get 0.42612
df = pd.DataFrame({'datetime': test.datetime, 'count': np.exp(pred_stacker_c) + np.exp(pred_stacker_r) - 2}) 
df.to_csv('submission/python_stacker_svr.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S") 