In [44]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression,BayesianRidge 

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from joblib import dump, load

from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.style import set_palette
set_palette('sns_pastel')

In [45]:
rf_regr = load('rf_regr.joblib')
df = pd.read_pickle('bikeshare.pkl')
df = df.reset_index().dropna()

In [46]:
X,y = df[['Hour', 'Temperature(°C)','Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)',
       'Snowfall (cm)']], df['Rented Bike Count']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)

In [48]:
param_grid = {
    'n_estimators' : [50,100,200],
    'max_depth' : [2,5,10]
}

rf_regr.criterion = 'mae'

In [49]:
grid_rf_regr = GridSearchCV(rf_regr, 
                            param_grid=param_grid, 
                            scoring='r2',    
                            n_jobs=-1,    
                            cv = 10,    
                            refit=True,
                            return_train_score=True
                )

In [None]:
grid_rf_regr.fit(X_train,y_train)

In [20]:
cv_results_df = pd.DataFrame(grid_rf_regr.cv_results_)
print(cv_results_df.shape)

(9, 32)


In [22]:
cv_results_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'split5_test_score', 'split6_test_score',
       'split7_test_score', 'split8_test_score', 'split9_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score',
       'split0_train_score', 'split1_train_score', 'split2_train_score',
       'split3_train_score', 'split4_train_score', 'split5_train_score',
       'split6_train_score', 'split7_train_score', 'split8_train_score',
       'split9_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [31]:
top_five = cv_results_df.nsmallest(5,'rank_test_score')

In [32]:
top_five = top_five[['rank_test_score','params','mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

In [40]:
top_five['gen_error'] = top_five['mean_train_score'] - top_five['mean_test_score']

In [41]:
top_five

Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score,mean_train_score,std_train_score,gen_error
8,1,"{'max_depth': 10, 'n_estimators': 200}",0.764567,0.019233,0.88076,0.001966,0.116193
7,2,"{'max_depth': 10, 'n_estimators': 100}",0.76357,0.019582,0.880543,0.00189,0.116973
6,3,"{'max_depth': 10, 'n_estimators': 50}",0.761735,0.018803,0.879427,0.002463,0.117692
5,4,"{'max_depth': 5, 'n_estimators': 200}",0.691393,0.022419,0.711773,0.003687,0.020379
4,5,"{'max_depth': 5, 'n_estimators': 100}",0.689573,0.02234,0.710685,0.004,0.021112
