# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [3]:
# install 
!pip install scikit-optimize



### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [4]:
X_s = X.head(20000)
y_s = y.head(20000)

#### Linear

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, random_state=0)

In [15]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)
%store lr

Stored 'lr' (LinearRegression)


In [16]:
%store -r lr
print("test score: %s" % lr.score(X_test, y_test))

test score: 0.5005780148025794


#### RF

In [17]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor
opt_rf = BayesSearchCV(
    RandomForestRegressor(random_state=0),
    search_spaces={ "n_estimators": (100,1000), "max_depth": (2,100)},
    random_state=0,
    n_iter=32,
    n_jobs=6)
_ = opt_rf.fit(X_train, y_train)
%store opt_rf

Stored 'opt_rf' (BayesSearchCV)


In [18]:
%store -r opt_rf
print("val. score: %s" % opt_rf.best_score_)
print("test score: %s" % opt_rf.score(X_test, y_test))
print("best params: %s" % str(opt_rf.best_params_))

val. score: 0.808777436006692
test score: 0.806701544600705
best params: OrderedDict([('max_depth', 84), ('n_estimators', 992)])


In [4]:
import pandas as pd
%store -r opt_rf
rf_df = pd.DataFrame.from_dict(opt_rf.cv_results_)
rf_df.sort_values('rank_test_score')[['rank_test_score', 'param_n_estimators', 'param_max_depth','mean_test_score']]

Unnamed: 0,rank_test_score,param_n_estimators,param_max_depth,mean_test_score
31,1,992,71,0.808777
5,1,992,84,0.808777
30,3,986,67,0.808761
28,4,999,89,0.808757
12,4,999,65,0.808757
18,6,1000,74,0.808751
10,6,1000,98,0.808751
11,8,894,92,0.808737
8,9,907,11,0.808708
20,10,535,92,0.808665


#### SVR

In [46]:
from skopt import BayesSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

pip = make_pipeline(StandardScaler(), SVR(kernel='rbf'))
opt_svr = BayesSearchCV(
    pip,
    search_spaces={ "svr__C": (1e-3, 1e+2), "svr__gamma": (1e-6, 1)},
    random_state=0,
    n_iter=32,
    n_jobs=6)
_ = opt_svr.fit(X_train, y_train)
%store opt_svr

Stored 'opt_svr' (BayesSearchCV)


In [47]:
%store -r opt_svr
print("val. score: %s" % opt_svr.best_score_)
print("test score: %s" % opt_svr.score(X_test, y_test))
print("best params: %s" % str(opt_svr.best_params_))

val. score: 0.7885137064652358
test score: 0.8076653418392263
best params: OrderedDict([('svr__C', 42.713392298939276), ('svr__gamma', 0.06310535770609077)])


In [3]:
import pandas as pd
%store -r opt_svr
svr_df = pd.DataFrame.from_dict(opt_svr.cv_results_)
svr_df.sort_values('rank_test_score')[['rank_test_score', 'param_svr__C', 'param_svr__gamma','mean_test_score']]

Unnamed: 0,rank_test_score,param_svr__C,param_svr__gamma,mean_test_score
12,1,42.713392,0.063105,0.788514
2,2,52.869646,0.057899,0.788212
20,3,22.403272,0.082771,0.786663
26,4,8.315085,0.081545,0.785234
17,5,18.3274,0.109188,0.78135
13,6,92.948632,0.06662,0.78102
19,7,88.509578,0.084767,0.779066
25,8,98.807952,0.085422,0.777824
14,9,36.6241,0.215107,0.753811
16,10,100.0,0.178836,0.751607
