# Setup

In [2]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Load the data

In [3]:
df = pd.read_csv(os.getcwd()+"/data/day.csv")
df["dteday"] = pd.to_datetime(df["dteday"])
df = df.sort_values(by="dteday",ascending=True)
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


# Feature selection

In [26]:
y = df["cnt"]
# only importat predictors (see feature importance plot)
X = df.loc[:,['mnth', 'weathersit', 'instant', 'season', 'weekday', 'workingday', 'temp', 'hum', 'windspeed']]

# Train-Test-Split

In [27]:
# don't split random but take the first 18 month as train data and the last 6 month as test data
test_idx = df[(df["yr"]==1) & (df["mnth"]>5)].index

X_test=X.iloc[test_idx,]
y_test=y.iloc[test_idx,]

X_train=X.drop(test_idx)
y_train=y.drop(test_idx)

(731, 17)

# Make it a pipeline
https://www.kdnuggets.com/2017/12/managing-machine-learning-workflows-scikit-learn-pipelines-part-1.html

In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, Matern, ConstantKernel
from sklearn.metrics import mean_squared_error 

# Construct some pipelines
pipe_rr = Pipeline([('scl', StandardScaler()),
            ('reg', linear_model.Ridge())])

pipe_svr_lin = Pipeline([('scl', StandardScaler()),
            ('reg', svm.SVR(kernel='linear'))])

pipe_svr_rbf = Pipeline([('scl', StandardScaler()),
            ('reg', svm.SVR(kernel='rbf'))])
            
pipe_rf = Pipeline([('scl', StandardScaler()),
            ('reg', ensemble.RandomForestRegressor(random_state=42))])

pipe_gp1 = Pipeline([('scl', StandardScaler()),
            ('reg', GaussianProcessRegressor(kernel=DotProduct() 
                                             + WhiteKernel(), 
                                             random_state=42))])

pipe_gp2 = Pipeline([('scl', StandardScaler()),
            ('reg', GaussianProcessRegressor(kernel=ConstantKernel() 
                                             + Matern(length_scale=2, nu=3/2) 
                                             + WhiteKernel(noise_level=1),
                                             random_state=42))])

# List of pipelines for ease of iteration
pipelines = [pipe_rr, pipe_svr_lin, pipe_svr_rbf, pipe_rf, pipe_gp1, pipe_gp2]
            
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Ridge Regression', 
             1:'SVR(kernel=''linear'')', 
             2:'SVR(kernel=''rbf'')', 
             3:'Random Forest',
             4:'Gaussian Process 1',
             5:'Gaussian Process 2'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

# Compare accuracies
for idx, val in enumerate(pipelines):
    y_pred = val.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    #print('%s pipeline test accuracy: %s' % (pipe_dict[idx], "{:,}".format(round(mse,2))))
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], rmse))

# Identify the most accurate model on test data
best_acc = 0.0
best_reg = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test)
        best_pipe = val
        best_reg = idx
print('\nRegressor with best accuracy: %s' % pipe_dict[best_clf])

# Save pipeline to file
#joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
#print('Saved %s pipeline to file' % pipe_dict[best_clf])

Ridge Regression pipeline test accuracy: 1151.359
SVR(kernel=linear) pipeline test accuracy: 2203.455
SVR(kernel=rbf) pipeline test accuracy: 2703.958
Random Forest pipeline test accuracy: 1117.239
Gaussian Process 1 pipeline test accuracy: 2840.561
Gaussian Process 2 pipeline test accuracy: 2863.304

Regressor with best accuracy: Random Forest
