# Prediction of bike rental counts a regression model

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn import model_selection
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import TimeSeriesSplit
import numpy as np 
from numpy import sqrt

### Load Dataset

In [3]:
hour_data = pd.read_csv('/Users/tanchaud/CAB_Berlin/Module_1/Project_3/Bike-Sharing-Dataset/hour.csv')

### Data 

In [4]:
df = hour_data.copy()

In [5]:
df = df.rename(columns={'dteday':'Date','season':'Season' ,'yr':'Year', 'mnth': 'Month', 
            'hr':'Hour','holiday':'Holiday', 'weekday':'Day_of_the_Week','workingday': 'Working_Day',
            'weathersit':'Weather_Situation','temp':'Temperature',
            'atemp':'Feels_like','hum':'Humidity','windspeed': 'Wind_Speed',
            'casual':'Casual_Users','registered':'Registered_Users','cnt':'Total_Users'})

df = df.drop(columns=['instant','Casual_Users','Registered_Users'])

In [6]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date', inplace=False)

In [7]:
df.head(3)

Unnamed: 0_level_0,Season,Year,Month,Hour,Holiday,Day_of_the_Week,Working_Day,Weather_Situation,Temperature,Feels_like,Humidity,Wind_Speed,Total_Users
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32


In [8]:
df.shape

(17379, 13)

### Train Test Split

In [10]:
y = np.array(df['Total_Users'])
X = df.drop(columns=['Total_Users'])
X = np.array(df)
tscv = TimeSeriesSplit()
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


### Model Selection: Cross Validation with Time Series Split

In [5]:
rmse = []
names = []
results = []

In [3]:
# prepare models
models = []
models.append(('OLS', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('SVR_rbf',SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)))
models.append(('SVR_linear',SVR(kernel="linear", C=100, gamma="auto")))
models.append(('SVR_poly',SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)))

In [None]:
for i in range(len(models)):

    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # Instantiate Model
        regressor = models[i][1]

    # Fit Model
        regressor.fit(X_train, y_train)

    # Predict with Model
        y_pred = regressor.predict(X_test)

    # Evaluate Model
    rmse.append(sqrt(mean_squared_error(y_test,y_pred)))
    

# Cross validation scores 
results.append = np.mean(rmse)

# Model Names
names.append(models[i][0])

# Visualise CV results
fig = plt.figure()
fig.suptitle('Regression Algorithm Comparison using cross validation results')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_ylabel('cross validation score')
ax.set_xticklabels(names)
plt.show()