In [None]:
# Run this cell to mount your Google Drive.
#from google.colab import drive
#drive.mount('/content/drive')

### Hourly Prediction of bike rental counts using linear regression

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso 
from sklearn.svm import SVR

from sklearn import model_selection
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import TimeSeriesSplit
import numpy as np 
from numpy import sqrt

%matplotlib inline 

### Load Dataset

In [2]:
hour_data = pd.read_csv('/Users/tanchaud/CAB_Berlin/Module_1/Project_3/Bike-Sharing-Dataset/hour.csv')

### Data Wrangling

In [3]:
df_hour = hour_data.copy()
df_hour.head(3)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32


In [4]:
# Renaming columns
print(df_hour.shape)
df_hour = df_hour.rename(columns={'dteday':'Date','season':'Season' ,'yr':'Year', 'mnth': 'Month', 
            'hr':'Hour','holiday':'Holiday', 'weekday':'Day_of_the_Week','workingday': 'Working_Day',
            'weathersit':'Weather_Situation','temp':'Temperature',
            'atemp':'Feels_like','hum':'Humidity','windspeed': 'Wind_Speed',
            'casual':'Casual_Users','registered':'Registered_Users','cnt':'Total_Users'})

(17379, 17)


In [5]:
# Dropping columns
df_hour = df_hour.drop(columns=['instant'])

In [6]:
# Type Conversion of features 

df_hour['Date'] = pd.to_datetime(df_hour['Date'])

In [7]:
# Missing values 
df_hour.isna().sum()

Date                 0
Season               0
Year                 0
Month                0
Hour                 0
Holiday              0
Day_of_the_Week      0
Working_Day          0
Weather_Situation    0
Temperature          0
Feels_like           0
Humidity             0
Wind_Speed           0
Casual_Users         0
Registered_Users     0
Total_Users          0
dtype: int64

In [8]:
# Drop duplicates
df_hour = df_hour.drop_duplicates()

In [9]:
# Check datatypes
df_hour.dtypes

Date                 datetime64[ns]
Season                        int64
Year                          int64
Month                         int64
Hour                          int64
Holiday                       int64
Day_of_the_Week               int64
Working_Day                   int64
Weather_Situation             int64
Temperature                 float64
Feels_like                  float64
Humidity                    float64
Wind_Speed                  float64
Casual_Users                  int64
Registered_Users              int64
Total_Users                   int64
dtype: object

In [10]:
df_hour.shape

(17379, 16)

In [11]:
df_num = df_hour.select_dtypes(include='float64')

In [12]:
# Inverse Normalisation of Numeric variables

df_hour['Temperature (°C)'] = (df_hour.Temperature * 47) - 8
df_hour['Feels_like (°C)'] = (df_hour.Temperature * 50) - 16
df_hour['Humidity (%)'] = df_hour.Humidity * 100
df_hour['Wind_Speed (km/hr)'] = df_hour.Wind_Speed * 67

In [13]:
x = df_hour.groupby(by=['Month']).count()
x.tail(10)

Unnamed: 0_level_0,Date,Season,Year,Hour,Holiday,Day_of_the_Week,Working_Day,Weather_Situation,Temperature,Feels_like,Humidity,Wind_Speed,Casual_Users,Registered_Users,Total_Users,Temperature (°C),Feels_like (°C),Humidity (%),Wind_Speed (km/hr)
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473,1473
4,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437
5,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488
6,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440,1440
7,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488,1488
8,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475,1475
9,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437
10,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451,1451
11,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437
12,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483


In [14]:
df_hour['Registered_Users'].corr(df_hour['Casual_Users'])

0.5066177004294636

In [15]:
df_hour['Registered_Users'].corr(df_hour['Total_Users'])

0.9721507308642993

In [None]:
# Labelling features for EDA

df_hour.loc[df_hour['Holiday'] == 0, 'Holiday'] = 'No'
df_hour.loc[df_hour['Holiday'] == 1, 'Holiday'] = 'Yes'

seasons = ['Winter','Spring','Summer','Fall']

for idx in range(len(seasons)):
    
    df_hour.loc[df_hour['Season'] == idx + 1, 'Season'] = seasons[idx] 
    
months = ['Jan','Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec']

for idx in range(len(months)):
    
    df_hour.loc[df_hour['Month'] == idx + 1, 'Month'] = months[idx]    
    
df_hour.loc[df_hour['Year'] == 0, 'Year'] = 2011
df_hour.loc[df_hour['Year'] == 1, 'Year'] = 2012

df_hour["is_weekend"] = df_hour.Day_of_the_Week.isin([0, 6])
df_hour['is_weekend'] = ['Yes' if val == True else 'No' for val in df_hour['is_weekend']]

days_of_the_week = ['Sun','Mon','Tue','Wed','Thu','Fri','Sat']

for idx in range(len(days_of_the_week)):
    df_hour.loc[df_hour['Day_of_the_Week'] == idx, 'Day_of_the_Week'] = days_of_the_week[idx]
    
df_hour.loc[df_hour['Working_Day'] == 0, 'Working_Day'] = 'No'
df_hour.loc[df_hour['Working_Day'] == 1, 'Working_Day'] = 'Yes'

weather_sit = ['partly cloudy','cloudy','light precipitation', 'heavy precipitation']

for idx in range(len(weather_sit)):
    df_hour.loc[df_hour['Weather_Situation'] == idx + 1, 'Weather_Situation'] = weather_sit[idx]

### Exploratory Data Analysis

In [None]:
df_hour.columns

In [None]:
df_hour['Weather Situation'].unique()

In [None]:
sns.set(rc = {'figure.figsize':(12,8)})
sns.scatterplot(x = df_hour['Hour'], y = df_hour['Casual_Users'])

tick_values = range(len(df_hour.Hour.unique()))
plt.xticks(tick_values)

### ML Regression

##### Preparing data for the model

In [16]:
X = df_hour
X.head(3)

Unnamed: 0,Date,Season,Year,Month,Hour,Holiday,Day_of_the_Week,Working_Day,Weather_Situation,Temperature,Feels_like,Humidity,Wind_Speed,Casual_Users,Registered_Users,Total_Users,Temperature (°C),Feels_like (°C),Humidity (%),Wind_Speed (km/hr)
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,3.28,-4.0,81.0,0.0
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,2.34,-5.0,80.0,0.0
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,2.34,-5.0,80.0,0.0


In [17]:
# Typecasting Date column and set as df index
X['Date'] = pd.to_datetime(X['Date'])
X = X.set_index('Date', inplace=False)

In [20]:
# dropping columns 
X = X.drop(columns=['Year', 'Month', 'Temperature', 'Feels_like',
       'Humidity', 'Wind_Speed', 'Casual_Users','Registered_Users'])

In [21]:
# Typecasting 
cat_cols = ['Season', 'Holiday', 'Day_of_the_Week', 'Working_Day', 'Weather_Situation']

for col in cat_cols:
    X[col] = X[col].astype('category')

In [22]:
y = X['Total_Users']

In [23]:
X = X.drop(columns=['Total_Users'])

In [24]:
X.head(3)

Unnamed: 0_level_0,Season,Hour,Holiday,Day_of_the_Week,Working_Day,Weather_Situation,Temperature (°C),Feels_like (°C),Humidity (%),Wind_Speed (km/hr)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-01-01,1,0,0,6,0,1,3.28,-4.0,81.0,0.0
2011-01-01,1,1,0,6,0,1,2.34,-5.0,80.0,0.0
2011-01-01,1,2,0,6,0,1,2.34,-5.0,80.0,0.0


### Train Test Split

In [25]:
test_size = 5214

X_train = X[:-test_size]
X_test = X[-test_size:]

y_train = y[:-test_size]
y_test = y[-test_size:]

### Categorical Data Encoding

In [26]:
# Encoding categorical columns in X
ce_OHE = ce.OneHotEncoder(cols=['Season','Holiday', 'Day_of_the_Week', 'Working_Day', 'Weather_Situation'])

ce_OHE.fit(X_train)
X_train = ce_OHE.transform(X_train)
X_test = ce_OHE.transform(X_test)

### Feature Scaling 

In [28]:
# fit scaler on training data
#norm = MinMaxScaler().fit(X_train)
norm = StandardScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)

X_train = X_train_norm
X_test = X_test_norm

### Target Variable

In [None]:
#### ENCODING OF TARGET VARIABLE

#### FEATURE SCALING OF TARGET VARIABLE 


### Model Selection: Cross Validation with Time Series Split

In [29]:
y = y_train
X = X_train
tscv = TimeSeriesSplit()
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


In [30]:
rmse = []
names = []

In [31]:
# prepare models
models = []
models.append(('OLS', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso',Lasso()))
models.append(('SVR_linear',SVR(kernel="linear")))

In [None]:
results = np.zeros(shape=(5,5))

fig, axes = plt.subplots(5,5)
fig.set_size_inches(10,10)

for imodel, (model_name, regressor) in enumerate(models):
  for isplit, (train_index, test_index) in enumerate(tscv.split(X)):
    fig.show()
    print(model_name, "TRAIN:", train_index[:5],'...', train_index[-5:], "TEST:", test_index[:5], '...', test_index[-5:])
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit Model
    regressor.fit(X_train, y_train)

    # Predict with Model
    y_pred = regressor.predict(X_test)

    axes[imodel, isplit].scatter( y_test, y_pred, label=f'{model_name}: split #{isplit + 1}')
    

    # Evaluate Model
    results[imodel, isplit] = sqrt(mean_squared_error(y_test,y_pred))

    # Cross validation scores 
    # results[imodel, isplit] = np.mean(rmse)

  # Model Names
  names.append(model_name)

In [None]:
# Visualise CV results
fig = plt.figure()
fig.suptitle('Regression Algorithm Comparison using cross validation results')
ax = fig.add_subplot(111)
# ax.plot(results, label=names)
for model, rmse_series in zip(names, results):
  # if model in ['SVR_rbf', 'SVR_poly']:
  #   continue
  ax.plot(rmse_series, label=model,)

ax.set_ylabel('RMSE score')
ax.legend()
ax.set_yscale('log')
ax.set_xticks(ticks=range(5),)
plt.show()

### Prediction with chosen model

In [41]:
regressor = LinearRegression()

# Fit chosen regression model to the Training set
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Evaluate Prediction Quality 
# Root mean squared error 
rmse_model = mean_squared_error(y_test, y_pred, squared=False)
print('Root mean squared error: ', rmse_model)

# The mean squared error
mse_model = mean_squared_error(y_test,y_pred)
print("Mean squared error: ", mse_model)

Root mean squared error:  199.6056027522907
Mean squared error:  39842.39665010528


In [None]:
regressor.coef_