## Importing required libraries

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from pandas_profiling import ProfileReport
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [38]:
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import scikitplot as skplt 

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

## Loading Datasets

In [40]:
df = pd.read_csv('data/hour_data_final.csv')

In [41]:
df.head()

Unnamed: 0,temp,humidity,windspeed,total,day,season_2,season_3,season_4,year_1,hour_1,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weather_2,weather_3,weather_4
0,0.24,0.81,0.194,16,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.22,0.8,0.194,40,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0.22,0.8,0.194,32,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.24,0.75,0.194,13,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.24,0.75,0.194,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [42]:
df.shape

(17377, 43)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17377 entries, 0 to 17376
Data columns (total 43 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   temp          17377 non-null  float64
 1   humidity      17377 non-null  float64
 2   windspeed     17377 non-null  float64
 3   total         17377 non-null  int64  
 4   day           17377 non-null  int64  
 5   season_2      17377 non-null  int64  
 6   season_3      17377 non-null  int64  
 7   season_4      17377 non-null  int64  
 8   year_1        17377 non-null  int64  
 9   hour_1        17377 non-null  int64  
 10  hour_2        17377 non-null  int64  
 11  hour_3        17377 non-null  int64  
 12  hour_4        17377 non-null  int64  
 13  hour_5        17377 non-null  int64  
 14  hour_6        17377 non-null  int64  
 15  hour_7        17377 non-null  int64  
 16  hour_8        17377 non-null  int64  
 17  hour_9        17377 non-null  int64  
 18  hour_10       17377 non-nu

## Prepare set of independent variables and the dependent variable.

In [44]:
X = df.drop('total', axis = 1).values
y = df['total']

## Split the data into Train and Test sets

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((12163, 42), (12163,), (5214, 42), (5214,))

## Implementing the model 

### 1. Linear Regression:

#### Hyper Parameter Estimation:

In [46]:
lin_reg = GridSearchCV(LinearRegression(),{'fit_intercept':['True','False'],
    'n_jobs': [1,2,3,4,5,6,7,8,9,10],
    'normalize':['True','False'],
    'copy_X':['True','False'],
    'positive':['True','False']},
    cv=3)

lin_reg.fit(X_train,y_train)
print(lin_reg.best_params_)

{'copy_X': 'True', 'fit_intercept': 'True', 'n_jobs': 1, 'normalize': 'True', 'positive': 'True'}


#### Run the model:

In [47]:
lin_reg = LinearRegression(copy_X = True, normalize= True, positive = True,n_jobs = 1, fit_intercept=True)
lin_reg.fit(X_train,y_train)

LinearRegression(n_jobs=1, normalize=True, positive=True)

#### Performance of the model:

In [48]:
y_pred = lin_reg.predict(X_test)


In [49]:
cross_vall = cross_val_score(lin_reg,X,y, cv = 3)
cross_validation = cross_vall.mean()
mean_squared = metrics.mean_squared_error(y_test,y_pred)
mean_r2 = metrics.r2_score(y_test,y_pred)

In [50]:
print('Cross Validation Score: ', cross_validation)
print('Root Mean Squared error: ', np.sqrt(mean_squared))
print('R-Square score: ', mean_r2)

Cross Validation Score:  0.5717623010735478
Root Mean Squared error:  103.77901355983724
R-Square score:  0.6536311348943649


### 2. Random Forrest Regressor:

#### Hyper Parameter Estimation:

In [51]:
ranf_reg = GridSearchCV(RandomForestRegressor(),
    {'n_estimators':[10,100],
    'criterion': ["mse","mae"],
    'max_features':['auto','sqrt',"log2"]},
    cv=3)

ranf_reg.fit(X_train,y_train)
print(ranf_reg.best_params_)

{'criterion': 'mse', 'max_features': 'auto', 'n_estimators': 100}


#### Run the model:

In [61]:
ranf_reg = RandomForestRegressor(n_estimators = 100, criterion= 'mse', max_features = 'auto')
ranf_reg.fit(X_train,y_train)

RandomForestRegressor()

#### Performance of the model:

In [62]:
y_pred = ranf_reg.predict(X_test)

In [63]:
cross_vall = cross_val_score(ranf_reg,X,y, cv = 3)
cross_validation = cross_vall.mean()
mean_squared = metrics.mean_squared_error(y_test,y_pred)
mean_r2 = metrics.r2_score(y_test,y_pred)

In [64]:
print('Cross Validation Score: ', cross_validation)
print('Root Mean Squared error: ', np.sqrt(mean_squared))
print('R-Square score: ', mean_r2)

Cross Validation Score:  0.7806267073665455
Root Mean Squared error:  54.03905449169824
R-Square score:  0.9060849371849957


### 3. Extra Trees Regressor:

In [56]:
etree_reg = GridSearchCV(ExtraTreesRegressor(),
    {'n_estimators':[10,100],
    'criterion': ["mse","mae"],
    'max_features':['auto','sqrt',"log2"]},
    cv=3)

etree_reg.fit(X_train,y_train)
print(etree_reg.best_params_)

{'criterion': 'mse', 'max_features': 'auto', 'n_estimators': 100}


#### Run the model:

In [57]:
etree_reg = ExtraTreesRegressor(n_estimators = 100, criterion= 'mse', max_features = 'auto')
etree_reg.fit(X_train,y_train)

ExtraTreesRegressor()

#### Performance of the model:

In [58]:
y_pred = etree_reg.predict(X_test)

In [59]:
cross_vall = cross_val_score(etree_reg,X,y, cv = 3)
cross_validation = cross_vall.mean()
mean_squared = metrics.mean_squared_error(y_test,y_pred)
mean_r2 = metrics.r2_score(y_test,y_pred)

In [60]:
print('Cross Validation Score: ', cross_validation)
print('Root Mean Squared error: ', np.sqrt(mean_squared))
print('R-Square score: ', mean_r2)

Cross Validation Score:  0.8251720921797437
Root Mean Squared error:  49.493815983591006
R-Square score:  0.9212189710877905


### Comparision of the models 

Here I've used three different models .
- Linear Regression  
- Random Forrest Regression
- Extra Trees Regression 

From these three models the best performing scores are from the "Extra Trees Regression". 

So we will use this model to predict the demand of 'Bike Sharing Users' in our dataset.