## For a given dataset predict number of bikes getting shared based on different parameters 

### Data Preprocessing

In [1]:
# importing needed libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn import metrics
import sys
sys.path.append('/home/admin1/PycharmProjects/Machine Learning from scratch/')
from ipynb.fs.full.ml_library import *

# importing dataset & storing it as pandas dataframe
bike_data = pd.read_csv('bike_sharing.csv')
bike_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


#### Feature selection & removing outliers

In [2]:
# All the steps of data feature column selection & removing outliers
# are done while using multi-linear model you can check all steps there
bike_data.drop(['instant','dteday','holiday','atemp','casual','registered'], axis=1, inplace=True)
bike_data = remove_outliers(bike_data, ['cnt'])

In [3]:
bike_data.columns

Index(['season', 'yr', 'mnth', 'hr', 'weekday', 'workingday', 'weathersit',
       'temp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

#### Separating out feature & label columns

In [4]:
# taking continuous numerical variable columns  or two category categorical columns
x_values = bike_data.loc[:,['temp','hum','windspeed','yr','workingday']].values
y_values = bike_data['cnt'].values

##### One-hot-encoding for categorical columns & appending it to feature matrix

In [5]:
categorical_cols = ['season','mnth','hr','weekday','weathersit']
one_hot_encode = OneHotEncoder()

new_columns = one_hot_encode.fit_transform(bike_data.loc[:,categorical_cols]).toarray()
x_values = np.append(x_values, new_columns, axis=1)

In [6]:
x_values.shape

(16874, 56)

#### Creating polynomial features

In [7]:
poly = PolynomialFeatures(degree=2)
x_values = poly.fit_transform(x_values)

In [8]:
x_values.shape

(16874, 1653)

#### Splitting dataset into train set & test set

In [9]:
train_x_values, test_x_values, train_y_values, test_y_values = train_test_split(x_values, y_values, train_size = 0.8, random_state=10) 

### Buliding linear regression model

In [10]:
regressor = LinearRegression()       # creating object from LinearRegression class
regressor.fit(train_x_values, train_y_values)       # building linear regression model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#### Pickling of required objects & storing it into file

In [11]:
import joblib
file = open('PolyRegModel2.pkl','wb')
joblib.dump(one_hot_encode, file)
joblib.dump(poly, file)
joblib.dump(regressor, file)
file.close()

#### Storing predictions for train set

In [12]:
train_prediction = regressor.predict(train_x_values)
test_prediction = regressor.predict(test_x_values)

### Calculating errors

In [13]:
mean_abs_error = metrics.mean_absolute_error(test_y_values, test_prediction)
mean_abs_error

32.205634259259256

In [14]:
mean_sqr_error = metrics.mean_squared_error(test_y_values, test_prediction)
mean_sqr_error

2137.8849543547453

In [15]:
root_mean_sqr_error = np.sqrt(mean_sqr_error)
root_mean_sqr_error

46.237268024340985

### Evaluating model against test set

In [16]:
print(f'r2_score: {metrics.r2_score(test_y_values, test_prediction)}')

r2_score: 0.9135201355563308
