# Multiple Linear Regression

## Importing the libraries

In [118]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [119]:
Train = pd.read_csv('Train.csv')
Test = pd.read_csv('Test.csv')
VariableDefinitions = pd.read_csv('VariableDefinitions.csv')
SampleSubmission = pd.read_csv('SampleSubmission.csv')
Riders = pd.read_csv('Riders.csv')

In [79]:
Train.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,1:00:38 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,10:05:27 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,10:25:37 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214


In [113]:
Train['Platform Type'].value_counts()

3    18054
1     2147
2      980
4       20
Name: Platform Type, dtype: int64

In [115]:
(18054)/21000

0.8597142857142858

In [80]:
Train.shape

(21201, 29)

In [81]:
#Select significant columns(or Drop insignificant columns)
df=Train[['Pickup - Weekday (Mo = 1)', 'Distance (KM)', 'Temperature', 'Rider Id', 'Time from Pickup to Arrival']]

In [82]:
X_Train = df.iloc[:,:-1]
Y_Train = df.iloc[:,4:]

In [83]:
X_Train.head()

Unnamed: 0,Pickup - Weekday (Mo = 1),Distance (KM),Temperature,Rider Id
0,5,4,20.4,Rider_Id_432
1,5,16,26.4,Rider_Id_856
2,2,3,,Rider_Id_155
3,5,9,19.2,Rider_Id_855
4,1,9,15.4,Rider_Id_770


In [84]:
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201 entries, 0 to 21200
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Pickup - Weekday (Mo = 1)  21201 non-null  int64  
 1   Distance (KM)              21201 non-null  int64  
 2   Temperature                16835 non-null  float64
 3   Rider Id                   21201 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 662.7+ KB


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201 entries, 0 to 21200
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Pickup - Weekday (Mo = 1)    21201 non-null  int64  
 1   Distance (KM)                21201 non-null  int64  
 2   Temperature                  16835 non-null  float64
 3   Rider Id                     21201 non-null  object 
 4   Time from Pickup to Arrival  21201 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 828.3+ KB


## Encoding categorical data

In [86]:
#Replacing NaN values with mean values in respective coumns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X=np.array(X_Train)
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [87]:
X_Train = pd.DataFrame(X, columns = X_Train.columns)
X_Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201 entries, 0 to 21200
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Pickup - Weekday (Mo = 1)  21201 non-null  object
 1   Distance (KM)              21201 non-null  object
 2   Temperature                21201 non-null  object
 3   Rider Id                   21201 non-null  object
dtypes: object(4)
memory usage: 662.7+ KB


In [88]:
#From somewhat reason ColumnTransformer OneHotEncoder does not work so we can try pandas GetDummies
X_Train_Dummies = pd.get_dummies(X_Train.iloc[:,-1])
X_Train = X_Train.drop('Rider Id', axis = 1)
XTD = pd.concat([X_Train, X_Train_Dummies], axis = 1)

# Again we make sure that all the column names have underscores instead of whitespaces
XTD.columns = [col.replace(" ","_") for col in XTD.columns] 

XTD.head()


Unnamed: 0,Pickup_-_Weekday_(Mo_=_1),Distance_(KM),Temperature,Rider_Id_0,Rider_Id_1,Rider_Id_10,Rider_Id_100,Rider_Id_101,Rider_Id_102,Rider_Id_103,...,Rider_Id_953,Rider_Id_954,Rider_Id_955,Rider_Id_956,Rider_Id_958,Rider_Id_959,Rider_Id_96,Rider_Id_97,Rider_Id_98,Rider_Id_99
0,5,4,20.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,16,26.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,3,23.2589,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,9,19.2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,9,15.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
XTD.shape

(21201, 927)

## Splitting the dataset into the Training set and Test set

In [90]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XTD, Y_Train, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [98]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [107]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
Predict_test = pd.DataFrame(np.concatenate((y_pred, y_test),1), columns = ['Predict', 'Test'])

In [108]:
Predict_test

Unnamed: 0,Predict,Test
0,1409.454271,992.0
1,1476.289183,2400.0
2,2182.750453,3486.0
3,1309.282341,3033.0
4,1778.200725,1567.0
...,...,...
4236,1419.381050,948.0
4237,1051.874611,1.0
4238,-150.816958,173.0
4239,2030.694765,1844.0


In [109]:
from sklearn import metrics

In [117]:
(metrics.mean_squared_error(y_pred, y_test))**0.5

774.514171402951