In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import style
style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data = pd.read_csv('CO2_Emissions_Canada.csv')
data.head(10)


Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
5,ACURA,RLX,MID-SIZE,3.5,6,AS6,Z,11.9,7.7,10.0,28,230
6,ACURA,TL,MID-SIZE,3.5,6,AS6,Z,11.8,8.1,10.1,28,232
7,ACURA,TL AWD,MID-SIZE,3.7,6,AS6,Z,12.8,9.0,11.1,25,255
8,ACURA,TL AWD,MID-SIZE,3.7,6,M6,Z,13.4,9.5,11.6,24,267
9,ACURA,TSX,COMPACT,2.4,4,AS5,Z,10.6,7.5,9.2,31,212


In [3]:
#drop duplicate values
data.drop_duplicates(inplace = True)
data.duplicated().sum()


0

In [4]:
cols = ['Vehicle Class', 'Transmission','Fuel Type', 'Cylinders', 'Engine Size(L)', 'Fuel Consumption Comb (L/100 km)']
X = data[cols]
Y = data[['CO2 Emissions(g/km)']]
X.head()


Unnamed: 0,Vehicle Class,Transmission,Fuel Type,Cylinders,Engine Size(L),Fuel Consumption Comb (L/100 km)
0,COMPACT,AS5,Z,4,2.0,8.5
1,COMPACT,M6,Z,4,2.4,9.6
2,COMPACT,AV7,Z,4,1.5,5.9
3,SUV - SMALL,AS6,Z,6,3.5,11.1
4,SUV - SMALL,AS6,Z,6,3.5,10.6


In [5]:
from sklearn.preprocessing import OrdinalEncoder
oc = OrdinalEncoder()
cols = ['Vehicle Class', 'Transmission', 'Fuel Type']
X[cols] = oc.fit_transform(X[cols])


In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
cols = ['Engine Size(L)', 'Fuel Consumption Comb (L/100 km)']
X[cols] = sc.fit_transform(X[cols])


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 2)


In [8]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)


In [9]:
y_pred = reg.predict(x_test)


# -------------------------------------------------------------
# Evaluation Metrics


## why we use cost function as mean square error not mean absolute error?

In [10]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred)

12.157598046846717

In [11]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

322.93924233965765

## Root mean squared error

- Root of mean squared error

In [12]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred,squared=False)

17.970510352787915

In [13]:
mean_squared_error(y_test,y_pred,squared=False)**2

322.93924233965765

## R- Squared score
## r^2=1-(ss reg/ss total)
## r^2 = 1-(sigma(yi-yi^)/sigma(yi-ymean)

In [14]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.901167694753238

### the default score function also gives r2_score

In [15]:
print('Score on training data : ', reg.score(x_train, y_train))
print('Score on testing  data : ', reg.score(x_test, y_test))


Score on training data :  0.9032624598836975
Score on testing  data :  0.901167694753238


# K-Fold cross validation

In [16]:
from sklearn.model_selection import cross_val_score
cross_val_score(reg,X,Y,cv=5)

array([0.88037087, 0.88758895, 0.90857073, 0.90885784, 0.91051165])

In [17]:
(cross_val_score(reg,X,Y,cv=5)).mean()

0.8991800070262179

In [None]:
mean_squared_error(y_test,y_pred)