In [33]:
#import libraries

import pandas as pd
import numpy as np
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

from scipy import stats

In [2]:
df = pd.read_csv('./FuelConsumption.csv')
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [3]:
df.shape

(1067, 13)

In [4]:
df.isnull().any()

MODELYEAR                   False
MAKE                        False
MODEL                       False
VEHICLECLASS                False
ENGINESIZE                  False
CYLINDERS                   False
TRANSMISSION                False
FUELTYPE                    False
FUELCONSUMPTION_CITY        False
FUELCONSUMPTION_HWY         False
FUELCONSUMPTION_COMB        False
FUELCONSUMPTION_COMB_MPG    False
CO2EMISSIONS                False
dtype: bool

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 13 columns):
MODELYEAR                   1067 non-null int64
MAKE                        1067 non-null object
MODEL                       1067 non-null object
VEHICLECLASS                1067 non-null object
ENGINESIZE                  1067 non-null float64
CYLINDERS                   1067 non-null int64
TRANSMISSION                1067 non-null object
FUELTYPE                    1067 non-null object
FUELCONSUMPTION_CITY        1067 non-null float64
FUELCONSUMPTION_HWY         1067 non-null float64
FUELCONSUMPTION_COMB        1067 non-null float64
FUELCONSUMPTION_COMB_MPG    1067 non-null int64
CO2EMISSIONS                1067 non-null int64
dtypes: float64(4), int64(4), object(5)
memory usage: 108.4+ KB


In [7]:
#CO2 EMISSIONS correlation with numerical features
df.corr()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
MODELYEAR,,,,,,,,
ENGINESIZE,,1.0,0.934011,0.832225,0.778746,0.819482,-0.808554,0.874154
CYLINDERS,,0.934011,1.0,0.796473,0.724594,0.776788,-0.77043,0.849685
FUELCONSUMPTION_CITY,,0.832225,0.796473,1.0,0.965718,0.995542,-0.935613,0.898039
FUELCONSUMPTION_HWY,,0.778746,0.724594,0.965718,1.0,0.985804,-0.893809,0.861748
FUELCONSUMPTION_COMB,,0.819482,0.776788,0.995542,0.985804,1.0,-0.927965,0.892129
FUELCONSUMPTION_COMB_MPG,,-0.808554,-0.77043,-0.935613,-0.893809,-0.927965,1.0,-0.906394
CO2EMISSIONS,,0.874154,0.849685,0.898039,0.861748,0.892129,-0.906394,1.0


In [8]:
#Reliability of correlation
pearson_coef,p_value = stats.pearsonr(df['ENGINESIZE'],df['CO2EMISSIONS'])

pearson_coef,p_value

(0.8741543683958123, 0.0)

In [9]:
pearson_coef,p_value = stats.pearsonr(df['CYLINDERS'],df['CO2EMISSIONS'])

pearson_coef,p_value

(0.8496845920141143, 2.7709372039888274e-298)

In [10]:
pearson_coef,p_value = stats.pearsonr(df['FUELCONSUMPTION_CITY'],df['CO2EMISSIONS'])

pearson_coef,p_value

(0.8980385119353926, 0.0)

In [11]:
pearson_coef,p_value = stats.pearsonr(df['FUELCONSUMPTION_HWY'],df['CO2EMISSIONS'])

pearson_coef,p_value

(0.8617479448970434, 3.91865563e-316)

In [13]:
pearson_coef,p_value = stats.pearsonr(df['FUELCONSUMPTION_COMB'],df['CO2EMISSIONS'])

pearson_coef,p_value

(0.8921285933157566, 0.0)

In [14]:
pearson_coef,p_value = stats.pearsonr(df['FUELCONSUMPTION_COMB_MPG'],df['CO2EMISSIONS'])

pearson_coef,p_value

(-0.9063942295226132, 0.0)

In [16]:
#Create dummies
df['MAKE'] = df['MAKE'].astype('category')
df['MAKE_cat'] = df['MAKE'].cat.codes


In [17]:
df['MODEL'] = df['MODEL'].astype('category')
df['MODEL_cat'] =df['MODEL'].cat.codes

In [18]:
df['VEHICLESCLASS'] = df['VEHICLECLASS'].astype('category')
df['VEHICLESCLASS_cat']= df['VEHICLESCLASS'].cat.codes

In [19]:
df['TRANSMISSION'] =df['TRANSMISSION'].astype('category')
df['TRANSMISSION_cat'] =df['TRANSMISSION'].cat.codes

In [20]:
df['FUELTYPE'] = df['FUELTYPE'].astype('category')
df['FUELTYPE_cat'] = df['FUELTYPE'].cat.codes

In [21]:
df['FUELTYPE'] = df['FUELTYPE'].astype('category')
df['FUELTYPE_cat'] = df['FUELTYPE'].cat.codes

In [22]:
Z = np.asanyarray(df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY','FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG','MAKE_cat','MODEL_cat','VEHICLESCLASS_cat','TRANSMISSION_cat','FUELTYPE_cat']])

In [23]:
Y = df['CO2EMISSIONS'].values

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(Z,Y, test_size = 0.2, random_state=4)

print('Train set:',X_train.shape,y_train.shape)
print('Test set:', X_test.shape,y_test.shape)

Train set: (853, 11) (853,)
Test set: (214, 11) (214,)


In [25]:
X_test = np.asanyarray(X_test)
y_test = np.asanyarray(y_test)

In [26]:
lm =LinearRegression()
lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
Yhat = lm.predict(X_test)

In [31]:
#Evaluation

from sklearn.metrics import r2_score

In [32]:
print("Mean absolute error: %.2f"%np.mean(np.absolute(Yhat - y_test)))
print("Residual sum of squares(MSE): %.2f"%np.mean((Yhat- y_test)**2))
print('R2-score: %.2f'%r2_score(Yhat,y_test))

Mean absolute error: 11.58
Residual sum of squares(MSE): 288.30
R2-score: 0.91
