In [120]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import random

In [121]:
collect_1 = pd.read_csv('auto-mpg.csv')

In [122]:
collect_1.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [123]:
collect_1.isnull().sum() # We get all null values number and get to see all columns at the same time

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [124]:
collect_1 = collect_1.replace('?', 0)
collect_1['horsepower'] = collect_1['horsepower'].astype(float, errors = 'raise')
collect_1.loc[collect_1['horsepower']==0]

# Done to change horsepower dtype to string

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,0.0,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,0.0,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,0.0,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,0.0,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,0.0,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,0.0,3035,20.5,82,1,amc concord dl


In [125]:
# L2 norm used here

dfa = collect_1.drop('car name',axis=1)
x=dfa.drop('mpg',axis=1) # Independent variable
y= dfa['mpg'] # Dependent variable
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.333)

# In this step we split the dataset into X train, Xtest, Y train , Y test. 
#This will help us partition data for training and testing purposes. In ration 2/3 for training and 1/3 for testing

In [126]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
114,4,98.0,90.0,2265,15.5,73,2
141,4,98.0,83.0,2219,16.5,74,2
209,4,120.0,88.0,3270,21.9,76,2
324,4,85.0,65.0,2110,19.2,80,3
334,3,70.0,100.0,2420,12.5,80,3
...,...,...,...,...,...,...,...
319,4,120.0,75.0,2542,17.5,80,3
243,3,80.0,110.0,2720,13.5,77,3
54,4,72.0,69.0,1613,18.0,71,3
363,6,231.0,110.0,3415,15.8,81,1


In [127]:
from sklearn.preprocessing import Normalizer     # Inbuilt
nm = Normalizer(norm = 'l2')
X_train = nm.fit_transform(X_train)
X_test = nm.transform(X_test)

reg1 = linear_model.LinearRegression()
reg1.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [128]:
reg1.score(X_train,y_train)

0.8106892652374971

In [129]:

dfb = pd.DataFrame(reg1.coef_).T
dfb = dfb.rename(columns={0: "cylinders", 1: "displacement",2:"horsepower",3:"weight",4:"acceleration",5:"model year",6:"origin"})
dfb # to casually check result of perations


Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,-2463.243733,34.63221,-121.107388,103.756931,23.256692,897.716582,684.560212


In [130]:
coeff1 = reg1.coef_
coeff1

array([-2463.24373271,    34.63221008,  -121.10738755,   103.75693147,
          23.25669247,   897.71658161,   684.56021162])

In [131]:
from sklearn.metrics import mean_squared_error
y_pred_test = reg1.predict(X_test)
MeanSqError = mean_squared_error(y_test,y_pred_test) # Evaluating the RMSE values for the test Predictions
print('Mean Squared Error : ',MeanSquaredError)

Mean Squared Error :  11.147965363734231


In [132]:
np.random.seed(42)
linregCV = linear_model.RidgeCV(alphas=[0.001,0.01,0.01,10.0,100.0,1000.0,1000.0,1000.0],cv=5)
linregCV.fit(X_train,y_train)
linregCV.alpha_

0.001

In [133]:

linridgeCV = linear_model.Ridge(alpha=0.01,normalize=True)
linridgeCV.fit(X_train,y_train)
linridgeCV.score(X_train,y_train)

0.8100414202727876

In [134]:
coeff2 = linridgeCV.coef_
coeff2

array([-1862.05870361,    -2.78485176,  -121.26904217,  -192.53626176,
         155.07161376,   815.58110412,   723.30396269])

In [135]:
y_predCV = linridgeCV.predict(X_test) 
y_predCV

array([18.82089475, 22.20142451, 32.93433985, 29.12992204, 34.40439588,
       16.15686523, 35.98570326, 24.6702725 , 13.05066821, 28.27914723,
       17.69527381, 14.3653996 , 19.06268785, 29.77670724, 21.55701675,
       21.20108778, 26.68353037, 19.53842324, 14.85249273, 18.96391254,
       13.30009791, 32.37638235, 26.10020453, 20.17293198, 12.9026615 ,
       25.68679137, 12.87679285, 22.91169824, 20.75546507, 33.11613554,
       25.09313258, 22.71353991, 36.22569511, 26.30480056, 27.25699015,
       14.65323276, 34.2190685 , 21.03763447, 12.97466391, 31.00770584,
       27.87744128, 34.60909857, 24.61774644, 28.62980162, 20.40806859,
       21.68366457, 14.46112081, 26.6164932 , 25.52804388, 16.70094299,
       35.55451661, 20.5130999 , 31.8536226 , 25.43043032, 15.77139429,
       33.66915676, 20.36376985, 15.1305317 , 30.61352133, 33.6969068 ,
       23.49827639, 14.26827474, 36.37211475, 16.97022666, 20.09284987,
       27.92379774, 11.56753636, 22.75384736, 39.29044505, 22.77

In [136]:
MeanSqError = np.square(np.subtract(y_test,y_predCV)).mean() 
MeanSqError

10.078905193059327

In [137]:
np.random.seed(42)
lassoCV=linear_model.LassoCV(alphas=[0.0001,0.001,0.01,0.1,1],max_iter=10000,cv=10)
lassoCV.fit(X_train,y_train)
y_predlasso=lassoCV.predict(X_test)

In [138]:
lassoCV.alpha_

0.0001

In [139]:
LCV = linear_model.Lasso(alpha=0.001,normalize=True)
LCV.fit(X_train,y_train)
LCV.score(X_train,y_train)

0.8104857786484225

In [140]:
coeff3 = LCV.coef_
coeff3

array([-1868.66908674,     5.90717546,  -122.53958782,   -65.47712536,
           0.        ,   871.95693234,   496.50973391])

In [141]:
y_predlasso=lassoCV.predict(X_test)

In [142]:
MeanSqError=np.square(np.subtract(y_test,y_predlasso)).mean()

In [143]:
coeff1

array([-2463.24373271,    34.63221008,  -121.10738755,   103.75693147,
          23.25669247,   897.71658161,   684.56021162])

In [144]:
coeff2

array([-1862.05870361,    -2.78485176,  -121.26904217,  -192.53626176,
         155.07161376,   815.58110412,   723.30396269])

In [145]:
coeff3

array([-1868.66908674,     5.90717546,  -122.53958782,   -65.47712536,
           0.        ,   871.95693234,   496.50973391])

Q 2

(e)	

Coefficients value is reduced by Regularization therefore it directly effects the importance of an attribute.  Linear Regression and Ridge reduce value of coefficient that are not important, but LassoCV converts coefficients that are not valueable to 0.

Since Cross Validaton reduces the Mean Square Error [MSE] , therefore MSE is a guide which indicates best result. Hence LassoCV gives the best result going by MSE followed by RidgeCV and Linear Regression