In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [3]:
df=pd.read_csv('cars.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [4]:
#step 1 replace '?' with NAN
df['normalized-losses'].replace('?',np.nan,inplace=True)

In [5]:
df['horsepower'].replace('?',np.nan,inplace=True)

In [6]:
#step 2--changing the data type  cat colum into float
df['normalized-losses']=df['normalized-losses'].astype('float64')

In [8]:
df['horsepower']=df['horsepower'].astype('float64')

In [9]:
from sklearn.impute import SimpleImputer
si=SimpleImputer(missing_values=np.nan,strategy='mean')

In [10]:
#separate feature and target
X=df.iloc[:,:-1]  #all column except last
Y=df.iloc[:,-1] #only last col

In [11]:
#fit nan with mean
X[['normalized-losses','horsepower']]=si.fit_transform(X[['normalized-losses','horsepower']])
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22


In [12]:
#separating cat columns
cat_col=X.select_dtypes(object).columns
cat_col

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [13]:
#splitting training and test data
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=1)

In [14]:
#Encoding
for col in cat_col:
    le=LabelEncoder() #le is an object of label encoder
    xtrain[col]=le.fit_transform(xtrain[col])
    xtest[col]=le.transform(xtest[col])

In [15]:
xtest

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
78,2,161.0,11,1,2,1,0,64.4,50.8,3,92,68.0,31,38
97,1,103.0,12,1,4,1,0,63.8,53.5,3,97,69.0,31,37
151,1,87.0,19,1,2,1,0,63.6,54.5,3,92,62.0,31,38
44,1,122.0,6,1,3,1,0,63.6,52.0,3,90,70.0,38,43
40,0,85.0,5,1,3,1,0,62.5,54.1,3,110,86.0,27,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,0,85.0,5,1,3,1,0,65.2,54.1,3,110,86.0,27,33
110,0,122.0,13,0,4,2,0,68.4,58.7,2,152,95.0,25,25
164,1,168.0,19,1,2,2,0,64.0,52.6,3,98,70.0,29,34
56,3,150.0,8,1,2,2,0,65.7,49.6,6,70,101.0,17,23


In [16]:
#Linear Regression model
lr=LinearRegression()
lr.fit(xtrain,ytrain)

In [17]:
#testing model on training data
from sklearn.metrics import r2_score
y_pred=lr.predict(xtrain)
r2_score(ytrain,y_pred)

0.8504573774895473

In [18]:
#testing model on testing data
from sklearn.metrics import r2_score
y_pred=lr.predict(xtest)
r2_score(ytest,y_pred)

0.796556678039738

In [19]:
lr.score(xtrain,ytrain)

0.8504573774895473

In [20]:
lr.score(xtest,ytest)

0.796556678039738

In [21]:
# Regularization  :----

# Remove Overfitting we use Regularization.

from sklearn.linear_model import Lasso,Ridge

In [22]:
# Ridge regularization

l2=Ridge(0.1)
l2.fit(xtrain,ytrain)

In [23]:
l2.score(xtest,ytest)

0.7984694249678248

In [24]:
l2.coef_

array([ 5.80531057e+01,  1.25218729e+00, -1.99664337e+02, -6.93146496e+02,
       -2.02148035e+02,  1.87554249e+03,  1.55353516e+04,  7.59674152e+02,
        3.76888292e+02,  3.02876163e+02,  9.84783068e+01, -8.63170960e+00,
        3.01108871e+02, -4.07873646e+02])

In [25]:
# Taking different values of error to L2 model

for alpha in range(1,10):
  l2=Ridge(alpha)
  l2.fit(xtrain,ytrain)
  test_score=l2.score(xtest,ytest)
  print("Alpha:",alpha)
  print("Test score:",test_score)
  print("-----------------------------")


Alpha: 1
Test score: 0.8074518758147273
-----------------------------
Alpha: 2
Test score: 0.8110292248150515
-----------------------------
Alpha: 3
Test score: 0.8126933383890033
-----------------------------
Alpha: 4
Test score: 0.8136148645029297
-----------------------------
Alpha: 5
Test score: 0.8141745853539419
-----------------------------
Alpha: 6
Test score: 0.8145301242133357
-----------------------------
Alpha: 7
Test score: 0.8147582608502816
-----------------------------
Alpha: 8
Test score: 0.8149010602831954
-----------------------------
Alpha: 9
Test score: 0.8149836949253052
-----------------------------


In [26]:
# Lasso Regularizaation

# Taking different values of error to L2 model

for alpha in range(100,150,10):
  l1=Lasso(alpha)
  l1.fit(xtrain,ytrain)
  test_score=l1.score(xtest,ytest)
  print("Alpha:",alpha)
  print("Test score:",test_score)
  print("-----------------------------")

Alpha: 100
Test score: 0.8089989519118684
-----------------------------
Alpha: 110
Test score: 0.8098656626873879
-----------------------------
Alpha: 120
Test score: 0.8106487931098089
-----------------------------
Alpha: 130
Test score: 0.8113484125018899
-----------------------------
Alpha: 140
Test score: 0.8119644623062721
-----------------------------
