In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("cars.csv")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   body-style         205 non-null    object 
 5   drive-wheels       205 non-null    object 
 6   engine-location    205 non-null    object 
 7   width              205 non-null    float64
 8   height             205 non-null    float64
 9   engine-type        205 non-null    object 
 10  engine-size        205 non-null    int64  
 11  horsepower         205 non-null    object 
 12  city-mpg           205 non-null    int64  
 13  highway-mpg        205 non-null    int64  
 14  price              205 non-null    int64  
dtypes: float64(2), int64(5), object(8)
memory usage: 24.1+ KB


In [4]:
miss_vals =["Nan","?","missing"] 
df.replace(miss_vals,np.NaN,inplace=True)

In [5]:
df.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
body-style            0
drive-wheels          0
engine-location       0
width                 0
height                0
engine-type           0
engine-size           0
horsepower            2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [6]:
df['normalized-losses'] = df['normalized-losses'].astype("float64")
df['horsepower'] = df['horsepower'].astype("float64")

df['normalized-losses'].replace(np.NaN,df['normalized-losses'].mean(),inplace=True)
df['horsepower'].replace(np.NaN,df['horsepower'].mean(),inplace=True)

df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
body-style           0
drive-wheels         0
engine-location      0
width                0
height               0
engine-type          0
engine-size          0
horsepower           0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [7]:
# Distinguishing the Numerical & Categorial values 
df_numerical = df.select_dtypes(['int64','float64'])
df_category = df.select_dtypes(object)

In [8]:
from sklearn.preprocessing import LabelEncoder

for ele in df_category:
    LE = LabelEncoder()
    df_category[ele] = LE.fit_transform(df_category[ele])

In [9]:
df_category

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0,1,0,2,0,0
1,0,1,0,2,0,0
2,0,1,2,2,0,5
3,1,1,3,1,0,3
4,1,1,3,0,0,3
...,...,...,...,...,...,...
200,21,1,3,2,0,3
201,21,1,3,2,0,3
202,21,1,3,2,0,5
203,21,0,3,2,0,3


In [10]:
df = pd.concat([df_category,df_numerical],axis=1)

In [11]:
df.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price
0,0,1,0,2,0,0,3,122.0,64.1,48.8,130,111.0,21,27,13495
1,0,1,0,2,0,0,3,122.0,64.1,48.8,130,111.0,21,27,16500
2,0,1,2,2,0,5,1,122.0,65.5,52.4,152,154.0,19,26,16500
3,1,1,3,1,0,3,2,164.0,66.2,54.3,109,102.0,24,30,13950
4,1,1,3,0,0,3,2,164.0,66.4,54.3,136,115.0,18,22,17450


In [12]:
# split X & Y
x=df.iloc[:,:-1]
x

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg
0,0,1,0,2,0,0,3,122.0,64.1,48.8,130,111.0,21,27
1,0,1,0,2,0,0,3,122.0,64.1,48.8,130,111.0,21,27
2,0,1,2,2,0,5,1,122.0,65.5,52.4,152,154.0,19,26
3,1,1,3,1,0,3,2,164.0,66.2,54.3,109,102.0,24,30
4,1,1,3,0,0,3,2,164.0,66.4,54.3,136,115.0,18,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,21,1,3,2,0,3,-1,95.0,68.9,55.5,141,114.0,23,28
201,21,1,3,2,0,3,-1,95.0,68.8,55.5,141,160.0,19,25
202,21,1,3,2,0,5,-1,95.0,68.9,55.5,173,134.0,18,23
203,21,0,3,2,0,3,-1,95.0,68.9,55.5,145,106.0,26,27


In [13]:
y=df.iloc[:,-1]
y

0      13495
1      16500
2      16500
3      13950
4      17450
       ...  
200    16845
201    19045
202    21485
203    22470
204    22625
Name: price, Length: 205, dtype: int64

In [14]:
# Applying train test split
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.20,random_state=1)

In [15]:
# Model Building
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(xtrain,ytrain)


In [16]:
train=linreg.score(xtrain,ytrain)
test=linreg.score(xtest,ytest)
print(f"Training result -:{train}")
print(f"Test result -:{test}")

Training result -:0.8539412613914047
Test result -:0.744026325854571


In [17]:
linreg.coef_

array([-1.93628369e+02, -3.96939528e+02, -1.59308387e+02,  2.03453658e+03,
        1.61641371e+04,  3.04469374e+02,  8.10627829e+01, -4.98250716e+00,
        8.12358117e+02,  2.86594382e+02,  9.72915986e+01, -1.24843308e+01,
        2.37997595e+02, -3.56739524e+02])

In [18]:
#low training error = low bias
#high testing erroe = high variance
#Overfiting

In [19]:
from sklearn.linear_model import Ridge , Lasso

In [20]:
#create l2 model(ridge)
l2=Ridge(alpha=11)
l2.fit(xtrain,ytrain)

In [21]:
train=l2.score(xtrain,ytrain)
test=l2.score(xtest,ytest)
print(f"Training result -:{train}")
print(f"Test result -:{test}")

Training result -:0.8192099407731406
Test result -:0.7748177686083775


In [22]:
#best value of alpha
for i in range(1,25):
    l2=Ridge(alpha=i)
    l2.fit(xtrain,ytrain)
    
    test=l2.score(xtest,ytest)
    print(f"alpha value : {i} Test Result : {test}")

alpha value : 1 Test Result : 0.7597037544007507
alpha value : 2 Test Result : 0.7655470158349814
alpha value : 3 Test Result : 0.7685522366994495
alpha value : 4 Test Result : 0.7703895039721605
alpha value : 5 Test Result : 0.771636176523288
alpha value : 6 Test Result : 0.7725405637365288
alpha value : 7 Test Result : 0.7732264194061071
alpha value : 8 Test Result : 0.7737624277305333
alpha value : 9 Test Result : 0.7741899891455895
alpha value : 10 Test Result : 0.7745357446711169
alpha value : 11 Test Result : 0.7748177686083775
alpha value : 12 Test Result : 0.7750488663114242
alpha value : 13 Test Result : 0.7752384410630866
alpha value : 14 Test Result : 0.7753936067841307
alpha value : 15 Test Result : 0.7755198811711731
alpha value : 16 Test Result : 0.7756216343440716
alpha value : 17 Test Result : 0.7757023891700178
alpha value : 18 Test Result : 0.7757650283597379
alpha value : 19 Test Result : 0.775811941091747
alpha value : 20 Test Result : 0.7758451292906465
alpha value

In [23]:
l3=Ridge(alpha=23)
l3.fit(xtrain,ytrain)
test=l3.score(xtest,ytest)
print(f"Test result -:{test}")

Test result -:0.7758780793379447


In [24]:
l3.coef_

array([-174.0367263 , -594.18314569, -601.16739742, 1338.06264586,
       1273.78469879,  437.31257312,  219.52478407,   -5.22890037,
        382.7693018 ,  478.61745859,  104.10152959,   21.3689452 ,
        140.51595786, -225.62742993])

In [28]:
#best value of alpha (Lass0)
for i in range(1,50):
    l1=Lasso(alpha=i)
    l1.fit(xtrain,ytrain)
    
    test=l1.score(xtest,ytest)
    print(f"alpha value : {i} Test Result : {test}")

alpha value : 1 Test Result : 0.7442680740683877
alpha value : 2 Test Result : 0.7445088555136212
alpha value : 3 Test Result : 0.7447486663380158
alpha value : 4 Test Result : 0.7449875117718192
alpha value : 5 Test Result : 0.7452253930544235
alpha value : 6 Test Result : 0.7454623074748041
alpha value : 7 Test Result : 0.7456982523133264
alpha value : 8 Test Result : 0.7459332350703951
alpha value : 9 Test Result : 0.7461672452598296
alpha value : 10 Test Result : 0.7464003033147398
alpha value : 11 Test Result : 0.7466323874920109
alpha value : 12 Test Result : 0.7468635084582822
alpha value : 13 Test Result : 0.7470936662369827
alpha value : 14 Test Result : 0.7473228474604031
alpha value : 15 Test Result : 0.747551078282638
alpha value : 16 Test Result : 0.7477783459621578
alpha value : 17 Test Result : 0.7480046359003962
alpha value : 18 Test Result : 0.7482299766973243
alpha value : 19 Test Result : 0.7484543391200867
alpha value : 20 Test Result : 0.7486777530570511
alpha valu

In [29]:
l1=Lasso(alpha=211)
l1.fit(xtrain,ytrain)
test=l1.score(xtest,ytest)
print(f"Test result -:{test}")

Test result -:0.7776542978303806


In [30]:
l1.coef_

array([-161.08636692,   -0.        , -418.67042801, 1273.97828427,
          0.        ,  283.22410782,   50.67795754,   -3.99449787,
        304.94846482,  398.57627235,  112.34535546,   16.43966854,
          0.        , -102.8559008 ])