# **Regularization Model**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [4]:
df=pd.read_csv("cars.csv")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [5]:
'''
1.read data----basic anaysis
2.Missing values and encoding
3.Build a baseline model-
'''

'\n1.read data----basic anaysis\n2.Missing values and encoding\n3.Build a baseline model-\n'

In [6]:
#step 1 replace '?' with NAN
df['normalized-losses'].replace('?',np.nan,inplace=True)


In [7]:
df['horsepower'].replace('?',np.nan,inplace=True)

In [8]:
#step 2--changing the data type  cat colum into float
df['normalized-losses']=df['normalized-losses'].astype('float64')

In [9]:
df['horsepower']=df['normalized-losses'].astype('float64')

In [10]:
from sklearn.impute import SimpleImputer
si=SimpleImputer(missing_values=np.nan,strategy='mean')

In [11]:
#separate feature and target
X=df.iloc[:,:-1]  #all column except last
Y=df.iloc[:,-1] #only last col

In [12]:
#fit nan with mean
X[['normalized-losses','horsepower']]=si.fit_transform(X[['normalized-losses','horsepower']])
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,122.0,21,27
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,122.0,21,27
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,122.0,19,26
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,164.0,24,30
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,164.0,18,22


In [13]:
#separating cat columns
cat_col=X.select_dtypes(object).columns
cat_col

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [14]:
#splitting training and test data
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=1)

In [15]:
#Encoding
for col in cat_col:
    le=LabelEncoder() #le is an object of label encoder
    xtrain[col]=le.fit_transform(xtrain[col])
    xtest[col]=le.transform(xtest[col])

In [16]:
xtest

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
78,2,161.0,11,1,2,1,0,64.4,50.8,3,92,161.0,31,38
97,1,103.0,12,1,4,1,0,63.8,53.5,3,97,103.0,31,37
151,1,87.0,19,1,2,1,0,63.6,54.5,3,92,87.0,31,38
44,1,122.0,6,1,3,1,0,63.6,52.0,3,90,122.0,38,43
40,0,85.0,5,1,3,1,0,62.5,54.1,3,110,85.0,27,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,0,85.0,5,1,3,1,0,65.2,54.1,3,110,85.0,27,33
110,0,122.0,13,0,4,2,0,68.4,58.7,2,152,122.0,25,25
164,1,168.0,19,1,2,2,0,64.0,52.6,3,98,168.0,29,34
56,3,150.0,8,1,2,2,0,65.7,49.6,6,70,150.0,17,23


In [17]:
#Linear Regression model
lr=LinearRegression()
lr.fit(xtrain,ytrain)

In [18]:
#testing model on training data
from sklearn.metrics import r2_score
y_pred=lr.predict(xtrain)
r2_score(ytrain,y_pred)

0.8499931572152468

In [19]:
#testing model on testing data
from sklearn.metrics import r2_score
y_pred=lr.predict(xtest)
r2_score(ytest,y_pred)

0.800307244170246

In [20]:
lr.score(xtrain,ytrain)

0.8499931572152468

In [21]:
lr.score(xtest,ytest)

0.800307244170246

***As the r2 score for train data > r2 score test data, this is a overfit model ***

# **LECTURE START -- 24/4/2024**

In [22]:
# Regularization  :----

# Remove Overfitting we use Regularization.

from sklearn.linear_model import Lasso,Ridge

In [23]:
# Ridge regularization

l2=Ridge(0.1)
l2.fit(xtrain,ytrain)

In [24]:
l2.score(xtest,ytest)

0.801370332340412

In [25]:
l2.coef_

array([ 5.63149939e+01,  9.37526361e-01, -2.03142377e+02, -8.08511372e+02,
       -2.25421219e+02,  1.84267850e+03,  1.51214166e+04,  7.31349194e+02,
        4.06618898e+02,  3.34062304e+02,  9.46560408e+01,  9.37526363e-01,
        3.35728393e+02, -4.22460477e+02])

In [26]:
# Taking different values of error to L2 model

for alpha in range(1,10):
  l2=Ridge(alpha)
  l2.fit(xtrain,ytrain)
  test_score=l2.score(xtest,ytest)
  print("Alpha:",alpha)
  print("Test score:",test_score)
  print("-----------------------------")


Alpha: 1
Test score: 0.8063784661718955
-----------------------------
Alpha: 2
Test score: 0.8082482284216761
-----------------------------
Alpha: 3
Test score: 0.8090039293521639
-----------------------------
Alpha: 4
Test score: 0.8093440160202401
-----------------------------
Alpha: 5
Test score: 0.8094919679995507
-----------------------------
Alpha: 6
Test score: 0.8095367651885574
-----------------------------
Alpha: 7
Test score: 0.8095198513904858
-----------------------------
Alpha: 8
Test score: 0.8094630080973961
-----------------------------
Alpha: 9
Test score: 0.8093788406290283
-----------------------------


In [27]:
"""
After an alpha value 2, we can observe that there is small change in the r2 score
"""

'\nAfter an alpha value 2, we can observe that there is small change in the r2 score\n'

In [28]:
# Lasso Regularizaation

# Taking different values of error to L2 model

for alpha in range(100,150,10):
  l1=Lasso(alpha)
  l1.fit(xtrain,ytrain)
  test_score=l1.score(xtest,ytest)
  print("Alpha:",alpha)
  print("Test score:",test_score)
  print("-----------------------------")

Alpha: 100
Test score: 0.8087089626767562
-----------------------------
Alpha: 110
Test score: 0.8091467065043798
-----------------------------
Alpha: 120
Test score: 0.8095091370649387
-----------------------------
Alpha: 130
Test score: 0.8097965219954009
-----------------------------
Alpha: 140
Test score: 0.8100083403020953
-----------------------------


In [29]:
l2=Ridge(50)
l2.fit(xtest,ytest)
l2.coef_

array([  28.29697442,    0.71540709, -180.24787137, -138.60360037,
       -120.59204121,  550.2422454 ,    0.        ,  507.20246776,
         85.82763834,  -42.24480375,  115.63254527,    0.71540709,
       -242.31529729,   64.91042102])

In [30]:
l2=Ridge(2)
l2.fit(xtest,ytest)
l2.coef_

array([ 141.0635808 ,   -4.00498133, -224.35750857, -643.83110812,
       -139.34657588, 2434.11819361,    0.        ,  614.47305746,
         59.36573512, -151.19411892,  106.43363293,   -4.00498133,
       -485.74547075,  320.04845566])

In [31]:
l2=Lasso(130)
l2.fit(xtest,ytest)
l2.coef_

array([ 2.55934874e+01, -3.03476511e-02, -2.03304715e+02, -0.00000000e+00,
       -0.00000000e+00,  2.15671395e+03,  0.00000000e+00,  6.35327980e+02,
        1.14437129e+01, -0.00000000e+00,  1.07350626e+02, -4.10236123e+00,
       -2.79750133e+02,  1.41815854e+02])

# NOTE:
**1.If we taking Ridge, value we have to take is lower**
  ***Ex.*** for alpha in range(1,10):
              l2=Ridge(alpha)
              l2.fit(xtrain,ytrain)
              test_score=l2.score(xtest,ytest)
              print("Alpha:",alpha)
              print("Test score:",test_score)
              print("-----------------------------")
**2.If we taking Lasso, value we have to take is large**
  ***Ex.*** for alpha in range(100,150,10):
            l1=Lasso(alpha)
            l1.fit(xtrain,ytrain)
            test_score=l1.score(xtest,ytest)
            print("Alpha:",alpha)
            print("Test score:",test_score)
            print("-----------------------------")

# **Cross-Validation**

In [32]:
from sklearn.model_selection import cross_val_score

In [33]:
catcol=X.select_dtypes(object).columns

In [34]:
for col in catcol:
  le=LabelEncoder()
  X[col]=le.fit_transform(X[col])

In [35]:
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
0,3,122.0,0,1,0,2,0,64.1,48.8,0,130,122.0,21,27
1,3,122.0,0,1,0,2,0,64.1,48.8,0,130,122.0,21,27
2,1,122.0,0,1,2,2,0,65.5,52.4,5,152,122.0,19,26
3,2,164.0,1,1,3,1,0,66.2,54.3,3,109,164.0,24,30
4,2,164.0,1,1,3,0,0,66.4,54.3,3,136,164.0,18,22


In [36]:
cross_val_score(l2,X,Y,cv=4)  #returns r2 score for all 4 parts of data

array([0.73789473, 0.84979423, 0.39436269, 0.46435112])

In [37]:
cross_val_score(l1,X,Y,cv=4)  #returns r2 score for all 4 parts of data

array([0.74142203, 0.84739925, 0.39793331, 0.4591439 ])