# Regularization 
Apply penalty term on the coefficients

1. Ridge - Penalty is applied on square of coefficients
2. Lasso - Penalty is applied on absolute value of coefficients

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

# Step 1 - Data ingestion

In [2]:
import pandas as pd
df = pd.read_csv("Cars93.csv", na_values = ["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


# Estimate the weight of car based on other factors

# Step 2 - Perform basic data quality checks

In [3]:
df.shape

(94, 28)

In [4]:
df.duplicated().sum()

np.int64(1)

In [5]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.shape

(93, 28)

In [6]:
m = df.isna().sum()
m[m > 0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

# Step 3 - Seperate X and Y(Weight)

In [7]:
X = df.drop(columns = ["id", "Weight"])
Y = df["Weight"]

In [8]:
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [9]:
Y.head()

0    2705
1    3560
2    3375
3    3405
4    3640
Name: Weight, dtype: int64

In [10]:
X.select_dtypes(include="object").nunique() / len(df)

Manufacturer       0.344086
Model              1.000000
Type               0.064516
AirBags            0.032258
DriveTrain         0.032258
Cylinders          0.064516
Man.trans.avail    0.021505
Origin             0.021505
Make               1.000000
dtype: float64

In [11]:
# Dropping high cardinality columns
X = X.drop(columns = ["Model", "Make"])

In [12]:
X.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
0,Acura,Small,12.9,15.9,18.8,25,31,,Front,4,...,Yes,13.2,5,177,102,68,37,26.5,11.0,non-USA
1,Acura,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,6,...,Yes,18.0,5,195,115,71,38,30.0,15.0,non-USA
2,Audi,Compact,25.9,29.1,32.3,20,26,Driver only,Front,6,...,Yes,16.9,5,180,102,67,37,28.0,14.0,non-USA
3,Audi,Midsize,30.8,37.7,44.6,19,26,,Front,6,...,Yes,21.1,6,193,106,70,37,31.0,17.0,non-USA
4,BMW,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,4,...,Yes,21.1,4,186,109,69,39,27.0,13.0,non-USA


# Step 4 - Train Test split

In [13]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=10)

In [14]:
xtrain.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
42,Honda,Compact,13.8,17.5,21.2,24,31,Driver & Passenger,Front,4,...,Yes,17.0,4,185,107,67,41,28.0,14.0,non-USA
53,Mazda,Small,10.9,11.6,12.3,28,36,,Front,4,...,Yes,14.5,5,172,98,66,36,26.5,13.0,non-USA
21,Chrysler,Large,29.5,29.5,29.5,20,26,,Front,6,...,No,16.0,6,203,110,69,44,36.0,17.0,USA
6,Buick,Large,19.9,20.8,21.7,19,28,Driver only,Front,6,...,No,18.0,6,200,111,74,42,30.5,17.0,USA
26,Dodge,Midsize,14.8,15.6,16.4,21,27,Driver only,Front,4,...,No,16.0,6,192,105,69,42,30.5,16.0,USA


In [15]:
ytrain.head()

42    3040
53    2440
21    3570
6     3470
26    3080
Name: Weight, dtype: int64

In [16]:
xtest.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
34,Ford,Sporty,12.8,14.0,15.2,24,30,Driver only,Front,4,...,Yes,15.5,4,179,103,70,38,23.0,18.0,USA
90,Volkswagen,Sporty,22.9,23.3,23.7,18,25,,Front,6,...,Yes,18.5,4,159,97,66,36,26.0,15.0,non-USA
3,Audi,Midsize,30.8,37.7,44.6,19,26,,Front,6,...,Yes,21.1,6,193,106,70,37,31.0,17.0,non-USA
35,Ford,Van,14.5,19.9,25.3,15,20,Driver only,4WD,6,...,Yes,21.0,7,176,119,72,45,30.0,,USA
19,Chrylser,Large,18.4,18.4,18.4,20,28,Driver & Passenger,Front,6,...,No,18.0,6,203,113,74,40,31.0,15.0,USA


In [17]:
ytest.head()

34    2710
90    2810
3     3405
35    3735
19    3515
Name: Weight, dtype: int64

In [18]:
xtrain.shape

(74, 24)

In [19]:
xtest.shape

(19, 24)

# Step 5 - Apply preprocessing on X

In [34]:
num_cols = xtrain.select_dtypes(include="number").columns.tolist()
num_cols

['Min.Price',
 'Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room']

In [35]:
cat_cols = xtrain.select_dtypes(include="object").columns.tolist()
cat_cols

['Manufacturer',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin']

In [36]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [37]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

In [38]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler()
)

In [39]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform="pandas")

In [40]:
pre.fit(xtrain)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [42]:
xtrain_pre = pre.transform(xtrain)
xtrain_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,num__Turn.circle,num__Rear.seat.room,num__Luggage.room,cat__Manufacturer,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin
42,-0.373824,-0.208643,-0.071922,0.318308,0.363084,-0.499737,-0.070927,0.583333,0.553885,0.094849,...,0.554316,0.030727,0.028068,-0.330948,-1.373237,-1.720536,-0.180439,-0.76365,0.780189,1.114641
53,-0.699007,-0.802086,-0.851002,1.066078,1.322665,-0.8714,-0.759572,0.416667,-0.190474,-0.664766,...,-0.987766,-0.456514,-0.318105,0.266373,0.474391,1.173093,-0.180439,-0.76365,0.780189,1.114641
21,1.38665,0.99836,0.654635,-0.429463,-0.596496,0.522338,0.059357,-0.75,-1.020721,-0.208997,...,1.479565,2.629344,1.066586,-0.928269,-0.757361,-0.273722,-0.180439,0.975122,-1.28174,-0.89715
6,0.310182,0.123283,-0.028154,-0.616405,-0.212664,0.986917,0.487434,-0.75,-1.431073,0.398695,...,0.862732,0.842795,1.066586,-1.286661,-0.757361,-0.273722,-0.180439,0.975122,-1.28174,-0.89715
26,-0.261692,-0.399752,-0.4921,-0.24252,-0.40458,-0.220989,-0.815408,-0.75,0.277136,-0.208997,...,0.862732,0.842795,0.720413,-0.808804,-0.141485,-0.273722,-0.180439,-0.76365,-1.28174,-0.89715


In [43]:
xtest_pre = pre.transform(xtest)
xtest_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,num__Turn.circle,num__Rear.seat.room,num__Luggage.room,cat__Manufacturer,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin
34,-0.485956,-0.560685,-0.597145,0.318308,0.171168,-0.685568,-0.536227,0.416667,0.038559,-0.36092,...,-0.370933,-1.593409,1.412758,-0.569876,1.090267,-0.273722,-0.180439,-0.76365,0.780189,-0.89715
90,0.646578,0.374742,0.14692,-0.803348,-0.788412,0.057759,0.63633,0.916667,0.124447,0.550618,...,-0.987766,-0.618927,0.374241,1.699942,1.090267,1.173093,-0.180439,0.975122,0.780189,1.114641
3,1.532421,1.823145,1.976445,-0.616405,-0.596496,0.057759,0.524658,0.416667,0.410739,1.340618,...,-0.67935,1.005208,1.066586,-1.525589,-0.141485,-0.273722,-0.180439,0.975122,0.780189,1.114641
35,-0.295332,0.032758,0.28698,-1.364176,-1.747992,0.24359,0.022133,-0.75,-0.45768,1.310233,...,1.787981,0.680381,0.028068,-0.569876,1.706143,-0.273722,-2.087934,0.975122,0.780189,-0.89715
19,0.141983,-0.118118,-0.317026,-0.429463,-0.212664,0.522338,0.171029,0.083333,-0.629455,0.398695,...,0.2459,1.005208,0.374241,-1.764517,-0.757361,-1.720536,-0.180439,0.975122,-1.28174,-0.89715


# Fit base linear regression model

In [44]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [45]:
model.intercept_

np.float64(3087.567567567568)

In [46]:
model.coef_

array([ 2.04868780e+03, -4.55409525e+03,  2.57740449e+03,  5.58788601e+01,
       -1.43406804e+02, -1.23522037e+02,  2.74147784e+02, -1.04815627e+02,
       -5.08096266e+01,  1.03621300e+02,  1.88463806e+01,  5.62951231e+01,
        2.40399164e+02, -4.42426717e+01,  2.84784672e+01, -2.98992049e+01,
        5.08191340e+01,  3.88278970e+00,  5.37264242e+01, -1.62021287e+01,
       -1.07346585e+01,  1.44934812e+01,  2.04583427e+01,  4.99720069e+01])

In [48]:
train_r2 = model.score(xtrain_pre, ytrain)
test_r2 = model.score(xtest_pre, ytest)
gen_err = abs(train_r2 - test_r2)
print(f"Train R2 : {train_r2:.2%}")
print(f"Test R2 : {test_r2:.2%}")
print(f"Generalization error : {gen_err:.2%}")

Train R2 : 97.25%
Test R2 : 82.17%
Generalization error : 15.08%


# Feature selection

In [49]:
from sklearn.feature_selection import SequentialFeatureSelector

base_model = LinearRegression()
sel = SequentialFeatureSelector(base_model, n_features_to_select="auto", direction="forward")
sel.fit(xtrain_pre, ytrain)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [50]:
sel.get_feature_names_out()

array(['num__MPG.highway', 'num__Horsepower', 'num__RPM',
       'num__Rev.per.mile', 'num__Fuel.tank.capacity', 'num__Length',
       'num__Wheelbase', 'num__Luggage.room', 'cat__Manufacturer',
       'cat__Type', 'cat__Cylinders', 'cat__Origin'], dtype=object)

In [51]:
xtrain_pre_sel = sel.transform(xtrain_pre)
xtrain_pre_sel[0:5]

array([[ 0.36308441, -0.07092689,  0.58333333,  0.55388468,  0.09484922,
         0.07553707,  0.4504151 ,  0.02806804, -0.33094792, -1.3732369 ,
        -0.76364966,  1.11464086],
       [ 1.32266464, -0.75957166,  0.41666667, -0.19047443, -0.66476576,
        -0.83279622, -0.91311426, -0.31810451,  0.26637272,  0.47439093,
        -0.76364966,  1.11464086],
       [-0.59649582,  0.05935726, -0.75      , -1.02072113, -0.20899677,
         1.33322932,  0.90492489,  1.06658569, -0.92826856, -0.75736096,
         0.97512187, -0.89714996],
       [-0.21266373,  0.48743374, -0.75      , -1.43107295,  0.39869522,
         1.12361394,  1.05642815,  1.06658569, -1.28666095, -0.75736096,
         0.97512187, -0.89714996],
       [-0.40457977, -0.81540773, -0.75      ,  0.27713578, -0.20899677,
         0.56463961,  0.14740858,  0.72041314, -0.80880444, -0.14148501,
        -0.76364966, -0.89714996]])

In [53]:
xtest_pre_sel = sel.transform(xtest_pre)
xtest_pre_sel[0:5]

array([[ 0.17116836, -0.53622741,  0.41666667,  0.03855914, -0.36091977,
        -0.34369368, -0.15559794,  1.41275824, -0.56987618,  1.09026688,
        -0.76364966, -0.89714996],
       [-0.78841186,  0.6363299 ,  0.91666667,  0.12444673,  0.55061821,
        -1.7411295 , -1.06461752,  0.37424059,  1.69994226,  1.09026688,
         0.97512187,  1.11464086],
       [-0.59649582,  0.52465778,  0.41666667,  0.4107387 ,  1.3406178 ,
         0.6345114 ,  0.29891184,  1.06658569, -1.5255892 , -0.14148501,
         0.97512187,  1.11464086],
       [-1.74799209,  0.02213321, -0.75      , -0.45768026,  1.3102332 ,
        -0.55330905,  2.26845425,  0.02806804, -0.56987618,  1.70614282,
         0.97512187, -0.89714996],
       [-0.21266373,  0.17102938,  0.08333333, -0.62945544,  0.39869522,
         1.33322932,  1.35943468,  0.37424059, -1.76451746, -0.75736096,
         0.97512187, -0.89714996]])

# Check results after feature selection

In [54]:
model2 = LinearRegression()
model2.fit(xtrain_pre_sel, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [56]:
model2.intercept_

np.float64(3087.5675675675675)

In [55]:
model2.coef_

array([-126.23827432,  188.46646328,  -61.70682334,  -25.75935582,
         78.65055422,   44.18149908,  222.05694178,   25.17503785,
          8.05856146,   59.04286602,  -18.30179931,   40.72901962])

In [57]:
train_r2 = model2.score(xtrain_pre_sel, ytrain)
test_r2 = model2.score(xtest_pre_sel, ytest)
gen_err = abs(train_r2 - test_r2)
print(f"Train R2 : {train_r2:.2%}")
print(f"Test R2 : {test_r2:.2%}")
print(f"Generalization error : {gen_err:.2%}")

Train R2 : 96.60%
Test R2 : 86.79%
Generalization error : 9.81%


# Ridge 
Penalty is applied on square of coefficients

In [58]:
from sklearn.linear_model import Ridge

model3 = Ridge(alpha=0.1)
model3.fit(xtrain_pre_sel, ytrain)

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [59]:
model3.score(xtrain_pre_sel, ytrain)

0.9659999476696532

In [60]:
model3.score(xtest_pre_sel, ytest)

0.8691743503267384

In [61]:
from sklearn.model_selection import GridSearchCV

params = {
    "alpha": [0.01, 0.05, 0.1, 1, 10, 50, 100, 200, 500]
}

base_ridge = Ridge(random_state=42)
gscv_ridge = GridSearchCV(base_ridge, param_grid=params, scoring="r2")
gscv_ridge.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Ridge(random_state=42)
,param_grid,"{'alpha': [0.01, 0.05, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [62]:
gscv_ridge.best_params_

{'alpha': 1}

In [63]:
gscv_ridge.best_score_

np.float64(0.9487786917560944)

In [64]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [65]:
train_r2 = best_ridge.score(xtrain_pre_sel, ytrain)
test_r2 = best_ridge.score(xtest_pre_sel, ytest)
gen_err = abs(train_r2 - test_r2)
print(f"Train R2 : {train_r2:.2%}")
print(f"Test R2 : {test_r2:.2%}")
print(f"Generalization Error : {gen_err:.2%}")

Train R2 : 96.58%
Test R2 : 87.81%
Generalization Error : 8.77%


# Lasso Model

In [66]:
from sklearn.linear_model import Lasso 

params2 = {
    "alpha": [0.01, 0.05, 0.1, 1, 10, 50, 100, 200]
}

model3 = Lasso(random_state=42)
gscv_lasso = GridSearchCV(model3, params2, cv=5, scoring="r2")
gscv_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Lasso(random_state=42)
,param_grid,"{'alpha': [0.01, 0.05, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [67]:
gscv_lasso.best_score_

np.float64(0.9486952161781591)

In [68]:
gscv_lasso.best_params_

{'alpha': 1}

In [69]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

0,1,2
,alpha,1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [70]:
train_r2 = best_lasso.score(xtrain_pre_sel, ytrain)
test_r2 = best_lasso.score(xtest_pre_sel, ytest)
gen_err = abs(train_r2 - test_r2)
print(f"Train R2 : {train_r2:.2%}")
print(f"Test R2 : {test_r2:.2%}")
print(f"Generalization error : {gen_err:.2%}")

Train R2 : 96.59%
Test R2 : 87.88%
Generalization error : 8.71%


# Let us choose ridge model because highest score

In [71]:
gscv_ridge.best_score_

np.float64(0.9487786917560944)

In [72]:
gscv_lasso.best_score_

np.float64(0.9486952161781591)

# Out of sample prediction

In [73]:
xnew = pd.read_csv("sample.csv")
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [74]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,num__Turn.circle,num__Rear.seat.room,num__Luggage.room,cat__Manufacturer,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin
0,1.532421,1.823145,1.976445,-0.616405,-0.596496,0.057759,0.524658,0.416667,0.410739,-0.512843,...,-0.67935,1.005208,1.066586,-1.525589,-0.141485,-0.273722,-0.180439,0.975122,0.780189,1.114641
1,-0.867205,-0.852378,-0.807234,0.131365,0.363084,-0.685568,-0.629288,-0.083333,0.658858,-0.452074,...,-0.062517,-0.943754,-0.318105,0.983157,-1.373237,-0.273722,-0.180439,-0.76365,0.780189,-0.89715
2,-0.418677,-0.369577,-0.317026,-0.24252,-0.020748,-0.499737,-0.629288,-0.083333,0.525255,-0.057074,...,0.2459,0.19314,0.720413,-1.047733,-0.141485,-0.273722,-0.180439,-0.76365,-1.28174,-0.89715
3,1.723046,1.300111,0.917246,-0.990291,-0.788412,-1.335979,2.069456,2.083333,0.00993,1.006387,...,-0.67935,-0.131687,0.028068,0.266373,1.090267,-0.273722,1.727057,-2.502421,0.780189,1.114641
4,-0.945698,-1.053545,-1.096106,0.50525,0.746917,-0.8714,-1.169036,0.416667,0.439368,-1.302842,...,-1.604599,-0.618927,-1.356622,1.699942,0.474391,-0.273722,-0.180439,-0.76365,0.780189,1.114641


In [75]:
xnew_pre_sel = sel.transform(xnew_pre)
xnew_pre_sel[0:5]

array([[-0.59649582,  0.52465778,  0.41666667,  0.4107387 , -0.51284277,
         0.42489603,  0.29891184,  1.06658569, -1.5255892 , -0.14148501,
         0.97512187,  1.11464086],
       [ 0.36308441, -0.62928752, -0.08333333,  0.6588584 , -0.45207357,
        -0.20395009, -0.45860447, -0.31810451,  0.98315749, -1.3732369 ,
        -0.76364966, -0.89714996],
       [-0.02074768, -0.62928752, -0.08333333,  0.52525548, -0.05707377,
         0.98387036,  0.60191837,  0.72041314, -1.04773269, -0.14148501,
        -0.76364966, -0.89714996],
       [-0.78841186,  2.06945551,  2.08333333,  0.00992995,  1.00638721,
        -1.04241159, -1.21612078,  0.02806804,  0.26637272,  1.09026688,
        -2.50242119,  1.11464086],
       [ 0.7469165 , -1.16903612,  0.41666667,  0.43936789, -1.30284235,
        -1.46164234, -1.67063057, -1.35662215,  1.69994226,  0.47439093,
        -0.76364966,  1.11464086]])

In [76]:
preds = best_ridge.predict(xnew_pre_sel)
preds

array([3300.2903705 , 2664.92140017, 3109.36913408, 3328.45984932,
       2277.14963959])

In [77]:
xnew["Weight"] = preds.round(2)

In [78]:
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3300.29
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2664.92
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3109.37
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,3328.46
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2277.15


In [79]:
xnew.to_csv("Results.csv", index=False)

# Save the model

In [80]:
import joblib

joblib.dump(pre, "pre.joblib")

['pre.joblib']

In [81]:
joblib.dump(sel, "sel.joblib")

['sel.joblib']

In [82]:
joblib.dump(best_ridge, "ridge_model.joblib")

['ridge_model.joblib']

In [83]:
p = joblib.load("pre.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [84]:
s = joblib.load("sel.joblib")
s

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [85]:
m = joblib.load("ridge_model.joblib")
m

0,1,2
,alpha,1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42
