# Forward selection

![image.png](attachment:image.png)

### Feature selection is done to avoid overfitting in data

In [1]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 1 - Read the dataset

In [2]:
import pandas as pd
df = pd.read_csv("Cars93.csv", keep_default_na=False, na_values=["", "NA"])
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


# Step 2 - Perform basic data quality checks

In [3]:
df.shape

(94, 28)

In [4]:
df.duplicated().sum()

np.int64(1)

In [5]:
df = df.drop_duplicates(keep="first").reset_index(drop="first")
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


In [6]:
m = df.isna().sum()
m[m > 0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

# Step 3 - Seperate X and Y(Weight)

In [7]:
X = df.drop(columns = ["id", "Weight"])
Y = df["Weight"]

In [8]:
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [9]:
Y.head()

0    2705
1    3560
2    3375
3    3405
4    3640
Name: Weight, dtype: int64

In [10]:
X.select_dtypes(include="object").nunique() / len(df)

Manufacturer       0.344086
Model              1.000000
Type               0.064516
AirBags            0.032258
DriveTrain         0.032258
Cylinders          0.064516
Man.trans.avail    0.021505
Origin             0.021505
Make               1.000000
dtype: float64

In [11]:
X = X.drop(columns = ["Model", "Make"])
X.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
0,Acura,Small,12.9,15.9,18.8,25,31,,Front,4,...,Yes,13.2,5,177,102,68,37,26.5,11.0,non-USA
1,Acura,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,6,...,Yes,18.0,5,195,115,71,38,30.0,15.0,non-USA
2,Audi,Compact,25.9,29.1,32.3,20,26,Driver only,Front,6,...,Yes,16.9,5,180,102,67,37,28.0,14.0,non-USA
3,Audi,Midsize,30.8,37.7,44.6,19,26,,Front,6,...,Yes,21.1,6,193,106,70,37,31.0,17.0,non-USA
4,BMW,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,4,...,Yes,21.1,4,186,109,69,39,27.0,13.0,non-USA


# Step 4 - Apply train test split

In [12]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=10)

In [13]:
xtrain.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
42,Honda,Compact,13.8,17.5,21.2,24,31,Driver & Passenger,Front,4,...,Yes,17.0,4,185,107,67,41,28.0,14.0,non-USA
53,Mazda,Small,10.9,11.6,12.3,28,36,,Front,4,...,Yes,14.5,5,172,98,66,36,26.5,13.0,non-USA
21,Chrysler,Large,29.5,29.5,29.5,20,26,,Front,6,...,No,16.0,6,203,110,69,44,36.0,17.0,USA
6,Buick,Large,19.9,20.8,21.7,19,28,Driver only,Front,6,...,No,18.0,6,200,111,74,42,30.5,17.0,USA
26,Dodge,Midsize,14.8,15.6,16.4,21,27,Driver only,Front,4,...,No,16.0,6,192,105,69,42,30.5,16.0,USA


In [14]:
ytrain.head()

42    3040
53    2440
21    3570
6     3470
26    3080
Name: Weight, dtype: int64

In [15]:
xtest.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
34,Ford,Sporty,12.8,14.0,15.2,24,30,Driver only,Front,4,...,Yes,15.5,4,179,103,70,38,23.0,18.0,USA
90,Volkswagen,Sporty,22.9,23.3,23.7,18,25,,Front,6,...,Yes,18.5,4,159,97,66,36,26.0,15.0,non-USA
3,Audi,Midsize,30.8,37.7,44.6,19,26,,Front,6,...,Yes,21.1,6,193,106,70,37,31.0,17.0,non-USA
35,Ford,Van,14.5,19.9,25.3,15,20,Driver only,4WD,6,...,Yes,21.0,7,176,119,72,45,30.0,,USA
19,Chrylser,Large,18.4,18.4,18.4,20,28,Driver & Passenger,Front,6,...,No,18.0,6,203,113,74,40,31.0,15.0,USA


In [16]:
ytest.head()

34    2710
90    2810
3     3405
35    3735
19    3515
Name: Weight, dtype: int64

In [17]:
xtrain.shape

(74, 24)

In [18]:
xtest.shape

(19, 24)

# Step 5 - Apply Preprocessing on X

In [19]:
cat_cols = xtrain.select_dtypes(include="object").columns.tolist()
print(cat_cols)

['Manufacturer', 'Type', 'AirBags', 'DriveTrain', 'Cylinders', 'Man.trans.avail', 'Origin']


In [20]:
num_cols = xtrain.select_dtypes(include="number").columns.tolist()
print(num_cols)

['Min.Price', 'Price', 'Max.Price', 'MPG.city', 'MPG.highway', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room']


In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [22]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

In [23]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler()
)

In [24]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform = "pandas")

In [25]:
pre.fit(xtrain)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [26]:
xtrain_pre = pre.transform(xtrain)
xtrain_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,num__Turn.circle,num__Rear.seat.room,num__Luggage.room,cat__Manufacturer,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin
42,-0.373824,-0.208643,-0.071922,0.318308,0.363084,-0.499737,-0.070927,0.583333,0.553885,0.094849,...,0.554316,0.030727,0.028068,-0.330948,-1.373237,-1.720536,-0.180439,-0.76365,0.780189,1.114641
53,-0.699007,-0.802086,-0.851002,1.066078,1.322665,-0.8714,-0.759572,0.416667,-0.190474,-0.664766,...,-0.987766,-0.456514,-0.318105,0.266373,0.474391,1.173093,-0.180439,-0.76365,0.780189,1.114641
21,1.38665,0.99836,0.654635,-0.429463,-0.596496,0.522338,0.059357,-0.75,-1.020721,-0.208997,...,1.479565,2.629344,1.066586,-0.928269,-0.757361,-0.273722,-0.180439,0.975122,-1.28174,-0.89715
6,0.310182,0.123283,-0.028154,-0.616405,-0.212664,0.986917,0.487434,-0.75,-1.431073,0.398695,...,0.862732,0.842795,1.066586,-1.286661,-0.757361,-0.273722,-0.180439,0.975122,-1.28174,-0.89715
26,-0.261692,-0.399752,-0.4921,-0.24252,-0.40458,-0.220989,-0.815408,-0.75,0.277136,-0.208997,...,0.862732,0.842795,0.720413,-0.808804,-0.141485,-0.273722,-0.180439,-0.76365,-1.28174,-0.89715


In [27]:
xtest_pre = pre.transform(xtest)
xtest_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,num__Turn.circle,num__Rear.seat.room,num__Luggage.room,cat__Manufacturer,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin
34,-0.485956,-0.560685,-0.597145,0.318308,0.171168,-0.685568,-0.536227,0.416667,0.038559,-0.36092,...,-0.370933,-1.593409,1.412758,-0.569876,1.090267,-0.273722,-0.180439,-0.76365,0.780189,-0.89715
90,0.646578,0.374742,0.14692,-0.803348,-0.788412,0.057759,0.63633,0.916667,0.124447,0.550618,...,-0.987766,-0.618927,0.374241,1.699942,1.090267,1.173093,-0.180439,0.975122,0.780189,1.114641
3,1.532421,1.823145,1.976445,-0.616405,-0.596496,0.057759,0.524658,0.416667,0.410739,1.340618,...,-0.67935,1.005208,1.066586,-1.525589,-0.141485,-0.273722,-0.180439,0.975122,0.780189,1.114641
35,-0.295332,0.032758,0.28698,-1.364176,-1.747992,0.24359,0.022133,-0.75,-0.45768,1.310233,...,1.787981,0.680381,0.028068,-0.569876,1.706143,-0.273722,-2.087934,0.975122,0.780189,-0.89715
19,0.141983,-0.118118,-0.317026,-0.429463,-0.212664,0.522338,0.171029,0.083333,-0.629455,0.398695,...,0.2459,1.005208,0.374241,-1.764517,-0.757361,-1.720536,-0.180439,0.975122,-1.28174,-0.89715


# Step 6 - Apply the Linear regression

In [28]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [29]:
train_r2 = model1.score(xtrain_pre, ytrain)
test_r2 = model1.score(xtest_pre, ytest)
gen_err = abs(train_r2 - test_r2)
print(f"Train R2 : {train_r2:.2%}")
print(f"Test R2 : {test_r2:.2%}")
print(f"Generalization Error : {gen_err:.2%}")

Train R2 : 97.25%
Test R2 : 82.17%
Generalization Error : 15.08%


# Step 7 - Backward elimination

In [30]:
from sklearn.feature_selection import SequentialFeatureSelector

base_model = LinearRegression()
sel = SequentialFeatureSelector(base_model, n_features_to_select="auto", direction="backward")
sel.fit(xtrain_pre, ytrain)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [31]:
sel.get_feature_names_out()

array(['num__Min.Price', 'num__Price', 'num__MPG.highway',
       'num__EngineSize', 'num__Horsepower', 'num__RPM',
       'num__Rev.per.mile', 'num__Fuel.tank.capacity', 'num__Length',
       'num__Wheelbase', 'cat__Type', 'cat__Origin'], dtype=object)

In [32]:
xtrain_pre_sel = sel.transform(xtrain_pre)
xtrain_pre_sel[0:5]

array([[-0.37382411, -0.20864295,  0.36308441, -0.49973664, -0.07092689,
         0.58333333,  0.55388468,  0.09484922,  0.07553707,  0.4504151 ,
        -1.3732369 ,  1.11464086],
       [-0.69900714, -0.80208604,  1.32266464, -0.87140007, -0.75957166,
         0.41666667, -0.19047443, -0.66476576, -0.83279622, -0.91311426,
         0.47439093,  1.11464086],
       [ 1.38664954,  0.99835994, -0.59649582,  0.5223378 ,  0.05935726,
        -0.75      , -1.02072113, -0.20899677,  1.33322932,  0.90492489,
        -0.75736096, -0.89714996],
       [ 0.31018158,  0.12328284, -0.21266373,  0.98691709,  0.48743374,
        -0.75      , -1.43107295,  0.39869522,  1.12361394,  1.05642815,
        -0.75736096, -0.89714996],
       [-0.26169203, -0.39975175, -0.40457977, -0.22098907, -0.81540773,
        -0.75      ,  0.27713578, -0.20899677,  0.56463961,  0.14740858,
        -0.14148501, -0.89714996]])

In [33]:
xtest_pre_sel = sel.transform(xtest_pre)
xtest_pre_sel[0:5]

array([[-0.48595619, -0.56068546,  0.17116836, -0.68556836, -0.53622741,
         0.41666667,  0.03855914, -0.36091977, -0.34369368, -0.15559794,
         1.09026688, -0.89714996],
       [ 0.64657782,  0.37474178, -0.78841186,  0.05775851,  0.6363299 ,
         0.91666667,  0.12444673,  0.55061821, -1.7411295 , -1.06461752,
         1.09026688,  1.11464086],
       [ 1.53242124,  1.82314524, -0.59649582,  0.05775851,  0.52465778,
         0.41666667,  0.4107387 ,  1.3406178 ,  0.6345114 ,  0.29891184,
        -0.14148501,  1.11464086],
       [-0.29533165,  0.03275762, -1.74799209,  0.24359022,  0.02213321,
        -0.75      , -0.45768026,  1.3102332 , -0.55330905,  2.26845425,
         1.70614282, -0.89714996],
       [ 0.14198346, -0.11811774, -0.21266373,  0.5223378 ,  0.17102938,
         0.08333333, -0.62945544,  0.39869522,  1.33322932,  1.35943468,
        -0.75736096, -0.89714996]])

# Build the model on preprocessed and selected features

In [34]:
final_model = LinearRegression()
final_model.fit(xtrain_pre_sel, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [35]:
train_r2 = final_model.score(xtrain_pre_sel, ytrain)
test_r2 = final_model.score(xtest_pre_sel, ytest)
gen_err = abs(train_r2 - test_r2)
print(f"Train R2 : {train_r2:.2%}")
print(f"Test R2 : {test_r2:.2%}")
print(f"Generalization Error : {gen_err:.2%}")

Train R2 : 96.92%
Test R2 : 84.06%
Generalization Error : 12.86%


# Evaluate model in detail

In [36]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [37]:
print("Training Results : ")
evaluate_model(final_model, xtrain_pre_sel, ytrain)

Training Results : 
RMSE : 107.04
MAE : 88.49
MAPE : 2.98%
R2 : 96.92%


In [38]:
print("Testing Results : ")
evaluate_model(final_model, xtest_pre_sel, ytest)

Testing Results : 
RMSE : 192.42
MAE : 144.06
MAPE : 4.86%
R2 : 84.06%


# Out of sample prediction

In [39]:
xnew = pd.read_csv("sample.csv")
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [40]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,num__Turn.circle,num__Rear.seat.room,num__Luggage.room,cat__Manufacturer,cat__Type,cat__AirBags,cat__DriveTrain,cat__Cylinders,cat__Man.trans.avail,cat__Origin
0,1.532421,1.823145,1.976445,-0.616405,-0.596496,0.057759,0.524658,0.416667,0.410739,-0.512843,...,-0.67935,1.005208,1.066586,-1.525589,-0.141485,-0.273722,-0.180439,0.975122,0.780189,1.114641
1,-0.867205,-0.852378,-0.807234,0.131365,0.363084,-0.685568,-0.629288,-0.083333,0.658858,-0.452074,...,-0.062517,-0.943754,-0.318105,0.983157,-1.373237,-0.273722,-0.180439,-0.76365,0.780189,-0.89715
2,-0.418677,-0.369577,-0.317026,-0.24252,-0.020748,-0.499737,-0.629288,-0.083333,0.525255,-0.057074,...,0.2459,0.19314,0.720413,-1.047733,-0.141485,-0.273722,-0.180439,-0.76365,-1.28174,-0.89715
3,1.723046,1.300111,0.917246,-0.990291,-0.788412,-1.335979,2.069456,2.083333,0.00993,1.006387,...,-0.67935,-0.131687,0.028068,0.266373,1.090267,-0.273722,1.727057,-2.502421,0.780189,1.114641
4,-0.945698,-1.053545,-1.096106,0.50525,0.746917,-0.8714,-1.169036,0.416667,0.439368,-1.302842,...,-1.604599,-0.618927,-1.356622,1.699942,0.474391,-0.273722,-0.180439,-0.76365,0.780189,1.114641


In [41]:
xnew_pre_sel = sel.transform(xnew_pre)
xnew_pre_sel[0:5]

array([[ 1.53242124,  1.82314524, -0.59649582,  0.05775851,  0.52465778,
         0.41666667,  0.4107387 , -0.51284277,  0.42489603,  0.29891184,
        -0.14148501,  1.11464086],
       [-0.86720526, -0.85237783,  0.36308441, -0.68556836, -0.62928752,
        -0.08333333,  0.6588584 , -0.45207357, -0.20395009, -0.45860447,
        -1.3732369 , -0.89714996],
       [-0.41867694, -0.36957667, -0.02074768, -0.49973664, -0.62928752,
        -0.08333333,  0.52525548, -0.05707377,  0.98387036,  0.60191837,
        -0.14148501, -0.89714996],
       [ 1.72304578,  1.30011066, -0.78841186, -1.33597936,  2.06945551,
         2.08333333,  0.00992995,  1.00638721, -1.04241159, -1.21612078,
         1.09026688,  1.11464086],
       [-0.94569771, -1.05354498,  0.7469165 , -0.87140007, -1.16903612,
         0.41666667,  0.43936789, -1.30284235, -1.46164234, -1.67063057,
         0.47439093,  1.11464086]])

In [42]:
preds = final_model.predict(xnew_pre_sel)
preds[0:5]

array([3255.60214796, 2700.74236997, 3132.76680149, 3469.66709788,
       2222.19057096])

In [43]:
xnew["Weight_pred"] = preds.round(2)

In [44]:
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3255.6
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2700.74
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3132.77
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,3469.67
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2222.19


In [45]:
xnew.to_csv("backward_res.csv", index=False)

# Save the model object, preprocess and selector

In [46]:
import joblib

joblib.dump(pre, "pre.joblib")

['pre.joblib']

In [47]:
joblib.dump(sel, "backward_sel.joblib")

['backward_sel.joblib']

In [48]:
joblib.dump(final_model, "backward_model.joblib")

['backward_model.joblib']

# Load the model

In [49]:
p = joblib.load("pre.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [50]:
s = joblib.load("forward_sel.joblib")
s

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [51]:
m = joblib.load("forward_model.joblib")
m

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False
