# Pipeline in sklearn
pipelines are used to combine multiples steps

In [1]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 1 - Data Ingestion

In [2]:
import pandas as pd

df = pd.read_csv("Cars93.csv", na_values=["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


# Predict the weights of cars based on other columnst
# Target Feature - Weight

# Step 2 - Perform basic data quality checks

In [3]:
df.shape

(94, 28)

In [4]:
# check for row duplicates
df.duplicated().sum()

np.int64(1)

In [5]:
# drop duplicate
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.shape

(93, 28)

In [7]:
# check missing values
m = df.isna().sum()
m

id                     0
Manufacturer           0
Model                  0
Type                   0
Min.Price              0
Price                  0
Max.Price              0
MPG.city               0
MPG.highway            0
AirBags                4
DriveTrain             0
Cylinders              0
EngineSize             0
Horsepower             0
RPM                    0
Rev.per.mile           0
Man.trans.avail        0
Fuel.tank.capacity     0
Passengers             0
Length                 0
Wheelbase              0
Width                  0
Turn.circle            0
Rear.seat.room         2
Luggage.room          11
Weight                 0
Origin                 0
Make                   0
dtype: int64

In [8]:
m[m > 0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

# Step 3 - Seperate X and Y(Weight)

In [9]:
X = df.drop(columns=["id", "Weight"])
Y = df["Weight"]

In [10]:
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [11]:
Y.head()

0    2705
1    3560
2    3375
3    3405
4    3640
Name: Weight, dtype: int64

In [16]:
X.select_dtypes(include="object").nunique()

Manufacturer       32
Model              93
Type                6
AirBags             3
DriveTrain          3
Cylinders           6
Man.trans.avail     2
Origin              2
Make               93
dtype: int64

In [14]:
def get_high_card_cols(df: pd.DataFrame) -> list[str]:
    cat_unique = df.select_dtypes(include="object").nunique()
    card = cat_unique / len(df)
    high_card = card[card >= 0.9]
    print(high_card)
    high_cols = high_card.index.tolist()
    return high_cols

In [15]:
high_card_cols = get_high_card_cols(X)
print(high_card_cols)

Model    1.0
Make     1.0
dtype: float64
['Model', 'Make']


In [17]:
X = X.drop(columns=high_card_cols)
X.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
0,Acura,Small,12.9,15.9,18.8,25,31,,Front,4,...,Yes,13.2,5,177,102,68,37,26.5,11.0,non-USA
1,Acura,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,6,...,Yes,18.0,5,195,115,71,38,30.0,15.0,non-USA
2,Audi,Compact,25.9,29.1,32.3,20,26,Driver only,Front,6,...,Yes,16.9,5,180,102,67,37,28.0,14.0,non-USA
3,Audi,Midsize,30.8,37.7,44.6,19,26,,Front,6,...,Yes,21.1,6,193,106,70,37,31.0,17.0,non-USA
4,BMW,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,4,...,Yes,21.1,4,186,109,69,39,27.0,13.0,non-USA


# Step 4 - Apply preprocessing pipeline on X

In [19]:
num_cols = X.select_dtypes(include="number").columns.tolist()
print(num_cols)

['Min.Price', 'Price', 'Max.Price', 'MPG.city', 'MPG.highway', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room']


In [20]:
cat_cols = X.select_dtypes(include="object").columns.tolist()
print(cat_cols)

['Manufacturer', 'Type', 'AirBags', 'DriveTrain', 'Cylinders', 'Man.trans.avail', 'Origin']


In [18]:
from sklearn.pipeline import make_pipeline  # To combine multiple steps
from sklearn.impute import SimpleImputer  # To replaces missing values
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # Preprocessing
from sklearn.compose import ColumnTransformer  # To combine multiple pipelines

In [None]:
num_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

In [None]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
)

In [None]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, num_cols), 
        ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform="pandas")

pre.fit(X)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [26]:
# Transform data using above pipeline
X_pre = pre.transform(X)
X_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.388017,1.497844,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,1.008658,0.998227,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,0.755752,1.091905,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


# Step 5 - Train Test split
test size = 0.2 to 0.4

In [78]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X_pre, Y, test_size=0.2, random_state=21
)

In [79]:
xtrain.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
57,1.365026,1.28967,1.185041,-0.423219,-0.016221,-0.356418,-0.265452,-0.304365,0.18791,-0.663618,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
31,-1.003095,-0.979424,-0.911397,0.1135,0.172352,-0.841022,-0.323043,2.054464,0.157535,-1.062184,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
62,0.606307,0.685966,0.729294,-0.781032,-0.959087,0.322027,1.116716,1.212025,-0.247462,0.716035,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
29,0.043016,-0.021825,-0.063707,-0.423219,-0.204794,0.806631,1.347077,0.87505,-0.71321,0.409445,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
51,1.985795,1.726835,1.449374,-0.781032,-0.581941,1.872759,1.27029,-1.146804,-0.996708,1.022624,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [73]:
ytrain.head()

57    2920
31    2530
62    3730
29    3490
51    4055
Name: Weight, dtype: int64

In [74]:
xtest.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
23,-1.003095,-0.85452,-0.701753,0.1135,-0.016221,-0.453339,-0.975733,-0.809828,0.532158,-0.816912,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
86,0.203957,0.332071,0.4285,-0.781032,-1.336233,-0.259498,-0.111878,-0.472853,0.370159,0.961306,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
91,0.537333,0.332071,0.145937,-0.244313,-0.204794,-0.356418,-0.572601,0.201098,-0.237337,-0.265051,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
21,1.422504,1.039862,0.692834,-0.423219,-0.581941,0.612789,0.060893,-0.809828,-1.108083,-0.203734,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
17,0.100495,-0.073868,-0.209546,-0.959938,-0.581941,2.260442,0.502419,-1.820755,-1.988953,1.942392,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [75]:
ytest.head()

23    2670
86    3785
91    2985
21    3570
17    3910
Name: Weight, dtype: int64

In [76]:
xtrain.shape

(74, 64)

In [77]:
xtest.shape

(19, 64)

# Step 6 - Model Building

In [80]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(xtrain, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [81]:
model.intercept_

np.float64(3523.015378962659)

In [82]:
model.coef_

array([ 7.12212393e+03, -1.54355635e+04,  8.86749716e+03,  5.00676137e+00,
       -1.96248452e+02, -1.72195953e+02,  2.18990416e+02, -1.01240272e+02,
       -9.82918419e+01, -7.15652573e+01, -1.87890179e+01,  1.23960452e+02,
        1.40245110e+02,  1.46630116e+02, -2.79198764e+01, -4.96161337e+01,
        1.23076470e+02, -2.00775222e+01,  4.04828046e+02, -1.98024742e+02,
       -6.92192730e+01,  1.69794756e+01,  0.00000000e+00, -1.23142521e+01,
       -3.38587130e+01, -2.33198096e+01,  1.19980040e+00,  3.27801322e+02,
        5.42679913e+01,  1.21813043e+02, -1.28351110e+02,  1.24625901e+02,
       -2.15006137e+02, -9.76514709e+01, -7.09903501e+02,  4.29717366e+00,
       -3.43057874e+01, -4.40566494e+01, -1.64937780e+02,  4.79656697e+02,
        6.99422932e+01, -4.54364613e+02,  1.36087964e+01,  1.19319453e+01,
        3.41060513e-13, -8.92584268e+01,  1.12873235e+02,  5.13064123e+01,
       -2.60204740e+02, -2.25080841e+02,  3.34158069e+01, -2.17820097e+02,
        2.23174657e+02,  

In [83]:
coefs = pd.Series(model.coef_, index=xtrain.columns)
coefs

num__Min.Price               7122.123931
num__Price                 -15435.563466
num__Max.Price               8867.497160
num__MPG.city                   5.006761
num__MPG.highway             -196.248452
                                ...     
cat__Cylinders_6             -504.389436
cat__Cylinders_8             -893.370279
cat__Cylinders_rotary       -1141.731531
cat__Man.trans.avail_Yes     -165.286524
cat__Origin_non-USA           130.996471
Length: 64, dtype: float64

In [84]:
coefs.sort_values(ascending=False)

num__Max.Price                      8867.497160
num__Min.Price                      7122.123931
cat__Manufacturer_Plymouth           479.656697
cat__Manufacturer_BMW                404.828046
cat__Manufacturer_Geo                327.801322
                                       ...     
cat__Cylinders_6                    -504.389436
cat__Manufacturer_Mercedes-Benz     -709.903501
cat__Cylinders_8                    -893.370279
cat__Cylinders_rotary              -1141.731531
num__Price                        -15435.563466
Length: 64, dtype: float64

# Step 7 - Model evaluation

In [86]:
# R2 score for train
r2_train = model.score(xtrain, ytrain)
print(f"R2 train : {r2_train:.2%}")

R2 train : 99.60%


In [87]:
# R2 score for test data (unseen to model)
r2_test = model.score(xtest, ytest)
print(f"R2 Test : {r2_test:.2%}")

R2 Test : 87.04%


In [88]:
# Generalization error
gen_err = abs(r2_train - r2_test)
print(f"Generalization error : {gen_err:.2%}")

Generalization error : 12.57%


In [89]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
)


def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [90]:
print("Training result : ")
evaluate_model(model, xtrain, ytrain)

Training result : 
RMSE : 36.62
MAE : 24.38
MAPE : 0.84%
R2 : 99.60%


In [91]:
ypred_train = model.predict(xtrain)
ypred_train[0:5]

array([2908.56762846, 2528.78721545, 3624.9436603 , 3514.4374059 ,
       4043.48728307])

In [92]:
ytrain.head()

57    2920
31    2530
62    3730
29    3490
51    4055
Name: Weight, dtype: int64

In [93]:
print("Testing Results : ")
evaluate_model(model, xtest, ytest)

Testing Results : 
RMSE : 216.84
MAE : 175.60
MAPE : 5.42%
R2 : 87.04%


In [94]:
ypred_test = model.predict(xtest)

ypred_test[0:5]

array([2637.20122033, 3879.06895962, 3384.57157424, 3605.21495874,
       3705.20582955])

In [95]:
ytest.head()

23    2670
86    3785
91    2985
21    3570
17    3910
Name: Weight, dtype: int64

# From above i can conclude model is good and can be used for out of sample prediction

# Step  8 - Out of Sample prediction

In [97]:
xnew = pd.read_csv("sample.csv", na_values=["", "NA"], keep_default_na=False)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [98]:
pre

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [99]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,-0.888138,-0.875337,-0.829362,0.1135,0.360925,-0.647181,-0.649388,-0.135877,0.673908,-0.449005,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.428309,-0.37572,-0.318925,-0.244313,-0.016221,-0.453339,-0.649388,-0.135877,0.532158,-0.050439,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.767376,1.352122,0.966282,-0.959938,-0.770514,-1.325626,2.134145,2.054464,-0.014589,1.022624,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,-0.968608,-1.083511,-1.130155,0.471312,0.738071,-0.841022,-1.206095,0.369586,0.441034,-1.307455,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


### Predict results for new cars

In [101]:
preds = model.predict(xnew_pre)
preds

array([3429.56782691, 2665.76676634, 3197.55853826, 2895.        ,
       2195.19178545])

In [102]:
xnew["Weight_pred"] = preds.round(2)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3429.57
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2665.77
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3197.56
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,2895.0
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2195.19


In [103]:
xnew.to_csv("results.csv", index=False)

# Step 9 - Saving preprocessor and model object

In [104]:
pre

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [105]:
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [106]:
import joblib

joblib.dump(pre, "pre.joblib")

['pre.joblib']

In [107]:
joblib.dump(model, "weight_model.joblib")

['weight_model.joblib']

# Load the pre and model

In [108]:
p = joblib.load("pre.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [109]:
m = joblib.load("weight_model.joblib")
m

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [110]:
m.score(xtrain, ytrain)

0.9960137382218888

In [111]:
m.score(xtest, ytest)

0.8703581386679461