# Pipeline

1. num_pipe -> SimpleImputer (mean, median), StandardScaler
2. cat_pipe -> SimpleImputer (most_frequent) , OneHotEncoder 

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

# Step 1 - Data Ingestion

In [3]:
import pandas as pd
df = pd.read_csv("Cars93.csv", keep_default_na=False, na_values=["", "NA"])
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


# Try to estimate Weight of car based on other factors

# Step 2 - Perform basic data quality checks

In [4]:
df.shape

(94, 28)

In [6]:
df.duplicated().sum()

np.int64(1)

In [7]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)
df.shape

(93, 28)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  93 non-null     int64  
 1   Manufacturer        93 non-null     object 
 2   Model               93 non-null     object 
 3   Type                93 non-null     object 
 4   Min.Price           93 non-null     float64
 5   Price               93 non-null     float64
 6   Max.Price           93 non-null     float64
 7   MPG.city            93 non-null     int64  
 8   MPG.highway         93 non-null     int64  
 9   AirBags             89 non-null     object 
 10  DriveTrain          93 non-null     object 
 11  Cylinders           93 non-null     object 
 12  EngineSize          93 non-null     float64
 13  Horsepower          93 non-null     int64  
 14  RPM                 93 non-null     int64  
 15  Rev.per.mile        93 non-null     int64  
 16  Man.trans.

In [9]:
df.isna().sum()

id                     0
Manufacturer           0
Model                  0
Type                   0
Min.Price              0
Price                  0
Max.Price              0
MPG.city               0
MPG.highway            0
AirBags                4
DriveTrain             0
Cylinders              0
EngineSize             0
Horsepower             0
RPM                    0
Rev.per.mile           0
Man.trans.avail        0
Fuel.tank.capacity     0
Passengers             0
Length                 0
Wheelbase              0
Width                  0
Turn.circle            0
Rear.seat.room         2
Luggage.room          11
Weight                 0
Origin                 0
Make                   0
dtype: int64

In [11]:
df.select_dtypes(include = "object").nunique() / len(df)

Manufacturer       0.344086
Model              1.000000
Type               0.064516
AirBags            0.032258
DriveTrain         0.032258
Cylinders          0.064516
Man.trans.avail    0.021505
Origin             0.021505
Make               1.000000
dtype: float64

In [12]:
df = df.drop(columns = ["Model", "Make"])

# Step 3 - Seperate X and Y(Weight)

In [13]:
X = df.drop(columns = ["id", "Weight"])
Y = df["Weight"]

In [15]:
X.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
0,Acura,Small,12.9,15.9,18.8,25,31,,Front,4,...,Yes,13.2,5,177,102,68,37,26.5,11.0,non-USA
1,Acura,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,6,...,Yes,18.0,5,195,115,71,38,30.0,15.0,non-USA
2,Audi,Compact,25.9,29.1,32.3,20,26,Driver only,Front,6,...,Yes,16.9,5,180,102,67,37,28.0,14.0,non-USA
3,Audi,Midsize,30.8,37.7,44.6,19,26,,Front,6,...,Yes,21.1,6,193,106,70,37,31.0,17.0,non-USA
4,BMW,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,4,...,Yes,21.1,4,186,109,69,39,27.0,13.0,non-USA


In [16]:
Y.head()

0    2705
1    3560
2    3375
3    3405
4    3640
Name: Weight, dtype: int64

# Step 4 - Apply Train Test Split

In [17]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [18]:
xtrain.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
65,Nissan,Van,16.7,19.1,21.5,17,23,,Front,6,...,No,20.0,7,190,112,74,41,27.0,,non-USA
15,Chevrolet,Van,14.7,16.3,18.0,18,23,,Front,6,...,No,20.0,7,178,110,74,44,30.5,,USA
68,Oldsmobile,Midsize,14.2,16.3,18.4,23,31,Driver only,Front,4,...,No,16.5,5,190,105,70,42,28.0,16.0,USA
78,Saturn,Small,9.2,11.1,12.9,28,38,Driver only,Front,4,...,Yes,12.8,5,176,102,68,40,26.5,12.0,USA
30,Ford,Small,6.9,7.4,7.9,31,33,,Front,4,...,Yes,10.0,4,141,90,63,33,26.0,12.0,USA


In [19]:
ytrain.head()

65    4100
15    3715
68    2890
78    2495
30    1845
Name: Weight, dtype: int64

In [20]:
xtest.head()

Unnamed: 0,Manufacturer,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Man.trans.avail,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin
40,Honda,Sporty,17.0,19.8,22.7,24,31,Driver & Passenger,Front,4,...,Yes,15.9,4,175,100,70,39,23.5,8.0,non-USA
22,Dodge,Small,7.9,9.2,10.6,29,33,,Front,4,...,Yes,13.2,5,174,98,66,32,26.5,11.0,USA
55,Mazda,Van,16.6,19.1,21.7,18,24,,4WD,6,...,No,19.6,7,190,110,72,39,27.5,,non-USA
72,Pontiac,Small,8.2,9.0,9.9,31,41,,Front,4,...,Yes,13.2,4,177,99,66,35,25.5,17.0,USA
0,Acura,Small,12.9,15.9,18.8,25,31,,Front,4,...,Yes,13.2,5,177,102,68,37,26.5,11.0,non-USA


In [21]:
ytest.head()

40    2865
22    2270
55    3735
72    2350
0     2705
Name: Weight, dtype: int64

In [22]:
xtrain.shape

(74, 24)

In [23]:
xtest.shape

(19, 24)

# Step 5 - Create the pipeline on X

In [25]:
cat_cols = X.select_dtypes(include="object").columns.tolist()
cat_cols

['Manufacturer',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin']

In [26]:
num_cols = X.select_dtypes(include="number").columns.tolist()
num_cols

['Min.Price',
 'Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room']

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [29]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"), # Replace missing values by mean,
    StandardScaler() # Scale the data 
)

In [30]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), # Replace missing values by mode
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
)

In [31]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform="pandas")

In [32]:
pre.fit(xtrain)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [33]:
xtrain_pre = pre.transform(xtrain)
xtrain_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
65,-0.04529,-0.042952,-0.039482,-0.926566,-1.134441,0.307021,0.205378,-0.666653,-0.427741,1.004804,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
15,-0.27966,-0.3357,-0.355094,-0.74985,-1.134441,1.093845,0.603669,-0.666653,-1.218799,1.004804,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
68,-0.338253,-0.3357,-0.319024,0.133731,0.378147,-0.479803,-0.654093,0.006826,0.627002,-0.063961,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
78,-0.924178,-0.879377,-0.814985,1.017312,1.701662,-0.774863,-1.178161,-0.329914,-0.258982,-1.193798,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
30,-1.193703,-1.266224,-1.265859,1.547461,0.756294,-1.364981,-1.639341,-0.329914,1.861052,-2.04881,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [34]:
xtest_pre = pre.transform(xtest)
xtest_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
40,-0.010135,0.030236,0.068728,0.310447,0.378147,-0.38145,0.394042,1.017044,1.238753,-0.247178,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
22,-1.076518,-1.078028,-1.022387,1.194028,0.756294,-1.168275,-1.031422,1.353784,2.145833,-1.071653,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
55,-0.057009,-0.042952,-0.021447,-0.74985,-0.945368,0.307021,0.289229,-0.329914,-0.058581,0.882659,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
72,-1.041363,-1.098939,-1.08551,1.547461,2.268882,-1.069922,-1.408751,0.680305,1.818862,-1.071653,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,-0.490593,-0.377522,-0.282954,0.487164,0.378147,-0.873216,-0.025212,1.858893,1.312585,-1.071653,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


# Step 6 - Model building

In [36]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [38]:
model.intercept_

np.float64(3406.0829917621913)

In [37]:
model.coef_

array([ 1451.98593536, -2907.19838271,  1762.03881281,  -224.51313847,
         -32.88694122,   -31.21156302,   221.61570431,   -95.15238653,
         -45.01364027,  -145.81534813,   109.31669442,   -11.46791352,
          83.76378683,   201.14314408,   -14.69248882,   -57.27261228,
          70.51537822,   -22.32984663,  -150.78674523,  -421.62553168,
          40.7142687 ,   394.90719852,   105.51916383,    29.91341113,
         134.33140622,   -49.24084954,   526.56950633,   534.75141963,
         218.15571189,  -385.77357541,    88.04916735,  -282.77543861,
          12.08302666,  -783.7464714 ,    -4.34194821,    -4.40759825,
         155.22267216,   -69.28961866,   263.44057463,   -15.39711597,
        -299.50474669,   -36.78835142,    32.21761144,   391.6748307 ,
         188.25156458,   121.11354468,  -185.48799686,  -306.63657691,
        -177.83896961,    49.94136466,  -115.97415566,  -110.52962506,
         204.1889185 ,   191.82187309,   -87.41897318,    48.02542955,
      

# Step 7 - Model evaluation

In [39]:
model.score(xtrain_pre, ytrain)

0.9962358571591687

In [40]:
model.score(xtest_pre, ytest)

0.8203969231394308

In [41]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [42]:
print("Train Results : ")
evaluate_model(model, xtrain_pre, ytrain)

Train Results : 
RMSE : 36.08
MAE : 25.50
MAPE : 0.82%
R2 : 99.62%


In [43]:
print("Test Results : ")
evaluate_model(model, xtest_pre, ytest)

Test Results : 
RMSE : 243.52
MAE : 194.49
MAPE : 6.60%
R2 : 82.04%


# Step 8 Inference stage - Out of sample prediction

In [45]:
xnew = pd.read_csv("sample.csv")
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [46]:
pre

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [47]:
xnew_pre = pre.transform(xnew)
xnew_pre

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__AirBags_None,cat__DriveTrain_Front,cat__DriveTrain_Rear,cat__Cylinders_4,cat__Cylinders_5,cat__Cylinders_6,cat__Cylinders_8,cat__Cylinders_rotary,cat__Man.trans.avail_Yes,cat__Origin_non-USA
0,1.607018,1.901737,2.043556,-0.573134,-0.567221,0.110315,0.645595,0.511935,0.563718,-0.522003,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,-0.900741,-0.879377,-0.824003,0.133731,0.378147,-0.676509,-0.654093,0.006826,0.837951,-0.460931,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.432001,-0.377522,-0.319024,-0.219701,0.0,-0.479803,-0.654093,0.006826,0.690287,-0.063961,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.806232,1.358061,0.952441,-0.926566,-0.756294,-1.364981,2.3855,2.195633,0.120725,1.004804,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,-0.98277,-1.088483,-1.12158,0.487164,0.756294,-0.873216,-1.262012,0.511935,0.59536,-1.315942,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [48]:
preds = model.predict(xnew_pre)
preds

array([3481.81409757, 2590.90242693, 3208.51815944, 2895.        ,
       2255.44985   ])

In [49]:
xnew["Weight_pred"] = preds.round(2)

In [50]:
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3481.81
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2590.9
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3208.52
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,2895.0
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2255.45


In [51]:
xnew.to_csv("res.csv", index=False)

# Step 9 - Save the model object

In [52]:
import joblib

joblib.dump(model, "model.joblib")

['model.joblib']

In [53]:
joblib.dump(pre, "pre.joblib")

['pre.joblib']

# Joblib load

In [54]:
m = joblib.load("model.joblib")
m

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [55]:
p = joblib.load("pre.joblib")
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'
