### Automated Pipeline with Scikit learn

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

### Step 1 : Read the Dataset

In [2]:
import pandas as pd
df = pd.read_csv("Cars93.csv",na_values=["","NA"],keep_default_na=False)
df.head()

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


### Step 2 : BASIC DATA QUALITY CHECK

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  93 non-null     int64  
 1   Manufacturer        93 non-null     object 
 2   Model               93 non-null     object 
 3   Type                93 non-null     object 
 4   Min.Price           93 non-null     float64
 5   Price               93 non-null     float64
 6   Max.Price           93 non-null     float64
 7   MPG.city            93 non-null     int64  
 8   MPG.highway         93 non-null     int64  
 9   AirBags             89 non-null     object 
 10  DriveTrain          93 non-null     object 
 11  Cylinders           93 non-null     object 
 12  EngineSize          93 non-null     float64
 13  Horsepower          93 non-null     int64  
 14  RPM                 93 non-null     int64  
 15  Rev.per.mile        93 non-null     int64  
 16  Man.trans.

In [5]:
m = df.isna().sum()
m[m>0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [6]:
df.duplicated().sum()

0

### Seperate X and Y

Price ~ Remaining features

In [8]:
x = df.drop(columns=["id","Price"])
y = df[["Price"]]

In [9]:
x.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,18.8,25,31,,Front,4,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,38.7,18,25,Driver & Passenger,Front,6,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,Audi,90,Compact,25.9,32.3,20,26,Driver only,Front,6,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,Audi,100,Midsize,30.8,44.6,19,26,,Front,6,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,36.2,22,30,Driver only,Rear,4,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


In [10]:
y.head()

Unnamed: 0,Price
0,15.9
1,33.9
2,29.1
3,37.7
4,30.0


### Step 4 : Preprocessing Pipeline

In [11]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [18]:
cat = list(x.columns[x.dtypes == "object"])
con = list(x.columns[x.dtypes != "object"])


In [19]:
cat

['Manufacturer',
 'Model',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin',
 'Make']

In [20]:
con

['Min.Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room',
 'Weight']

In [21]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [23]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False)
)

In [25]:
pre = ColumnTransformer(
    [("num",num_pipe,con),
     ("cat",cat_pipe,cat)]
).set_output(transform="pandas")

In [26]:
x_pre = pre.fit_transform(x)
x_pre

Unnamed: 0,num__Min.Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,num__Passengers,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,-0.485787,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.129530,-1.062184,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.388017,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.008658,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.571949,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,0.884457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.755752,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,-1.050944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,-0.060445,0.073018,-0.959938,-1.524806,-0.162577,-0.668585,-1.315292,1.180155,1.359872,1.852158,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
89,0.054512,0.045673,-0.244313,0.172352,-0.647181,-0.188665,0.875050,0.714407,0.562740,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
90,0.663786,0.164167,-0.781032,-0.770514,0.128186,0.655993,0.875050,0.106911,0.562740,-1.050944,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
91,0.537333,0.145937,-0.244313,-0.204794,-0.356418,-0.572601,0.201098,-0.237337,-0.265051,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Step 5 : Train Test Split

In [27]:
from sklearn.model_selection import train_test_split
xtrain ,xtest,ytrain ,ytest = train_test_split(x_pre,y,test_size=0.2,random_state=42)
xtrain.head()

Unnamed: 0,num__Min.Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,num__Passengers,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
65,-0.04895,-0.036362,-0.959938,-1.14766,0.322027,0.13768,-0.809828,-0.541086,1.022624,1.852158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,-0.278864,-0.355385,-0.781032,-1.14766,1.097393,0.502419,-0.809828,-1.300456,1.022624,1.852158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,-0.336343,-0.318925,0.1135,0.360925,-0.453339,-0.649388,-0.135877,0.471409,-0.050439,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,-0.91113,-0.820247,1.008032,1.680937,-0.744101,-1.129307,-0.472853,-0.379087,-1.18482,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,-1.175531,-1.275994,1.544751,0.738071,-1.325626,-1.551637,-0.472853,1.656027,-2.04327,-1.050944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
ytrain.head()

Unnamed: 0,Price
65,19.1
15,16.3
68,16.3
78,11.1
30,7.4


### Step 6 : Model Building

In [30]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

In [32]:
model.score(xtrain,ytrain)

1.0

In [33]:
model.intercept_

array([19.75520972])

In [34]:
model.coef_

array([[ 3.50240810e+00,  5.23680512e+00, -2.66693503e-01,
        -2.26007303e-03,  1.97505143e-01,  4.33611518e-01,
         5.03825987e-02,  1.69490345e-01,  3.88098558e-02,
        -3.14803664e-02,  1.00902194e-02,  1.82703660e-01,
        -4.48068622e-01, -7.19595282e-02,  1.02994587e-01,
         5.11811889e-02,  2.15910616e-02,  2.41990850e-02,
         1.71165390e-01,  3.71404666e-02, -5.72024738e-02,
         2.81994821e-01, -1.66145571e-01, -2.97747398e-02,
         6.94406205e-02, -2.32400277e-01, -2.80603257e-01,
        -9.65820017e-02,  7.74409331e-02, -8.04486535e-02,
        -3.02956852e-01,  2.07938503e-01,  3.81773879e-02,
         5.13581590e-01, -5.19908100e-02,  6.62922105e-01,
        -1.44450340e-01,  3.25574915e-02, -3.03356664e-01,
        -2.68109313e-02,  2.30097882e-01, -6.79338892e-02,
         7.51192060e-02,  1.46043427e-01, -1.30636923e-01,
        -6.73166256e-02, -4.94875296e-02, -1.31619597e-01,
         5.35355902e-02,  1.85853971e-01,  1.29000702e-0

### Step 7 : Evaluating the model

In [39]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error,r2_score
def evaluate(model,x,y):
    # predict the test data
    ypred = model.predict(x)
    mse = mean_squared_error(y,ypred)
    rmse = mse ** (1/2)
    mae = mean_absolute_error(y,ypred)
    mape = mean_absolute_percentage_error(y,ypred)
    r2 = r2_score(y,ypred)
    print(f"MSE : {mse:.2f}")
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.4f}")
    print(f"R2 Score: {r2:.4f}")

In [40]:
evaluate(model,xtrain,ytrain)

MSE : 0.00
RMSE : 0.00
MAE : 0.00
MAPE : 0.0000
R2 Score: 1.0000


In [41]:
evaluate(model,xtest,ytest)

MSE : 0.17
RMSE : 0.41
MAE : 0.33
MAPE : 0.0186
R2 Score: 0.9982


### above result of R2 score for both the model is above >0.8 so the model is good and can be used for out of sample data

### Step 8 : Out of sample data

In [46]:
xnew = pd.read_csv("sample2.csv",na_values=["","NA"],keep_default_na=False)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,Cylinders,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight
0,Audi,100,Midsize,30.8,44.6,19,26,,Front,6,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3259.250718
1,Pontiac,Sunbird,Compact,9.4,12.8,23,31,,Front,4,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2575.0
2,Chevrolet,Lumina,Midsize,13.4,18.4,21,29,,Front,4,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3195.0
3,Mazda,RX-7,Sporty,32.5,32.5,17,25,Driver only,Rear,rotary,...,2,169,96,69,37,,,non-USA,Mazda RX-7,2895.0
4,Volkswagen,Fox,Small,8.7,9.5,25,33,,Front,4,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2240.0


In [47]:
pre

In [48]:
xnew_pre = pre.transform(xnew)
xnew_pre

Unnamed: 0,num__Min.Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,num__Passengers,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,1.571949,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,0.884457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.888138,-0.829362,0.1135,0.360925,-0.647181,-0.649388,-0.135877,0.673908,-0.449005,-0.083243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.428309,-0.318925,-0.244313,-0.016221,-0.453339,-0.649388,-0.135877,0.532158,-0.050439,0.884457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.767376,0.966282,-0.959938,-0.770514,-1.325626,2.134145,2.054464,-0.014589,1.022624,-2.986345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.968608,-1.130155,0.471312,0.738071,-0.841022,-1.206095,0.369586,0.441034,-1.307455,-1.050944,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [50]:
y_pred = model.predict(xnew_pre)
y_pred.round(2)

array([[38.22],
       [11.1 ],
       [15.9 ],
       [32.5 ],
       [ 9.1 ]])