In [27]:
import numpy as np
import pandas as pd
import re

import warnings 
warnings.filterwarnings('ignore')

In [28]:
data = pd.read_csv('car_price_prediction_updated.csv')
data.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age
0,13328,1399.0,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Left wheel,Silver,12,15
1,16621,1018.0,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,14
2,8467,906.838128,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Right-hand drive,Black,2,19
3,3607,862.0,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,Left wheel,White,0,14
4,11726,446.0,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Left wheel,Silver,4,11


In [29]:
data['Wheel'].unique()

array(['Left wheel', 'Right-hand drive'], dtype=object)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Levy              19237 non-null  float64
 2   Manufacturer      19237 non-null  object 
 3   Model             19237 non-null  object 
 4   Category          19237 non-null  object 
 5   Leather interior  19237 non-null  object 
 6   Fuel type         19237 non-null  object 
 7   Engine volume     19237 non-null  object 
 8   Mileage           19237 non-null  int64  
 9   Cylinders         19237 non-null  float64
 10  Gear box type     19237 non-null  object 
 11  Drive wheels      19237 non-null  object 
 12  Wheel             19237 non-null  object 
 13  Color             19237 non-null  object 
 14  Airbags           19237 non-null  int64  
 15  Age               19237 non-null  int64  
dtypes: float64(2), int64(4), object(10)
memo

In [31]:
def clean_model_name(text):
    if pd.isna(text):
        return text
    text = str(text).upper().strip()
    text = re.sub(r'[^A-Z0-9 ]', ' ', text)
    text = " ".join(text.split())
    return text

data['Model'] = data['Model'].apply(clean_model_name)

In [32]:
data['Manufacturer'] = data['Manufacturer'].str.upper().str.strip()

def clean_engine_volume(value):
    value = str(value).lower()
    is_turbo = 1 if 'turbo' in value else 0
    
    numeric_part = re.findall(r"[-+]?\d*\.\d+|\d+", value)
    volume = float(numeric_part[0]) if numeric_part else 0.0
    return volume, is_turbo

data[['Engine_Volume_Num', 'Is_Turbo']] = data['Engine volume'].apply(
    lambda x: pd.Series(clean_engine_volume(x))
)


data = data[data['Price'] > 100].reset_index(drop=True)


data['Levy'] = pd.to_numeric(data['Levy'], errors='coerce')
data['Levy'] = data['Levy'].fillna(data['Levy'].median())

print("Cleaning Complete.")
print(data[['Manufacturer', 'Engine_Volume_Num', 'Is_Turbo', 'Price']].head())

Cleaning Complete.
  Manufacturer  Engine_Volume_Num  Is_Turbo  Price
0        LEXUS                3.5       0.0  13328
1    CHEVROLET                3.0       0.0  16621
2        HONDA                1.3       0.0   8467
3         FORD                2.5       0.0   3607
4        HONDA                1.3       0.0  11726


In [33]:
data = data.drop(columns=['Is_Turbo', 'Engine volume'], axis=1)

In [34]:
data.rename(columns={
    'Engine_Volume_Num': 'Engine Volume'
}, inplace=True)

In [35]:
X = data.drop(['Price'] , axis=1)
y = data['Price']

In [38]:
## checking numeric columns skewness
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns

skewness = data[numeric_cols].skew().sort_values(ascending=False)
skewness

Price            135.145253
Mileage           41.140698
Levy               6.502211
Engine Volume      2.200250
Age                2.118242
Cylinders          2.080278
Airbags            0.075980
dtype: float64

In [39]:
data

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age,Engine Volume
0,13328,1399.000000,LEXUS,RX 450,Jeep,Yes,Hybrid,186005,6.0,Automatic,4x4,Left wheel,Silver,12,15,3.5
1,16621,1018.000000,CHEVROLET,EQUINOX,Jeep,No,Petrol,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,14,3.0
2,8467,906.838128,HONDA,FIT,Hatchback,No,Petrol,200000,4.0,Variator,Front,Right-hand drive,Black,2,19,1.3
3,3607,862.000000,FORD,ESCAPE,Jeep,Yes,Hybrid,168966,4.0,Automatic,4x4,Left wheel,White,0,14,2.5
4,11726,446.000000,HONDA,FIT,Hatchback,Yes,Petrol,91901,4.0,Automatic,Front,Left wheel,Silver,4,11,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18851,8467,906.838128,MERCEDES-BENZ,CLK 200,Coupe,Yes,CNG,300000,4.0,Manual,Rear,Left wheel,Silver,5,26,2.0
18852,15681,831.000000,HYUNDAI,SONATA,Sedan,Yes,Petrol,161600,4.0,Tiptronic,Front,Left wheel,Red,8,14,2.4
18853,26108,836.000000,HYUNDAI,TUCSON,Jeep,Yes,Diesel,116365,4.0,Automatic,Front,Left wheel,Grey,4,15,2.0
18854,5331,1288.000000,CHEVROLET,CAPTIVA,Jeep,Yes,Diesel,51258,4.0,Automatic,Front,Left wheel,Black,4,18,2.0


In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

## tranformation of each column accordling for model training

one_hot = ['Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Color']

power_transform = ['Levy', 'Mileage', 'Cylinders', 'Age', 'Engine Volume']

standard_transform = ['Airbags']

ordinal_tranform = ['Wheel']

binary_tranform = ['Leather interior']

high_cardinal = ['Manufacturer', 'Model']

## pipelines for each of the column as per need 

standard_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

power_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson'))
])

one_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(
        drop='first',
        handle_unknown='ignore',
        sparse_output=False
    ))
])

binary_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['No', 'Yes']], dtype=int))
])

ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        categories=[['Left wheel', 'Right-hand drive']],
        dtype=int
    ))
])


In [41]:
## cannot be done labelencoding, onehot and ordinal for manufacture and model so using base estimator and transformer mixin for converting to numeric

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        for col in X.columns:
            X[col] = X[col].map(self.freq_maps[col]).fillna(0)
        return X.values


high_card_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('freq', FrequencyEncoder())
])

In [42]:
transformer = ColumnTransformer([
    ('standard', standard_pipe, standard_transform),
    ('power', power_pipe, power_transform),
    ('onehot', one_pipe, one_hot),
    ('binary', binary_pipe, binary_tranform),
    ('ordinal', ordinal_pipe, ordinal_tranform),
    ('high_card', high_card_pipe, high_cardinal)
], remainder='passthrough')


In [43]:
y = np.log1p(data['Price'])

In [44]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge, RidgeCV, LassoCV
from sklearn.svm import SVR

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    X,y, test_size=0.3,
    random_state=42)

In [46]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "SVR": SVR(),
    "RidgeCV": RidgeCV(),
    "LassoCV": LassoCV()
}

for name, model in models.items():

    pipe = Pipeline([
        ('preprocess', transformer),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_test_pred_log  = pipe.predict(X_test)

    y_test_pred  = np.expm1(y_test_pred_log)

    y_test_real  = np.expm1(y_test)

    print(name)
    print("Test  MAE:", mean_absolute_error(y_test_real, y_test_pred))
    print("Test  R2 :", r2_score(y_test_real, y_test_pred))
    print("="*40)

LinearRegression
Test  MAE: 15148.677536031259
Test  R2 : -0.0005089359273589
Lasso
Test  MAE: 16943.626546901454
Test  R2 : -0.0013567130356824109
ElasticNet
Test  MAE: 16943.626546901454
Test  R2 : -0.0013567130356824109
Ridge
Test  MAE: 15148.749712492607
Test  R2 : -0.0005000455988801722
SVR
Test  MAE: 11037.82300851795
Test  R2 : 0.0004218552595683933
RidgeCV
Test  MAE: 15148.316486475771
Test  R2 : -0.00050761143473399
LassoCV
Test  MAE: 15144.797105605629
Test  R2 : -0.0004874066446056702


In [47]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

ensemble_transformer = ColumnTransformer([
    ('num', num_pipe, power_transform + standard_transform),
    ('onehot', one_pipe, one_hot),
    ('binary', binary_pipe, binary_tranform),
    ('ordinal', ordinal_pipe, ordinal_tranform),
    ('high_card', high_card_pipe, high_cardinal)
], remainder='passthrough')


In [48]:
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor,
    GradientBoostingRegressor, HistGradientBoostingRegressor
)

from xgboost import XGBRegressor

ensemble_models = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Ada Boost': AdaBoostRegressor(),
    'Extra Trees': ExtraTreesRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'Hist Gradient Boost': HistGradientBoostingRegressor(),
    'XG Boost': XGBRegressor()
}

In [49]:
for name, model in ensemble_models.items():

    pipe = Pipeline([
        ('preprocess', ensemble_transformer),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_test_pred_log  = pipe.predict(X_test)

    y_test_pred  = np.expm1(y_test_pred_log)

    y_test_real  = np.expm1(y_test)

    print(name)
    print("Test  MAE:", mean_absolute_error(y_test_real, y_test_pred))
    print("Test  R2 :", r2_score(y_test_real, y_test_pred))
    print("="*40)

Decision Tree
Test  MAE: 10475.517875627747
Test  R2 : 0.00033615377917584066
Random Forest
Test  MAE: 9378.496788377444
Test  R2 : 0.0008300697223340148
Ada Boost
Test  MAE: 15644.171303148722
Test  R2 : -0.0014010430860802447


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, uniform
svr_pipe = Pipeline([
    ('preprocess', transformer),
    ('model', SVR(kernel='rbf'))
])

svr_param_dist = {
    'model__C': loguniform(1e-1, 1e3),
    'model__gamma': loguniform(1e-3, 1e-1),
    'model__epsilon': uniform(0.01, 0.2)
}
svr_random = RandomizedSearchCV(
    svr_pipe,
    svr_param_dist,
    n_iter=30,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

svr_random.fit(X_train, y_train)

In [None]:
rf_pipe = Pipeline([
    ('preprocess', ensemble_transformer),
    ('model', RandomForestRegressor(
        random_state=42,
        n_jobs=-1
    ))
])

rf_param_dist = {
    'model__n_estimators': [200, 400, 600, 800],
    'model__max_depth': [None, 10, 15, 20, 25],
    'model__min_samples_leaf': [1, 3, 5, 10],
    'model__max_features': ['sqrt', 0.5, 0.7]
}

rf_random = RandomizedSearchCV(
    rf_pipe,
    rf_param_dist,
    n_iter=25,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

rf_random.fit(X_train, y_train)

In [None]:
et_pipe = Pipeline([
    ('preprocess', ensemble_transformer),
    ('model', ExtraTreesRegressor(
        random_state=42,
        n_jobs=-1
    ))
])

et_param_dist = {
    'model__n_estimators': [300, 500, 700, 900],
    'model__max_depth': [None, 15, 20, 25],
    'model__min_samples_leaf': [1, 3, 5],
    'model__max_features': ['sqrt', 0.5, 0.7]
}

et_random = RandomizedSearchCV(
    et_pipe,
    et_param_dist,
    n_iter=25,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

et_random.fit(X_train, y_train)

In [20]:
hyper_tune_models = {
    'SVR': svr_random,
    'RandomForest': rf_random,
    'ExtraTrees': et_random
}

for name, rs in hyper_tune_models.items():
    print(name)
    print("Best CV R2:", rs.best_score_)
    print("Best Params:", rs.best_params_)
    print("="*40)

SVR
Best CV R2: 0.43959983808100445
Best Params: {'model__C': 15.375920235481747, 'model__epsilon': 0.04697089110510541, 'model__gamma': 0.08692991511139551}
RandomForest
Best CV R2: 0.6390123263595453
Best Params: {'model__n_estimators': 800, 'model__min_samples_leaf': 1, 'model__max_features': 0.5, 'model__max_depth': None}
ExtraTrees
Best CV R2: 0.6118763923426973
Best Params: {'model__n_estimators': 300, 'model__min_samples_leaf': 1, 'model__max_features': 0.7, 'model__max_depth': 25}


In [21]:
final_rf = Pipeline([
    ('preprocess', ensemble_transformer),
    ('model', RandomForestRegressor(
        n_estimators=800,
        max_depth=None,
        min_samples_leaf=1,
        max_features=0.5,
        random_state=42,
        n_jobs=-1
    ))
])

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    final_rf,
    X,
    y,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

print("CV R2 scores:", cv_scores)
print("Mean CV R2   :", np.mean(cv_scores))
print("Std CV R2    :", np.std(cv_scores))

CV R2 scores: [0.68129674 0.64111305 0.66941387 0.66869558 0.65193693]
Mean CV R2   : 0.6624912335234491
Std CV R2    : 0.01420200090883984


In [24]:
import joblib
joblib.dump(final_rf, "random_forest_car_price_model.pkl")

['random_forest_car_price_model.pkl']