In [1]:
import numpy as np
import pandas as pd
import re

import warnings 
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('car_price_prediction_updated.csv')
data.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age
0,13328,1399.0,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Left wheel,Silver,12,15
1,16621,1018.0,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,14
2,8467,906.838128,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Right-hand drive,Black,2,19
3,3607,862.0,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,Left wheel,White,0,14
4,11726,446.0,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Left wheel,Silver,4,11


In [3]:
data['Wheel'].unique()

array(['Left wheel', 'Right-hand drive'], dtype=object)

In [4]:
data['Manufacturer'].value_counts()

Manufacturer
HYUNDAI          3769
TOYOTA           3662
MERCEDES-BENZ    2076
FORD             1111
CHEVROLET        1069
                 ... 
TESLA               1
PONTIAC             1
SATURN              1
ASTON MARTIN        1
GREATWALL           1
Name: count, Length: 65, dtype: int64

In [5]:
data['Color'].value_counts()

Color
Black            5033
White            4489
Silver           3792
Grey             2375
Blue             1396
Red               639
Green             322
Orange            253
Brown             187
Carnelian red     179
Golden            145
Beige             134
Sky blue          122
Yellow            106
Purple             39
Pink               26
Name: count, dtype: int64

In [6]:
data['Category'].value_counts()

Category
Sedan          8736
Jeep           5473
Hatchback      2847
Minivan         647
Coupe           532
Universal       364
Microbus        306
Goods wagon     233
Pickup           52
Cabriolet        36
Limousine        11
Name: count, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Levy              19237 non-null  float64
 2   Manufacturer      19237 non-null  object 
 3   Model             19237 non-null  object 
 4   Category          19237 non-null  object 
 5   Leather interior  19237 non-null  object 
 6   Fuel type         19237 non-null  object 
 7   Engine volume     19237 non-null  object 
 8   Mileage           19237 non-null  int64  
 9   Cylinders         19237 non-null  float64
 10  Gear box type     19237 non-null  object 
 11  Drive wheels      19237 non-null  object 
 12  Wheel             19237 non-null  object 
 13  Color             19237 non-null  object 
 14  Airbags           19237 non-null  int64  
 15  Age               19237 non-null  int64  
dtypes: float64(2), int64(4), object(10)
memo

In [9]:
# 1. Filter Manufacturer (Remove counts < 10)
mfr_counts = data['Manufacturer'].value_counts()
mfr_to_keep = mfr_counts[mfr_counts >= 10].index
data = data[data['Manufacturer'].isin(mfr_to_keep)]

# 2. Filter Color (Remove counts < 40)
color_counts = data['Color'].value_counts()
colors_to_keep = color_counts[color_counts >= 40].index
data = data[data['Color'].isin(colors_to_keep)]

# 3. Filter Category (Remove counts < 10)
cat_counts = data['Category'].value_counts()
cats_to_keep = cat_counts[cat_counts >= 10].index
data = data[data['Category'].isin(cats_to_keep)]

# Reset the index after removing rows
data.reset_index(drop=True, inplace=True)

In [10]:
def clean_model_name(text):
    if pd.isna(text):
        return text
    text = str(text).upper().strip()
    text = re.sub(r'[^A-Z0-9 ]', ' ', text)
    text = " ".join(text.split())
    return text

data['Model'] = data['Model'].apply(clean_model_name)

data['Model'] = data['Model'].astype(str).str.strip().str.upper()

# 2. Identify models with more than 2 occurrences
model_counts = data['Model'].value_counts()
models_to_keep = model_counts[model_counts > 2].index

# 3. Overwrite 'df' with the filtered data (save in the same variable)
data = data[data['Model'].isin(models_to_keep)].reset_index(drop=True)

In [11]:
data['Manufacturer'] = data['Manufacturer'].str.upper().str.strip()

def clean_engine_volume(value):
    value = str(value).lower()
    is_turbo = 1 if 'turbo' in value else 0
    
    numeric_part = re.findall(r"[-+]?\d*\.\d+|\d+", value)
    volume = float(numeric_part[0]) if numeric_part else 0.0
    return volume, is_turbo

data[['Engine_Volume_Num', 'Is_Turbo']] = data['Engine volume'].apply(
    lambda x: pd.Series(clean_engine_volume(x))
)


data = data[data['Price'] > 100].reset_index(drop=True)


data['Levy'] = pd.to_numeric(data['Levy'], errors='coerce')
data['Levy'] = data['Levy'].fillna(data['Levy'].median())

print("Cleaning Complete.")
print(data[['Manufacturer', 'Engine_Volume_Num', 'Is_Turbo', 'Price']].head())

Cleaning Complete.
  Manufacturer  Engine_Volume_Num  Is_Turbo  Price
0        LEXUS                3.5       0.0  13328
1    CHEVROLET                3.0       0.0  16621
2        HONDA                1.3       0.0   8467
3         FORD                2.5       0.0   3607
4        HONDA                1.3       0.0  11726


In [12]:
data = data.drop(columns=['Is_Turbo', 'Engine volume'], axis=1)

In [13]:
data.rename(columns={
    'Engine_Volume_Num': 'Engine Volume'
}, inplace=True)

In [14]:
X = data.drop(['Price'] , axis=1)
y = data['Price']

In [15]:
## checking numeric columns skewness
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns

skewness = data[numeric_cols].skew().sort_values(ascending=False)
skewness

Price            131.183421
Mileage           39.747699
Levy               4.790899
Engine Volume      2.304769
Cylinders          2.113759
Age                1.559201
Airbags            0.092468
dtype: float64

In [29]:
# Use the 99th percentile as a "ceiling" to remove extreme outliers
price_limit = data['Price'].quantile(0.99)    # ~87,000
mileage_limit = data['Mileage'].quantile(0.99) # ~575,000
levy_limit = data['Levy'].quantile(0.99)       # ~2,100

data = data[
    (data['Price'] > 500) &            # Remove placeholders like $1 or $100
    (data['Price'] <= price_limit) &   # Remove extreme luxury/error prices
    (data['Mileage'] <= mileage_limit) &
    (data['Levy'] <= levy_limit)
].reset_index(drop=True)

In [30]:
data

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age,Engine Volume
0,13328,1399.000000,LEXUS,RX 450,Jeep,Yes,Hybrid,186005,6.0,Automatic,4x4,Left wheel,Silver,12,15,3.5
1,16621,1018.000000,CHEVROLET,EQUINOX,Jeep,No,Petrol,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,14,3.0
2,8467,906.838128,HONDA,FIT,Hatchback,No,Petrol,200000,4.0,Variator,Front,Right-hand drive,Black,2,19,1.3
3,3607,862.000000,FORD,ESCAPE,Jeep,Yes,Hybrid,168966,4.0,Automatic,4x4,Left wheel,White,0,14,2.5
4,11726,446.000000,HONDA,FIT,Hatchback,Yes,Petrol,91901,4.0,Automatic,Front,Left wheel,Silver,4,11,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15879,5802,1055.000000,MERCEDES-BENZ,E 350,Sedan,Yes,Diesel,107800,6.0,Automatic,Rear,Left wheel,Grey,12,12,3.5
15880,8467,906.838128,MERCEDES-BENZ,CLK 200,Coupe,Yes,CNG,300000,4.0,Manual,Rear,Left wheel,Silver,5,26,2.0
15881,15681,831.000000,HYUNDAI,SONATA,Sedan,Yes,Petrol,161600,4.0,Tiptronic,Front,Left wheel,Red,8,14,2.4
15882,26108,836.000000,HYUNDAI,TUCSON,Jeep,Yes,Diesel,116365,4.0,Automatic,Front,Left wheel,Grey,4,15,2.0


In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

## tranformation of each column accordling for model training

one_hot = ['Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Color']

power_transform = ['Levy', 'Mileage', 'Cylinders', 'Age', 'Engine Volume']

standard_transform = ['Airbags']

ordinal_tranform = ['Wheel']

binary_tranform = ['Leather interior']

high_cardinal = ['Manufacturer', 'Model']

## pipelines for each of the column as per need 

standard_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

power_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson'))
])

one_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(
        drop='first',
        handle_unknown='ignore',
        sparse_output=False
    ))
])

binary_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['No', 'Yes']], dtype=int))
])

ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        categories=[['Left wheel', 'Right-hand drive']],
        dtype=int
    ))
])


In [32]:
## cannot be done labelencoding, onehot and ordinal for manufacture and model so using base estimator and transformer mixin for converting to numeric

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        for col in X.columns:
            X[col] = X[col].map(self.freq_maps[col]).fillna(0)
        return X.values


high_card_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('freq', FrequencyEncoder())
])

In [33]:
transformer = ColumnTransformer([
    ('standard', standard_pipe, standard_transform),
    ('power', power_pipe, power_transform),
    ('onehot', one_pipe, one_hot),
    ('binary', binary_pipe, binary_tranform),
    ('ordinal', ordinal_pipe, ordinal_tranform),
    ('high_card', high_card_pipe, high_cardinal)
], remainder='passthrough')


In [34]:
y = np.log1p(data['Price'])

In [35]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge, RidgeCV, LassoCV
from sklearn.svm import SVR

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    X,y, test_size=0.3,
    random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [17602, 15884]

In [37]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Ridge": Ridge(),
    "SVR": SVR(),
    "RidgeCV": RidgeCV(),
    "LassoCV": LassoCV()
}

for name, model in models.items():

    pipe = Pipeline([
        ('preprocess', transformer),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_test_pred_log  = pipe.predict(X_test)

    y_test_pred  = np.expm1(y_test_pred_log)

    y_test_real  = np.expm1(y_test)

    print(name)
    print("Test  MAE:", mean_absolute_error(y_test_real, y_test_pred))
    print("Test  R2 :", r2_score(y_test_real, y_test_pred))
    print("="*40)

LinearRegression
Test  MAE: 10373.729959036513
Test  R2 : 0.022239241228328654
Lasso
Test  MAE: 12168.141537607822
Test  R2 : -0.23243195354582036
ElasticNet
Test  MAE: 12168.141537607822
Test  R2 : -0.23243195354582036
Ridge
Test  MAE: 10364.469040018352
Test  R2 : 0.027060404339395716
SVR
Test  MAE: 5957.861891526152
Test  R2 : 0.5722655959155871
RidgeCV
Test  MAE: 10372.059539174996
Test  R2 : 0.02306619891137418
LassoCV
Test  MAE: 10349.802673558435
Test  R2 : 0.031276650815446394


In [38]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

ensemble_transformer = ColumnTransformer([
    ('num', num_pipe, power_transform + standard_transform),
    ('onehot', one_pipe, one_hot),
    ('binary', binary_pipe, binary_tranform),
    ('ordinal', ordinal_pipe, ordinal_tranform),
    ('high_card', high_card_pipe, high_cardinal)
], remainder='passthrough')


In [39]:
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor,
    GradientBoostingRegressor, HistGradientBoostingRegressor
)

from xgboost import XGBRegressor

ensemble_models = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Ada Boost': AdaBoostRegressor(),
    'Extra Trees': ExtraTreesRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'Hist Gradient Boost': HistGradientBoostingRegressor(),
    'XG Boost': XGBRegressor()
}

In [40]:
for name, model in ensemble_models.items():

    pipe = Pipeline([
        ('preprocess', ensemble_transformer),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_test_pred_log  = pipe.predict(X_test)

    y_test_pred  = np.expm1(y_test_pred_log)

    y_test_real  = np.expm1(y_test)

    print(name)
    print("Test  MAE:", mean_absolute_error(y_test_real, y_test_pred))
    print("Test  R2 :", r2_score(y_test_real, y_test_pred))
    print("="*40)

Decision Tree
Test  MAE: 5092.093788240761
Test  R2 : 0.5562275141315467
Random Forest
Test  MAE: 4236.668169807295
Test  R2 : 0.689828904529564
Ada Boost
Test  MAE: 14728.008011320122
Test  R2 : -0.5921699618223126
Extra Trees
Test  MAE: 4359.611329233347
Test  R2 : -1.0373461159741528
Gradient Boost
Test  MAE: 6695.6271501024175
Test  R2 : 0.4666439784752139
Hist Gradient Boost
Test  MAE: 5427.086687452937
Test  R2 : 0.6269555575825394
XG Boost
Test  MAE: 5452.0416275079315
Test  R2 : 0.31041711982273335


In [22]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, uniform
svr_pipe = Pipeline([
    ('preprocess', transformer),
    ('model', SVR(kernel='rbf'))
])

svr_param_dist = {
    'model__C': loguniform(1e-1, 1e3),
    'model__gamma': loguniform(1e-3, 1e-1),
    'model__epsilon': uniform(0.01, 0.2)
}
svr_random = RandomizedSearchCV(
    svr_pipe,
    svr_param_dist,
    n_iter=30,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

svr_random.fit(X_train, y_train)

KeyboardInterrupt: 

In [43]:
rf_pipe = Pipeline([
    ('preprocess', ensemble_transformer),
    ('model', RandomForestRegressor(
        random_state=42,
        n_jobs=-1
    ))
])
from sklearn.model_selection import RandomizedSearchCV

rf_param_dist = {
    'model__n_estimators': [200, 400, 600, 800],
    'model__max_depth': [None, 10, 15, 20, 25],
    'model__min_samples_leaf': [1, 3, 5, 10],
    'model__max_features': ['sqrt', 0.5, 0.7]
}

rf_random = RandomizedSearchCV(
    rf_pipe,
    rf_param_dist,
    n_iter=25,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

rf_random.fit(X_train, y_train)

In [42]:
et_pipe = Pipeline([
    ('preprocess', ensemble_transformer),
    ('model', ExtraTreesRegressor(
        random_state=42,
        n_jobs=-1
    ))
])

et_param_dist = {
    'model__n_estimators': [300, 500, 700, 900],
    'model__max_depth': [None, 15, 20, 25],
    'model__min_samples_leaf': [1, 3, 5],
    'model__max_features': ['sqrt', 0.5, 0.7]
}

et_random = RandomizedSearchCV(
    et_pipe,
    et_param_dist,
    n_iter=25,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

et_random.fit(X_train, y_train)

NameError: name 'RandomizedSearchCV' is not defined

In [None]:
hyper_tune_models = {
    'SVR': svr_random,
    'RandomForest': rf_random,
    'ExtraTrees': et_random
}

for name, rs in hyper_tune_models.items():
    print(name)
    print("Best CV R2:", rs.best_score_)
    print("Best Params:", rs.best_params_)
    print("="*40)

SVR
Best CV R2: 0.43959983808100445
Best Params: {'model__C': 15.375920235481747, 'model__epsilon': 0.04697089110510541, 'model__gamma': 0.08692991511139551}
RandomForest
Best CV R2: 0.6390123263595453
Best Params: {'model__n_estimators': 800, 'model__min_samples_leaf': 1, 'model__max_features': 0.5, 'model__max_depth': None}
ExtraTrees
Best CV R2: 0.6118763923426973
Best Params: {'model__n_estimators': 300, 'model__min_samples_leaf': 1, 'model__max_features': 0.7, 'model__max_depth': 25}


In [44]:
final_rf = Pipeline([
    ('preprocess', ensemble_transformer),
    ('model', RandomForestRegressor(
        n_estimators=800,
        max_depth=None,
        min_samples_leaf=1,
        max_features=0.5,
        random_state=42,
        n_jobs=-1
    ))
])

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    final_rf,
    X,
    y,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

print("CV R2 scores:", cv_scores)
print("Mean CV R2   :", np.mean(cv_scores))
print("Std CV R2    :", np.std(cv_scores))

ValueError: Found input variables with inconsistent numbers of samples: [17602, 15884]

In [None]:
import joblib
joblib.dump(final_rf, "random_forest_car_price_model.pkl")

['random_forest_car_price_model.pkl']