In [2]:
# %%
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor


In [20]:

# Load dataset
dataset = pickle.load(open('Cleaned_Cars.pkl', 'rb'))

# Features and target
X = dataset[['name','company','year','kms_driven','fuel_type']]
y = dataset['Price']   # No log-transform needed for Random Forest

# Train/Test split
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Column transformer for categorical features
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['name','company','fuel_type']),
    remainder='passthrough'
)

# Model (Random Forest)
# model = RandomForestRegressor(
#     n_estimators=500,
#     random_state=42,
#     max_depth=20,
#     min_samples_split=5,
#     min_samples_leaf=2,
#     n_jobs=-1
# )
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators= 1000, learning_rate=0.02, max_depth=6, random_state=42)




In [21]:
# Pipeline
pipe = make_pipeline(column_trans, model)

# Train
pipe.fit(x_train, y_train)







The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [26]:
# Predict
y_pred = pipe.predict(x_test)
y_pred

array([  97725.76 ,  306346.78 ,  299474.4  ,  314066.12 ,  424834.56 ,
        446373.97 ,  106874.914,  304837.66 ,  520046.97 ,  184660.1  ,
        369308.06 ,  112136.23 ,  611110.   ,  117516.32 , 1830381.9  ,
        354574.12 ,  615004.44 ,  310983.38 ,  452069.7  ,  297487.   ,
        371967.2  ,   84458.65 ,  497073.75 ,  640275.1  ,  276576.28 ,
        361202.6  ,  493664.62 ,  606780.6  ,  297449.06 ,  566846.44 ,
        590311.5  ,  409424.53 ,  110127.164,   90053.266,  525870.2  ,
        188721.38 ,  409431.12 ,  934380.4  ,  424646.44 ,  775867.06 ,
        167362.55 ,  457239.16 ,   70377.375,  368470.9  ,  360771.75 ,
        278493.34 ,  506042.97 ,   89883.04 ,  127874.375,  346399.5  ,
        254240.78 , 1113898.2  ,  427087.4  ,  424796.62 , 1544752.   ,
        597648.7  ,  926084.44 ,  192744.05 ,  391593.03 ,  255014.61 ,
        297487.   ,  501085.56 ,  156325.61 ,   68657.34 ,  448157.16 ,
        676709.9  ,  192726.34 ,  370731.03 ,  576048.56 ,  2161

In [23]:
# Metrics
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R2 Score: 0.5115096569061279
MAE: 133848.21875
MSE: 99476668416.0


In [25]:
# Save model
pickle.dump(pipe, open('Model.pkl', 'wb'))