In [5]:
import pandas as pd
import mlflow
from pycaret.regression import *
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('../data/processed/clean_used_car_prices.csv')

In [7]:
exp = setup(
    data=df,
    target='Price',
    normalize=True,
    rare_to_value=0.01,
    bin_numeric_features=['Mileage','Engine','Car_Age'],
    session_id=42
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Price
2,Target type,Regression
3,Original data shape,"(5146, 15)"
4,Transformed data shape,"(5146, 45)"
5,Transformed train set shape,"(3602, 45)"
6,Transformed test set shape,"(1544, 45)"
7,Numeric features,8
8,Categorical features,6
9,Rows with missing values,0.0%


In [8]:
top_models = compare_models(n_select=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,77425.6676,15231909569.4629,122833.3946,0.9042,0.1852,0.1387,0.456
lightgbm,Light Gradient Boosting Machine,82900.5478,17371012206.513,131187.7732,0.891,0.1983,0.148,0.134
xgboost,Extreme Gradient Boosting,83650.3328,18074670796.8,134028.2625,0.8864,0.2031,0.1492,0.088
rf,Random Forest Regressor,86443.6589,18645772789.712,135990.4258,0.883,0.2069,0.159,0.189
et,Extra Trees Regressor,87057.9049,19347886125.4951,138750.7305,0.8782,0.2058,0.1574,0.185
gbr,Gradient Boosting Regressor,95611.8263,20474971090.4305,142459.1806,0.8716,0.2293,0.1759,0.087
dt,Decision Tree Regressor,112588.9421,30641513782.339,174592.1711,0.8076,0.2678,0.2025,0.039
llar,Lasso Least Angle Regression,129426.8556,33039638310.2624,181449.4332,0.7924,0.4727,0.3043,0.038
lasso,Lasso Regression,129428.4525,33052013624.3147,181479.6735,0.7923,0.4721,0.3043,0.24
ridge,Ridge Regression,129429.251,33068675972.6575,181520.964,0.7922,0.471,0.3043,0.035


In [9]:
mlflow.set_tracking_uri("http://localhost:5000")  # or your MLflow URI
mlflow.set_experiment("Used_Car_Price_Prediction")

for model in top_models:
    model_name = str(model).split("(")[0]  # get class name
    eval_result = predict_model(model)
    metrics = pull()  # metrics from PyCaret table
    
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        mlflow.log_param("target", "Price")
        mlflow.log_param("normalize", True)
        mlflow.log_param("rare_to_value", 0.01)
        mlflow.log_param("bin_numeric_features", "Mileage,Engine,Car_Age")
        mlflow.log_param("session_id", 42)
        
        # Log metrics
        mlflow.log_metric("R2", metrics["R2"])
        mlflow.log_metric("MAE", metrics["MAE"])
        mlflow.log_metric("RMSE", metrics["RMSE"])
        
        # Log model
        mlflow.sklearn.log_model(model, "model")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,72750.6669,12541422613.0043,111988.4932,0.921,0.1795,0.1325




🏃 View run <catboost.core.CatBoostRegressor object at 0x0000019FA89C9270> at: http://localhost:5000/#/experiments/874352189977763051/runs/ba2181b1e72641b1bb43a86599a5b40b
🧪 View experiment at: http://localhost:5000/#/experiments/874352189977763051


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,80065.5161,15454322875.7901,124315.4169,0.9026,0.1985,0.1462




🏃 View run LGBMRegressor at: http://localhost:5000/#/experiments/874352189977763051/runs/0553409991d54f42b4ba4f4b97847130
🧪 View experiment at: http://localhost:5000/#/experiments/874352189977763051


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,78760.6328,15379539968.0,124014.2734,0.9031,0.2005,0.1438




🏃 View run XGBRegressor at: http://localhost:5000/#/experiments/874352189977763051/runs/decac54bcfbb45b9ae8df2d8a7daa881
🧪 View experiment at: http://localhost:5000/#/experiments/874352189977763051


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,81164.032,15859577029.2961,125934.8126,0.9,0.1971,0.1508




🏃 View run RandomForestRegressor at: http://localhost:5000/#/experiments/874352189977763051/runs/8398be36ffa74e4c83373592581cef10
🧪 View experiment at: http://localhost:5000/#/experiments/874352189977763051


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,80378.3128,15120819841.2187,122966.7428,0.9047,0.1987,0.1508




🏃 View run ExtraTreesRegressor at: http://localhost:5000/#/experiments/874352189977763051/runs/ae3a4fb7f98344eab6b22edcf24d7ea2
🧪 View experiment at: http://localhost:5000/#/experiments/874352189977763051


In [10]:
best_model = top_models[0]
tuned_model = tune_model(best_model, optimize='R2', fold=5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,98587.0993,23681842055.9586,153889.0576,0.8389,0.3859,0.2017
1,88269.6326,15278557334.6968,123606.4615,0.9059,0.3203,0.1787
2,106157.9925,26330700683.4428,162267.3741,0.8274,0.4178,0.1985
3,98658.1889,20770228318.9095,144118.7993,0.8693,0.3472,0.1894
4,105346.5043,23290115331.9701,152610.9935,0.8717,0.3145,0.1911
Mean,99403.8835,21870288744.9956,147298.5372,0.8626,0.3571,0.1919
Std,6420.7082,3737752359.9817,13169.2704,0.0276,0.0394,0.008


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [11]:
final_model = finalize_model(tuned_model)

In [12]:
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [13]:
predictions = predict_model(tuned_model, data=df)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,57112.2286,7035977623.1329,83880.7345,0.9561,0.1452,0.1084


In [14]:
final_metrics = pull()

input_example = df.head(1)
from mlflow.models.signature import infer_signature
signature = infer_signature(input_example, final_model.predict(input_example))

with mlflow.start_run(run_name="Final_Best_Model"):
    # Parameters
    mlflow.log_param("model", str(best_model).split("(")[0])
    mlflow.log_param("tuned", True)
    mlflow.log_param("optimize_metric", "R2")

    # Metrics
    mlflow.log_metric("R2", final_metrics["R2"])
    mlflow.log_metric("MAE", final_metrics["MAE"])
    mlflow.log_metric("RMSE", final_metrics["RMSE"])

    # Model
    mlflow.sklearn.log_model(
        sk_model=final_model,
        artifact_path="model",
        registered_model_name="UsedCarPriceModel",
        input_example=input_example,
        signature=signature
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'UsedCarPriceModel' already exists. Creating a new version of this model...
2025/08/29 13:47:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: UsedCarPriceModel, version 7


🏃 View run Final_Best_Model at: http://localhost:5000/#/experiments/874352189977763051/runs/21aa03de6e2a4d8f88cb8c414427396b
🧪 View experiment at: http://localhost:5000/#/experiments/874352189977763051


Created version '7' of model 'UsedCarPriceModel'.


In [15]:
save_model(final_model, '../models/catboost_used_car_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Year', 'Kilometers_Driven',
                                              'Mileage', 'Engine', 'Power',
                                              'Seats', 'Car_Age', 'km/year'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['Location', 'Fuel_Type',
                                              'Transmission', 'Owner_Type',
                                              'Brand', 'Model'],
                                     transformer=SimpleImpute...
                 ('bin_numeric_features',
                  TransformerWrapper(include=['Mileage', 'Engine', 'Car_Age'],
                                     transformer=KBinsDiscretizer(encode='ordinal',
                                                                  strategy='kmeans'))),
             