In [None]:
import pandas as pd
import numpy as np
import joblib

# Load data
train_df = pd.read_csv("./dataset/processed-data/final_train.csv")
test_df=pd.read_csv("./dataset/processed-data/final_test.csv")

# target column
target_col = "LOG_RESALE_PRICE"

X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]
X_test = test_df.copy()

In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

def evaluate_model(model, X, y_log, model_name="Model", 
                   n_splits=5, shuffle=True, random_state=42, 
                   log_type="log1p", show_per_fold=True):
    def inverse_log(z):
        if log_type == "log1p":
            return np.expm1(z)
        elif log_type == "log":
            return np.exp(z)
        else:
            raise ValueError("log_type must be 'log1p' or 'log'")

    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    log_rmse_list, real_rmse_list, mape_list = [], [], []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr_log, y_va_log = y_log.iloc[tr_idx], y_log.iloc[va_idx]

        model.fit(X_tr, y_tr_log)
        preds_log = model.predict(X_va)

        rmse_log = mean_squared_error(y_va_log, preds_log) ** 0.5

        y_va_raw   = inverse_log(y_va_log)
        preds_raw  = inverse_log(preds_log)

        eps = 1e-9
        y_va_raw_safe  = np.clip(y_va_raw, eps, None)
        preds_raw_safe = np.clip(preds_raw, eps, None)

        rmse_real = mean_squared_error(y_va_raw_safe, preds_raw_safe) ** 0.5
        mape = mean_absolute_percentage_error(y_va_raw_safe, preds_raw_safe) * 100

        log_rmse_list.append(rmse_log)
        real_rmse_list.append(rmse_real)
        mape_list.append(mape)

        if show_per_fold:
            print(f"Fold {fold}: logRMSE={rmse_log:.5f}, realRMSE={rmse_real:.2f}, MAPE={mape:.2f}%")

    print(f"[{model_name}] logRMSE={np.mean(log_rmse_list):.5f} ± {np.std(log_rmse_list):.5f}")
    print(f"[{model_name}] realRMSE={np.mean(real_rmse_list):.2f} ± {np.std(real_rmse_list):.2f}")
    print(f"[{model_name}] MAPE={np.mean(mape_list):.2f}% ± {np.std(mape_list):.2f}%")


# RandomForest

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=800,
    max_depth=20,
    min_samples_leaf=4,
    max_features=0.5,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
joblib.dump(rf_model, "./best_models/best_rf.pkl")
evaluate_model(rf_model, X_train, y_train, "Random Forest")

Fold 1: logRMSE=0.05513, realRMSE=30307.18, MAPE=4.09%
Fold 2: logRMSE=0.05528, realRMSE=30510.66, MAPE=4.12%
Fold 3: logRMSE=0.05526, realRMSE=30360.48, MAPE=4.12%
Fold 4: logRMSE=0.05448, realRMSE=30115.87, MAPE=4.05%
Fold 5: logRMSE=0.05502, realRMSE=30689.36, MAPE=4.12%
[Random Forest] logRMSE=0.05503 ± 0.00029
[Random Forest] realRMSE=30396.71 ± 193.30
[Random Forest] MAPE=4.10% ± 0.03%


# CatBoost

In [14]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(
    iterations=3000,             
    learning_rate=0.03,          
    depth=8,                     
    l2_leaf_reg=3.0,             
    bagging_temperature=0.8,     
    random_strength=1.0,         
    border_count=128,           
    subsample=0.8,             
    loss_function="RMSE",      
    eval_metric="RMSE",
    random_seed=42,
    verbose=500,               
    task_type="CPU",            
    early_stopping_rounds=100   
)
joblib.dump(cat_model, "./best_models/best_cat.pkl")
evaluate_model(cat_model, X_train, y_train, "CatBoost")

0:	learn: 0.3367216	total: 65.5ms	remaining: 3m 16s
500:	learn: 0.0603675	total: 6.51s	remaining: 32.5s
1000:	learn: 0.0524124	total: 12.2s	remaining: 24.5s
1500:	learn: 0.0491851	total: 17.6s	remaining: 17.6s
2000:	learn: 0.0471734	total: 22.9s	remaining: 11.4s
2500:	learn: 0.0457231	total: 28.3s	remaining: 5.64s
2999:	learn: 0.0445667	total: 33.5s	remaining: 0us
Fold 1: logRMSE=0.04868, realRMSE=25620.09, MAPE=3.63%
0:	learn: 0.3362725	total: 9.99ms	remaining: 30s
500:	learn: 0.0603454	total: 6.34s	remaining: 31.6s
1000:	learn: 0.0523582	total: 11.6s	remaining: 23.2s
1500:	learn: 0.0491331	total: 16.5s	remaining: 16.4s
2000:	learn: 0.0471709	total: 21.4s	remaining: 10.7s
2500:	learn: 0.0457252	total: 26.9s	remaining: 5.36s
2999:	learn: 0.0445554	total: 36.7s	remaining: 0us
Fold 2: logRMSE=0.04890, realRMSE=25956.89, MAPE=3.66%
0:	learn: 0.3366688	total: 18.3ms	remaining: 54.9s
500:	learn: 0.0603926	total: 6.73s	remaining: 33.6s
1000:	learn: 0.0523791	total: 13.6s	remaining: 27.1s
150