In [None]:
import pandas as pd
import numpy as np

# Load your data
train_df = pd.read_csv("./dataset/processed-data/final_train.csv")
test_df=pd.read_csv("./dataset/processed-data/final_test.csv")

# Define target column name
target_col = "LOG_RESALE_PRICE"

# Split into X (features) and y (target)
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]
X_test = test_df.copy()

In [32]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

def evaluate_model(model, X, y_log, model_name="Model", 
                   n_splits=5, shuffle=True, random_state=42, 
                   log_type="log1p", show_per_fold=True):
    def inverse_log(z):
        if log_type == "log1p":
            return np.expm1(z)
        elif log_type == "log":
            return np.exp(z)
        else:
            raise ValueError("log_type must be 'log1p' or 'log'")

    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    log_rmse_list, real_rmse_list, mape_list = [], [], []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr_log, y_va_log = y_log.iloc[tr_idx], y_log.iloc[va_idx]

        model.fit(X_tr, y_tr_log)
        preds_log = model.predict(X_va)

        rmse_log = mean_squared_error(y_va_log, preds_log) ** 0.5

        y_va_raw   = inverse_log(y_va_log)
        preds_raw  = inverse_log(preds_log)

        eps = 1e-9
        y_va_raw_safe  = np.clip(y_va_raw, eps, None)
        preds_raw_safe = np.clip(preds_raw, eps, None)

        rmse_real = mean_squared_error(y_va_raw_safe, preds_raw_safe) ** 0.5
        mape = mean_absolute_percentage_error(y_va_raw_safe, preds_raw_safe) * 100

        log_rmse_list.append(rmse_log)
        real_rmse_list.append(rmse_real)
        mape_list.append(mape)

        if show_per_fold:
            print(f"Fold {fold}: logRMSE={rmse_log:.5f}, realRMSE={rmse_real:.2f}, MAPE={mape:.2f}%")

    print(f"[{model_name}] logRMSE={np.mean(log_rmse_list):.5f} ± {np.std(log_rmse_list):.5f}")
    print(f"[{model_name}] realRMSE={np.mean(real_rmse_list):.2f} ± {np.std(real_rmse_list):.2f}")
    print(f"[{model_name}] MAPE={np.mean(mape_list):.2f}% ± {np.std(mape_list):.2f}%")


In [33]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
evaluate_model(model, X_train, y_train, "Linear Regression")

Fold 1: logRMSE=0.11068, realRMSE=58547.92, MAPE=8.62%
Fold 2: logRMSE=0.11084, realRMSE=59486.32, MAPE=8.62%
Fold 3: logRMSE=0.11032, realRMSE=59101.41, MAPE=8.58%
Fold 4: logRMSE=0.11072, realRMSE=58750.62, MAPE=8.62%
Fold 5: logRMSE=0.11018, realRMSE=74167.84, MAPE=8.59%
[Linear Regression] logRMSE=0.11055 ± 0.00025
[Linear Regression] realRMSE=62010.82 ± 6086.90
[Linear Regression] MAPE=8.60% ± 0.02%


In [46]:
# adaptive lasso
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

alphas = np.logspace(-4, 2, 50)  
m = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", LassoCV(alphas=alphas, cv=5, max_iter=10000, tol=1e-4, random_state=0))
])
m.fit(X_train, y_train)

best_alpha = m.named_steps["lasso"].alpha_
coef = m.named_steps["lasso"].coef_
print(f"Best alpha: {best_alpha}")
evaluate_model(m, X_train, y_train, "Lasso model")

Best alpha: 0.0001
Fold 1: logRMSE=0.11069, realRMSE=58552.32, MAPE=8.62%
Fold 2: logRMSE=0.11084, realRMSE=59472.47, MAPE=8.62%
Fold 3: logRMSE=0.11032, realRMSE=59107.78, MAPE=8.58%
Fold 4: logRMSE=0.11072, realRMSE=58754.99, MAPE=8.62%
Fold 5: logRMSE=0.11018, realRMSE=73997.68, MAPE=8.59%
[Lasso model] logRMSE=0.11055 ± 0.00025
[Lasso model] realRMSE=61977.05 ± 6018.50
[Lasso model] MAPE=8.60% ± 0.02%


In [48]:
from sklearn.linear_model import RidgeCV

alphas = np.logspace(-4, 4, 50)  

ridge_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=alphas, cv=5))
])

ridge_model.fit(X_train, y_train)

best_alpha = ridge_model.named_steps["ridge"].alpha_
coef = ridge_model.named_steps["ridge"].coef_

print(f"Best alpha: {best_alpha:.6f}")
print(f"Non-zero coefficients: {(coef != 0).sum()}/{len(coef)}")

evaluate_model(ridge_model, X_train, y_train, "Ridge Regression")


Best alpha: 11.513954
Non-zero coefficients: 36/36
Fold 1: logRMSE=0.11068, realRMSE=58545.56, MAPE=8.62%
Fold 2: logRMSE=0.11084, realRMSE=59483.08, MAPE=8.62%
Fold 3: logRMSE=0.11032, realRMSE=59098.28, MAPE=8.58%
Fold 4: logRMSE=0.11072, realRMSE=58750.12, MAPE=8.62%
Fold 5: logRMSE=0.11018, realRMSE=74085.52, MAPE=8.59%
[Ridge Regression] logRMSE=0.11055 ± 0.00025
[Ridge Regression] realRMSE=61992.51 ± 6054.91
[Ridge Regression] MAPE=8.60% ± 0.02%


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=800,
    max_depth=20,
    min_samples_leaf=4,
    max_features=0.5,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

evaluate_model(rf_model, X_train, y_train, "Random Forest")

模型已训练完成
树数量: 800, 最大深度: 20
Fold 1: logRMSE=0.05513, realRMSE=30307.18, MAPE=4.09%
Fold 2: logRMSE=0.05528, realRMSE=30510.66, MAPE=4.12%
Fold 3: logRMSE=0.05526, realRMSE=30360.48, MAPE=4.12%
Fold 4: logRMSE=0.05448, realRMSE=30115.87, MAPE=4.05%
Fold 5: logRMSE=0.05502, realRMSE=30689.36, MAPE=4.12%
[Random Forest] logRMSE=0.05503 ± 0.00029
[Random Forest] realRMSE=30396.71 ± 193.30
[Random Forest] MAPE=4.10% ± 0.03%


In [None]:
# XGBoost
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)
evaluate_model(xgb_model, X_train, y_train, "XGBoost")

Fold 1: logRMSE=0.04966, realRMSE=26183.68, MAPE=3.74%
Fold 2: logRMSE=0.05065, realRMSE=26991.55, MAPE=3.82%
Fold 3: logRMSE=0.05112, realRMSE=27559.32, MAPE=3.79%
Fold 4: logRMSE=0.04988, realRMSE=26796.97, MAPE=3.74%
Fold 5: logRMSE=0.05447, realRMSE=30036.01, MAPE=4.11%
[XGBoost] logRMSE=0.05115 ± 0.00174
[XGBoost] realRMSE=27513.51 ± 1335.62
[XGBoost] MAPE=3.84% ± 0.14%


In [None]:

preds_log = xgb_model.predict(X_test)

preds_raw = np.expm1(preds_log)  

id_col = "Id" if "Id" in test_df.columns else test_df.index

submission = pd.DataFrame({
    "Id": id_col,
    "Predicted": preds_raw
})

submission.to_csv("submission-result/submission_xgb.csv", index=False)

In [55]:
# LightGBM
from lightgbm import LGBMRegressor
LGBM_model = LGBMRegressor(
    boosting_type="gbdt",
    objective="regression",
    metric="rmse",
    n_estimators=1500,
    learning_rate=0.03,
    num_leaves=50,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.3,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
evaluate_model(LGBM_model, X_train, y_train, "LightGBM")

Fold 1: logRMSE=0.04989, realRMSE=26440.42, MAPE=3.73%
Fold 2: logRMSE=0.04990, realRMSE=26620.04, MAPE=3.76%
Fold 3: logRMSE=0.05056, realRMSE=27239.22, MAPE=3.78%
Fold 4: logRMSE=0.04948, realRMSE=26470.80, MAPE=3.72%
Fold 5: logRMSE=0.05169, realRMSE=28091.59, MAPE=3.90%
[LightGBM] logRMSE=0.05030 ± 0.00077
[LightGBM] realRMSE=26972.41 ± 629.69
[LightGBM] MAPE=3.78% ± 0.07%


In [56]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(
    iterations=3000,             # 相当于 n_estimators
    learning_rate=0.03,          # 学习率
    depth=8,                     # 树深度（5–10 一般效果好）
    l2_leaf_reg=3.0,             # L2 正则项（1–10）
    bagging_temperature=0.8,     # bagging 采样强度
    random_strength=1.0,         # 特征随机性增强
    border_count=128,            # 连续特征分桶数
    subsample=0.8,               # 样本采样比例
    loss_function="RMSE",        # 回归任务常用 RMSE
    eval_metric="RMSE",
    random_seed=42,
    verbose=200,                 # 每 200 轮打印一次
    task_type="CPU",             # 如果有 GPU 可改为 "GPU"
    early_stopping_rounds=100    # 早停避免过拟合
)
evaluate_model(cat_model, X_train, y_train, "CatBoost")

0:	learn: 0.3367216	total: 60.6ms	remaining: 3m 1s
200:	learn: 0.0767770	total: 1.04s	remaining: 14.5s
400:	learn: 0.0639139	total: 1.94s	remaining: 12.6s
600:	learn: 0.0579043	total: 2.79s	remaining: 11.1s
800:	learn: 0.0545920	total: 3.63s	remaining: 9.96s
1000:	learn: 0.0524124	total: 4.49s	remaining: 8.96s
1200:	learn: 0.0508892	total: 5.33s	remaining: 7.99s
1400:	learn: 0.0497183	total: 6.56s	remaining: 7.49s
1600:	learn: 0.0487293	total: 7.49s	remaining: 6.54s
1800:	learn: 0.0478931	total: 8.39s	remaining: 5.58s
2000:	learn: 0.0471734	total: 9.27s	remaining: 4.63s
2200:	learn: 0.0465443	total: 10.2s	remaining: 3.69s
2400:	learn: 0.0459806	total: 11.1s	remaining: 2.77s
2600:	learn: 0.0454617	total: 12s	remaining: 1.84s
2800:	learn: 0.0450016	total: 12.9s	remaining: 916ms
2999:	learn: 0.0445667	total: 13.8s	remaining: 0us
Fold 1: logRMSE=0.04868, realRMSE=25620.09, MAPE=3.63%
0:	learn: 0.3362725	total: 5.33ms	remaining: 16s
200:	learn: 0.0761440	total: 916ms	remaining: 12.8s
400:	l

In [57]:
preds_log = cat_model.predict(X_test)

preds_raw = np.expm1(preds_log)  

id_col = "Id" if "Id" in test_df.columns else test_df.index

submission = pd.DataFrame({
    "Id": id_col,
    "Predicted": preds_raw
})

submission.to_csv("submission-result/submission_cat.csv", index=False)