# Train XGBoost from Parquet (Categorical dtypes)

- Read training data from `X_clean.parquet`（reserve `category` dtype）and `y.csv` ；
- Ues `XGBRegressor(enable_categorical=True)` to train；
- Optional **log1p target**（`LOG_TARGET=True`）；
- **Train/Validation split**，output RMSE/R²；
- save as `xgb_model.joblib` after training

> prerequisite：Use `clean_only_xgb.ipynb` to generate `xgb_clean_outputs/X_clean.parquet` and `y.csv`。


In [10]:

# optional: install xgboost
#%pip install -U xgboost joblib pandas pyarrow


In [11]:

import os, json
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor

# ==== 路径配置（根据你的目录调整）====
DATA_DIR = "../xgb_clean_outputs"
X_PARQUET = f"{DATA_DIR}/X_clean.parquet"
Y_CSV     = f"{DATA_DIR}/y.csv"
OUT_MODEL = "../xgb_model/xgb_model.joblib"

# 训练配置
SEED = 42
TEST_SIZE = 0.2
LOG_TARGET = False   # if True：log1p(y)，after training, use expm1 to back to origin
N_ESTIMATORS = 2000
LEARNING_RATE = 0.05
MAX_DEPTH = 6
SUBSAMPLE = 0.8
COLSAMPLE_BYTREE = 0.8
REG_ALPHA = 0.0
REG_LAMBDA = 1.0

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


## 1) Read the data

In [12]:

# X: Parquet（reserve category dtype），y: csv
X = pd.read_parquet(X_PARQUET)
y = pd.read_csv(Y_CSV)["y"].values

print(X.dtypes.head())
print("X shape:", X.shape, "| y shape:", y.shape)


LotFrontage    float64
LotArea          int64
OverallQual      int64
OverallCond      int64
YearBuilt        int64
dtype: object
X shape: (1460, 82) | y shape: (1460,)


## 2) training/validation split（optional: log1p target）

In [13]:

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED
)

if LOG_TARGET:
    y_tr_fit = np.log1p(y_tr)
else:
    y_tr_fit = y_tr

X_tr.shape, X_va.shape, y_tr.shape, y_va.shape


((1168, 82), (292, 82), (1168,), (292,))

## 3) train XGBoost model

In [14]:

model = XGBRegressor(
    n_estimators=N_ESTIMATORS,
    learning_rate=LEARNING_RATE,
    max_depth=MAX_DEPTH,
    subsample=SUBSAMPLE,
    colsample_bytree=COLSAMPLE_BYTREE,
    reg_alpha=REG_ALPHA,
    reg_lambda=REG_LAMBDA,
    random_state=SEED,
    tree_method="hist",
    enable_categorical=True,
    n_jobs=-1
)
model.fit(X_tr, y_tr_fit)
model


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,True


## 4) validation on val data（RMSE / R²）

In [15]:

# Prediction
pred_log = model.predict(X_va)
pred = np.expm1(pred_log) if LOG_TARGET else pred_log

metrics = {
    "rmse": rmse(y_va, pred),
    "r2": float(r2_score(y_va, pred)),
    "log_target": bool(LOG_TARGET)
}
metrics


{'rmse': 25522.873819380136, 'r2': 0.9150730967521667, 'log_target': False}

## 5) Save the model and metrics

In [16]:

dump(model, OUT_MODEL)
with open(os.path.splitext(OUT_MODEL)[0] + "_metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2, ensure_ascii=False)

print("[OK] Saved model to:", OUT_MODEL)
print("[OK] Saved metrics to:", os.path.splitext(OUT_MODEL)[0] + "_metrics.json")


[OK] Saved model to: ../xgb_model/xgb_model.joblib
[OK] Saved metrics to: ../xgb_model/xgb_model_metrics.json


## 6) (Optional) train a final model on full training data

In [17]:

# To avoid less validation sample because of split of training/validation
# Attention: no validation here,
TRAIN_FULL = False  # True for full data

if TRAIN_FULL:
    y_fit_full = np.log1p(y) if LOG_TARGET else y
    model_full = XGBRegressor(
        n_estimators=N_ESTIMATORS,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        subsample=SUBSAMPLE,
        colsample_bytree=COLSAMPLE_BYTREE,
        reg_alpha=REG_ALPHA,
        reg_lambda=REG_LAMBDA,
        random_state=SEED,
        tree_method="hist",
        enable_categorical=True,
        n_jobs=-1
    )
    model_full.fit(X, y_fit_full)
    out_full = os.path.splitext(OUT_MODEL)[0] + "_full.joblib"
    dump(model_full, out_full)
    print("[OK] Saved full-data model to:", out_full)
else:
    print("Skipped. Set TRAIN_FULL=True to train on the full dataset.")


Skipped. Set TRAIN_FULL=True to train on the full dataset.
