In [1]:
import pandas as pd
df_train = pd.read_parquet('processed/train.snappy.parquet')
df_val = pd.read_parquet('processed/val.snappy.parquet')

In [2]:
# XGBoost local (sin Ray) usando el mismo batch: df_train/df_val ya cargados arriba
import os
import time
import numpy as np
import pandas as pd

try:
    import xgboost as xgb
except Exception as e:
    raise ImportError("No se pudo importar xgboost. Instala con: pip install xgboost") from e

TARGET = "attack"
assert TARGET in df_train.columns, f"No existe columna target '{TARGET}' en df_train. Columnas: {list(df_train.columns)[:30]}..."
assert TARGET in df_val.columns, f"No existe columna target '{TARGET}' en df_val. Columnas: {list(df_val.columns)[:30]}..."

# Asegura uso de todos los cores disponibles (puedes limitarlo con OMP_NUM_THREADS si quieres)
cpu_count = os.cpu_count() or 1
for var in (
    "OMP_NUM_THREADS",
    "MKL_NUM_THREADS",
    "OPENBLAS_NUM_THREADS",
    "NUMEXPR_NUM_THREADS",
    "VECLIB_MAXIMUM_THREADS",
    "XGBOOST_NUM_THREADS",
    "XGB_NUM_THREADS",
    "RAYON_NUM_THREADS",
    ):
    os.environ[var] = str(cpu_count)

print(f"[local-xgboost] cpu_count={cpu_count} | OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')}")

# Replica de params (tomados de k3s/kuberay/schemas/xgboost_params.py) pero sin importar Ray
params = {
    "objective": "multi:softprob",
    "eval_metric": ["mlogloss", "merror"],
    "booster": "gbtree",
    "tree_method": "hist",
    "verbosity": 1,
    "eta": 0.3,
    "max_depth": 6,
    "min_child_weight": 1,
    "subsample": 1.0,
    "lambda": 1.0,
    "alpha": 0.0,
    # threading
    "nthread": cpu_count,
}

# Inferir num_class desde el batch actual (igual que el módulo de Ray: params['num_class']=num_classes)
num_class = int(pd.concat([df_train[TARGET], df_val[TARGET]], axis=0).nunique())
params["num_class"] = num_class
print(f"[local-xgboost] num_class={num_class}")

# DMatrix (usa el batch actual tal cual)
X_train = df_train.drop(columns=[TARGET])
y_train = df_train[TARGET].astype("int64")
X_val = df_val.drop(columns=[TARGET])
y_val = df_val[TARGET].astype("int64")

dtrain = xgb.DMatrix(X_train, label=y_train, nthread=cpu_count)
dval = xgb.DMatrix(X_val, label=y_val, nthread=cpu_count)

num_boost_round = 100  # igual al default del esquema
t0 = time.perf_counter()
booster = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=[(dval, "validation")],
    verbose_eval=10,
)
dt = time.perf_counter() - t0
print(f"[local-xgboost] train_time_sec={dt:.2f}")

# Métrica rápida en val (merror/accuracy)
probs = booster.predict(dval)
if probs.ndim == 1:
    y_pred = (probs > 0.5).astype("int64")
else:
    y_pred = probs.argmax(axis=1).astype("int64")
y_true = y_val.to_numpy()
acc = float((y_pred == y_true).mean())
merror = 1.0 - acc
print(f"[local-xgboost] val_accuracy={acc:.4f} | val_merror={merror:.4f}")

[local-xgboost] cpu_count=24 | OMP_NUM_THREADS=24
[local-xgboost] num_class=6
[0]	validation-mlogloss:0.75514	validation-merror:0.00833
[10]	validation-mlogloss:0.04549	validation-merror:0.00400
[20]	validation-mlogloss:0.01370	validation-merror:0.00433
[30]	validation-mlogloss:0.01207	validation-merror:0.00433
[40]	validation-mlogloss:0.01193	validation-merror:0.00400
[50]	validation-mlogloss:0.01185	validation-merror:0.00400
[60]	validation-mlogloss:0.01187	validation-merror:0.00400
[70]	validation-mlogloss:0.01194	validation-merror:0.00400
[80]	validation-mlogloss:0.01197	validation-merror:0.00367
[90]	validation-mlogloss:0.01202	validation-merror:0.00367
[99]	validation-mlogloss:0.01213	validation-merror:0.00367
[local-xgboost] train_time_sec=14.74
[local-xgboost] val_accuracy=0.9963 | val_merror=0.0037
