# Notebook 2 â€” PSO-Tuned LightGBM Training

This notebook consumes the balanced parquet generated in Notebook 1 and performs PSO-guided hyperparameter tuning for LightGBM. The resulting model + validation metrics are stored under `artifacts/`.

## Workflow

1. Import configuration + helper modules.
2. Load the processed dataset and create deterministic train/val/test splits.
3. Launch PSO search to optimize key LightGBM hyperparameters.
4. Train the final model on train+val data and log validation performance.
5. Persist the model (`artifacts/models/lightgbm_pso.pkl`) and metrics JSON.

In [None]:
from pathlib import Path
import sys
import json

import joblib
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

PROJECT_ROOT = Path.cwd()
SRC_DIR = PROJECT_ROOT / "src"
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

print(f"Using project root: {PROJECT_ROOT}")

In [None]:
from src.config import DEFAULT_CONFIG_PATH, load_config

config = load_config(DEFAULT_CONFIG_PATH)
processed_path = Path(config.data.processed_file)
if not processed_path.exists():
    raise FileNotFoundError(f"Processed dataset not found at {processed_path}. Run Notebook 1 first.")

print(processed_path)
df = pd.read_parquet(processed_path)
df.head()

In [None]:
label_col = config.data.label_column
X = df.drop(columns=[label_col])
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=config.training.test_size,
    stratify=y,
    random_state=config.training.random_state,
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=config.training.val_size,
    stratify=y_train,
    random_state=config.training.random_state,
)

len(X_train), len(X_val), len(X_test)

### Run PSO search + final training

Set `RUN_TRAINING=True` to launch the PSO loop. Training can take several minutes depending on the number of particles/iterations in the YAML config.

In [None]:
from src.models.pso_lightgbm import PSOLightGBMTuner

RUN_TRAINING = False
best_params = None
model = None

if RUN_TRAINING:
    tuner = PSOLightGBMTuner(config)
    best_params = tuner.fit(X_train, y_train)
    model = tuner.train_best_model(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
    print("Best params:\n", best_params)
else:
    print("Skipping PSO training. Set RUN_TRAINING=True to execute.")

In [None]:
if model is not None:
    val_preds = model.predict(X_val)
    report = classification_report(y_val, val_preds, output_dict=True)
    print("Validation macro F1:", report["macro avg"]["f1-score"])

    model_path = config.paths.models_path / "lightgbm_pso.pkl"
    metrics_path = config.paths.metrics_path / "validation_report.json"
    model_path.parent.mkdir(parents=True, exist_ok=True)
    metrics_path.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(model, model_path)
    Path(metrics_path).write_text(json.dumps(report, indent=2), encoding="utf-8")
    print(f"Saved model to {model_path}")
else:
    
    print("Model not trained in this session.")