## Import Standard Libraries

In [None]:
from pathlib import Path
from typing import List, Optional, Tuple, Callable, Union, Any, Dict, Type
from dataclasses import dataclass
from abc import ABC, abstractmethod
from copy import deepcopy
from enum import Enum

import optuna
from optuna.samplers import TPESampler
from optuna.trial import Trial
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from catboost import Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import yaml

## Configuration Classes

In [None]:
@dataclass
class DatasetConfig:
    """Configuration for dataset column information"""
    training_col_names: List[str]
    target_col_name: str
    numerical_col_names: List[str]
    categorical_col_names: List[str]
    name: Optional[str] = None

## Base Classes

In [None]:
class BaseKtoolsModel(ABC):
    """Base class for all models"""
    def __init__(self) -> None:
        self._fitted = False
        self.model = None

    @abstractmethod
    def fit(
        self,
        X,
        y,
        validation_set: Optional[Tuple] = None,
        weights: Optional[np.ndarray] = None,
    ) -> "BaseKtoolsModel":
        pass

    @abstractmethod
    def predict(self, X) -> np.ndarray:
        pass

    @property
    def fitted(self) -> bool:
        return self._fitted


class BasePreprocessor(ABC):
    """Base class for all preprocessors"""
    name = "base-preprocessor"

    def __init__(self, config: DatasetConfig):
        self._fitted = False
        self.config = config

    @abstractmethod
    def fit(self, data: pd.DataFrame) -> "BasePreprocessor":
        pass

    @abstractmethod
    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        pass

    def fit_transform(self, data: pd.DataFrame) -> pd.DataFrame:
        return self.fit(data).transform(data)

    @property
    def fitted(self) -> bool:
        return self._fitted

## Utility Functions

In [None]:
def infer_task(y: Union[np.ndarray, pd.Series]) -> str:
    """Infer the task type from target values"""
    if isinstance(y, pd.Series):
        y = y.to_numpy()
    y = y.flatten()

    nuniques = np.unique(y).shape[0]
    has_floats = np.any(y % 1 != 0)

    if has_floats:
        print("Target contains float values. Inferring regression task.")
        return "regression"
    elif nuniques == 2:
        print("Target contains two unique values. Inferring binary classification task.")
        return "binary_classification"
    elif nuniques > 2:
        print("Target contains more than two unique values. Inferring multiclass classification task.")
        return "multiclass_classification"

    raise ValueError(
        "Unable to infer task type from target values. Is there only one target value?"
    )


NestedDict = dict[str, "NestedDict | Any"]
TrialSampler = Callable[[Trial], Any]


def load_optuna_grid(
    path: str,
    model_type: str,
    extra_samplers: Dict[str, TrialSampler] | None = None,
) -> Callable[[Trial], Dict[str, Any]]:
    """
    Load an Optuna parameter grid from a YAML file.

    Args:
        path: Path to the YAML file containing parameter grids.
        model_type: Key in the YAML file for the model's parameter grid.
        extra_samplers: Additional tunable parameters as Optuna callables.
            Each callable takes a Trial and returns a sampled value.
            Example: {"weight": lambda t: t.suggest_float("weight", 0.5, 2.0)}

    Returns:
        A callable that takes an Optuna Trial and returns sampled parameters.
    """
    with open(path, "r") as f:
        param_grid: NestedDict = yaml.safe_load(f)
    param_grid = param_grid.get(model_type, {})
    if len(param_grid) == 0:
        raise ValueError(f"No parameter grid found for model type: {model_type}")

    def param_grid_getter(trial: Trial) -> Dict[str, Any]:
        unpacked = {}
        for param_name, param_info in param_grid.items():
            dtype = param_info.get("type")
            if dtype == "int":
                unpacked[param_name] = trial.suggest_int(
                    param_name,
                    param_info["low"],
                    param_info["high"],
                )
            elif dtype == "float":
                unpacked[param_name] = trial.suggest_float(
                    param_name,
                    param_info["low"],
                    param_info["high"],
                    log=param_info.get("log", False),
                )
            elif dtype == "categorical":
                unpacked[param_name] = trial.suggest_categorical(
                    param_name,
                    param_info["choices"],
                )
            elif dtype == "fixed":
                unpacked[param_name] = trial.set_user_attr(
                    param_name, param_info["value"]
                )
            else:
                raise ValueError(f"Unsupported parameter type: {dtype}")

        if extra_samplers:
            for param_name, sampler in extra_samplers.items():
                unpacked[param_name] = sampler(trial)

        return unpacked

    return param_grid_getter

## Preprocessing Classes

In [None]:
class CategoricalEncoder(BasePreprocessor):
    """Encoder for categorical features"""
    name = "categorical-encoder"

    def __init__(
        self,
        config: DatasetConfig,
        handle_unknown: str = "use_encoded_value",
        unknown_value: int = -2,
        encoded_missing_value: int = -1,
        **encoder_kwargs,
    ) -> None:
        super().__init__(config)
        self.encode_missing_value = encoded_missing_value
        self.encoder = OrdinalEncoder(
            handle_unknown=handle_unknown,
            unknown_value=unknown_value,
            encoded_missing_value=encoded_missing_value,
            **encoder_kwargs,
        )

    def fit(self, data: pd.DataFrame) -> "CategoricalEncoder":
        self.encoder.fit(data[self.config.categorical_col_names])
        self._fitted = True
        return self

    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        copy = data.copy()
        mask = copy[self.config.categorical_col_names].isna()
        copy[self.config.categorical_col_names] = self.encoder.transform(
            copy[self.config.categorical_col_names]
        ).astype(int)
        copy[self.config.categorical_col_names] = (
            copy[self.config.categorical_col_names]
            .where(~mask, self.encode_missing_value)
            .astype("category")
        )
        return copy


class StandardScale(BasePreprocessor):
    """Standard scaler for numerical features"""
    name = "standard-scaler"

    def __init__(self, config: DatasetConfig) -> None:
        super().__init__(config)
        self.scaler = StandardScaler()

    def fit(self, data: pd.DataFrame) -> "StandardScale":
        self.scaler.fit(data[self.config.numerical_col_names])
        self._fitted = True
        return self

    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        copy = data.copy()
        copy[self.config.numerical_col_names] = self.scaler.transform(
            copy[self.config.numerical_col_names]
        )
        return copy


class PreprocessingPipeline:
    """Pipeline for preprocessing steps"""
    def __init__(self, preprocessors: List[BasePreprocessor]) -> None:
        self.preprocessors = preprocessors

    def train_pipe(self, data: pd.DataFrame) -> pd.DataFrame:
        for preprocessor in self.preprocessors:
            data = preprocessor.fit_transform(data)
        return data

    def inference_pipe(self, data: pd.DataFrame) -> pd.DataFrame:
        for preprocessor in self.preprocessors:
            data = preprocessor.transform(data)
        return data

## Model Classes

In [None]:
class DefaultObjective(Enum):
    """Default objectives for different tasks"""
    regression = "regression"
    binary_classification = "binary"
    multiclass_classification = "multiclass"


class LGBMModel(BaseKtoolsModel):
    """LightGBM model wrapper"""
    def __init__(
        self,
        num_boost_round: int = 100,
        early_stopping_rounds: Union[int, None] = 20,
        random_state: int = 129,
        verbose: int = -1,
        n_jobs: int = 1,
        callbacks: List[Any] = [],
        **lgb_param_grid,
    ) -> None:
        super().__init__()
        self._num_boost_round = num_boost_round
        self._verbose = verbose
        self._n_jobs = n_jobs
        self._callbacks = callbacks
        self.early_stopping_rounds = early_stopping_rounds

        self._lgb_param_grid = {
            "verbose": verbose,
            "random_state": random_state,
            "n_jobs": n_jobs,
            **lgb_param_grid,
        }

    def fit(
        self,
        X,
        y,
        validation_set: Optional[Tuple] = None,
        weights: Optional[np.ndarray] = None,
    ) -> "LGBMModel":
        if "objective" not in self._lgb_param_grid:
            task_id = infer_task(y)
            self._lgb_param_grid["objective"] = DefaultObjective[task_id].value
            if task_id == "multiclass_classification":
                self._lgb_param_grid["num_class"] = np.unique(y).shape[0]

        train_data = lgb.Dataset(X, label=y, weight=weights)
        eval_sets = [train_data]
        eval_names = ["train"]
        if validation_set is not None:
            X_val, y_val = validation_set
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
            eval_sets += [val_data]
            eval_names += ["valid"]
            self._lgb_param_grid["early_stopping_rounds"] = self.early_stopping_rounds

        train_params = {
            "params": self._lgb_param_grid,
            "train_set": train_data,
            "num_boost_round": self._num_boost_round,
            "valid_sets": eval_sets,
            "valid_names": eval_names,
            "callbacks": self._callbacks,
        }

        self.model = lgb.train(**train_params)
        self._fitted = True
        return self

    def predict(self, X) -> np.ndarray:
        y_pred = self.model.predict(X)
        return y_pred

In [None]:
class XGBoostModel(BaseKtoolsModel):
    """XGBoost model wrapper"""
    class DefaultObjective(Enum):
        regression = "reg:squarederror"
        binary_classification = "binary:logistic"
        multiclass_classification = "multi:softprob"

    def __init__(
        self,
        eval_verbosity: bool = False,
        num_boost_round: int = 100,
        early_stopping_rounds: Union[int, None] = 20,
        random_state: int = 129,
        verbosity: int = 0,
        n_jobs: int = 1,
        **xgb_param_grid,
    ) -> None:
        super().__init__()
        self._eval_verbosity = eval_verbosity
        self._num_boost_round = num_boost_round
        self._verbosity = verbosity
        self._n_jobs = n_jobs
        self._early_stopping_rounds = early_stopping_rounds

        self._xgb_param_grid = {
            "verbosity": verbosity,
            "random_state": random_state,
            "n_jobs": n_jobs,
            **xgb_param_grid,
        }

    def fit(
        self,
        X,
        y,
        validation_set: Optional[Tuple] = None,
        weights: Optional[np.ndarray] = None,
    ) -> "XGBoostModel":
        train_params = {}
        if "objective" not in self._xgb_param_grid:
            task_id = infer_task(y)
            self._xgb_param_grid["objective"] = self.DefaultObjective[task_id].value
            if task_id == "multiclass_classification":
                self._xgb_param_grid["num_class"] = np.unique(y).shape[0]

        train_data = xgb.DMatrix(X, label=y, enable_categorical=True, weight=weights)
        eval_data = [(train_data, "train")]
        if validation_set is not None:
            X_val, y_val = validation_set
            valid_data = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
            eval_data += [(valid_data, "eval")]
            train_params["early_stopping_rounds"] = self._early_stopping_rounds

        train_params = {
            "params": self._xgb_param_grid,
            "dtrain": train_data,
            "evals": eval_data,
            "num_boost_round": self._num_boost_round,
            "verbose_eval": self._eval_verbosity,
            **train_params,
        }

        self.model = xgb.train(**train_params)
        self._fitted = True
        return self

    def predict(self, X) -> np.ndarray:
        test_data = xgb.DMatrix(X, enable_categorical=True)
        y_pred = self.model.predict(test_data)
        return y_pred


class CatBoostModel(BaseKtoolsModel):
    """CatBoost model wrapper"""
    class DefaultObjective(Enum):
        regression = "RMSE"
        binary_classification = "Logloss"
        multiclass_classification = "MultiClass"

    def __init__(
        self,
        num_boost_round: int = 100,
        early_stopping_rounds: Optional[int] = 20,
        random_state: int = 129,
        verbose: bool = False,
        allow_writing_files: bool = False,
        **catboost_params,
    ) -> None:
        super().__init__()
        self.model: Union[cat.CatBoost, None] = None
        self._task: bool = False
        self._num_boost_round = num_boost_round
        self._verbose = verbose
        self._allow_writing_files = allow_writing_files
        self._early_stopping_rounds = early_stopping_rounds

        self._catboost_params = {
            "random_seed": random_state,
            "verbose": verbose,
            "allow_writing_files": allow_writing_files,
            **catboost_params,
        }

    def fit(
        self,
        X,
        y,
        validation_set: Optional[Tuple] = None,
        weights: Optional[np.ndarray] = None,
    ) -> "CatBoostModel":
        task_id = infer_task(y)
        self._task = task_id
        if "loss_function" not in self._catboost_params:
            self._catboost_params["loss_function"] = self.DefaultObjective[task_id].value

        self.cat_col_names = (
            [col for col in X.columns if X[col].dtype == "category"]
            if isinstance(X, pd.DataFrame)
            else []
        )
        train_params: Dict[Any, Any] = {"eval_set": None}
        train_pool = Pool(
            data=X, label=y, cat_features=self.cat_col_names, weight=weights
        )
        if validation_set is not None:
            X_val, y_val = validation_set
            train_params["eval_set"] = Pool(
                data=X_val, label=y_val, cat_features=self.cat_col_names
            )
            train_params["early_stopping_rounds"] = self._early_stopping_rounds

        train_params = {
            "params": self._catboost_params,
            "dtrain": train_pool,
            "num_boost_round": self._num_boost_round,
            **train_params,
        }
        self.model = cat.train(**train_params)
        self._fitted = True
        return self

    def predict(self, X) -> np.ndarray:
        if self.model is None:
            raise ValueError("Model is not fitted yet. Please call 'fit' first.")
        test_pool = Pool(data=X, cat_features=self.cat_col_names)
        if self._task == "binary_classification":
            y_pred = self.model.predict(test_pool, prediction_type="Probability")[:, 1]
        elif self._task == "multiclass_classification":
            y_pred = self.model.predict(test_pool, prediction_type="Probability")
        else:
            y_pred = self.model.predict(test_pool)
        return y_pred

## Model Pipeline

In [None]:
class ModelPipeline:
    """Pipeline for model training with preprocessing"""
    def __init__(
        self,
        model: BaseKtoolsModel,
        config: DatasetConfig,
        preprocessor: PreprocessingPipeline = PreprocessingPipeline([]),
    ) -> None:
        self.model = model
        self.config = config
        self.preprocessor = preprocessor

    def fit(
        self,
        train_data: pd.DataFrame,
        validation_data: Optional[pd.DataFrame] = None,
        weights: Optional[Union[pd.Series, np.ndarray]] = None,
    ) -> "ModelPipeline":
        train_data = self.preprocessor.train_pipe(train_data)
        X_train = train_data.drop(columns=[self.config.target_col_name])
        y_train = train_data[self.config.target_col_name]

        if validation_data is not None:
            validation_data = self.preprocessor.inference_pipe(validation_data)
            X_valid = validation_data.drop(columns=[self.config.target_col_name])
            y_valid = validation_data[self.config.target_col_name]
            validation_data = (X_valid, y_valid)

        self.model.fit(
            X=X_train, y=y_train, validation_set=validation_data, weights=weights
        )
        return self

    def predict(self, data: pd.DataFrame) -> np.ndarray:
        data = self.preprocessor.inference_pipe(data)
        X_test = data[self.config.training_col_names]
        return self.model.predict(X_test)

## Optuna Hyperparameter Optimizer

In [None]:
class OptunaHyperparameterOptimizer:
    """
    Hyperparameter optimizer using Optuna's TPE sampler.

    Args:
        model_type: Type of model to optimize (e.g., "catboost", "lightgbm").
        grid_yaml_path: Path to YAML file containing parameter search space.
        extra_samplers: Additional tunable parameters as Optuna callables.
        timeout: Maximum optimization time in seconds.
        direction: Optimization direction ("maximize" or "minimize").
        n_trials: Number of trials to run.
        study_name: Name for the Optuna study.
        explore_fraction: Fraction of trials for exploration phase.
        save_study: Whether to persist the study to SQLite.
        load_if_exists: Whether to resume an existing study with the same name.
        catch_exceptions: Tuple of exception types to catch during optimization.
        verbose: Whether to log progress information.
        random_state: Random seed for reproducibility.
    """

    def __init__(
        self,
        model_type: str,
        grid_yaml_path: str,
        extra_samplers: Dict[str, TrialSampler] | None = None,
        timeout: int = 3600,
        direction: str = "maximize",
        n_trials: int = 100,
        study_name: str = "ml_experiment",
        explore_fraction: float = 0.1,
        save_study: bool = False,
        load_if_exists: bool = True,
        catch_exceptions: Tuple[Type[Exception], ...] = (),
        verbose: bool = False,
        random_state: int = 42,
    ) -> None:
        self._param_space_builder = load_optuna_grid(grid_yaml_path, model_type, extra_samplers=extra_samplers)
        self._timeout = timeout
        self._direction = direction
        self._n_trials = n_trials
        self._study_name = study_name
        self._explore_fraction = explore_fraction
        self._save_study = save_study
        self._load_if_exists = load_if_exists
        self._catch_exceptions = catch_exceptions
        self._verbose = verbose
        self._random_state = random_state
        self.study: optuna.Study | None = None

    def optimize(
        self,
        *args: Any,
        tunable_func: Callable[..., float],
    ) -> dict[str, Any]:
        """
        Run hyperparameter optimization.

        Args:
            *args: Positional arguments passed to tunable_func.
            tunable_func: Function that takes (*args, **hyperparameters) and returns a score.

        Returns:
            Dictionary of best hyperparameters found.
        """
        if self._verbose:
            print("#" * 100)
            print("Starting Optuna Optimizer")
            print("#" * 100)

        sampler = TPESampler(
            n_startup_trials=int(self._n_trials * self._explore_fraction),
            seed=self._random_state,
        )

        storage_name = f"sqlite:///{self._study_name}.db" if self._save_study else None

        self.study = optuna.create_study(
            sampler=sampler,
            study_name=self._study_name,
            direction=self._direction,
            storage=storage_name,
            load_if_exists=self._load_if_exists,
        )

        def objective(trial: optuna.Trial) -> float:
            parameters = self._param_space_builder(trial)
            return tunable_func(*args, **parameters)

        self.study.optimize(
            objective,
            n_trials=self._n_trials,
            timeout=self._timeout,
            catch=self._catch_exceptions,
        )

        return self.study.best_params

## Load Data

Load your training and test data. This example uses the diabetes prediction dataset with the cv-experimentation pattern.

In [None]:
# Update these paths to match your dataset location
DATA_PATH = Path("./data/diabetes_prediction/")
TARGET = "diagnosed_diabetes"

# ID split for separating data sources
split_id = 678000

# Load data
orig_data = pd.read_csv(DATA_PATH / "original.csv")
train_data = pd.read_csv(DATA_PATH / "train.csv", index_col=0)
test_data = pd.read_csv(DATA_PATH / "test.csv", index_col=0).assign(data=0)
test_data["data"] = test_data["data"].astype("category")

print(f"Original data shape: {orig_data.shape}")
print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

## Prepare Data with Source Labels

Add data source labels for stratified CV and validation filtering.

In [None]:
# Ensure column alignment
assert (train_data.columns == train_data.columns.intersection(orig_data.columns)).all()

# Add data source labels
orig_data = orig_data.drop(columns=orig_data.columns.difference(train_data.columns).to_list())
orig_data = orig_data.assign(data=2)
train_data = train_data.assign(data=np.nan)
train_data.iloc[:split_id, -1] = 1
train_data.iloc[split_id:, -1] = 0

# Create stratification categories (target + data source)
categories_of_interest = train_data[TARGET].astype(str) + "_" + train_data["data"].astype(str)

print(f"Categories distribution:\n{categories_of_interest.value_counts()}")

## Configure Dataset

In [None]:
# Identify column types
training_col_names = train_data.drop(columns=TARGET).columns.tolist()
numerical_col_names = (
    train_data.drop(columns=TARGET)
    .select_dtypes(include=["number"])
    .columns.tolist()
)
categorical_col_names = train_data.select_dtypes(
    include=["object"]
).columns.tolist()

# Create dataset configuration
config = DatasetConfig(
    training_col_names=training_col_names,
    numerical_col_names=numerical_col_names,
    categorical_col_names=categorical_col_names,
    target_col_name=TARGET,
)

print(f"Numerical features ({len(numerical_col_names)}): {numerical_col_names}")
print(f"\nCategorical features ({len(categorical_col_names)}): {categorical_col_names}")

## Setup Preprocessing Pipeline

In [None]:
def cv_tunable_func(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    categories_of_interest: pd.Series,
    config: DatasetConfig,
    preprocessors: List[BasePreprocessor],
    model_class: Type[BaseKtoolsModel],
    sample_weight_data0: float = 1.5,  # Tunable weight for data == 0 samples
    **model_params,
) -> float:
    """
    Cross-validation function to be tuned by Optuna.
    
    This function follows the cv-experimentation pattern:
    - Stratified K-Fold on (target + data source)
    - Validation fold filtered to data == 0.0 only
    - Sample weights based on data source (tunable)
    
    Args:
        train_data: Training DataFrame with 'data' column for source labels.
        test_data: Test DataFrame (for OOF predictions).
        categories_of_interest: Series for stratification (target + data source).
        config: DatasetConfig with column information.
        preprocessors: List of preprocessors to apply.
        model_class: Model class to instantiate (e.g., LGBMModel).
        sample_weight_data0: Weight multiplier for samples where data == 0.
            Higher values give more importance to these samples.
        **model_params: Hyperparameters to pass to the model.
        
    Returns:
        Mean ROC AUC score across folds.
    """
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    mean_score: float = 0.0
    
    for train_index, val_index in kfold.split(train_data, categories_of_interest):
        train_fold: pd.DataFrame = train_data.iloc[train_index].copy()
        val_fold: pd.DataFrame = train_data.iloc[val_index]
        
        # Filter validation to data == 0.0 only (matching cv-experimentation pattern)
        val_fold = val_fold[val_fold["data"] == 0.0]
        
        # Convert data column to category
        train_fold["data"] = train_fold["data"].astype("category")
        
        # Sample weights: tunable weight for data == 0
        weights = np.where(train_fold["data"] == 0, sample_weight_data0, 1.0)
        
        # Create fresh preprocessor pipeline for each fold
        preprocessor_pipeline = PreprocessingPipeline(
            preprocessors=[deepcopy(p) for p in preprocessors]
        )
        
        # Create model pipeline
        pipe = ModelPipeline(
            model=model_class(**model_params),
            config=config,
            preprocessor=preprocessor_pipeline,
        )
        
        # Fit and predict
        pipe.fit(train_fold, validation_data=val_fold, weights=weights)
        y_pred = pipe.predict(val_fold)
        
        # Calculate score
        score = roc_auc_score(val_fold[config.target_col_name], y_pred)
        mean_score += score / 5
    
    return mean_score

## Run Hyperparameter Optimization

Create the optimizer and run optimization with the tunable CV function.

In [None]:
# Specify the path to your parameter grid YAML file
GRID_YAML_PATH = "ktools/hyperopt/grids/lgbm.yml"  # Update with your grid path
MODEL_TYPE = "base"  # The key in the YAML file
MODEL_CLASS = LGBMModel  # Change to XGBoostModel or CatBoostModel as needed

# Extra samplers for tuning non-model parameters (e.g., sample weights)
# Range rationale for sample_weight_data0:
#   - 0.5: Down-weight data==0 samples (less trust in this data source)
#   - 1.0: Equal weight (no preference)
#   - 3.0: Strong up-weight (3x importance for data==0 samples)
# A log scale helps explore both <1 and >1 ranges effectively
extra_samplers = {
    "sample_weight_data0": lambda t: t.suggest_float("sample_weight_data0", 1.0, 5.0),
}

# Create optimizer with updated signature
optimizer = OptunaHyperparameterOptimizer(
    model_type=MODEL_TYPE,
    grid_yaml_path=GRID_YAML_PATH,
    extra_samplers=extra_samplers,
    timeout=42000,  # ~11.5 hours, adjust as needed
    direction="maximize",
    n_trials=300,  # Increase for better optimization
    study_name="kaggle_cv_optimizer",
    explore_fraction=0.1,
    save_study=False,
    load_if_exists=True,
    catch_exceptions=(),
    verbose=True,
    random_state=42,
)

print("Optimizer configured:")
print(f"  Model type: {MODEL_TYPE}")
print(f"  Grid path: {GRID_YAML_PATH}")
print(f"  Max trials: {optimizer._n_trials}")
print(f"  Timeout: {optimizer._timeout}s")
print(f"  Extra samplers: {list(extra_samplers.keys())}")

In [None]:
# Run optimization - note: config is now passed explicitly to cv_tunable_func
best_params = optimizer.optimize(
    train_data,
    test_data,
    categories_of_interest,
    config,  # Pass config explicitly
    [CategoricalEncoder(config), StandardScale(config)],
    MODEL_CLASS,
    tunable_func=cv_tunable_func,
)

print("\n" + "=" * 100)
print("OPTIMIZATION COMPLETE!")
print("=" * 100)
print(f"\nBest Score: {optimizer.study.best_value:.6f}")
print(f"\nBest Parameters:")
for key, value in best_params.items():
    print(f"  {key}: {value}")

## Summary

This notebook provides a complete workflow for hyperparameter optimization using the cv-experimentation pattern:

**Key Features:**
1. **Stratified K-Fold CV** on combined (target + data source) categories
2. **Validation filtering** to only include `data == 0.0` samples
3. **Sample weighting** with higher weight for `data == 0` samples
4. **Tunable function pattern** for flexible optimization

**How It Works:**
1. `cv_tunable_func` encapsulates the entire CV loop
2. `OptunaHyperparameterOptimizer.optimize()` receives this function via `tunable_func=`
3. Optuna suggests hyperparameters which are passed as `**model_params`
4. The function returns the mean CV score for Optuna to optimize

**Supported Models:**
- `LGBMModel` (LightGBM)
- `XGBoostModel` (XGBoost)  
- `CatBoostModel` (CatBoost)

**Parameter Grid YAML Format:**
```yaml
base:  # or your model_type key
  param_name:
    type: int  # or float, categorical, fixed
    low: 1
    high: 10
    # For float, add: log: true (optional)
    # For categorical, use: choices: [val1, val2, ...]
    # For fixed, use: value: some_value
```

**Extra Samplers (Optional):**
For parameters not in the YAML grid, use `extra_samplers`:
```python
extra_samplers = {
    "weight_power": lambda t: t.suggest_float("weight_power", 0.5, 2.0),
}
load_optuna_grid(path, model_type, extra_samplers=extra_samplers)
```