# Pipeline Unit Tests
This notebook uses the provided synthetic data to automatically validate the main pipeline utilities.

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
import config
from model_definitions import (
    get_base_regressors, get_base_classifiers,
    get_meta_regressor_candidates, get_meta_classifier_candidates,
    optimize_lgbm_reg, optimize_lgbm_cls,
    select_best_stack)
from utils import run_optuna_study
from outer_cv import run_outer_cv_loop
from pipeline_steps import aggregate_cv_results, select_final_model, evaluate_on_test_set

# Speed up tests
config.N_SPLITS_OUTER_CV = 2
config.OPTUNA_TRIALS_MAIN = 2
config.OPTUNA_TRIALS_OTHER = 1
config.TUNE_ALL_BASE_MODELS = False

# Load synthetic dataset
df = pd.read_csv(config.DATA_FILE)
X = df.iloc[:, :-2]
y_reg = df.iloc[:, -2]
y_cls = df.iloc[:, -1]
print(f"Data shape: {df.shape}")


In [None]:
kf = KFold(n_splits=config.N_SPLITS_OUTER_CV, shuffle=True, random_state=config.RANDOM_STATE)
splits = list(kf.split(X))
print(f'Number of folds: {len(splits)}')
assert len(splits) == config.N_SPLITS_OUTER_CV
combined = np.concatenate([fold[0] for fold in splits])
assert set(combined) <= set(range(len(X)))
print('Cross validation splitter works as expected.')


In [None]:
study = run_optuna_study(
    optimize_lgbm_reg,
    X.iloc[:100], y_reg.iloc[:100],
    X.iloc[100:150], y_reg.iloc[100:150],
    n_trials=2, direction='minimize', study_name='test_lgbm_reg')
assert study.best_trial is not None
print("Optuna returned best params:", study.best_params)


In [None]:
base_regs = get_base_regressors()
for name, model in base_regs.items():
    model.fit(X.iloc[:150], y_reg.iloc[:150])
    preds = model.predict(X.iloc[150:160])
    assert len(preds) == 10
print("Regression base models train and predict successfully.")

base_clfs = get_base_classifiers()
for name, model in base_clfs.items():
    model.fit(X.iloc[:150], y_cls.iloc[:150])
    preds = model.predict(X.iloc[150:160])
    assert len(preds) == 10
print("Classification base models train and predict successfully.")


In [None]:
MODEL_REGRESSORS = get_base_regressors()
MODEL_CLASSIFIERS = get_base_classifiers()
META_REG_CANDS = get_meta_regressor_candidates()
META_CLS_CANDS = get_meta_classifier_candidates()
OPT_FUNCS_REG = {'LGBM': optimize_lgbm_reg}
OPT_FUNCS_CLS = {'LGBM': optimize_lgbm_cls}

outer_results_reg, outer_results_cls, feats_list, best_params_reg, best_params_cls, fold_scalers, fold_selectors, models_reg, models_cls = run_outer_cv_loop(
    X, y_reg, y_cls, kf, X.columns.tolist(),
    MODEL_REGRESSORS=MODEL_REGRESSORS,
    MODEL_CLASSIFIERS=MODEL_CLASSIFIERS,
    STACKING_META_REGRESSOR_CANDIDATES=META_REG_CANDS,
    STACKING_META_CLASSIFIER_CANDIDATES=META_CLS_CANDS,
    OPTIMIZATION_FUNCTIONS_REG=OPT_FUNCS_REG,
    OPTIMIZATION_FUNCTIONS_CLS=OPT_FUNCS_CLS,
    run_optuna_study=run_optuna_study,
    select_best_stack=select_best_stack,
    get_compute_device_params=lambda: {'xgb_tree_method': 'hist', 'lgbm_device': 'cpu'},
    TUNE_ALL_BASE_MODELS=False
)
print('Outer CV completed')
assert len(outer_results_reg['R2']) == config.N_SPLITS_OUTER_CV


In [None]:
agg = aggregate_cv_results(outer_results_reg, outer_results_cls)
(final_reg, final_cls, final_scaler, final_selectors, sel_feats_final, best_fold_reg, best_fold_cls) = select_final_model(
    outer_results_reg,
    outer_results_cls,
    models_reg,
    models_cls,
    fold_scalers,
    fold_selectors,
    feats_list,
    config.N_SPLITS_OUTER_CV
)
assert final_reg is not None and final_cls is not None
print('Final models selected')


In [None]:
X_test_scaled_df, X_test_sel_reg_df, X_test_sel_cls_df, y_pred_reg_test, y_pred_cls_test = evaluate_on_test_set(
    X.iloc[:40],
    y_reg.iloc[:40],
    y_cls.iloc[:40],
    final_reg,
    final_cls,
    final_scaler,
    final_selectors,
    sel_feats_final,
    X.columns.tolist()
)
assert y_pred_reg_test is not None and y_pred_cls_test is not None
print('Evaluation step succeeded')
