In [None]:
# =============================================================================
# FOLD PARAMETER - SET THIS BEFORE RUNNING
# =============================================================================
CURRENT_FOLD = 0  # Set to 0, 1, 2, 3, or 4 for 5-fold CV
N_FOLDS = 5
RANDOM_STATE = 42

# FLAML specific parameters
FLAML_TIME_BUDGET = 60  # seconds per model search
FLAML_ESTIMATOR_LIST = ["lgbm", "xgboost", "xgb_limitdepth", "catboost", "rf", "extra_tree"]

In [None]:
# =============================================================================
# IMPORTS
# =============================================================================
from pathlib import Path
from typing import List, Optional, Tuple, Union
from dataclasses import dataclass
from abc import ABC, abstractmethod
from enum import Enum
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from flaml import AutoML

warnings.filterwarnings('ignore')

In [None]:
# =============================================================================
# KTOOLS DEPENDENCIES (INLINED FOR KAGGLE)
# =============================================================================

T = Union[np.ndarray, pd.DataFrame]
pd_to_np = lambda x: x.to_numpy() if isinstance(x, pd.DataFrame) else x


def infer_task(y: np.ndarray | pd.Series) -> str:
    """
    Will infer binary, multiclass classification or regression based on the target values.
    """
    if isinstance(y, pd.Series):
        y = y.to_numpy()
    y = y.flatten()

    nuniques = np.unique(y).shape[0]
    has_floats = np.any(y % 1 != 0)

    if has_floats:
        print("Target contains float values. Inferring regression task.")
        return "regression"
    elif nuniques == 2:
        print("Target contains two unique values. Inferring binary classification task.")
        return "binary_classification"
    elif nuniques > 2:
        print("Target contains more than two unique values. Inferring multiclass classification task.")
        return "multiclass_classification"

    raise ValueError(
        "Unable to infer task type from target values. Is there only one target value?"
    )


@dataclass
class DatasetConfig:
    training_col_names: List[str]
    target_col_name: str
    numerical_col_names: List[str]
    categorical_col_names: List[str]
    name: Optional[str] = None


class BasePreprocessor(ABC):
    name = "base-preprocessor"

    def __init__(self, config: DatasetConfig):
        self._fitted = False
        self.config = config

    @abstractmethod
    def fit(self, data: pd.DataFrame) -> "BasePreprocessor":
        pass

    @abstractmethod
    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        pass

    def fit_transform(self, data: pd.DataFrame) -> pd.DataFrame:
        return self.fit(data).transform(data)

    @property
    def fitted(self) -> bool:
        return self._fitted


class CategoricalEncoder(BasePreprocessor):
    name = "categorical-encoder"

    def __init__(
        self,
        config: DatasetConfig,
        handle_unknown: str = "use_encoded_value",
        unknown_value: int = -2,
        encoded_missing_value: int = -1,
        **encoder_kwargs,
    ) -> None:
        super().__init__(config)
        self.encode_missing_value = encoded_missing_value
        self.encoder = OrdinalEncoder(
            handle_unknown=handle_unknown,
            unknown_value=unknown_value,
            encoded_missing_value=encoded_missing_value,
            **encoder_kwargs,
        )

    def fit(self, data: pd.DataFrame) -> "CategoricalEncoder":
        self.encoder.fit(data[self.config.categorical_col_names])
        self._fitted = True
        return self

    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        copy = data.copy()
        mask = copy[self.config.categorical_col_names].isna()
        copy[self.config.categorical_col_names] = self.encoder.transform(
            copy[self.config.categorical_col_names]
        ).astype(int)
        copy[self.config.categorical_col_names] = (
            copy[self.config.categorical_col_names]
            .where(~mask, self.encode_missing_value)
            .astype("category")
        )
        return copy


class StandardScale(BasePreprocessor):
    name = "standard-scaler"

    def __init__(self, config: DatasetConfig) -> None:
        super().__init__(config)
        self.scaler = StandardScaler()

    def fit(self, data: pd.DataFrame) -> "StandardScale":
        self.scaler.fit(data[self.config.numerical_col_names])
        self._fitted = True
        return self

    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        copy = data.copy()
        copy[self.config.numerical_col_names] = self.scaler.transform(
            copy[self.config.numerical_col_names]
        )
        return copy


class PreprocessingPipeline:
    def __init__(self, preprocessors: List[BasePreprocessor]) -> None:
        self.preprocessors = preprocessors

    def train_pipe(self, data: pd.DataFrame) -> pd.DataFrame:
        for preprocessor in self.preprocessors:
            data = preprocessor.fit_transform(data)
        return data

    def inference_pipe(self, data: pd.DataFrame) -> pd.DataFrame:
        for preprocessor in self.preprocessors:
            data = preprocessor.transform(data)
        return data

In [None]:
# =============================================================================
# FLAML MODEL (INLINED)
# =============================================================================

task_to_default_metric = {
    "regression": "mse",
    "binary_classification": "roc_auc",
    "multiclass_classification": "accuracy"
}


class DefaultObjective(Enum):
    regression = "regression"
    binary_classification = "classification"
    multiclass_classification = "classification"


class FLAMLModel:
    def __init__(
        self,
        time_budget: float = 60,
        metric: Optional[str] = None,
        task: Optional[str] = None,
        n_jobs: int = -1,
        estimator_list: List[str] = [
            "lgbm",
            "xgboost",
            "xgb_limitdepth",
            "catboost",
            "rf",
            "extra_tree",
        ],
        verbose: int = 0,
        seed: int = 42,
        **extra_params,
    ) -> None:
        self._random_state = seed
        self._task = task
        self._time_budget = time_budget
        self._fitted = False
        self.model = None
        self._model_parameters = {
            "time_budget": time_budget,
            "metric": metric,
            "task": task,
            "n_jobs": n_jobs,
            "estimator_list": estimator_list,
            "verbose": verbose,
            "seed": seed,
            **extra_params,
        }

    def fit(
        self,
        X: T,
        y: T,
        validation_set: Optional[Tuple[T, T]] = None,
        weights: Optional[T] = None,
    ) -> "FLAMLModel":

        if self._task is None:
            task_id = infer_task(y)
            self._task = task_id
            self._model_parameters["task"] = DefaultObjective[task_id].value
            self._model_parameters["metric"] = task_to_default_metric[task_id]

        if weights is not None:
            print("Warning: FLAML does not currently support sample weights. Ignoring weights.")

        self.model = AutoML(**self._model_parameters)

        fitting_kwargs = {}

        if validation_set is not None:
            X_val, y_val = validation_set
            X_val, y_val = pd_to_np(X_val), pd_to_np(y_val)
            fitting_kwargs["X_val"] = X_val
            fitting_kwargs["y_val"] = y_val

        X, y = pd_to_np(X), pd_to_np(y)
        fitting_kwargs["X_train"] = X
        fitting_kwargs["y_train"] = y

        self.model.fit(
            **fitting_kwargs,
            time_budget=self._time_budget,
        )
        self._fitted = True
        return self

    def predict(self, X: T) -> np.ndarray:
        X = pd_to_np(X)
        if self._task == "regression":
            y_pred = self.model.predict(X)
        elif self._task == "binary_classification":
            y_pred = self.model.predict_proba(X)[:, 1]
        elif self._task == "multiclass_classification":
            y_pred = self.model.predict_proba(X)
        else:
            raise NotImplementedError
        return y_pred

    @property
    def fitted(self) -> bool:
        return self._fitted

In [None]:
# =============================================================================
# MODEL PIPELINE (INLINED)
# =============================================================================

class ModelPipeline:
    def __init__(
        self,
        model,
        config: DatasetConfig,
        preprocessor: PreprocessingPipeline = None,
    ) -> None:
        self.model = model
        self.config = config
        self.preprocessor = preprocessor if preprocessor is not None else PreprocessingPipeline([])

    def fit(
        self,
        train_data: pd.DataFrame,
        validation_data: Optional[pd.DataFrame] = None,
        weights: Optional[Union[pd.Series, np.ndarray]] = None,
    ) -> "ModelPipeline":
        train_data = self.preprocessor.train_pipe(train_data)
        X_train = train_data.drop(columns=[self.config.target_col_name])
        y_train = train_data[self.config.target_col_name]

        if validation_data is not None:
            validation_data = self.preprocessor.inference_pipe(validation_data)
            X_valid = validation_data.drop(columns=[self.config.target_col_name])
            y_valid = validation_data[self.config.target_col_name]
            validation_data = (X_valid, y_valid)

        self.model.fit(
            X=X_train, y=y_train, validation_set=validation_data, weights=weights
        )
        return self

    def predict(self, data: pd.DataFrame) -> np.ndarray:
        data = self.preprocessor.inference_pipe(data)
        X_test = data[self.config.training_col_names]
        return self.model.predict(X_test)

In [None]:
# =============================================================================
# DATA LOADING
# =============================================================================

DATA_PATH = Path("./data/diabetes_prediction/")
TARGET = "diagnosed_diabetes"

# id split for stratification
split_id = 678000

In [None]:
orig_data = pd.read_csv(DATA_PATH / "original.csv")
train_data = pd.read_csv(DATA_PATH / "train.csv", index_col=0)
test_data = pd.read_csv(DATA_PATH / "test.csv", index_col=0).assign(data=0)

test_data["data"] = test_data["data"].astype("category")

In [None]:
assert (train_data.columns == train_data.columns.intersection(orig_data.columns)).all()

In [None]:
orig_data[orig_data.columns.difference(train_data.columns)].dtypes

In [None]:
train_data = train_data.assign(data=np.nan)
train_data.iloc[:split_id, -1] = 1
train_data.iloc[split_id:, -1] = 0

In [None]:
# =============================================================================
# FEATURE ENGINEERING
# =============================================================================

TARGET = 'diagnosed_diabetes'
BASE = [col for col in train_data.columns if col not in ['id', TARGET, 'data']]

ORIG = []

for col in BASE:
    # MEAN encoding from original data
    for tgt in [TARGET, 'glucose_fasting', 'glucose_postprandial', 'hba1c']:
        
        mean_map = orig_data.groupby(col)[tgt].mean()
        new_mean_col_name = f"orig_mean_{tgt}_grouped_by_{col}"
        mean_map.name = new_mean_col_name
        
        print(col, tgt)
        train_data = train_data.merge(mean_map, on=col, how='left')
        test_data = test_data.merge(mean_map, on=col, how='left')
        ORIG.append(new_mean_col_name)

print(len(ORIG), 'Orig Features Created!!')

In [None]:
# =============================================================================
# DATASET CONFIG
# =============================================================================

training_col_names = train_data.drop(columns=TARGET).columns.tolist()

numerical_col_names = (
    train_data.drop(columns=TARGET)
    .select_dtypes(include=["number"])
    .columns.tolist()
)
categorical_col_names = train_data.select_dtypes(
    include=["object"]
).columns.tolist()

config = DatasetConfig(
    training_col_names=training_col_names,
    numerical_col_names=numerical_col_names + ORIG,
    categorical_col_names=categorical_col_names,
    target_col_name=TARGET,
)

print(f"Training columns: {len(training_col_names)}")
print(f"Numerical columns: {len(numerical_col_names)}")
print(f"Categorical columns: {len(categorical_col_names)}")

In [None]:
# Stratification categories
categories_of_interest = train_data["diagnosed_diabetes"].astype(str) + "_" + train_data["data"].astype(str)

In [None]:
# =============================================================================
# SINGLE FOLD TRAINING
# =============================================================================

print(f"\n{'='*60}")
print(f"TRAINING FOLD {CURRENT_FOLD} of {N_FOLDS}")
print(f"{'='*60}\n")

preprocessors = [StandardScale(config), CategoricalEncoder(config)]

# Get fold indices
kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
fold_splits = list(kfold.split(train_data, categories_of_interest))

train_index, val_index = fold_splits[CURRENT_FOLD]

print(f"Train size: {len(train_index)}, Validation size: {len(val_index)}")

# Prepare fold data
train_fold: pd.DataFrame = train_data.iloc[train_index].copy()
val_fold: pd.DataFrame = train_data.iloc[val_index].copy()
subsetval_fold = val_fold[val_fold["data"] == 0.0]

train_fold["data"] = train_fold["data"].astype("category")
weights = np.where(train_fold["data"] == 0, 1.0, 1.0)

# Create pipeline with FLAML model
pipe = ModelPipeline(
    model=FLAMLModel(
        time_budget=FLAML_TIME_BUDGET,
        estimator_list=FLAML_ESTIMATOR_LIST,
        seed=RANDOM_STATE,
        verbose=1,
    ),
    config=config,
    preprocessor=PreprocessingPipeline(preprocessors=preprocessors),
)

# Fit model
print(f"\nFitting FLAML model (time budget: {FLAML_TIME_BUDGET}s)...")
pipe.fit(train_fold, validation_data=subsetval_fold, weights=weights)

# Predictions
y_pred = pipe.predict(subsetval_fold)
test_pred = pipe.predict(test_data)
oof_pred = pipe.predict(val_fold)

# Score
score = roc_auc_score(subsetval_fold[TARGET], y_pred)
print(f"\nFold {CURRENT_FOLD} ROC AUC Score: {score:.6f}")

In [None]:
# =============================================================================
# SAVE PREDICTIONS
# =============================================================================

save_path = Path("./data/diabetes_prediction/")

# Create directories if they don't exist
(save_path / "oofs" / "flaml").mkdir(parents=True, exist_ok=True)
(save_path / "test_preds" / "flaml").mkdir(parents=True, exist_ok=True)

# Save OOF predictions with fold indices
oof_df = pd.DataFrame({
    "index": val_index,
    f"oof_pred_fold_{CURRENT_FOLD}": oof_pred,
})
oof_df.to_csv(save_path / "oofs" / "flaml" / f"oof_preds_fold_{CURRENT_FOLD}.csv", index=False)

# Save test predictions
test_df = pd.DataFrame({
    f"test_pred_fold_{CURRENT_FOLD}": test_pred,
})
test_df.to_csv(save_path / "test_preds" / "flaml" / f"test_preds_fold_{CURRENT_FOLD}.csv", index=False)

# Save fold score
score_df = pd.DataFrame({
    "fold": [CURRENT_FOLD],
    "roc_auc_score": [score],
    "time_budget": [FLAML_TIME_BUDGET],
})
score_df.to_csv(save_path / "oofs" / "flaml" / f"score_fold_{CURRENT_FOLD}.csv", index=False)

print(f"\nSaved predictions for fold {CURRENT_FOLD}:")
print(f"  - OOF: {save_path / 'oofs' / 'flaml' / f'oof_preds_fold_{CURRENT_FOLD}.csv'}")
print(f"  - Test: {save_path / 'test_preds' / 'flaml' / f'test_preds_fold_{CURRENT_FOLD}.csv'}")
print(f"  - Score: {save_path / 'oofs' / 'flaml' / f'score_fold_{CURRENT_FOLD}.csv'}")

In [None]:
# =============================================================================
# HELPER: COMBINE ALL FOLDS (RUN AFTER ALL FOLDS ARE COMPLETE)
# =============================================================================

def combine_all_folds(save_path: Path, n_folds: int = 5):
    """
    Combine OOF and test predictions from all folds.
    Run this after all folds have been processed.
    """
    # Combine OOF predictions
    oof_dfs = []
    for fold in range(n_folds):
        oof_path = save_path / "oofs" / "flaml" / f"oof_preds_fold_{fold}.csv"
        if oof_path.exists():
            oof_dfs.append(pd.read_csv(oof_path))
    
    if len(oof_dfs) == n_folds:
        combined_oof = pd.concat(oof_dfs, ignore_index=True)
        combined_oof = combined_oof.sort_values("index").reset_index(drop=True)
        combined_oof.to_csv(save_path / "oofs" / "flaml" / "combined_oof_preds.csv", index=False)
        print(f"Combined OOF predictions saved: {combined_oof.shape}")
    else:
        print(f"Missing folds. Found {len(oof_dfs)} of {n_folds} folds.")
    
    # Combine test predictions (average)
    test_dfs = []
    for fold in range(n_folds):
        test_path = save_path / "test_preds" / "flaml" / f"test_preds_fold_{fold}.csv"
        if test_path.exists():
            test_dfs.append(pd.read_csv(test_path))
    
    if len(test_dfs) == n_folds:
        combined_test = pd.concat([df.iloc[:, 0] for df in test_dfs], axis=1)
        combined_test["test_pred_mean"] = combined_test.mean(axis=1)
        combined_test.to_csv(save_path / "test_preds" / "flaml" / "combined_test_preds.csv", index=False)
        print(f"Combined test predictions saved: {combined_test.shape}")
    else:
        print(f"Missing folds. Found {len(test_dfs)} of {n_folds} folds.")
    
    # Combine scores
    score_dfs = []
    for fold in range(n_folds):
        score_path = save_path / "oofs" / "flaml" / f"score_fold_{fold}.csv"
        if score_path.exists():
            score_dfs.append(pd.read_csv(score_path))
    
    if len(score_dfs) == n_folds:
        combined_scores = pd.concat(score_dfs, ignore_index=True)
        mean_score = combined_scores["roc_auc_score"].mean()
        combined_scores.to_csv(save_path / "oofs" / "flaml" / "combined_scores.csv", index=False)
        print(f"\nMean ROC AUC across all folds: {mean_score:.6f}")
        print(combined_scores)
    else:
        print(f"Missing folds. Found {len(score_dfs)} of {n_folds} folds.")

# Uncomment to run after all folds are complete:
# combine_all_folds(save_path, N_FOLDS)

In [None]:
# Print best model info from FLAML
# print(f"\n{'='*60}")
# print("FLAML BEST MODEL INFO")
# print(f"{'='*60}")
# print(f"Best estimator: {pipe.model.model.best_estimator}")
# print(f"Best config: {pipe.model.model.best_config}")
# print(f"Best validation score: {pipe.model.model.best_loss}")