In [None]:
# Install CatBoost if missing
# !pip install catboost scikit-learn pandas numpy

import catboost
import sklearn
import pandas as pd
import numpy as np

print(f"CatBoost Version: {catboost.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

In [None]:
import pandas as pd
import numpy as np
import gc
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import roc_auc_score

# ============================================
# 1. CONFIGURATION
# ============================================
SEEDS = [42, 123, 777]
N_FOLDS = 10
EARLY_STOPPING_ROUNDS = 50

# Fixed CatBoost Params
CATBOOST_PARAMS = {
    'iterations': 2000,           # High number, let early_stopping cut it
    'learning_rate': 0.02,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,            # Will be overwritten by loop seed
    'allow_writing_files': False, # Keep directory clean
    'thread_count': -1,           # Use all CPU cores
    'task_type': 'CPU'            # Strict determinism
}

# ============================================
# 2. DATA PREPARATION
# ============================================
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Helper: Basic Feature Engineering (The robust ones we found earlier)
def base_feature_eng(df):
    df_new = df.copy()
    df_new['TG_HDL_Ratio'] = df_new['triglycerides'] / (df_new['hdl_cholesterol'] + 1)
    df_new['MAP'] = (df_new['systolic_bp'] + 2 * df_new['diastolic_bp']) / 3
    df_new['BMI_Age'] = df_new['bmi'] * df_new['age']
    return df_new

# Apply Base Engineering
X = base_feature_eng(train_df.drop(['diagnosed_diabetes', 'id'], axis=1))
y = train_df['diagnosed_diabetes']
X_test_base = base_feature_eng(test_df.drop(['id'], axis=1))

# Identify Columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()

# ============================================
# 3. TRAINING LOOP
# ============================================

# Storage for predictions
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test_base))

# Average over seeds
for seed in SEEDS:
    print(f"\n{'='*20}\nRunning Seed: {seed}\n{'='*20}")
    
    # Update seed in params
    CATBOOST_PARAMS['random_seed'] = seed
    
    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    
    seed_test_preds = np.zeros(len(X_test_base))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        # 1. Split Data
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx].copy(), y.iloc[val_idx]
        
        # 2. IN-FOLD BINNING (The Critical Step)
        # We fit the binner ONLY on X_train to avoid leakage
        # Strategy: 'quantile' (Statistical binning)
        binner = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile', subsample=None)
        
        # Fit on Train, Transform Train
        # We create NEW columns for the bins (preserving original raw data usually helps CatBoost)
        # But if the prompt implies "using binning methods", we can append them as categorical features
        
        train_binned = binner.fit_transform(X_train[num_cols])
        val_binned = binner.transform(X_val[num_cols])
        test_binned = binner.transform(X_test_base[num_cols])
        
        # Create DataFrames for the binned features
        binned_cols = [f"{c}_bin" for c in num_cols]
        X_train_bins = pd.DataFrame(train_binned, columns=binned_cols, index=X_train.index)
        X_val_bins = pd.DataFrame(val_binned, columns=binned_cols, index=X_val.index)
        X_test_bins = pd.DataFrame(test_binned, columns=binned_cols, index=X_test_base.index)
        
        # Concatenate: Raw Features + Binned Features
        X_train_final = pd.concat([X_train, X_train_bins], axis=1)
        X_val_final = pd.concat([X_val, X_val_bins], axis=1)
        X_test_final = pd.concat([X_test_base, X_test_bins], axis=1)
        
        # IMPORTANT: CatBoost needs to know which cols are categorical
        # The new binned cols are effectively ordinal/categorical
        # We cast them to int for CatBoost
        X_train_final[binned_cols] = X_train_final[binned_cols].astype(int)
        X_val_final[binned_cols] = X_val_final[binned_cols].astype(int)
        X_test_final[binned_cols] = X_test_final[binned_cols].astype(int)
        
        # Full list of categorical features (Original Cats + New Bins)
        full_cat_features = cat_cols + binned_cols
        
        # 3. Create CatBoost Pools
        train_pool = Pool(X_train_final, y_train, cat_features=full_cat_features)
        val_pool = Pool(X_val_final, y_val, cat_features=full_cat_features)
        test_pool = Pool(X_test_final, cat_features=full_cat_features)
        
        # 4. Train
        model = CatBoostClassifier(**CATBOOST_PARAMS)
        model.fit(
            train_pool,
            eval_set=val_pool,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            use_best_model=True,
            verbose=False # Silence fold output to keep notebook clean
        )
        
        # 5. Predict
        val_probs = model.predict_proba(val_pool)[:, 1]
        seed_test_preds += model.predict_proba(test_pool)[:, 1] / N_FOLDS
        
        # Accumulate OOF (Divide by len(SEEDS) later)
        oof_preds[val_idx] += val_probs / len(SEEDS)
        
        # Optional: Print Fold Score
        # score = roc_auc_score(y_val, val_probs)
        # print(f"  Fold {fold+1} AUC: {score:.4f}")
        
    # Add seed predictions to global test preds
    test_preds += seed_test_preds / len(SEEDS)
    
    # Quick check for this seed
    # (Note: This is an approximation since oof_preds is being built incrementally)
    print("  Seed completed.")

# ============================================
# 4. EVALUATION & SAVING
# ============================================

# Calculate Final OOF Score
final_auc = roc_auc_score(y, oof_preds)
print(f"\nFinal OOF ROC-AUC Score: {final_auc:.5f}")

# Save OOF File (For local comparison)
oof_df = pd.DataFrame({'id': train_df['id'], 'diagnosed_diabetes': y, 'prediction': oof_preds})
oof_df.to_csv('catboost_final_oof.csv', index=False)
print("Saved OOF predictions to 'catboost_final_oof.csv'")

# Save Submission
submission = pd.DataFrame({'id': test_df['id'], 'diagnosed_diabetes': test_preds})
submission.to_csv('submission.csv', index=False)
print("Saved final submission to 'submission.csv'")
display(submission.head())


Running Seed: 42


