In [1]:
# ============================================================
# CatBoost
# ============================================================

# ---------- General imports ----------
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, math, json
from tqdm import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier, Pool

import shap
warnings.filterwarnings('ignore')

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ---------- Helpers ----------
def seed_everything(seed=43):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def make_dt_m(transaction_dt: pd.Series, days_per_month: int = 30) -> pd.Series:
    """IEEE-style month index from TransactionDT (seconds since reference)."""
    s = pd.to_numeric(transaction_dt, errors='coerce')
    return (s / (3600*24*days_per_month)).fillna(0).astype(np.int16)

seed_everything(SEED)

In [2]:
# ---------- DATA LOAD (YOUR APPROACH) ----------
print('Loading preprocessed data...')
DATA_DIR = "../data/processed"

# Load preprocessed features from kyakovlev's kernel
print("Loading preprocessed features...")
train = pd.read_pickle(f"{DATA_DIR}/train_df.pkl")
test = pd.read_pickle(f"{DATA_DIR}/test_df.pkl")
remove_features_df = pd.read_pickle(f"{DATA_DIR}/remove_features.pkl")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Features to remove: {len(remove_features_df)}")

# Get features to remove
remove_features = list(remove_features_df['features_to_remove'].values)
print(f"Removing {len(remove_features)} features")

# Build final feature list (exclude removed features)
all_features = [col for col in train.columns if col not in remove_features]

# Exclude target and ID columns
EXCLUDE_COLS = {'TransactionID', 'isFraud', 'DT_M'}
FEATURES = [col for col in all_features if col not in EXCLUDE_COLS]

print(f"Final feature count: {len(FEATURES)}")
print(FEATURES)

Loading preprocessed data...
Loading preprocessed features...
Train shape: (590540, 791)
Test shape: (506691, 791)
Features to remove: 19
Removing 19 features
Final feature count: 772
['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V6

In [3]:
# ---------- Prepare target and features ----------
TARGET = 'isFraud'
y = train[TARGET].astype('int8')

# Create DT_M for GroupKFold if not already present
if 'DT_M' not in train.columns and 'TransactionDT' in train.columns:
    print("Creating DT_M from TransactionDT...")
    train['DT_M'] = make_dt_m(train['TransactionDT'])
elif 'DT_M' not in train.columns:
    print("Warning: Neither DT_M nor TransactionDT found. Using row-based groups.")
    # Fallback: create pseudo-groups based on row order
    train['DT_M'] = (np.arange(len(train)) // (len(train) // 6)).astype('int16')

# ---------- Identify categorical features ----------
# For preprocessed data, we need to identify categorical features properly
print("Analyzing feature types...")

# Start with an empty list
CAT_FEATURES = []

# Check each potential categorical feature
potential_cat_features = [
    'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
    'DeviceType', 'DeviceInfo'
] + [f'id_{i}' for i in range(12, 39)]

for col in FEATURES:
    if col in train.columns:
        col_dtype = train[col].dtype
        unique_count = train[col].nunique()
        
        # Check if it's a string/object column (definitely categorical)
        if col_dtype == 'object' or col_dtype.name == 'category':
            CAT_FEATURES.append(col)
            print(f"  {col}: object/category -> categorical")
            
        # Check if it's in our known categorical list AND has reasonable cardinality
        elif col in potential_cat_features:
            # Additional checks for preprocessed data
            if unique_count <= 1000:  # Reasonable limit for categorical
                # Check if values look categorical
                sample_values = train[col].dropna().head(1000)
                if len(sample_values) > 0:
                    # If few unique values, likely categorical regardless of dtype
                    if unique_count <= 200:  # Most categorical features have < 200 categories
                        CAT_FEATURES.append(col)
                        print(f"  {col}: {col_dtype}, {unique_count} unique -> categorical")
                    else:
                        print(f"  {col}: {col_dtype}, {unique_count} unique -> numeric (too many categories)")
                else:
                    print(f"  {col}: all null -> skipping")
            else:
                print(f"  {col}: {unique_count} unique (too many) -> numeric")

print(f'\nIdentified {len(CAT_FEATURES)} categorical features')
if CAT_FEATURES:
    print("Categorical features:")
    for cat in CAT_FEATURES:
        print(f"  - {cat} ({train[cat].dtype}, {train[cat].nunique()} unique)")

Analyzing feature types...
  ProductCD: float64, 5 unique -> categorical
  card1: 9117 unique (too many) -> numeric
  card2: float16, 496 unique -> numeric (too many categories)
  card3: float16, 96 unique -> categorical
  card4: float32, 4 unique -> categorical
  card5: float16, 83 unique -> categorical
  card6: float32, 3 unique -> categorical
  addr1: float16, 332 unique -> numeric (too many categories)
  addr2: float16, 74 unique -> categorical
  P_emaildomain: object/category -> categorical
  R_emaildomain: object/category -> categorical
  M1: float16, 2 unique -> categorical
  M2: float16, 2 unique -> categorical
  M3: float16, 2 unique -> categorical
  M4: float64, 3 unique -> categorical
  M5: float16, 2 unique -> categorical
  M6: float16, 2 unique -> categorical
  M7: float16, 2 unique -> categorical
  M8: float16, 2 unique -> categorical
  M9: float16, 2 unique -> categorical
  id_12: float64, 2 unique -> categorical
  id_13: float16, 54 unique -> categorical
  id_14: float1

In [4]:
# ---------- Handle missing values and convert categorical features ----------
print("\nHandling missing values and categorical conversion...")

# Handle missing values BEFORE converting categorical features
print(f"Missing values per feature (top 10):")
missing_counts = train[FEATURES].isnull().sum().sort_values(ascending=False)
print(missing_counts.head(10))

# Handle categorical and numerical features separately for missing values
print(f"\nProcessing missing values...")

# For numerical features, we can use NaN (CatBoost handles this automatically)
numerical_features = [f for f in FEATURES if f not in CAT_FEATURES]
print(f"Numerical features: keeping NaN for CatBoost to handle ({len(numerical_features)} features)")

# For categorical features, we need to handle missing values more carefully
print(f"\nHandling categorical features ({len(CAT_FEATURES)} features)...")
for cat_col in CAT_FEATURES:
    if cat_col in train.columns:
        original_dtype = train[cat_col].dtype
        print(f"  Processing {cat_col} ({original_dtype}, {train[cat_col].nunique()} unique)")
        
        # Handle categorical columns (they might be pandas Categorical type)
        if original_dtype.name == 'category':
            # For pandas Categorical, convert to string first, then handle missing
            print(f"    Converting pandas Categorical to string")
            train_col = train[cat_col].astype(str)
            test_col = test[cat_col].astype(str)
            # Replace 'nan' strings with 'missing'
            train_col = train_col.replace('nan', 'missing')
            test_col = test_col.replace('nan', 'missing')
            train[cat_col] = train_col
            test[cat_col] = test_col
            
        elif 'float' in str(original_dtype):
            print(f"    Converting float to string")
            # Fill NaN first, then convert
            train_col = train[cat_col].fillna(-999)
            test_col = test[cat_col].fillna(-999)
            # Convert to int then string
            train[cat_col] = train_col.round().astype(int).astype(str)
            test[cat_col] = test_col.round().astype(int).astype(str)
            # Replace the fill value with a cleaner missing indicator
            train[cat_col] = train[cat_col].replace('-999', 'missing')
            test[cat_col] = test[cat_col].replace('-999', 'missing')
            
        elif 'int' in str(original_dtype):
            print(f"    Converting int to string")
            # Fill NaN first, then convert
            train_col = train[cat_col].fillna(-999)
            test_col = test[cat_col].fillna(-999)
            train[cat_col] = train_col.astype(int).astype(str)
            test[cat_col] = test_col.astype(int).astype(str)
            # Replace the fill value with a cleaner missing indicator
            train[cat_col] = train[cat_col].replace('-999', 'missing')
            test[cat_col] = test[cat_col].replace('-999', 'missing')
            
        else:  # object type
            print(f"    Converting object to string")
            train[cat_col] = train[cat_col].fillna('missing').astype(str)
            test[cat_col] = test[cat_col].fillna('missing').astype(str)

print("Categorical features converted successfully!")
print(f"Sample categorical values:")
for cat_col in CAT_FEATURES[:3]:  # Show first 3 as examples
    unique_vals = train[cat_col].unique()[:5]  # First 5 unique values
    print(f"  {cat_col}: {unique_vals}")


Handling missing values and categorical conversion...
Missing values per feature (top 10):
id_23    588860
id_24    585793
id_25    585408
id_08    585385
id_07    585385
id_21    585381
id_26    585377
id_22    585371
id_27    585371
dist2    552913
dtype: int64

Processing missing values...
Numerical features: keeping NaN for CatBoost to handle (730 features)

Handling categorical features (42 features)...
  Processing ProductCD (float64, 5 unique)
    Converting float to string
  Processing card3 (float16, 96 unique)
    Converting float to string
  Processing card4 (float32, 4 unique)
    Converting float to string
  Processing card5 (float16, 83 unique)
    Converting float to string
  Processing card6 (float32, 3 unique)
    Converting float to string
  Processing addr2 (float16, 74 unique)
    Converting float to string
  Processing P_emaildomain (category, 60 unique)
    Converting pandas Categorical to string
  Processing R_emaildomain (category, 61 unique)
    Converting pan

In [5]:
# ========== GroupKFold by month ==========
assert 'DT_M' in train.columns, "DT_M not found. Check data preprocessing."
groups = train['DT_M'].values
skf = GroupKFold(n_splits=6)
folds = list(skf.split(train[FEATURES], y, groups=groups))

# Class imbalance
pos = int(y.sum())
neg = int(len(y) - pos)
scale_pos_weight = (neg / max(pos, 1))
print(f"\nOverall fraud rate: {y.mean():.4f} ({y.mean()*100:.2f}%) | scale_pos_weight: {scale_pos_weight:.2f}")

# CatBoost parameters
cb_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "learning_rate": 0.03,
    "depth": 8,
    "l2_leaf_reg": 3.0,
    "iterations": 5000,  # Reasonable cap with early stopping
    "random_seed": SEED,
    "thread_count": -1,
    "logging_level": "Silent",
    "task_type": "CPU",  # Change to 'GPU' if available
    "use_best_model": True,
    "scale_pos_weight": scale_pos_weight,
}

# Create directories for artifacts
ARTIFACT_DIR = "catboost_artifacts"
SUB_DIR = "submissions"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
os.makedirs(SUB_DIR, exist_ok=True)

print("\n" + "="*60)
print("TRAINING CATBOOST WITH GROUPKFOLD (DT_M)")
print("="*60)

models = []
oof = np.zeros(len(train), dtype=float)
fold_scores, fold_sizes = [], []

for fold_id, (tr_idx, va_idx) in enumerate(tqdm(folds, desc="Folds")):
    va_month = int(groups[va_idx][0])
    print(f"\nFold {fold_id} | holdout month = {va_month} | train={len(tr_idx)} valid={len(va_idx)}")
    
    X_tr = train.iloc[tr_idx][FEATURES]
    y_tr = y.iloc[tr_idx]
    X_va = train.iloc[va_idx][FEATURES]
    y_va = y.iloc[va_idx]
    
    # Create CatBoost pools
    dtr = Pool(X_tr, label=y_tr, cat_features=CAT_FEATURES)
    dva = Pool(X_va, label=y_va, cat_features=CAT_FEATURES)
    
    # Train model
    model = CatBoostClassifier(**cb_params)
    model.fit(
        dtr,
        eval_set=dva,
        early_stopping_rounds=200,
        verbose=200
    )
    
    # Predict on validation set
    va_pred = model.predict_proba(dva)[:,1]
    oof[va_idx] = va_pred
    auc = roc_auc_score(y_va, va_pred)
    fold_scores.append(auc)
    fold_sizes.append(len(va_idx))
    print(f"Fold {fold_id} AUC: {auc:.4f} | Best iter: {model.get_best_iteration()}")
    
    # Save model
    model.save_model(f"{ARTIFACT_DIR}/catboost_fold{fold_id}.cbm")
    models.append(model)
    
    # Clean up memory
    del X_tr, y_tr, X_va, y_va, dtr, dva, va_pred
    gc.collect()

# Calculate final scores
fold_scores = np.array(fold_scores, dtype=float)
weights = np.array(fold_sizes, dtype=float) / np.sum(fold_sizes)
weighted_cv = float(np.sum(fold_scores * weights))
oof_auc = roc_auc_score(y, oof)

print("\n" + "#"*20)
print(f"CatBoost OOF CV = {oof_auc:.4f}")
print("Fold AUCs:", np.round(fold_scores, 4))
print("Weighted mean AUC:", f"{weighted_cv:.4f}")


Overall fraud rate: 0.0350 (3.50%) | scale_pos_weight: 27.58

TRAINING CATBOOST WITH GROUPKFOLD (DT_M)


Folds:   0%|          | 0/6 [00:00<?, ?it/s]


Fold 0 | holdout month = 12 | train=453219 valid=137321
Fold 0 AUC: 0.9128 | Best iter: 1932


Folds:  17%|█▋        | 1/6 [55:32<4:37:43, 3332.63s/it]


Fold 1 | holdout month = 15 | train=488908 valid=101632


Folds:  33%|███▎      | 2/6 [1:25:57<2:43:02, 2445.59s/it]

Fold 1 AUC: 0.9342 | Best iter: 1401

Fold 2 | holdout month = 13 | train=497955 valid=92585


Folds:  50%|█████     | 3/6 [1:57:23<1:49:31, 2190.34s/it]

Fold 2 AUC: 0.9335 | Best iter: 1446

Fold 3 | holdout month = 17 | train=501214 valid=89326


Folds:  67%|██████▋   | 4/6 [2:22:35<1:04:05, 1922.53s/it]

Fold 3 AUC: 0.9294 | Best iter: 1127

Fold 4 | holdout month = 14 | train=504519 valid=86021
Fold 4 AUC: 0.9368 | Best iter: 1710


Folds:  83%|████████▎ | 5/6 [2:59:31<33:48, 2028.21s/it]  


Fold 5 | holdout month = 16 | train=506885 valid=83655
Fold 5 AUC: 0.9455 | Best iter: 1919


Folds: 100%|██████████| 6/6 [3:40:27<00:00, 2204.53s/it]


####################
CatBoost OOF CV = 0.9311
Fold AUCs: [0.9128 0.9342 0.9335 0.9294 0.9368 0.9455]
Weighted mean AUC: 0.9304





In [6]:
# ---------- Save artifacts ----------
np.save(f"{ARTIFACT_DIR}/catboost_oof_predictions.npy", oof)

metadata = {
    "model_type": "catboost_groupkfold_preprocessed",
    "oof_auc": oof_auc,
    "weighted_cv_auc": weighted_cv,
    "fold_scores": fold_scores.tolist(),
    "fold_weights": weights.tolist(),
    "n_folds": len(folds),
    "params": cb_params,
    "features": FEATURES,
    "categorical_features": CAT_FEATURES,
    "validation_method": "GroupKFold with DT_M",
    "best_iterations": [int(m.get_best_iteration()) for m in models],
    "data_source": "kyakovlev_preprocessed",
    "features_removed": len(remove_features)
}

with open(f"{ARTIFACT_DIR}/catboost_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

# Feature importance (averaged across folds)
print("\nCalculating feature importance...")
importances = []
for (tr_idx, _), m in zip(folds, models):
    tr_pool = Pool(train.iloc[tr_idx][FEATURES], label=y.iloc[tr_idx], cat_features=CAT_FEATURES)
    imp = m.get_feature_importance(tr_pool, type="PredictionValuesChange")
    importances.append(np.array(imp, dtype=float))
    del tr_pool; gc.collect()

fi_values = np.mean(np.vstack(importances), axis=0)
fi_df = pd.DataFrame({"feature": FEATURES, "importance": fi_values}).sort_values("importance", ascending=False)
fi_df.to_csv(f"{ARTIFACT_DIR}/catboost_feature_importance.csv", index=False)

print("Top 10 most important features:")
print(fi_df.head(10))

# ---------- Test predictions + submission ----------
print("\n" + "="*30)
print("GENERATING SUBMISSION")
print("="*30)

# Create test pool
DTEST = Pool(test[FEATURES], cat_features=CAT_FEATURES)

# Generate predictions
test_preds = np.zeros(len(test), dtype=float)
for i, m in enumerate(models):
    p = m.predict_proba(DTEST)[:,1]
    test_preds += p / len(models)
    print(f"Fold {i} test preds | mean={p.mean():.5f} std={p.std():.5f}")

np.save(f"{ARTIFACT_DIR}/catboost_test_predictions.npy", test_preds)

# Create submission file
# Check if we have TransactionID in test data
if 'TransactionID' in test.columns:
    submission = pd.DataFrame({
        'TransactionID': test['TransactionID'],
        'isFraud': test_preds
    })
else:
    # If no TransactionID, create a basic submission
    submission = pd.DataFrame({
        'TransactionID': range(len(test_preds)),
        'isFraud': test_preds
    })

sub_path = f"{SUB_DIR}/submission_catboost_preprocessed_cv{oof_auc:.4f}.csv"
submission.to_csv(sub_path, index=False)
print(f"Saved submission: {sub_path}")

# Final summary
summary = {
    "model_name": "catboost_groupkfold_preprocessed",
    "performance": {
        "oof_auc": oof_auc,
        "weighted_cv_auc": weighted_cv,
        "fold_aucs": fold_scores.tolist()
    },
    "data_info": {
        "n_features": len(FEATURES),
        "n_categorical": len(CAT_FEATURES),
        "features_removed": len(remove_features),
        "data_source": "kyakovlev_preprocessed"
    },
    "files_saved": [
        "catboost_oof_predictions.npy",
        "catboost_test_predictions.npy",
        "catboost_metadata.json",
        "catboost_feature_importance.csv"
    ] + [f"catboost_fold{i}.cbm" for i in range(len(models))]
}

with open(f"{ARTIFACT_DIR}/catboost_ensemble_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n" + "="*60)
print("CATBOOST GROUPKFOLD COMPLETE")
print("="*60)
print(f"Final OOF AUC: {oof_auc:.4f}")
print(f"Number of features used: {len(FEATURES)}")
print(f"Number of categorical features: {len(CAT_FEATURES)}")
print(f"Features removed by preprocessing: {len(remove_features)}")


Calculating feature importance...
Top 10 most important features:
           feature  importance
711      C1_fq_enc    2.112462
47              M5    1.668027
26             C13    1.478920
27             C14    1.256796
14              C1    1.227671
12   P_emaildomain    0.993845
48              M6    0.976447
723     C13_fq_enc    0.916317
679      D2_scaled    0.798277
2            card1    0.765258

GENERATING SUBMISSION
Fold 0 test preds | mean=0.10694 std=0.18043
Fold 1 test preds | mean=0.13298 std=0.19028
Fold 2 test preds | mean=0.13045 std=0.19023
Fold 3 test preds | mean=0.14955 std=0.19738
Fold 4 test preds | mean=0.11930 std=0.18564
Fold 5 test preds | mean=0.11293 std=0.18078
Saved submission: submissions/submission_catboost_preprocessed_cv0.9311.csv

CATBOOST GROUPKFOLD COMPLETE
Final OOF AUC: 0.9311
Number of features used: 772
Number of categorical features: 42
Features removed by preprocessing: 19
