
# IEEE-CIS Fraud Detection: Boosting Pipeline (LightGBM, XGBoost, CatBoost)

Этот ноутбук демонстрирует **полный ход работы** — от загрузки и анализа данных до обучения трёх моделей бустинга с корректной обработкой категориальных признаков и временного среза.

**Модели:**
- LightGBM (нативные категории)
- XGBoost (OOF target encoding для категориальных)
- CatBoost (нативные категории)

> **Примечание:** ноутбук рассчитан на запуск локально или в окружении с достаточной памятью (8–16 GB+) и временем выполнения. Для экономии ресурсов предусмотрены опции downcast/подвыборки.


## 1. Установка и импорты

In [None]:

# !pip -q install kaggle lightgbm xgboost catboost category_encoders pandas matplotlib scikit-learn pyarrow fastparquet
import os, gc, math, json, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import category_encoders as ce

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


## 2. Загрузка датасета (Kaggle) или использование локального пути

In [None]:

# Опционально: загрузка с Kaggle (нужно добавить kaggle.json в ~/.kaggle)
# from pathlib import Path
# kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
# if kaggle_json.exists():
#     !kaggle competitions download -c ieee-fraud-detection -p ./data
#     !unzip -n ./data/ieee-fraud-detection.zip -d ./data
# else:
#     print("⚠️ Kaggle API не настроен. Поместите файлы train_transaction.csv, train_identity.csv, "
#           "test_transaction.csv, test_identity.csv и sample_submission.csv в папку ./data")

DATA_DIR = "./data"

train_tr_path = os.path.join(DATA_DIR, "train_transaction.csv")
train_id_path = os.path.join(DATA_DIR, "train_identity.csv")
test_tr_path  = os.path.join(DATA_DIR, "test_transaction.csv")
test_id_path  = os.path.join(DATA_DIR, "test_identity.csv")
sub_path      = os.path.join(DATA_DIR, "sample_submission.csv")

for p in [train_tr_path, train_id_path, test_tr_path, test_id_path, sub_path]:
    print(("✓" if os.path.exists(p) else "✗"), p)


## 3. Функции для экономии памяти и чтения

In [None]:

def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.kind in ['i','u','f']:
            c_min = df[col].min()
            c_max = df[col].max()
            if col_type.kind in ['i','u']:
                if c_min >= 0:
                    if c_max < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif c_max < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif c_max < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if np.iinfo(np.int8).min < c_min < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif np.iinfo(np.int16).min < c_min < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif np.iinfo(np.int32).min < c_min < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
            else:
                df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == 'object':
            # не переводим автоматически в category, чтобы иметь контроль
            pass
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f"Mem. {start_mem:.2f} MB → {end_mem:.2f} MB")
    return df

def read_csv_safely(path):
    # Вариант с dtype=None, чтобы дать Pandas самому определить, затем downcast
    df = pd.read_csv(path)
    return reduce_mem_usage(df)


## 4. Загрузка и объединение данных

In [None]:

train_tr = read_csv_safely(train_tr_path)
train_id = read_csv_safely(train_id_path)
test_tr  = read_csv_safely(test_tr_path)
test_id  = read_csv_safely(test_id_path)

train = train_tr.merge(train_id, how='left', on='TransactionID')
test  = test_tr.merge(test_id,  how='left', on='TransactionID')

print("train:", train.shape, "test:", test.shape)
train.head(3)


## 5. Быстрый EDA

In [None]:

target_col = 'isFraud'
print("Target mean:", train[target_col].mean())
print("Missing ratio (train top 10):")
miss = train.isna().mean().sort_values(ascending=False).head(10)
display(miss.to_frame('missing_ratio').T)

print("numeric cols:", train.select_dtypes(include=[np.number]).shape[1],
      "object cols:", train.select_dtypes(include=['object']).shape[1])

# Столбцы категорий-«паспорта» (примерная эвристика)
candidate_cats = [c for c in train.columns if train[c].dtype == 'object']
print("candidate categorical:", len(candidate_cats))
candidate_cats[:20]


## 6. Базовый Feature Engineering (временные фичи, логи, частоты)

In [None]:

# TransactionDT — секунды с начала «нулевого» времени. Сделаем фичи:
def add_time_features(df):
    if 'TransactionDT' in df.columns:
        df['DT'] = df['TransactionDT']
        df['DT_day'] = (df['DT'] // (24*60*60)).astype('int32')
        df['DT_hour'] = ((df['DT'] // (60*60)) % 24).astype('int16')
        df['DT_dayofweek'] = (df['DT_day'] % 7).astype('int8')
    return df

def add_amount_features(df):
    if 'TransactionAmt' in df.columns:
        df['TransactionAmt_log1p'] = np.log1p(df['TransactionAmt'].astype(float))
    return df

def freq_encode(train, test, cols):
    for c in cols:
        fq = train[c].value_counts(dropna=False)
        train[c + '_fq'] = train[c].map(fq)
        test[c + '_fq']  = test[c].map(fq)
    return train, test

train = add_time_features(train)
test  = add_time_features(test)
train = add_amount_features(train)
test  = add_amount_features(test)

# Примеры частотных энкодингов для card1/addr1/emaildomain при наличии
freq_cols = [c for c in ['card1','addr1','P_emaildomain','R_emaildomain'] if c in train.columns]
train, test = freq_encode(train, test, freq_cols)


## 7. Выбор признаков и временной сплит train/valid

In [None]:

drop_cols = [target_col, 'TransactionID']

features = [c for c in train.columns if c not in drop_cols]

# Временной сплит по TransactionDT: последний хвост как валидация
assert 'TransactionDT' in train.columns
cutoff = np.quantile(train['TransactionDT'], 0.85)  # 85% train, 15% valid
trn_idx = train['TransactionDT'] < cutoff
val_idx = ~trn_idx

X_tr = train.loc[trn_idx, features].reset_index(drop=True)
y_tr = train.loc[trn_idx, target_col].astype('int8').reset_index(drop=True)
X_va = train.loc[val_idx, features].reset_index(drop=True)
y_va = train.loc[val_idx, target_col].astype('int8').reset_index(drop=True)

X_te = test[features].reset_index(drop=True)

print(X_tr.shape, X_va.shape, X_te.shape)


## 8. Определение категориальных столбцов

In [None]:

# Эвристика: object → категориальные
cat_cols = [c for c in features if train[c].dtype == 'object']
num_cols = [c for c in features if c not in cat_cols]

print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))
cat_cols[:20]


## 9. LightGBM (нативные категории)

In [None]:

# Приводим object к category
def cast_category(df, cols):
    for c in cols:
        df[c] = df[c].astype('category')
    return df

X_tr_lgb = X_tr.copy()
X_va_lgb = X_va.copy()
X_te_lgb = X_te.copy()

X_tr_lgb = cast_category(X_tr_lgb, cat_cols)
X_va_lgb = cast_category(X_va_lgb, cat_cols)
X_te_lgb = cast_category(X_te_lgb, cat_cols)

lgb_train = lgb.Dataset(X_tr_lgb, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
lgb_valid = lgb.Dataset(X_va_lgb, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

lgb_params = dict(
    objective='binary',
    metric='auc',
    learning_rate=0.05,
    num_leaves=256,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_data_in_leaf=64,
    reg_alpha=0.1,
    reg_lambda=0.1,
    verbose=-1,
    seed=RANDOM_STATE,
)

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=10000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train','valid'],
    early_stopping_rounds=200,
    verbose_eval=200
)

va_pred_lgb = lgb_model.predict(X_va_lgb, num_iteration=lgb_model.best_iteration)
print("LightGBM AUC:", roc_auc_score(y_va, va_pred_lgb))
print("LightGBM PR-AUC:", average_precision_score(y_va, va_pred_lgb))


In [None]:

# Важности признаков
imp = pd.DataFrame({
    'feature': lgb_model.feature_name(),
    'importance': lgb_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False).head(40)

plt.figure(figsize=(8,10))
plt.barh(imp['feature'].iloc[::-1], imp['importance'].iloc[::-1])
plt.title('LightGBM Feature Importance (gain, top-40)')
plt.tight_layout()
plt.show()


## 10. XGBoost (OOF Target Encoding для категориальных)

In [None]:

# OOF Target Encoding для cat_cols
def oof_target_encode(X, y, X_valid, X_test, cols, n_splits=5, smoothing=20, random_state=RANDOM_STATE, add_noise=0.0):
    X = X.copy()
    X_valid = X_valid.copy()
    X_test = X_test.copy()

    oof = pd.DataFrame(index=X.index)
    te_models = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for c in cols:
        oof[c] = np.nan

    for tr_idx, va_idx in kf.split(X):
        X_tr_f, X_va_f = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr_f = y.iloc[tr_idx]

        te = ce.TargetEncoder(cols=cols, smoothing=smoothing)
        te.fit(X_tr_f, y_tr_f)
        oof.iloc[va_idx] = te.transform(X_va_f)[cols].values
        te_models.append(te)

    # финальный энкодер на полном трейне для теста/валидации
    te_full = ce.TargetEncoder(cols=cols, smoothing=smoothing)
    te_full.fit(X, y)
    X_valid_te = te_full.transform(X_valid)[cols]
    X_test_te  = te_full.transform(X_test)[cols]

    # шум для регуляризации
    if add_noise > 0:
        noise = np.random.normal(0, add_noise, size=oof[cols].shape)
        oof[cols] = oof[cols] + noise

    # добавим TE фичи в датасеты
    X_te_tr = X.copy()
    X_te_va = X_valid.copy()
    X_te_te = X_test.copy()
    for c in cols:
        X_te_tr[c + "_te"] = oof[c].astype(np.float32)
        X_te_va[c + "_te"] = X_valid_te[c].astype(np.float32)
        X_te_te[c + "_te"] = X_test_te[c].astype(np.float32)

    # можно удалить исходные категориальные колонки, чтобы оставить только TE-варианты
    X_te_tr = X_te_tr.drop(columns=cols)
    X_te_va = X_te_va.drop(columns=cols)
    X_te_te = X_te_te.drop(columns=cols)

    return X_te_tr, X_te_va, X_te_te

X_tr_xgb, X_va_xgb, X_te_xgb = oof_target_encode(X_tr, y_tr, X_va, X_te, cat_cols, n_splits=5, smoothing=20, add_noise=0.01)

dtr = xgb.DMatrix(X_tr_xgb, label=y_tr)
dva = xgb.DMatrix(X_va_xgb, label=y_va)
dte = xgb.DMatrix(X_te_xgb)

xgb_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=1.0,
    tree_method='hist',
    seed=RANDOM_STATE,
)

xgb_model = xgb.train(
    xgb_params,
    dtr,
    num_boost_round=20000,
    evals=[(dtr,'train'), (dva,'valid')],
    early_stopping_rounds=300,
    verbose_eval=200
)

va_pred_xgb = xgb_model.predict(dva, iteration_range=(0, xgb_model.best_ntree_limit))
print("XGBoost AUC:", roc_auc_score(y_va, va_pred_xgb))
print("XGBoost PR-AUC:", average_precision_score(y_va, va_pred_xgb))


## 11. CatBoost (нативные категории, Ordered Target Statistics)

In [None]:

# CatBoost требует индексы категориальных фичей относительно X_* столбцов
cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(data=X_tr, label=y_tr, cat_features=cat_idx)
valid_pool = Pool(data=X_va, label=y_va, cat_features=cat_idx)
test_pool  = Pool(data=X_te, cat_features=cat_idx)

cb_model = CatBoostClassifier(
    iterations=20000,
    learning_rate=0.05,
    depth=8,
    loss_function='Logloss',
    eval_metric='AUC',
    l2_leaf_reg=3.0,
    random_seed=RANDOM_STATE,
    od_type='Iter',
    od_wait=300,
    verbose=200
)

cb_model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

va_pred_cb = cb_model.predict_proba(valid_pool)[:,1]
print("CatBoost AUC:", roc_auc_score(y_va, va_pred_cb))
print("CatBoost PR-AUC:", average_precision_score(y_va, va_pred_cb))


## 12. Сравнение и блендинг

In [None]:

from sklearn.metrics import roc_curve

scores = {
    'LightGBM': roc_auc_score(y_va, va_pred_lgb),
    'XGBoost':  roc_auc_score(y_va, va_pred_xgb),
    'CatBoost': roc_auc_score(y_va, va_pred_cb),
}
print("AUC scores:", scores)

# Простой бленд (усреднение)
va_blend = (va_pred_lgb + va_pred_xgb + va_pred_cb) / 3.0
print("Blend AUC:", roc_auc_score(y_va, va_blend))
print("Blend PR-AUC:", average_precision_score(y_va, va_blend))


## 13. Обучение на всём трейне и сабмит

In [None]:

# Для корректности можно пересобрать модели на всей тренировочной части (или на полном train).
# Ниже — пример инференса на test с текущими моделями и усреднения предсказаний.

# LightGBM
te_pred_lgb = lgb_model.predict(X_te_lgb, num_iteration=lgb_model.best_iteration)

# XGBoost
te_pred_xgb = xgb_model.predict(dte, iteration_range=(0, xgb_model.best_ntree_limit))

# CatBoost
te_pred_cb = cb_model.predict_proba(test_pool)[:,1]

# Blend
te_blend = (te_pred_lgb + te_pred_xgb + te_pred_cb) / 3.0

sub = pd.read_csv(sub_path)
sub['isFraud'] = te_blend
out_path = "./ieee_blend_submission.csv"
sub.to_csv(out_path, index=False)
out_path



## 14. Советы по улучшению
- Более аккуратный **time-based CV** (rolling/expanding window).
- Более богатый **feature engineering** (emaildomain группировки, браузеры/ОС, расстояния D*, взаимодействия C/V/*, device info).
- **Hyperopt/Optuna** для подбора гиперпараметров.
- Раздельные модели по «сценам» (например, по наличию identity-фичей).
- Построение **SHAP**/Permutation importance для интерпретации.
- Учет **class weights** (дисбаланс) и **focal loss** (в CatBoost/XGB).
