ml 02.11

 ⁃ bias variance 

сложный датасет с нелин данными и большим обьемом

анал данных: 
 1. пропуски, 
 2. константные признаки, 
 3. корреляция
 4. zero split method (для всех признаков строим любой максимально глубокий бустинг с подбором параметров (много деревьев глубоких); после выводим feature importance и удаляем с нулевым.
 5. для каждой пары скоррелированных признаков выводим корреляцию с таргетом и удаляем тот, у которого меньше.
 6. все три вида бустинга lightGBM, XGBoost, CatBoost (из оф. библиотек). Для каждого используем любой удобный подбор параметров на валидации(70+15+15). Тестим на данных.
 7. Проверка всех метрик: acc, rec, pr, roc auc, поиграться с cut off (построить кривую pr|rec и поток)

In [1]:
#!pip install kaggle

In [2]:
#!kaggle competitions download -c ieee-fraud-detection
#!python -m venv sklearn-env
#!sklearn-env\Scripts\activate с
#!pip install -U scikit-learn

In [3]:
import os, gc, math, json, re, warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, roc_curve
)
from sklearn.model_selection  import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [6]:
#!pip install lightgbm

In [7]:
#!pip install xgboost

In [8]:
#!pip install catboost

In [9]:
#!pip install category-encoders

In [10]:
import category_encoders as ce

In [11]:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [12]:
DATA_DIR = "ieee-fraud-detection"

train_tr_path = os.path.join(DATA_DIR, "train_transaction.csv")
train_id_path = os.path.join(DATA_DIR, "train_identity.csv")
test_tr_path = os.path.join(DATA_DIR, "test_transaction.csv")
test_id_path = os.path.join(DATA_DIR, "test_identity.csv")
sub_path = os.path.join(DATA_DIR, "sample_submission.csv")


In [13]:
def reduce_mem_usage(df, verbose=True):
    
    # считаем, сколько памяти юзаем (deep=True учитывает размер самих значений,
    # а не только контейнеров)
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.kind in ['i','u','f']:
            c_min = df[col].min()
            c_max = df[col].max()
            if col_type.kind in ['i','u']:
                if c_min >= 0:
                    if c_max < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif c_max < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif c_max < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if np.iinfo(np.int8).min < c_min < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif np.iinfo(np.int16).min < c_min < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif np.iinfo(np.int32).min < c_min < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
            else:
                df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == 'object':
            # не переводим автоматически в category, тк хз точно ли категориальные признаки
            pass
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f"Mem. {start_mem:.2f} MB → {end_mem:.2f} MB")
    return df

def read_csv_safely(path):
    # Вариант с dtype=None, чтобы дать Pandas самому определить, затем downcast
    df = pd.read_csv(path)
    return reduce_mem_usage(df)


In [14]:
def fix_id_columns(df):
    df = df.rename(columns=lambda c: re.sub(r'^id-(\d+)$', r'id_\1', c))
    return df

In [15]:
train_tr = pd.read_csv(train_tr_path)
train_id = pd.read_csv(train_id_path)
test_tr  = pd.read_csv(test_tr_path)
test_id  = pd.read_csv(test_id_path)
sub = pd.read_csv(sub_path)

train = fix_id_columns(train_tr.merge(train_id, how='left', on='TransactionID'))
test  = fix_id_columns(test_tr.merge(test_id,  how='left', on='TransactionID'))

print("train:", train.shape, "test:", test.shape, "sub", sub.shape)

train: (590540, 434) test: (506691, 433) sub (506691, 2)


In [16]:
target_col = 'isFraud'
print(train[target_col].mean())
miss = train.isna().mean().sort_values(ascending=False).head(10)
miss.to_frame('miss_ratio')

print(miss)

print('tot_cols: ', train.shape[1])
print('num_cols: ', train.select_dtypes(include=[np.number]).shape[1])
print('obj_cols: ', train.select_dtypes(include=['object']).shape[1])

cat_candidates = [c for c in train.columns if train[c].dtype == 'object']

0.03499000914417313
id_24    0.991962
id_25    0.991310
id_07    0.991271
id_08    0.991271
id_21    0.991264
id_26    0.991257
id_22    0.991247
id_27    0.991247
id_23    0.991247
dist2    0.936284
dtype: float64
tot_cols:  434
num_cols:  403
obj_cols:  31


In [17]:
# TransactionDT — секунды с начала «нулевого» времени. Сделаем фичи:
def add_time_features(df):
    if 'TransactionDT' in df.columns:
        df['DT'] = df['TransactionDT']
        df['DT_day'] = (df['DT'] // (24*60*60)).astype('int32')
        df['DT_hour'] = ((df['DT'] // (60*60)) % 24).astype('int16')
        df['DT_dayofweek'] = (df['DT_day'] % 7).astype('int8')
    return df

def add_amount_features(df):
    if 'TransactionAmt' in df.columns:
        df['TransactionAmt_log1p'] = np.log1p(df['TransactionAmt'].astype(float))
    return df

def freq_encode(train, test, cols):
    for c in cols:
        fq = train[c].value_counts(dropna=False)
        train[c + '_fq'] = train[c].map(fq)
        test[c + '_fq']  = test[c].map(fq)
    return train, test

train = add_time_features(train)
test  = add_time_features(test)
train = add_amount_features(train)
test  = add_amount_features(test)

# Примеры частотных энкодингов для card1/addr1/emaildomain при наличии
# freq_cols = [c for c in ['card1','addr1','P_emaildomain','R_emaildomain'] if c in train.columns]
# train, test = freq_encode(train, test, freq_cols)


In [18]:
drop_cols = [target_col, 'TransactionID']

features = [c for c in train.columns if c not in drop_cols]

# Временной сплит по TransactionDT: последний хвост как валидация
cutoff = np.quantile(train['TransactionDT'], 0.85)  
trn_idx = train['TransactionDT'] < cutoff
val_idx = ~trn_idx

X_tr = train.loc[trn_idx, features].reset_index(drop=True)
y_tr = train.loc[trn_idx, target_col].astype('int8').reset_index(drop=True)
X_va = train.loc[val_idx, features].reset_index(drop=True)
y_va = train.loc[val_idx, target_col].astype('int8').reset_index(drop=True)

X_te = test.loc[:,features].reset_index(drop=True)


In [23]:
X_tr.describe()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_22,id_24,id_25,id_26,id_32,DT,DT_day,DT_hour,DT_dayofweek,TransactionAmt_log1p
count,501959.0,501959.0,501959.0,494639.0,501136.0,498706.0,445541.0,445541.0,198630.0,33283.0,...,4524.0,4153.0,4495.0,4518.0,68562.0,501959.0,501959.0,501959.0,501959.0,501959.0
mean,6126156.0,134.652819,9873.604133,362.801326,153.259281,199.572179,290.604427,86.780525,119.980345,236.128083,...,16.020778,12.773417,329.757508,148.590527,26.572387,6126156.0,70.307692,13.823922,2.944398,4.383301
std,3824121.0,237.845691,4901.362169,157.956761,11.418906,40.938366,101.891595,2.819424,372.595205,538.79091,...,6.933404,2.244228,99.068855,32.371031,3.762732,3824121.0,44.260131,7.658789,2.018744,0.935103
min,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,0.0,0.0,...,10.0,11.0,100.0,100.0,0.0,86400.0,1.0,0.0,0.0,0.223943
25%,2487993.0,42.9775,6019.0,214.0,150.0,166.0,204.0,87.0,3.0,7.0,...,14.0,11.0,321.0,119.0,24.0,2487993.0,28.0,6.0,1.0,3.783678
50%,5973411.0,68.911,9633.0,361.0,150.0,226.0,299.0,87.0,8.0,37.0,...,14.0,11.0,321.0,147.0,24.0,5973411.0,69.0,16.0,3.0,4.247223
75%,9409264.0,125.0,14135.0,512.0,150.0,226.0,330.0,87.0,25.0,218.0,...,14.0,15.0,371.0,169.0,32.0,9409264.0,108.0,20.0,5.0,4.836282
max,13151840.0,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,10286.0,11623.0,...,44.0,26.0,548.0,216.0,32.0,13151840.0,152.0,23.0,6.0,10.371564


In [19]:
cat_cols = [c for c in features if train[c].dtype == 'object']
num_cols = [c for c in features if c not in cat_cols]

# print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))
# cat_cols[:20]


In [None]:
# Приводим object к category
def cast_category(df, cols):
    for c in cols:
        df[c] = df[c].astype('category')
    return df

X_tr_lgb = X_tr.copy()
X_va_lgb = X_va.copy()
X_te_lgb = X_te.copy()

X_tr_lgb = cast_category(X_tr_lgb, cat_cols)
X_va_lgb = cast_category(X_va_lgb, cat_cols)
X_te_lgb = cast_category(X_te_lgb, cat_cols)

lgb_train = lgb.Dataset(X_tr_lgb, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
lgb_valid = lgb.Dataset(X_va_lgb, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

lgb_params = dict(
    objective='binary',
    metric='auc',
    learning_rate=0.05,
    num_leaves=256,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_data_in_leaf=64,
    reg_alpha=0.1,
    reg_lambda=0.1,
    verbose=-1,
    seed=RANDOM_STATE,
)

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=10000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train','valid'],
)

va_pred_lgb = lgb_model.predict(X_va_lgb, num_iteration=lgb_model.best_iteration)
print("LightGBM AUC:", roc_auc_score(y_va, va_pred_lgb))
print("LightGBM PR-AUC:", average_precision_score(y_va, va_pred_lgb))


In [None]:

# Важности признаков
imp = pd.DataFrame({
    'feature': lgb_model.feature_name(),
    'importance': lgb_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False).head(40)

plt.figure(figsize=(8,10))
plt.barh(imp['feature'].iloc[::-1], imp['importance'].iloc[::-1])
plt.title('LightGBM Feature Importance (gain, top-40)')
plt.tight_layout()
plt.show()


In [None]:

# OOF Target Encoding для cat_cols
def oof_target_encode(X, y, X_valid, X_test, cols, n_splits=5, smoothing=20, random_state=RANDOM_STATE, add_noise=0.0):
    X = X.copy()
    X_valid = X_valid.copy()
    X_test = X_test.copy()

    oof = pd.DataFrame(index=X.index)
    te_models = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for c in cols:
        oof[c] = np.nan

    for tr_idx, va_idx in kf.split(X):
        X_tr_f, X_va_f = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr_f = y.iloc[tr_idx]

        te = ce.TargetEncoder(cols=cols, smoothing=smoothing)
        te.fit(X_tr_f, y_tr_f)
        oof.iloc[va_idx] = te.transform(X_va_f)[cols].values
        te_models.append(te)

    # финальный энкодер на полном трейне для теста/валидации
    te_full = ce.TargetEncoder(cols=cols, smoothing=smoothing)
    te_full.fit(X, y)
    X_valid_te = te_full.transform(X_valid)[cols]
    X_test_te  = te_full.transform(X_test)[cols]

    # шум для регуляризации
    if add_noise > 0:
        noise = np.random.normal(0, add_noise, size=oof[cols].shape)
        oof[cols] = oof[cols] + noise

    # добавим TE фичи в датасеты
    X_te_tr = X.copy()
    X_te_va = X_valid.copy()
    X_te_te = X_test.copy()
    for c in cols:
        X_te_tr[c + "_te"] = oof[c].astype(np.float32)
        X_te_va[c + "_te"] = X_valid_te[c].astype(np.float32)
        X_te_te[c + "_te"] = X_test_te[c].astype(np.float32)

    # можно удалить исходные категориальные колонки, чтобы оставить только TE-варианты
    X_te_tr = X_te_tr.drop(columns=cols)
    X_te_va = X_te_va.drop(columns=cols)
    X_te_te = X_te_te.drop(columns=cols)

    return X_te_tr, X_te_va, X_te_te

X_tr_xgb, X_va_xgb, X_te_xgb = oof_target_encode(X_tr, y_tr, X_va, X_te, cat_cols, n_splits=5, smoothing=20, add_noise=0.01)

dtr = xgb.DMatrix(X_tr_xgb, label=y_tr)
dva = xgb.DMatrix(X_va_xgb, label=y_va)
dte = xgb.DMatrix(X_te_xgb)

xgb_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=1.0,
    tree_method='hist',
    seed=RANDOM_STATE,
)

xgb_model = xgb.train(
    xgb_params,
    dtr,
    num_boost_round=20000,
    evals=[(dtr,'train'), (dva,'valid')],
    early_stopping_rounds=300,
    verbose_eval=200
)

va_pred_xgb = xgb_model.predict(dva, iteration_range=(0, xgb_model.best_ntree_limit))
print("XGBoost AUC:", roc_auc_score(y_va, va_pred_xgb))
print("XGBoost PR-AUC:", average_precision_score(y_va, va_pred_xgb))


In [None]:

# CatBoost требует индексы категориальных фичей относительно X_* столбцов
cat_idx = [X_tr.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(data=X_tr, label=y_tr, cat_features=cat_idx)
valid_pool = Pool(data=X_va, label=y_va, cat_features=cat_idx)
test_pool  = Pool(data=X_te, cat_features=cat_idx)

cb_model = CatBoostClassifier(
    iterations=20000,
    learning_rate=0.05,
    depth=8,
    loss_function='Logloss',
    eval_metric='AUC',
    l2_leaf_reg=3.0,
    random_seed=RANDOM_STATE,
    od_type='Iter',
    od_wait=300,
    verbose=200
)

cb_model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

va_pred_cb = cb_model.predict_proba(valid_pool)[:,1]
print("CatBoost AUC:", roc_auc_score(y_va, va_pred_cb))
print("CatBoost PR-AUC:", average_precision_score(y_va, va_pred_cb))


In [None]:

from sklearn.metrics import roc_curve

scores = {
    'LightGBM': roc_auc_score(y_va, va_pred_lgb),
    'XGBoost':  roc_auc_score(y_va, va_pred_xgb),
    'CatBoost': roc_auc_score(y_va, va_pred_cb),
}
print("AUC scores:", scores)

# Простой бленд (усреднение)
va_blend = (va_pred_lgb + va_pred_xgb + va_pred_cb) / 3.0
print("Blend AUC:", roc_auc_score(y_va, va_blend))
print("Blend PR-AUC:", average_precision_score(y_va, va_blend))


In [None]:

# Для корректности можно пересобрать модели на всей тренировочной части (или на полном train).
# Ниже — пример инференса на test с текущими моделями и усреднения предсказаний.

# LightGBM
te_pred_lgb = lgb_model.predict(X_te_lgb, num_iteration=lgb_model.best_iteration)

# XGBoost
te_pred_xgb = xgb_model.predict(dte, iteration_range=(0, xgb_model.best_ntree_limit))

# CatBoost
te_pred_cb = cb_model.predict_proba(test_pool)[:,1]

# Blend
te_blend = (te_pred_lgb + te_pred_xgb + te_pred_cb) / 3.0

sub = pd.read_csv(sub_path)
sub['isFraud'] = te_blend
out_path = "./ieee_blend_submission.csv"
sub.to_csv(out_path, index=False)
out_path
