ml 02.11

 ⁃ bias variance 

сложный датасет с нелин данными и большим обьемом

анал данных: 
 1. пропуски, 
 2. константные признаки, 
 3. корреляция
 4. zero split method (для всех признаков строим любой максимально глубокий бустинг с подбором параметров (много деревьев глубоких); после выводим feature importance и удаляем с нулевым.
 5. для каждой пары скоррелированных признаков выводим корреляцию с таргетом и удаляем тот, у которого меньше.
 6. все три вида бустинга lightGBM, XGBoost, CatBoost (из оф. библиотек). Для каждого используем любой удобный подбор параметров на валидации(70+15+15). Тестим на данных.
 7. Проверка всех метрик: acc, rec, pr, roc auc, поиграться с cut off (построить кривую pr|rec и поток)

In [4]:
#!pip install kaggle

In [5]:
#!kaggle competitions download -c ieee-fraud-detection
#!python -m venv sklearn-env
#!sklearn-env\Scripts\activate с
#!pip install -U scikit-learn

In [6]:
import os, gc, math, json, warnings
warnings.filterwarnings('ignore')

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, roc_curve
)
from sklearn.model_selection  import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
#!pip install lightgbm

In [10]:
#!pip install xgboost

In [11]:
#!pip install catboost

In [12]:
#!pip install category-encoders

In [13]:
import category_encoders as ce

In [14]:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [15]:
DATA_DIR = "ieee-fraud-detection"

train_tr_path = os.path.join(DATA_DIR, "train_transaction.csv")
train_id_path = os.path.join(DATA_DIR, "train_identity.csv")
test_tr_path = os.path.join(DATA_DIR, "test_transaction.csv")
test_id_path = os.path.join(DATA_DIR, "test_identity.csv")
sub_path = os.path.join(DATA_DIR, "sample_submission.csv")


In [16]:
def reduce_mem_usage(df, verbose=True):
    
    # считаем, сколько памяти юзаем (deep=True учитывает размер самих значений,
    # а не только контейнеров)
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.kind in ['i','u','f']:
            c_min = df[col].min()
            c_max = df[col].max()
            if col_type.kind in ['i','u']:
                if c_min >= 0:
                    if c_max < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif c_max < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif c_max < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if np.iinfo(np.int8).min < c_min < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif np.iinfo(np.int16).min < c_min < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif np.iinfo(np.int32).min < c_min < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
            else:
                df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == 'object':
            # не переводим автоматически в category, чтобы иметь контроль
            pass
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f"Mem. {start_mem:.2f} MB → {end_mem:.2f} MB")
    return df

def read_csv_safely(path):
    # Вариант с dtype=None, чтобы дать Pandas самому определить, затем downcast
    df = pd.read_csv(path)
    return reduce_mem_usage(df)


In [17]:
train_tr = read_csv_safely(train_tr_path)
train_id = read_csv_safely(train_id_path)
test_tr  = read_csv_safely(test_tr_path)
test_id  = read_csv_safely(test_id_path)

train = train_tr.merge(train_id, how='left', on='TransactionID')
test  = test_tr.merge(test_id,  how='left', on='TransactionID')

print("train:", train.shape, "test:", test.shape)
train.head(3)

Mem. 2062.07 MB → 1203.22 MB
Mem. 143.14 MB → 129.94 MB
Mem. 1771.84 MB → 1038.31 MB
Mem. 140.08 MB → 127.09 MB
train: (590540, 434) test: (506691, 433)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,


In [18]:
train.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
count,590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,...,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
mean,3282270.0,0.03499,7372311.0,135.027161,9898.734658,362.555511,153.194946,199.2789,290.733826,86.800652,...,189.45137,14.237337,353.128174,403.882568,368.269806,16.002708,12.800927,329.608917,149.070312,26.508596
std,170474.4,0.183755,4617224.0,239.162689,4901.170153,157.817963,11.343591,41.332325,101.700386,2.77353,...,30.377136,1.561116,141.11203,152.158493,198.849014,6.897755,2.372468,97.462585,32.101933,3.737366
min,2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
25%,3134635.0,0.0,3027058.0,43.320999,6019.0,214.0,150.0,166.0,204.0,87.0,...,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
50%,3282270.0,0.0,7306528.0,68.769001,9678.0,361.0,150.0,226.0,299.0,87.0,...,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
75%,3429904.0,0.0,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,...,225.0,15.0,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0
max,3577539.0,1.0,15811130.0,31937.390625,18396.0,600.0,231.0,237.0,540.0,102.0,...,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [19]:
np.iinfo(np.int16)

iinfo(min=-32768, max=32767, dtype=int16)

In [41]:
target_col = 'isFraud'
print(train[target_col].mean())
miss = train.isna().mean().sort_values(ascending=False).head(10)
miss.to_frame('miss_ratio')

print('tot_cols: ', train.shape[1])
print('num_cols: ', train.select_dtypes(include=[np.number]).shape[1])
print('obj_cols: ', train.select_dtypes(include=['object']).shape[1])

cat_candidates = [c for c in train.columns if train[c].dtype == 'object']
cat_candidates

0.03499000914417313
tot_cols:  434
num_cols:  403
obj_cols:  31


['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'id_12',
 'id_15',
 'id_16',
 'id_23',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

In [None]:

# TransactionDT — секунды с начала «нулевого» времени. Сделаем фичи:
def add_time_features(df):
    if 'TransactionDT' in df.columns:
        df['DT'] = df['TransactionDT']
        df['DT_day'] = (df['DT'] // (24*60*60)).astype('int32')
        df['DT_hour'] = ((df['DT'] // (60*60)) % 24).astype('int16')
        df['DT_dayofweek'] = (df['DT_day'] % 7).astype('int8')
    return df

def add_amount_features(df):
    if 'TransactionAmt' in df.columns:
        df['TransactionAmt_log1p'] = np.log1p(df['TransactionAmt'].astype(float))
    return df

def freq_encode(train, test, cols):
    for c in cols:
        fq = train[c].value_counts(dropna=False)
        train[c + '_fq'] = train[c].map(fq)
        test[c + '_fq']  = test[c].map(fq)
    return train, test

train = add_time_features(train)
test  = add_time_features(test)
train = add_amount_features(train)
test  = add_amount_features(test)

# Примеры частотных энкодингов для card1/addr1/emaildomain при наличии
freq_cols = [c for c in ['card1','addr1','P_emaildomain','R_emaildomain'] if c in train.columns]
train, test = freq_encode(train, test, freq_cols)
