In [None]:
%pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score


# испортируем наш метод заполнения пропусков
from nan_filler import FillNan

Загрузим и подготовим данные

In [None]:
df = pd.read_parquet('merge_df.parquet')

# заполняем пропуски
X_train, X_test, y_train, y_test = train_test_split(df.drop('isFraud', axis=1), df['isFraud'], test_size=0.3, random_state=42)
filler = FillNan(num_filler="median", cat_filler="constant", drop_highly_missed=True)

filler.fit(X_train)
X_train = filler.transform(X_train)
X_test = filler.transform(X_test)

# проделываем one hot encoding
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

X_ohe = pd.get_dummies(X, drop_first=True)

# делим данные на трейн и тест
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape

(395661, 2818)

In [None]:
# сохраним
# X_train.to_parquet('X_train.parquet')
# X_test.to_parquet('X_test.parquet')

# pd.DataFrame(y_train).to_parquet('y_train.parquet')
# pd.DataFrame(y_test).to_parquet('y_test.parquet')

Сначала при помощи оверсемплинга немного восполним меньший класс до пропорции 3 к 1 (законные к мошенническим)

In [None]:
#!c1.8
sm = SMOTE(sampling_strategy=0.5, random_state=42, k_neighbors=5)
X_train_sampl, y_train_sampl = sm.fit_resample(X_train, y_train)

# Оценим распределение классов до и после отбора
print(f'До отбора: \n{y_train.value_counts()}')
print(f'После отбора: \n{y_train_sampl.value_counts()}')

До отбора: 
0    381821
1     13840
Name: isFraud, dtype: int64
После отбора: 
0    381821
1    190910
Name: isFraud, dtype: int64


Теперь окончательно выровняем количество классов

In [None]:
#!c1.8

rus = RandomUnderSampler(random_state=42)

# Отбираем объекты
X_train_sampl2, y_train_sampl2 = rus.fit_resample(X_train_sampl, y_train_sampl)

# Оценим распределение классов до и после отбора
print(f'До отбора: \n{y_train_sampl.value_counts()}')
print(f'\nПосле отбора: \n{y_train_sampl2.value_counts()}')

До отбора: 
0    381821
1    190910
Name: isFraud, dtype: int64

После отбора: 
1    190910
0    190910
Name: isFraud, dtype: int64


In [None]:
# сохраним результаты
X_train_sampl2.to_parquet('X_train_sampled.parquet')
pd.DataFrame(y_train_sampl2).to_parquet('y_train_sampled.parquet')

In [None]:
#!g1.1
# загрузим результаты
X_train = pd.read_parquet('X_train_sampled.parquet')
X_test = pd.read_parquet('X_test.parquet')

y_train = pd.read_parquet('y_train_sampled.parquet')
y_test = pd.read_parquet('y_test.parquet')

In [None]:
#!g1.1
X_train

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,...,DeviceInfo_rv:31.0,DeviceInfo_rv:33.0,DeviceInfo_rv:35.0,DeviceInfo_rv:37.0,DeviceInfo_rv:38.0,DeviceInfo_rv:39.0,DeviceInfo_rv:41.0,DeviceInfo_rv:42.0,DeviceInfo_rv:43.0,DeviceInfo_rv:44.0,DeviceInfo_rv:45.0,DeviceInfo_rv:46.0,DeviceInfo_rv:47.0,DeviceInfo_rv:48.0,DeviceInfo_rv:49.0,DeviceInfo_rv:50.0,DeviceInfo_rv:51.0,DeviceInfo_rv:52.0,DeviceInfo_rv:52.9,DeviceInfo_rv:53.0,DeviceInfo_rv:54.0,DeviceInfo_rv:55.0,DeviceInfo_rv:56.0,DeviceInfo_rv:57.0,DeviceInfo_rv:58.0,DeviceInfo_rv:59.0,DeviceInfo_rv:60.0,DeviceInfo_rv:61.0,DeviceInfo_verykool,DeviceInfo_verykoolS5005,DeviceInfo_verykoolS5019,DeviceInfo_verykoolS5524,DeviceInfo_verykoolS5525,DeviceInfo_verykoolS5530 Build/LMY47I,DeviceInfo_verykools4009,DeviceInfo_verykools5004,DeviceInfo_verykools5034,DeviceInfo_verykools5035,DeviceInfo_vivo,DeviceInfo_xs-Z47b7VqTMxs
0,3532405,14401100,54.500000,2157,111.000000,150.000000,226.000000,272.000000,87.0,8.0,1.000000,1.000000,0.0,0.000000,0.0,1.000000,0.000000,0.000000,1.0,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,97.000000,8.000000,0.000000,10.000000,0.000000,38.708332,0.666666,0.000000,43.0,0.000000,0.000000,0.000000,0.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3228623,5706887,159.950000,5699,555.000000,150.000000,226.000000,436.000000,87.0,8.0,87.000000,91.000000,0.0,0.000000,79.0,74.000000,0.000000,0.000000,65.0,0.000000,68.000000,0.000000,419.000000,74.000000,81.000000,81.000000,2.000000,26.000000,10.000000,0.000000,38.708332,0.666666,411.000000,42.0,0.000000,0.000000,0.000000,411.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3238552,6015554,68.500000,12932,361.000000,150.000000,226.000000,325.000000,87.0,9.0,1.000000,1.000000,0.0,0.000000,0.0,1.000000,0.000000,0.000000,1.0,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,97.000000,8.000000,0.000000,10.000000,0.000000,38.708332,0.666666,0.000000,415.0,0.000000,0.000000,0.000000,0.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3291884,7573787,161.000000,6530,206.000000,150.000000,126.000000,264.000000,87.0,8.0,1.000000,1.000000,0.0,0.000000,0.0,1.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,97.000000,0.000000,384.000000,0.000000,0.000000,38.708332,0.666666,0.000000,42.0,0.000000,0.000000,0.000000,0.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3400038,10433918,108.950000,18132,567.000000,150.000000,117.000000,191.000000,87.0,8.0,1.000000,1.000000,0.0,0.000000,1.0,0.000000,0.000000,0.000000,1.0,0.000000,1.000000,0.000000,1.000000,1.000000,0.000000,97.000000,8.000000,26.000000,10.000000,0.000000,38.708332,0.666666,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381815,3059418,1619690,14.824581,9633,270.806876,185.000000,138.000000,299.000000,87.0,8.0,10.392937,15.571747,0.0,7.937639,0.0,7.937639,7.937639,11.634108,0.0,8.785874,16.027045,16.027045,5.089405,3.392937,8.089405,22.355391,9.696468,8.937639,6.785874,8.937639,32.960202,0.691960,0.000000,42.0,8.937639,0.000000,48.349349,0.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
381816,3537052,14518265,553.619630,8140,453.000000,150.000000,226.000000,485.000000,87.0,8.0,1.000000,1.000000,0.0,0.000000,0.0,1.000000,0.000000,0.000000,1.0,0.000000,1.000000,1.000000,3.000000,1.000000,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,38.708332,0.666666,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
381817,3109954,2413221,45.666172,6697,350.122294,183.680660,226.000000,299.980081,87.0,8.0,1.000000,1.000000,0.0,1.000000,0.0,1.000000,0.962305,1.000000,0.0,1.000000,1.000000,0.962305,1.000000,1.000000,0.000000,97.000000,8.000000,0.980081,10.000000,0.000000,38.708332,0.666666,0.565431,42.0,0.000000,0.000000,163.591777,1.922467,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
381818,3321920,8257192,51.158174,2851,529.488574,183.552267,226.000000,295.980442,87.0,8.0,1.248183,1.124091,0.0,1.000000,0.0,1.000000,0.958636,1.041364,0.0,1.041364,1.041364,0.958636,0.958636,0.958636,0.000000,97.000000,8.000000,1.075459,10.000000,0.000000,37.133061,0.664943,0.620457,42.0,0.000000,0.000000,0.000000,2.109554,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Стандартизуем данные

In [None]:
#!g1.1
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Создадим валидационную выборку

In [None]:
#!g1.1
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
X_train.shape, X_val.shape

((286365, 2818), (95455, 2818))

Напишем функцию для обучения нейронной сети

In [None]:
#!g1.1

loss_func = nn.BCELoss()

def test(model, loader, last):
    loss_log = []
    roc_auc_log = []
    pred = []
    true = []
    model.eval()

    for data, target in loader:

        data = data.to(device)
        target = target.to(device)

        with torch.no_grad():
                logits = model(data)
                loss = loss_func(logits, target)

        y_true = target.cpu()
        roc_auc_log.append(roc_auc_score(y_true, logits.cpu().detach().numpy()))
        loss_log.append(loss.item())

        if last:
            y_pred = [1 if x >= 0.5 else 0 for x in logits.cpu().detach().numpy()]
            pred.extend(y_pred)
            true.extend([int(x) for x in y_true])

    return np.mean(loss_log), np.mean(roc_auc_log), true, pred

def train_epoch(model, optimizer, train_loader, last):
    loss_log = []
    roc_auc_log = []
    pred = []
    true = []
    model.train()

    for data, target in train_loader:
        data = data.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        logits = model(data)

        loss = loss_func(logits, target)
        loss.backward()
        optimizer.step()

        y_true = target.cpu()
        roc_auc_log.append(roc_auc_score(y_true, logits.cpu().detach().numpy()))
        loss_log.append(loss.item())

        if last:
            y_pred = [1 if x >= 0.5 else 0 for x in logits.cpu().detach().numpy()]
            pred.extend(y_pred)
            true.extend([int(x) for x in y_true])

    return loss_log, roc_auc_log, true, pred

def train(model, optimizer, n_epochs, train_loader, val_loader, scheduler=None):
    train_loss_log, train_roc_auc_log, val_loss_log, val_roc_auc_log = [], [], [], []

    for epoch in range(n_epochs):
        if epoch == n_epochs-1:
            train_loss, train_roc_auc, train_true, train_pred = train_epoch(model, optimizer, train_loader, last=True)
            val_loss, val_roc_auc, val_true, val_pred = test(model, val_loader, last=True)
        else:
            train_loss, train_roc_auc, train_true, train_pred = train_epoch(model, optimizer, train_loader, last=False)
            val_loss, val_roc_auc, val_true, val_pred = test(model, val_loader, last=False)

        train_loss_log.extend(train_loss)
        train_roc_auc_log.extend(train_roc_auc)

        val_loss_log.append(val_loss)
        val_roc_auc_log.append(val_roc_auc)

        print(f"Epoch {epoch}")
        print(f" train loss: {np.mean(train_loss)}, train roc_auc: {np.mean(train_roc_auc)}")
        print(f" val loss: {val_loss}, val roc_auc: {val_roc_auc}\n")


        if epoch == n_epochs-1:
            print('---------' * 5)
            print('Final train metrics:')
            print('\t* ROC-AUC:', np.mean(train_roc_auc))
            print('\t* Precision:', precision_score(train_true, train_pred))
            print('\t* Recall:', recall_score(train_true, train_pred))
            print('\t* F1:', f1_score(train_true, train_pred))
            print('---------' * 5)
            print('Final val metrics:')
            print('\t* ROC-AUC:', val_roc_auc)
            print('\t* Precision:', precision_score(val_true, val_pred))
            print('\t* Recall:', recall_score(val_true, val_pred))
            print('\t* F1:', f1_score(val_true, val_pred))

        if scheduler is not None:
            scheduler.step()

    return train_loss_log, train_roc_auc_log, val_loss_log, val_roc_auc_log

Обозначим нейронную сеть

In [None]:
#!g1.1

class FraudNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(in_features=2818, out_features=1500),
            nn.ReLU(),
            nn.Linear(in_features=1500, out_features=2000),
            nn.ReLU(),
            nn.Dropout(p=0.25),
            nn.Linear(in_features=2000, out_features=1000),
            nn.ReLU(),
            nn.Linear(in_features=1000, out_features=800),
            nn.ReLU(),
            nn.Linear(in_features=800, out_features=1),
            nn.Sigmoid(),
        )

    def forward(self, x):

        return self.model(x)

Приводим данные в необходимый вид

In [None]:
#!g1.1

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(np.array(y_train), dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                            torch.tensor(np.array(y_val), dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

Обучаем на gpu

In [None]:
#!g1.1
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


Обучим модель на тренировочных данных. Каждую эпоху будем выводить значение функции потерь и roc-auc. В конце выведем все интересующие нас метрики

In [None]:
#!g1.1
net = FraudNet().to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
train_loss_log, train_roc_auc_log, val_loss_log, val_roc_auc_log = train(net, optimizer, 8, train_loader, val_loader, scheduler=scheduler)

Epoch 0
 train loss: 0.1545298684949274, train roc_auc: 0.9829483957591046
 val loss: 0.1264393324639159, val roc_auc: 0.9880433519938958

Epoch 1
 train loss: 0.11312901081059482, train roc_auc: 0.9902418864340533
 val loss: 0.11223757623765997, val roc_auc: 0.9906240406822513

Epoch 2
 train loss: 0.10016499227526801, train roc_auc: 0.9928363318013407
 val loss: 0.10324770643315889, val roc_auc: 0.9909169300179506

Epoch 3
 train loss: 0.0894537660738651, train roc_auc: 0.994234819614875
 val loss: 0.09919898165109, val roc_auc: 0.9922002903105268

Epoch 4
 train loss: 0.07644243474852024, train roc_auc: 0.9953804508582066
 val loss: 0.09495557427822035, val roc_auc: 0.9929579745233595

Epoch 5
 train loss: 0.06992296557883894, train roc_auc: 0.9961301573733558
 val loss: 0.09032188893056127, val roc_auc: 0.992983766149072

Epoch 6
 train loss: 0.07639700152677115, train roc_auc: 0.9964361483006382
 val loss: 0.09696399741812332, val roc_auc: 0.9937047834931261

Epoch 7
 train loss: 

Посмотрим результат на тесте

In [None]:
#!g1.1
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(np.array(y_test), dtype=torch.float32)

X_test = X_test.to(device)
y_test = y_test.to(device)

In [None]:
#!g1.1
net.eval()

with torch.no_grad():
    logits = net(X_test)

roc_auc = roc_auc_score(y_test.cpu().detach().numpy(), logits.cpu().detach().numpy())

y_test_true = [int(x) for x in y_test.cpu().detach().numpy()]
y_pred = [1 if x >= 0.5 else 0 for x in logits.cpu().detach().numpy()]

print('* ROC_AUC:', roc_auc)
print('* precision:', precision_score(y_test_true, y_pred))
print('* recall:', recall_score(y_test_true, y_pred))
print('* f1:', f1_score(y_test_true, y_pred))

* ROC_AUC: 0.9950980392156863
* precision: 0.9166666666666666
* recall: 0.9166666666666666
* f1: 0.9166666666666666
