# Обработка датасета

Ссылка на датасет: https://www.kaggle.com/datasets/vagifa/ethereum-frauddetection-dataset

In [None]:
data = "drive/MyDrive/transaction_dataset.csv"

In [None]:
!pip install catboost

import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics



Удалим из датасета **нечисловые** признаки и признаки, связанные со смарт-контрактами (данные о них слабо связаны с подозрительностью транзакции); некоторые из оставшихся признаков разделим на соответствующее время наблюдения ("*Time Diff between first and last (Mins)*"). Избавимся также и от **адресов кошельков**.

In [None]:
dataset = pd.read_csv(data).drop(['Unnamed: 0', 'Index', 'Address', 'min value sent to contract', 'max val sent to contract', 'avg value sent to contract','Number of Created Contracts', 'total ether sent contracts'], axis=1)

In [None]:
dataset = dataset[dataset.columns[list(dataset.dtypes != "object")]]

In [None]:
dataset = dataset[dataset.columns[~dataset.columns.str.contains('ERC')]]

In [None]:
dataset.columns

Index(['FLAG', 'Avg min between sent tnx', 'Avg min between received tnx',
       'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx',
       'Unique Received From Addresses', 'Unique Sent To Addresses',
       'min value received', 'max value received ', 'avg val received',
       'min val sent', 'max val sent', 'avg val sent',
       'total transactions (including tnx to create contract',
       'total Ether sent', 'total ether received', 'total ether balance'],
      dtype='object')

In [None]:
t = time = "Time Diff between first and last (Mins)"

to_be_divided = ['Sent tnx', 'Received Tnx', 'Unique Received From Addresses', 'Unique Sent To Addresses', 'total transactions (including tnx to create contract', 'total Ether sent', 'total ether received']

dataset[to_be_divided] = (dataset.drop(['FLAG', t], axis=1)[to_be_divided].T / list(dataset[t] + 1)).T

In [None]:
dataset.head()

Unnamed: 0,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Unique Received From Addresses,Unique Sent To Addresses,min value received,max value received,avg val received,min val sent,max val sent,avg val sent,total transactions (including tnx to create contract,total Ether sent,total ether received,total ether balance
0,0,844.26,1093.71,704785.63,0.001023,0.000126,5.7e-05,0.000167,0.0,45.806785,6.589513,0.0,31.22,1.200681,0.001149,0.001228,0.000832,-279.224419
1,0,12709.07,2958.44,1218216.73,7.7e-05,7e-06,4e-06,1.1e-05,0.0,2.613269,0.385685,0.0,1.8,0.032844,8.4e-05,3e-06,3e-06,-0.001819
2,0,246194.54,2434.02,516729.3,4e-06,1.9e-05,1.9e-05,4e-06,0.113119,1.165453,0.358906,0.05,3.538616,1.794308,2.3e-05,7e-06,7e-06,0.000441
3,0,10219.6,15785.09,397555.9,6.3e-05,2.3e-05,1.8e-05,3.3e-05,0.0,500.0,99.48884,0.0,450.0,70.001834,8.6e-05,0.004402,0.002252,-854.646303
4,0,36.61,10707.77,382472.42,0.012022,5.2e-05,1.8e-05,5e-05,0.0,12.802411,2.671095,0.0,9.0,0.022688,0.012077,0.000273,0.00014,-50.896986


# GridSearch

Для подбора гиперпараметров воспользуемся **GridSearch**.

In [None]:
X = dataset.drop(['FLAG'], axis=1)
y = dataset.FLAG


parameters = {'learning_rate': [i/1000 for i in range(50, 260, 50)],
            'iterations': [i for i in range(50, 2050, 500)],
            'verbose': [False],
            'early_stopping_rounds': [20, 50]}



Grid_CBC = GridSearchCV(estimator=CatBoostClassifier(), param_grid = parameters, verbose=100, cv=2)
Grid_CBC.fit(X, y)

Fitting 2 folds for each of 40 candidates, totalling 80 fits
[CV 1/2; 1/40] START early_stopping_rounds=20, iterations=50, learning_rate=0.05, verbose=False
[CV 1/2; 1/40] END early_stopping_rounds=20, iterations=50, learning_rate=0.05, verbose=False;, score=0.923 total time=   0.3s
[CV 2/2; 1/40] START early_stopping_rounds=20, iterations=50, learning_rate=0.05, verbose=False
[CV 2/2; 1/40] END early_stopping_rounds=20, iterations=50, learning_rate=0.05, verbose=False;, score=0.925 total time=   0.3s
[CV 1/2; 2/40] START early_stopping_rounds=20, iterations=50, learning_rate=0.1, verbose=False
[CV 1/2; 2/40] END early_stopping_rounds=20, iterations=50, learning_rate=0.1, verbose=False;, score=0.930 total time=   0.6s
[CV 2/2; 2/40] START early_stopping_rounds=20, iterations=50, learning_rate=0.1, verbose=False
[CV 2/2; 2/40] END early_stopping_rounds=20, iterations=50, learning_rate=0.1, verbose=False;, score=0.930 total time=   0.3s
[CV 1/2; 3/40] START early_stopping_rounds=20, iter

In [None]:
print("Лучший результат: ", Grid_CBC.best_score_)
print("Гиперпараметры: ", Grid_CBC.best_params_)

Лучший результат:  0.9484807519788264
Гиперпараметры:  {'early_stopping_rounds': 20, 'iterations': 1050, 'learning_rate': 0.2, 'verbose': False}


# Кросс-валидация

Для оценки точности напишем **кросс-валидацию**.

In [None]:
from catboost import cv

params=Grid_CBC.best_params_
params['loss_function'] = 'Logloss'

cv_data = cv(
    params=params,
    pool=Pool(X, label=y),
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    stratified=True,
    verbose=True
)

Training on fold [0/5]
0:	learn: 0.4705609	test: 0.4673513	best: 0.4673513 (0)	total: 7.37ms	remaining: 7.73s
1:	learn: 0.3580657	test: 0.3565407	best: 0.3565407 (1)	total: 14.3ms	remaining: 7.49s
2:	learn: 0.2943990	test: 0.2925620	best: 0.2925620 (2)	total: 21.9ms	remaining: 7.65s
3:	learn: 0.2465900	test: 0.2477703	best: 0.2477703 (3)	total: 31.5ms	remaining: 8.23s
4:	learn: 0.2226400	test: 0.2278852	best: 0.2278852 (4)	total: 39.5ms	remaining: 8.26s
5:	learn: 0.2082101	test: 0.2155616	best: 0.2155616 (5)	total: 47.3ms	remaining: 8.23s
6:	learn: 0.1988576	test: 0.2095164	best: 0.2095164 (6)	total: 54.8ms	remaining: 8.16s
7:	learn: 0.1884765	test: 0.2013060	best: 0.2013060 (7)	total: 63ms	remaining: 8.21s
8:	learn: 0.1817843	test: 0.1958935	best: 0.1958935 (8)	total: 71.8ms	remaining: 8.31s
9:	learn: 0.1740596	test: 0.1894057	best: 0.1894057 (9)	total: 82.3ms	remaining: 8.56s
10:	learn: 0.1686139	test: 0.1854634	best: 0.1854634 (10)	total: 95.6ms	remaining: 9.03s
11:	learn: 0.1650697

In [None]:
cv_data

Unnamed: 0,iterations,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.470697,0.003084,0.470286,0.002001
1,1,0.358166,0.009485,0.357025,0.005507
2,2,0.294478,0.005971,0.293030,0.002086
3,3,0.257343,0.007569,0.253271,0.004188
4,4,0.232893,0.004923,0.227198,0.006283
...,...,...,...,...,...
179,179,0.122807,0.010410,0.041792,0.003270
180,180,0.122834,0.010361,0.041765,0.003304
181,181,0.122804,0.010416,0.041744,0.003331
182,182,0.122819,0.010389,0.041706,0.003380


# Обучение и выгрузка

Обучим теперь модель на **оптимальных гиперпараметрах** и **90% датасета** и выгрузим ее. Выведем некоторую **полезную информацию**.

In [None]:
X = dataset.drop(['FLAG'], axis=1)
y = dataset.FLAG

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model = CatBoostClassifier(**params)
model.fit(X_train, y_train, eval_set=(X_test, y_test))

model.save_model(f'CatBoost.model')

print('\n')

print('Важности признаков')
print(model.get_feature_importance(prettified=True))
print()

print("Classification Report (метрики)")
print(metrics.classification_report(y_test, model.predict(X_test)))
print()

print('Confusion matrix')
print(metrics.confusion_matrix(y_test, model.predict(X_test), normalize='true'))
print()

print('ROC-AUC')
print(metrics.roc_auc_score(y_test, model.predict(X_test)))



Важности признаков
                                           Feature Id  Importances
0                      Unique Received From Addresses    18.914515
1             Time Diff between first and last (Mins)    12.218182
2                                    avg val received     8.801237
3                                 total ether balance     6.664180
4                            Avg min between sent tnx     6.568129
5                            Unique Sent To Addresses     6.159585
6                        Avg min between received tnx     5.972923
7                                        Received Tnx     5.562519
8                                        min val sent     5.209479
9                                            Sent tnx     5.171643
10                                 min value received     4.989135
11                               total ether received     3.134620
12  total transactions (including tnx to create co...     2.818302
13                                max val