In [None]:
%pip install category_encoders
%pip install PyOD
import pandas as pd
import numpy as np
import random
from pyod.models.iforest import IForest
from pyod.models.ecod import ECOD
from sklearn.model_selection import train_test_split
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
from nan_filler import FillNan

random.seed(123)

###Загрузка данных

In [None]:
df = pd.read_parquet('merge_df.parquet')
df = df.drop('TransactionID', axis=1)

# делим данные на трейн, тест
X_train, X_test, y_train, y_test = train_test_split(df.drop('isFraud', axis=1), df['isFraud'], test_size=0.3, random_state=123)

# заполняем пропуски
filler = FillNan(num_filler="median", cat_filler="constant", drop_highly_missed=True)

filler.fit(X_train)
X_train = filler.transform(X_train)
X_test = filler.transform(X_test)

# создаем валидационную выборку
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=123)

Обработываем категориальные признаки

In [None]:
categor_cols = []

for col in X_train.columns:
  if X_train[col].dtypes == 'O':
    categor_cols.append(col)

In [None]:
loo = LeaveOneOutEncoder(cols=categor_cols, sigma = 0.48)

X_train = loo.fit_transform(X_train, y_train)
X_val = loo.transform(X_val)
X_test = loo.transform(X_test)

###Вероятностные модели

* ECOD - Обнаружение выбросов при помощи эмпирической функции распределения и оценки вероятности нахождения конкретных данных в одном их хвостов распределения.

In [None]:
ecod = ECOD(contamination=0.03, n_jobs=-1)
ecod.fit(X_train)

ecod_pred = ecod.predict(X_val.values)
ecod_pred_prob = ecod.decision_function(X_val.values)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   14.3s remaining:   14.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   16.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   17.0s remaining:   17.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   20.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   16.7s remaining:   16.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   20.2s finished


In [None]:
print('* ROC-AUC:', roc_auc_score(y_val, ecod_pred_prob))
print('* Precision:', precision_score(y_val, ecod_pred))
print('* Recall:', recall_score(y_val, ecod_pred))
print('* F1:', f1_score(y_val, ecod_pred))

* ROC-AUC: 0.7834893546677868
* Precision: 0.27706812652068125
* Recall: 0.21475719000471477
* F1: 0.24196547144754316


* COPOD - выявление аномалий при помощи копулу - многомерной функции распределения на основании которой моделируются зависимости данных

In [None]:
from pyod.models.copod import COPOD

copod = COPOD(contamination=0.03, n_jobs=-1)
copod.fit(X_train)

ecod_pred = copod.predict(X_val.values)
ecod_pred_prob = copod.decision_function(X_val.values)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   14.6s remaining:   14.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   17.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   18.0s remaining:   18.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   21.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   17.6s remaining:   17.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   21.1s finished


In [None]:
print('* ROC-AUC:', roc_auc_score(y_val, ecod_pred_prob))
print('* Precision:', precision_score(y_val, ecod_pred))
print('* Recall:', recall_score(y_val, ecod_pred))
print('* F1:', f1_score(y_val, ecod_pred))

* ROC-AUC: 0.7824193053080911
* Precision: 0.2791456541085731
* Recall: 0.22182932578972184
* F1: 0.2472087219230264


###Модели, основанные на оценки плотностей

* HBOS - выявление аномалий при помощи построения гистограмм на основании плотности каждого признака

In [None]:
#!c1.8
from pyod.models.hbos import HBOS

svm = HBOS(contamination=0.03)
svm.fit(X_train)

svm_pred = svm.predict(X_val.values)
svm_pred_prob = svm.decision_function(X_val.values)

In [None]:
#!c1.8
print('* ROC-AUC:', roc_auc_score(y_val, svm_pred_prob))
print('* Precision:', precision_score(y_val, svm_pred))
print('* Recall:', recall_score(y_val, svm_pred))
print('* F1:', f1_score(y_val, svm_pred))

* ROC-AUC: 0.6979634862344064
* Precision: 0.2823090178058587
* Recall: 0.23173031588873172
* F1: 0.2545313309166235


* LOF - метод локального уровеня выброса - оценка выбросов при помощи сравнения плотностей посчитанных как расстояния до k ближайших соседей.

In [None]:
#!c1.8
from pyod.models.lof import LOF

svm = LOF(contamination=0.03, n_jobs=-1)
svm.fit(X_train)

svm_pred = svm.predict(X_val.values)
svm_pred_prob = svm.decision_function(X_val.values)


In [None]:
#!c1.8
print('* ROC-AUC:', roc_auc_score(y_val, svm_pred_prob))
print('* Precision:', precision_score(y_val, svm_pred))
print('* Recall:', recall_score(y_val, svm_pred))
print('* F1:', f1_score(y_val, svm_pred))

* ROC-AUC: 0.5998727379286085
* Precision: 0.09729295095148753
* Recall: 0.08557284299858557
* F1: 0.09105731844976796


###Линейные модели

* PCA - использует нормализованную ошибку восстановления в качестве оценки аномалии.

In [None]:
from pyod.models.pca import PCA

svm = PCA(contamination=0.03)
svm.fit(X_train)

svm_pred = svm.predict(X_val.values)
svm_pred_prob = svm.decision_function(X_val.values)

In [None]:
print('* ROC-AUC:', roc_auc_score(y_val, svm_pred_prob))
print('* Precision:', precision_score(y_val, svm_pred))
print('* Recall:', recall_score(y_val, svm_pred))
print('* F1:', f1_score(y_val, svm_pred))

* ROC-AUC: 0.7583015579672098
* Precision: 0.25338491295938104
* Recall: 0.21617161716171618
* F1: 0.23330365093499555


###Ансамбли

* Isolation forest - выявление аномалий при помощи оценки средней глубины листа для кажд

In [None]:
ecodforest = IForest()
forest.fit(X_train.values)

forest_pred = forest.predict(X_val.values)
forest_pred_prob = forest.decision_function(X_val.values)

In [None]:
print('* ROC-AUC:', roc_auc_score(y_val, forest_pred_prob))
print('* Precision:', precision_score(y_val, forest_pred))
print('* Recall:', recall_score(y_val, forest_pred))
print('* F1:', f1_score(y_val, forest_pred))

* ROC-AUC: 0.7686692872594973
* Precision: 0.16354479590033574
* Recall: 0.4363507779349364
* F1: 0.23791773778920308


### Результаты

 Model | ROC-AUC | F1-score |precision_score | recall_score
 --- | --- | --- | ---| ---
 ECOD | 0.78 | 0.24 | 0.28 | 0.21
 COPOD | 0.78 | 0.24 | 0.28 | 0.22
 HBOS | 0.70 | 0.25 | 0.28 | 0.23
 LOF | 0.60 | 0.09 | 0.09 | 0.08
 PCA | 0.75 | 0.23 | 0.25 | 0.21
 Isolation forest | 0.76 | 0.24 | 0.16 | 0.44