In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from src.drift import ModelDriftExplainer

# config
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

seed = 2021


# IEEE fraud data

Download data with kaggle CLI if it is setup on your computer:

In [None]:
#!kaggle competitions download -c ieee-fraud-detection

Else you can download the data here: https://www.kaggle.com/c/ieee-fraud-detection/data, and (I think) you will have to accept the competition rules

In [None]:
df = pd.read_csv('data/train_transaction.csv')

In [None]:
print(df.shape)

In [None]:
missing_values = df.isnull().sum(axis=0)

In [None]:
selected_columns = [col for col in df.columns if missing_values[col] < 10000]

In [None]:
df = df[selected_columns].dropna(axis=0, how='any')
df = df.loc[df['card6'].isin(['debit', 'credit']), :].copy()
df['card6'] = (df['card6'] == 'credit') * 1

In [None]:
df.shape

# Minimal feature engineering

In [None]:
df.head(5)

In [None]:
is_sorted = lambda x: (np.diff(x)>=0).all()

In [None]:
is_sorted(df['TransactionDT'])

In [None]:
df.dtypes

In [None]:
features = [col for col in df.columns if col not in ['TransactionID', 'isFraud', 'TransactionDT',
                                                     'ProductCD', 'card4']] # 'card6'

In [None]:
df_temp, df_prod = train_test_split(df, test_size=0.25, shuffle=False, random_state=seed)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df_temp[features], df_temp['isFraud'].values,
                                                      test_size=1/3, shuffle=False, random_state=seed)

In [None]:
X_prod, y_prod = df_prod[features], df_prod['isFraud'].values

# Build model

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.2,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=20,
        verbose=10, eval_metric=['auc', 'logloss'])

# Detection of data drift

In [None]:
drift_explainer = ModelDriftExplainer(clf)

In [None]:
drift_explainer.fit(X1=X_valid, X2=X_prod, y1=y_valid, y2=y_prod)

In [None]:
drift_explainer.plot_prediction_drift()
drift_explainer.get_prediction_drift()

In [None]:
drift_explainer.plot_target_drift()
drift_explainer.get_target_drift()

In [None]:
print(f'log_loss valid: {log_loss(y_valid, clf.predict_proba(X_valid))}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

In [None]:
drift_explainer.plot_feature_drift(0)
drift_explainer.get_feature_drift(0)

In [None]:
drift_explainer.plot_feature_drift('card6', as_discrete=True)
drift_explainer.get_feature_drift('card6')