In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from src.drift import ModelDriftExplainer, AdversarialDriftExplainer

# config
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

seed = 2021

# IEEE fraud data

Download data with kaggle CLI if it is setup on your computer:

In [None]:
#!kaggle competitions download -c ieee-fraud-detection

Else you can download the data here: https://www.kaggle.com/c/ieee-fraud-detection/data, and (I think) you will have to accept the competition rules

In [None]:
df = pd.read_csv('data/train_transaction.csv')

In [None]:
print(df.shape)

# Preprocessing

In [None]:
missing_values = df.isnull().sum(axis=0)
missing_values

In [None]:
selected_columns = [col for col in df.columns if missing_values[col] < 10000]

In [None]:
df = df[selected_columns].dropna(axis=0, how='any')
df = df.loc[df['card6'].isin(['debit', 'credit']), :].copy()

In [None]:
df.head()

In [None]:
is_sorted = lambda x: (np.diff(x)>=0).all()

In [None]:
is_sorted(df['TransactionDT'])

# Sampling

In [None]:
features = [col for col in df.columns if col not in ['TransactionID', 'isFraud', 'TransactionDT',
                                                     'ProductCD', 'card4', 'card6']]

In [None]:
df_temp, df_prod = train_test_split(df.copy(), test_size=0.25, shuffle=False)

In [None]:
df_temp['card6'].value_counts()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df_temp.loc[df_temp['card6'].values == 'credit', features].copy(),
                                                      df_temp.loc[df_temp['card6'].values == 'credit', 'isFraud'].values,
                                                      test_size=1/3,
                                                      shuffle=False,
                                                      random_state=seed)

In [None]:
X_prod, y_prod = df_prod[features], df_prod['isFraud'].values

# Build model

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.1,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=20,
        verbose=10, eval_metric=['auc', 'logloss'])

# Detection of data drift

In [None]:
drift_explainer = ModelDriftExplainer(clf)

In [None]:
drift_explainer.fit(X1=X_valid, X2=X_prod, y1=y_valid, y2=y_prod)

In [None]:
drift_explainer.plot_prediction_drift(figsize=(7, 5))
drift_explainer.get_prediction_drift()

In [None]:
drift_explainer.plot_target_drift()
drift_explainer.get_target_drift()

In [None]:
print(f'log_loss valid: {log_loss(y_valid, clf.predict_proba(X_valid))}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

print(f'AUC valid: {roc_auc_score(y_valid, clf.predict_proba(X_valid)[:, 1])}')
print(f'AUC prod: {roc_auc_score(y_prod, clf.predict_proba(X_prod)[:, 1])}')

# Explain data drift

In [None]:
drift_explainer.plot_tree_based_drift_values(type='node_size')

In [None]:
drift_explainer.plot_feature_drift('D1')
drift_explainer.get_feature_drift('D1')

In [None]:
drift_explainer.plot_feature_drift('C13')
drift_explainer.get_feature_drift('C13')

In [None]:
drift_explainer.plot_feature_drift('C2')
drift_explainer.get_feature_drift('C2')

In [None]:
drift_explainer.plot_feature_drift('TransactionAmt')
drift_explainer.get_feature_drift('TransactionAmt')

In [None]:
# feature importance of the model
pd.DataFrame(clf.feature_importances_, X_train.columns).sort_values(0, ascending=False)

# Correction of data drift

## Correction on validation dataset

In [None]:
# weights computed with the adversarial method
sample_weights_valid_adversarial = (AdversarialDriftExplainer(feature_subset=['D1', 'C13', 'C2', 'TransactionAmt'],
                                                              seed=2021)
                                    .fit(X_valid, X_prod)
                                    .get_adversarial_correction_weights(max_ratio=10))

In [None]:
drift_explainer2 = ModelDriftExplainer(clf).fit(X1=X_valid, X2=X_prod, y1=y_valid, y2=y_prod,
                                                sample_weights1=sample_weights_valid_adversarial)

In [None]:
# the drift on distribution of predictions seems to be well corrected
drift_explainer2.plot_prediction_drift()
drift_explainer2.get_prediction_drift()

In [None]:
# the target algo re-equilibrated in the good direction
drift_explainer2.plot_target_drift()
drift_explainer2.get_target_drift()

In [None]:
# valid loss is closer to prod loss, but there is still a difference
print(f'log_loss valid: {log_loss(y_valid, clf.predict_proba(X_valid), sample_weight=sample_weights_valid_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

print(f'AUC valid: {roc_auc_score(y_valid, clf.predict_proba(X_valid)[:, 1], sample_weight=sample_weights_valid_adversarial)}')
print(f'AUC prod: {roc_auc_score(y_prod, clf.predict_proba(X_prod)[:, 1])}')

## Correction on validation dataset and train dataset (in order to retrain the model)

In [None]:
sample_weights_train_adversarial = (AdversarialDriftExplainer(feature_subset=['D1', 'C13', 'C2', 'TransactionAmt'],
                                                              seed=2021)
                                    .fit(X_train, X_prod)
                                    .get_adversarial_correction_weights(max_ratio=10))

In [None]:
clf2 = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.1,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
# we use a gamma power factor to smooth the weights
clf2.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], sample_weight=sample_weights_train_adversarial**0.3,
         early_stopping_rounds=20, verbose=10, eval_metric=['auc', 'logloss'],
         sample_weight_eval_set=[sample_weights_valid_adversarial])

In [None]:
# with the reweighting, we see a small improvement for performance on production data, but is it significative ?
print(f'log_loss valid: {log_loss(y_valid, clf2.predict_proba(X_valid), sample_weight=sample_weights_valid_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf2.predict_proba(X_prod))}')

print(f'AUC valid: {roc_auc_score(y_valid, clf2.predict_proba(X_valid)[:, 1], sample_weight=sample_weights_valid_adversarial)}')
print(f'AUC prod: {roc_auc_score(y_prod, clf2.predict_proba(X_prod)[:, 1])}')