In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb
import os
import sys

# TODO: handle this by running setup.py ?
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from src.drift.drift_explainer import DriftExplainer

# config
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

seed = 2021

# IEEE fraud data

if you have setup a kaggle api token

In [None]:
#!kaggle competitions download -c ieee-fraud-detection

Else you can download the data here: https://www.kaggle.com/c/ieee-fraud-detection/data, and (I think) you will have to accept the competition rules

In [None]:
df = pd.read_csv('data/train_transaction.csv')

In [None]:
print(df.shape)

# Preprocessing

In [None]:
missing_values = df.isnull().sum(axis=0)
missing_values

In [None]:
selected_columns = [col for col in df.columns if missing_values[col] < 10000]

In [None]:
df = df[selected_columns].dropna(axis=0, how='any')
df = df.loc[df['card6'].isin(['debit', 'credit']), :].copy()

In [None]:
df['card6'] = (df['card6'] == 'credit') * 1

In [None]:
df.head()

In [None]:
is_sorted = lambda x: (np.diff(x)>=0).all()

In [None]:
is_sorted(df['TransactionDT'])

# Sampling

In [None]:
features = [col for col in df.columns if col not in ['TransactionID', 'isFraud', 'TransactionDT',
                                                     'ProductCD', 'card4']]

In [None]:
df[features].head()

In [None]:
df_temp, df_prod = train_test_split(df.copy(), test_size=0.25, shuffle=False, random_state=seed)

In [None]:
df_temp['card6'].value_counts()

In [None]:
np.random.seed(seed)
df_temp = df_temp.loc[((np.random.randint(low=0, high=9, size=df_temp.shape[0]) == 0) |
                       (df_temp['card6'].values == 1)), :]

In [None]:
df_temp['card6'].value_counts()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df_temp[features].copy(),
                                                      df_temp['isFraud'].values,
                                                      test_size=1/3,
                                                      shuffle=False,
                                                      random_state=seed)

In [None]:
X_prod, y_prod = df_prod[features], df_prod['isFraud'].values

# Build model

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.2,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=20,
        verbose=10, eval_metric=['auc', 'logloss'])

# Detection of data drift

In [None]:
drift_explainer = DriftExplainer()

In [None]:
drift_explainer.fit(clf, X1=X_valid, X2=X_prod, y1=y_valid, y2=y_prod)

In [None]:
drift_explainer.plot_prediction_drift()
drift_explainer.get_prediction_drift()

In [None]:
drift_explainer.plot_target_drift()
drift_explainer.get_target_drift()

In [None]:
print(f'log_loss valid: {log_loss(y_valid, clf.predict_proba(X_valid))}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

# Explaination of data drift

In [None]:
drift_explainer.plot_feature_contribs(type='node_size')

In [None]:
drift_explainer.plot_feature_drift('card6', as_discrete=True)
drift_explainer.get_feature_drift('card6')

In [None]:
drift_explainer.plot_feature_drift('C5')
drift_explainer.get_feature_drift('C5')

In [None]:
drift_explainer.plot_feature_drift('TransactionAmt')
drift_explainer.get_feature_drift('TransactionAmt')

In [None]:
# feature importance of the model
pd.DataFrame(clf.feature_importances_, X_train.columns).sort_values(0, ascending=False)

# Correction of data drift

## Correction on validation dataset

In [None]:
# weights computed with the adversarial method
sample_weights_valid_adversarial, drift_corrector = drift_explainer.get_adversarial_correction_weights(
    feature_subset=['card6'], return_object=True) #  , 'C5', 'TransactionAmt', 'D1', 'C13'


### Study the drift with the new weights on validation data

In [None]:
drift_explainer2 = DriftExplainer()
drift_explainer2.fit(clf, X1=X_valid, X2=X_prod, y1=y_valid, y2=y_prod,
                     sample_weights1=sample_weights_valid_adversarial)

In [None]:
# much better
drift_explainer2.plot_prediction_drift()
drift_explainer2.get_prediction_drift()

In [None]:
drift_explainer2.plot_target_drift()
drift_explainer2.get_target_drift()

In [None]:
# we now replicates much better what happens in production
print(f'log_loss valid: {log_loss(y_valid, clf.predict_proba(X_valid), sample_weight=sample_weights_valid_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

## Correction on validation dataset and train dataset (in order to retrain the model)

In [None]:
drift_explainer3 = DriftExplainer()
drift_explainer3.fit(clf, X1=X_train, X2=X_prod)

In [None]:
# weights computed with the adversarial method
sample_weights_train_adversarial, drift_corrector = drift_explainer3.get_adversarial_correction_weights(
    feature_subset=['card6'], return_object=True) #, 'C5', 'TransactionAmt', 'D1', 'C13'


In [None]:
clf2 = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.2,
                    max_depth=5,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
# we use a gamma factor as in the book
clf2.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], sample_weight=sample_weights_train_adversarial**0.1,
         early_stopping_rounds=20, verbose=10, eval_metric=['auc', 'logloss'],
         sample_weight_eval_set=[sample_weights_valid_adversarial])

In [None]:
print(f'log_loss valid: {log_loss(y_valid, clf2.predict_proba(X_valid), sample_weight=sample_weights_valid_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf2.predict_proba(X_prod))}')