In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from xgboost import XGBClassifier

import cinnamon
from cinnamon.drift import ModelDriftExplainer, AdversarialDriftExplainer

# pandas config
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

seed = 2021

# IEEE fraud data

Download data with kaggle CLI if it is setup on your computer:

In [None]:
#!kaggle competitions download -c ieee-fraud-detection

Else you can download the data here: https://www.kaggle.com/c/ieee-fraud-detection/data, and you will have to accept the competition rules

In [None]:
df = pd.read_csv('data/train_transaction.csv')

In [None]:
print(df.shape)

# Preprocessing

In [None]:
# count missing values per column
missing_values = df.isnull().sum(axis=0)
missing_values

In [None]:
# keep only columns with less than 10000 values
selected_columns = [col for col in df.columns if missing_values[col] < 10000]

In [None]:
# in the resulting columns, drop rows with any missing value
df = df[selected_columns].dropna(axis=0, how='any')

In [None]:
# for the variable 'card6', keep only rows corresponding to 'debit' and 'credit'modalities
df = df.loc[df['card6'].isin(['debit', 'credit']), :].copy()

In [None]:
df.head()

# Sampling

We replicate a typical production situation where we would have:
- training data
- test data
- production data

Also, we introduce so data drift on the variable `card6` by keeping only transactions which correspond to credit card. In a real application, this would correspond to the case where we are not able to identify fraud (the target label) for debit card transactions.

This data drift corresponds to a case of censoring. Generally it would correspond to concept drift.

In [None]:
# select features by keeping only numerical features
features = [col for col in df.columns if col not in ['TransactionID', 'isFraud', 'TransactionDT',
                                                     'ProductCD', 'card4', 'card6']]

In [None]:
# we do a time split (shuffle=False) to seperate between df_temp (training-test data)
# and df_prod (production data)
df_temp, df_prod = train_test_split(df.copy(), test_size=0.25, shuffle=False)

In [None]:
# the majority of transactions are made with debit cards
df_temp['card6'].value_counts()

In [None]:
# drop all debit card transactions in train-test data
# we do a time split (shuffle=False) to seperate between train data and test data
X_train, X_test, y_train, y_test = train_test_split(df_temp.loc[df_temp['card6'].values == 'credit', features].copy(),
                                                    df_temp.loc[df_temp['card6'].values == 'credit', 'isFraud'].values,
                                                    test_size=1/3,
                                                    shuffle=False,
                                                    random_state=seed)

In [None]:
X_prod, y_prod = df_prod[features], df_prod['isFraud'].values

# Build model

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.1,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20,
        verbose=10, eval_metric=['auc', 'logloss'])

# Detection of data drift

We do detect a data drift in this case. Our three indicators:

- distribution of predictions
- distribution of target labels
- performance metrics

show a data drift

In [None]:
drift_explainer = ModelDriftExplainer(clf)

In [None]:
drift_explainer.fit(X1=X_test, X2=X_prod, y1=y_test, y2=y_prod)

In [None]:
cinnamon.plot_prediction_drift(drift_explainer, figsize=(7, 5), bins=100)
drift_explainer.get_prediction_drift()

In [None]:
cinnamon.plot_target_drift(drift_explainer)
drift_explainer.get_target_drift()

In [None]:
print(f'log_loss test: {log_loss(y_test, clf.predict_proba(X_test))}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

print(f'AUC test: {roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])}')
print(f'AUC prod: {roc_auc_score(y_prod, clf.predict_proba(X_prod)[:, 1])}')

# Explain data drift

In [None]:
# plot drift values in order to identify features that have the higher impacts on data drift
cinnamon.plot_tree_based_drift_importances(drift_explainer, type='node_size')

In [None]:
# first drift value feature : 'D1'
cinnamon.plot_feature_drift(drift_explainer, 'D1', bins=100)
drift_explainer.get_feature_drift('D1')

In [None]:
cinnamon.plot_feature_drift(drift_explainer, 'C13', bins=100)
drift_explainer.get_feature_drift('C13')

In [None]:
cinnamon.plot_feature_drift(drift_explainer, 'C2', bins=100)
drift_explainer.get_feature_drift('C2')

In [None]:
cinnamon.plot_feature_drift(drift_explainer, 'TransactionAmt', bins=100)
drift_explainer.get_feature_drift('TransactionAmt')

In [None]:
# feature importance of the model
pd.DataFrame(clf.feature_importances_, X_train.columns).sort_values(0, ascending=False)

# Correction of data drift

## Correction on test dataset

We apply our methodology which uses adversarial learning to correct data drift between test and prod data.

We then check our three indicators of data drift in order to see if we get improvement.

In [None]:
# weights computed with the adversarial method
# feature_subset=['D1', 'C13', 'C2', 'TransactionAmt']: only the first fourth features in terms of
# drift value are selected
sample_weights_test_adversarial = (AdversarialDriftExplainer(feature_subset=['D1', 'C13'], #, 'C2', 'TransactionAmt'
                                                              seed=2021)
                                    .fit(X_test, X_prod)
                                    .get_adversarial_correction_weights(max_ratio=10))

In [None]:
drift_explainer2 = ModelDriftExplainer(clf).fit(X1=X_test, X2=X_prod, y1=y_test, y2=y_prod,
                                                sample_weights1=sample_weights_test_adversarial)

In [None]:
# the drift on distribution of predictions is lowered thaks to our technique 
cinnamon.plot_prediction_drift(drift_explainer2, bins=100)
drift_explainer2.get_prediction_drift()

In [None]:
# the target algo re-equilibrated in the good direction
cinnamon.plot_target_drift(drift_explainer2)
drift_explainer2.get_target_drift()

In [None]:
# valid loss is closer to prod loss, but there is still a difference
print(f'log_loss valid: {log_loss(y_test, clf.predict_proba(X_test), sample_weight=sample_weights_test_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

print(f'AUC valid: {roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1], sample_weight=sample_weights_test_adversarial)}')
print(f'AUC prod: {roc_auc_score(y_prod, clf.predict_proba(X_prod)[:, 1])}')

## Correction on test dataset and train dataset (in order to retrain the model)

We apply the same adversarial strategy on training data.

With the model retrain on re-weighted samples, new weights, we observe there is no obvious improvement in model performance on production data. This needs to be further investigated.

In [None]:
# weights computed with the adversarial method on training data
sample_weights_train_adversarial = (AdversarialDriftExplainer(feature_subset=['D1', 'C13', ], #'C2', 'TransactionAmt'
                                                              seed=2021)
                                    .fit(X_train, X_prod)
                                    .get_adversarial_correction_weights(max_ratio=10))

In [None]:
clf2 = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.1,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
# train a new classifier with the reweighted samples
# we use a power factor 0.3 on sample_weights_train_adversarial weights to smooth them
clf2.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], sample_weight=sample_weights_train_adversarial**0.3,
         early_stopping_rounds=20, verbose=10, eval_metric=['auc', 'logloss'],
         sample_weight_eval_set=[sample_weights_test_adversarial])

In [None]:
# with the reweighting, we see a small improvement for performance on production data, but is it significative ?
print(f'log_loss test: {log_loss(y_test, clf2.predict_proba(X_test), sample_weight=sample_weights_test_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf2.predict_proba(X_prod))}')

print(f'AUC test: {roc_auc_score(y_test, clf2.predict_proba(X_test)[:, 1], sample_weight=sample_weights_test_adversarial)}')
print(f'AUC prod: {roc_auc_score(y_prod, clf2.predict_proba(X_prod)[:, 1])}')