In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from xgboost import XGBClassifier

from cinnamon.drift import ModelDriftExplainer, AdversarialDriftExplainer

# config
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

seed = 2021

# IEEE fraud data

Download data with kaggle CLI if it is setup on your computer:

In [None]:
#!kaggle competitions download -c ieee-fraud-detection

Else you can download the data here: https://www.kaggle.com/c/ieee-fraud-detection/data, and you will have to accept the competition rules

In [None]:
df = pd.read_csv('data/train_transaction.csv')

In [None]:
print(df.shape)

# Preprocessing

In [None]:
# count missing values per column
missing_values = df.isnull().sum(axis=0)
missing_values

In [None]:
# keep only columns with less than 10000 values
selected_columns = [col for col in df.columns if missing_values[col] < 10000]

In [None]:
# in the resulting columns, drop rows with any missing value
df = df[selected_columns].dropna(axis=0, how='any')

In [None]:
# for the variable 'card6', keep only rows corresponding to 'debit' and 'credit'modalities
df = df.loc[df['card6'].isin(['debit', 'credit']), :].copy()
df['card6'] = (df['card6'] == 'credit') * 1

In [None]:
df.head()

# Sampling

We replicate a typical production situation where we would have:
- training data
- test data
- production data

Also, we introduce so data drift on the variable `card6` by using downsampling. This data drift corresponds to covariate shift.

In [None]:
# select features by keeping only numerical features
features = [col for col in df.columns if col not in ['TransactionID', 'isFraud', 'TransactionDT',
                                                     'ProductCD', 'card4']]

In [None]:
# we do a time split (shuffle=False) to seperate between df_temp (train-test data) and df_prod (production data)
df_temp, df_prod = train_test_split(df.copy(), test_size=0.25, shuffle=False, random_state=seed)

In [None]:
df_temp['card6'].value_counts()

In [None]:
# in df_temp, we downsample the modality '0' to introduce covariate shift
# (distribution before and after sampling are given in cell above and below)
np.random.seed(seed)
df_temp = df_temp.loc[((np.random.randint(low=0, high=9, size=df_temp.shape[0]) == 0) |
                       (df_temp['card6'].values == 1)), :]

In [None]:
df_temp['card6'].value_counts()

In [None]:
# we do a time split (shuffle=False) to seperate between training data and test data
X_train, X_test, y_train, y_test = train_test_split(df_temp[features].copy(),
                                                      df_temp['isFraud'].values,
                                                      test_size=1/3,
                                                      shuffle=False,
                                                      random_state=seed)

In [None]:
X_prod, y_prod = df_prod[features], df_prod['isFraud'].values

# Build model

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.2,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20,
        verbose=10, eval_metric=['auc', 'logloss'])

# Detection of data drift

We do detect a data drift in this case. Our three indicators:
- distribution of predictions
- distribution of target labels
- performance metrics

show a data drift

In [None]:
# initialize and fit a model drift explainer on test and prod data
drift_explainer = ModelDriftExplainer(clf)
drift_explainer.fit(X1=X_test, X2=X_prod, y1=y_test, y2=y_prod)

In [None]:
drift_explainer.plot_prediction_drift(bins=100)
drift_explainer.get_prediction_drift()

In [None]:
drift_explainer.plot_target_drift()
drift_explainer.get_target_drift()

In [None]:
print(f'log_loss test: {log_loss(y_test, clf.predict_proba(X_test))}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

# Explaination of data drift

In [None]:
# plot drift values in order to identify features that have the higher impacts on data drift
drift_explainer.plot_tree_based_drift_values(type='node_size')

In [None]:
# first drift value feature is 'card6' : the one we voluntarily add drift to.
drift_explainer.plot_feature_drift('card6', as_discrete=True)
drift_explainer.get_feature_drift('card6')

In [None]:
# feature importance of the model
# interestingly, 'card6' feature is quite far in the list of impartant feature in the model
pd.DataFrame(clf.feature_importances_, X_train.columns).sort_values(0, ascending=False)

In [None]:
drift_explainer.plot_feature_drift('C5')
drift_explainer.get_feature_drift('C5')

In [None]:
drift_explainer.plot_feature_drift('TransactionAmt')
drift_explainer.get_feature_drift('TransactionAmt')

# Correction of data drift

## Correction on test dataset

We apply our methodology which uses adversarial learning to correct data drift between test and prod data.

We then check our three indicators of  data drift in order to see if we get improvement.

In [None]:
# weights computed with the adversarial method
# feature_subset=['card6']: only the first feature in terms of drift value is selected here
sample_weights_test_adversarial = (AdversarialDriftExplainer(feature_subset=['card6'], seed=2021)
                                    .fit(X_test, X_prod)
                                    .get_adversarial_correction_weights(max_ratio=10))

In [None]:
# study the drift with the new weights on test data
drift_explainer2 = ModelDriftExplainer(clf)
drift_explainer2.fit(X1=X_test, X2=X_prod, y1=y_test, y2=y_prod,
                     sample_weights1=sample_weights_test_adversarial)

In [None]:
# with new weigts on test data, distributions of predictions are much closer
drift_explainer2.plot_prediction_drift(bins=100)
drift_explainer2.get_prediction_drift()

In [None]:
# distribution of target if also corrected
drift_explainer2.plot_target_drift()
drift_explainer2.get_target_drift()

In [None]:
# we now replicates much better what happens in production (also in terms of log loss)
print(f'log_loss test: {log_loss(y_test, clf.predict_proba(X_test), sample_weight=sample_weights_test_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

## Correction on validation dataset and train dataset (in order to retrain the model)

We apply the same adversarial strategy on training data.

With the new weights, we observe the model trained on re-weighted does not seem to perform better than the model trained on non-weighted data.

In [None]:
# weights computed with the adversarial method on training data
sample_weights_train_adversarial = (AdversarialDriftExplainer(feature_subset=['card6'], seed=2021, verbosity=False)
                                    .fit(X_train, X_prod)
                                    .get_adversarial_correction_weights(max_ratio=10))

In [None]:
clf2 = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.2,
                    max_depth=5,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
# train a new classifier with the reweighted samples
clf2.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], sample_weight=sample_weights_train_adversarial,
         early_stopping_rounds=20, verbose=10, eval_metric=['auc', 'logloss'],
         sample_weight_eval_set=[sample_weights_test_adversarial])

In [None]:
# we see no improvement on prod loss when we train with both train and test datasets reweighted
print(f'log_loss test: {log_loss(y_test, clf2.predict_proba(X_test), sample_weight=sample_weights_test_adversarial)}')
print(f'log_loss prod: {log_loss(y_prod, clf2.predict_proba(X_prod))}')