In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from xgboost import XGBClassifier

import cinnamon
from cinnamon.drift import ModelDriftExplainer

# pandas config
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

seed = 2021

# IEEE fraud data

Download data with kaggle CLI if it is setup on your computer:

In [None]:
#!kaggle competitions download -c ieee-fraud-detection

Else you can download the data here: https://www.kaggle.com/c/ieee-fraud-detection/data, and you will have to accept the competition rules

In [None]:
df = pd.read_csv('data/train_transaction.csv')

In [None]:
print(df.shape)

# Preprocessing

In [None]:
# count missing values per column
missing_values = df.isnull().sum(axis=0)
missing_values

In [None]:
# keep only columns with less than 10000 values
selected_columns = [col for col in df.columns if missing_values[col] < 10000]

In [None]:
# in the resulting columns, drop rows with any missing value
df = df[selected_columns].dropna(axis=0, how='any')

In [None]:
# for the variable 'card6', keep only rows corresponding to 'debit' and 'credit'modalities
df = df.loc[df['card6'].isin(['debit', 'credit']), :].copy()
df['card6'] = (df['card6'] == 'credit') * 1

In [None]:
print(df.shape)
df.head()

# Sampling

We replicate a typical production situation where we would have:
- training data
- test data
- production data

In [None]:
# select features by keeping only numerical features
features = [col for col in df.columns if col not in ['TransactionID', 'isFraud', 'TransactionDT',
                                                     'ProductCD', 'card4']]

In [None]:
# we do a time split (shuffle=False) to seperate between df_temp (train + test data) and df_prod (production data)
df_temp, df_prod = train_test_split(df, test_size=0.25, shuffle=False, random_state=seed)

In [None]:
# we do a time split (shuffle=False) to seperate between training data and validation data
X_train, X_test, y_train, y_test = train_test_split(df_temp[features], df_temp['isFraud'].values,
                                                      test_size=1/3, shuffle=False, random_state=seed)

In [None]:
X_prod, y_prod = df_prod[features], df_prod['isFraud'].values

# Build model

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.2,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=seed)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20,
        verbose=10, eval_metric=['auc', 'logloss'])

# Detection of data drift

We don not detect any data drift in the example below

In [None]:
# initialize and fit a model drift explainer on valid and prod data
drift_explainer = ModelDriftExplainer(clf)
drift_explainer.fit(X1=X_test, X2=X_prod, y1=y_test, y2=y_prod)

In [None]:
# no apparent drift in distribution of predictions
cinnamon.plot_prediction_drift(drift_explainer, bins=100)
drift_explainer.get_prediction_drift()

In [None]:
# no apparent drift in distribution of target labels
cinnamon.plot_target_drift(drift_explainer)
drift_explainer.get_target_drift()

In [None]:
# no apparent drift in performance metrics
print(f'log_loss test: {log_loss(y_test, clf.predict_proba(X_test))}')
print(f'log_loss prod: {log_loss(y_prod, clf.predict_proba(X_prod))}')

In [None]:
# data drift of feature 0 (first column)
cinnamon.plot_feature_drift(drift_explainer, 0, bins=100)
drift_explainer.get_feature_drift(0)

In [None]:
cinnamon.plot_feature_drift(drift_explainer, 'card6', as_discrete=True)
drift_explainer.get_feature_drift('card6')