The experiment is done with a random split so we should not detect a data drift

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import xgboost as xgb
import os
import sys

# TODO: handle this by running setup.py ?
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

from src.drift.drift_explainer import DriftExplainer

# Breast Cancer Data

In [None]:
dataset = datasets.load_breast_cancer()

In [None]:
X = pd.DataFrame(dataset.data, columns = dataset.feature_names)
y = dataset.target

In [None]:
X.head()

# Build XGBoost model

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.05,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=2021)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=20, verbose=10)

In [None]:
#clf.save_model('models/xgboost_binary_classif_breast_cancer')

In [None]:
#clf = XGBClassifier()
#clf.load_model('models/xgboost_binary_classif_breast_cancer')

# Analyze data drift

In [None]:
drift_explainer = DriftExplainer()

In [None]:
drift_explainer.fit(clf, X1=X_train, X2=X_valid, y1=y_train, y2=y_valid)

In [None]:
drift_explainer.plot_prediction_drift()

In [None]:
drift_explainer.get_prediction_drift()

In [None]:
drift_explainer.plot_prediction_drift(prediction_type='proba')

In [None]:
drift_explainer.get_prediction_drift(prediction_type='proba')

In [None]:
drift_explainer.plot_feature_contribs(type='size_diff')

In [None]:
drift_explainer.get_feature_contribs(type='size_diff')

In [None]:
drift_explainer.plot_feature_contribs(type='mean_diff')

In [None]:
drift_explainer.get_feature_contribs(type='mean_diff')

In [None]:
drift_explainer.plot_target_drift()

In [None]:
drift_explainer.get_target_drift()

In [None]:
drift_explainer.plot_feature_drift(0)

In [None]:
drift_explainer.get_feature_drifts()

# With base XGBoost API

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [None]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)

In [None]:
param = {'max_depth': 6,
         'learning_rate': 0.05,
         'objective': 'binary:logistic',
         'booster': 'gbtree',
         'seed': 2021
        }

In [None]:
bst = xgb.train(param, dtrain, num_boost_round=1000,
                evals=[(dtrain, 'train'), (dvalid, 'valid')],
                early_stopping_rounds=20, verbose_eval=10)

In [None]:
drift_explainer = DriftExplainer()
drift_explainer.fit(bst, X1=X_train, X2=X_valid, y1=y_train, y2=y_valid)