In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import xgboost as xgb

from cinnamon.drift import ModelDriftExplainer

# Breast Cancer Data

In [None]:
dataset = datasets.load_breast_cancer()

In [None]:
X = pd.DataFrame(dataset.data, columns = dataset.feature_names)
y = dataset.target

In [None]:
X.head()

# Build XGBoost model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2021)

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    objective="binary:logistic",
                    learning_rate=0.05,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=2021)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20, verbose=10)

# Analyze data drift with ModelDriftExplainer

The experiment is done with a random split so we should not detect a data drift

In [None]:
drift_explainer = ModelDriftExplainer(clf)

In [None]:
drift_explainer.fit(X1=X_train, X2=X_test, y1=y_train, y2=y_test)

In [None]:
# no apparent drift in distributions of logit predictions
drift_explainer.plot_prediction_drift()
drift_explainer.get_prediction_drift()

In [None]:
# same thing for distributions of predicted probabilities
drift_explainer.plot_prediction_drift(prediction_type='proba')
drift_explainer.get_prediction_drift(prediction_type='proba')

In [None]:
# same thing for distributions of predicted classes
drift_explainer.plot_prediction_drift(prediction_type='class')
drift_explainer.get_prediction_drift(prediction_type='class')

In [None]:
# no apparent drift in distributions of target labels
drift_explainer.plot_target_drift()
drift_explainer.get_target_drift()

In [None]:
drift_explainer.get_performance_metrics_drift()

In [None]:
# plot drift values computed with the tree based approach (only for demonstration
# purpose since there is no data drift)
drift_explainer.plot_tree_based_drift_values(type='node_size')

In [None]:
# drift values with the tree based approach
drift_explainer.get_tree_based_drift_values(type='node_size')

In [None]:
# plot drift values for type='mean_norm'
drift_explainer.plot_tree_based_drift_values(type='mean_norm')

In [None]:
drift_explainer.get_tree_based_drift_values(type='mean_norm')

In [None]:
# plot drift values computed with the model agnostic approach (only for demonstration
# purpose since there is no data drift)
drift_explainer.plot_model_agnostic_drift_values(type='mean')
drift_explainer.get_model_agnostic_drift_values(type='mean')

In [None]:
drift_explainer.plot_feature_drift('mean perimeter')
drift_explainer.get_feature_drift('mean perimeter')

In [None]:
drift_explainer.plot_feature_drift(4)
drift_explainer.get_feature_drift(4)

In [None]:
drift_explainer.get_feature_drifts()