In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from cinnamon.drift import OutputDriftDetector

# Iris Data

In [None]:
dataset = datasets.load_iris()

In [None]:
X = pd.DataFrame(dataset.data, columns = dataset.feature_names)
y = dataset.target

In [None]:
X.head()

In [None]:
# This is multiclass classification pbm with 3 classes
np.unique(y, return_counts=True)

# Build XGBoost model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2021)

In [None]:
clf = XGBClassifier(n_estimators=1000,
                    booster="gbtree",
                    learning_rate=0.05,
                    max_depth=6,
                    use_label_encoder=False,
                    seed=2021)

In [None]:
clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=20, verbose=10)

# Analyze data drift with ModelDriftExplainer

The experiment is done with a random split so we should not detect a data drift

### prediction_type == "proba"

In [None]:
output_drift_detector = OutputDriftDetector(task='classification', prediction_type='proba')

In [None]:
output_drift_detector.fit(clf.predict_proba(X_train), clf.predict_proba(X_test), y1=y_train, y2=y_test)

In [None]:
# no apparent drift in distributions of log-softmax predictions
output_drift_detector.plot_prediction_drift()
output_drift_detector.get_prediction_drift()

In [None]:
output_drift_detector.plot_target_drift()
output_drift_detector.get_target_drift()

In [None]:
output_drift_detector.get_performance_metrics_drift()

In [None]:
output_drift_detector.class_names

### prediction_type == "label"

In [None]:
output_drift_detector2 = OutputDriftDetector(task='classification', prediction_type='label')

In [None]:
output_drift_detector2.fit(clf.predict(X_train), clf.predict(X_test),
                            y_train, y_test)

In [None]:
output_drift_detector2.plot_prediction_drift()
output_drift_detector2.get_prediction_drift()

In [None]:
output_drift_detector2.plot_target_drift()
output_drift_detector2.get_target_drift()

In [None]:
output_drift_detector2.get_performance_metrics_drift()

### prediction_type == "raw"

In [None]:
output_drift_detector3 = OutputDriftDetector(task='classification', prediction_type='raw')

In [None]:
output_drift_detector3.fit(clf.predict(X_train, output_margin=True), clf.predict(X_test, output_margin=True),
                           y1=y_train, y2=y_test)

In [None]:
# no apparent drift in distributions of log-softmax predictions
output_drift_detector3.plot_prediction_drift()
output_drift_detector3.get_prediction_drift()

In [None]:
output_drift_detector3.plot_target_drift()
output_drift_detector3.get_target_drift()

In [None]:
output_drift_detector3.get_performance_metrics_drift()