In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_openml
from sklearn.compose import make_column_selector as selector, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

import cinnamon
from cinnamon.drift import ModelDriftExplainer

RANDOM_SEED = 2021

# Housing Data

In [None]:
#import os, ssl
#if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
#    getattr(ssl, '_create_unverified_context', None)): 
#    ssl._create_default_https_context = ssl._create_unverified_context

If you run into a SSL certificate error executing this cell, uncommenting and running the above cell should solve the problem
see https://github.com/scikit-learn/scikit-learn/issues/10201#issuecomment-365734582 

In [None]:
ames_housing = fetch_openml(name="house_prices", as_frame=True)

In [None]:
ames_housing_df = pd.DataFrame(data=ames_housing.data, columns=ames_housing.feature_names)

In [None]:
print(ames_housing_df.shape)
ames_housing_df.head()

In [None]:
ames_housing_df.dtypes

In [None]:
# drop columns with missing values
dropped_columns = ames_housing_df.columns[ames_housing_df.isnull().sum(axis=0) > 0]
dropped_columns

In [None]:
ames_housing_df.drop(dropped_columns, axis=1, inplace=True)

In [None]:
ames_housing_df.apply(lambda x: len(x.value_counts()), axis=0)

In [None]:
# use one hot encoding to preprocess the categorical columms
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(ames_housing_df)

preprocessor = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(handle_unknown="ignore"), categorical_columns),
])

In [None]:
cat_feature_indices = [ames_housing_df.columns.to_list().index(name) for name in categorical_columns] 

# Build linear model pipeline 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ames_housing_df, ames_housing.target, test_size=0.3, random_state=RANDOM_SEED)

In [None]:
# use one hot encoding to preprocess the categorical columms
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(ames_housing_df)

preprocessor = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(handle_unknown="ignore"), categorical_columns),
])

pipe = make_pipeline(preprocessor, LinearRegression())

In [None]:
pipe.fit(X=X_train, y=y_train)

# ModelDriftExplainer

The experiment is done with a random split so we should not detect a data drift

In [None]:
# CinnaMon can support regression pipeline if the pipe.predict method is implemented
drift_explainer = ModelDriftExplainer(pipe, task='regression')

In [None]:
cat_feature_indices = [ames_housing_df.columns.to_list().index(name) for name in categorical_columns] 
drift_explainer.fit(X_train, X_test, y_train, y_test, cat_feature_indices=cat_feature_indices)

In [None]:
# no apparent drift in distributions of predictions
cinnamon.plot_prediction_drift(drift_explainer)
drift_explainer.get_prediction_drift()

In [None]:
# no apparent drift in distributions of target labels
cinnamon.plot_target_drift(drift_explainer)
drift_explainer.get_target_drift()

In [None]:
# here we would detect a data drift but in fact the difference in performance metrics
# is due to comparing train and test dataset
drift_explainer.get_performance_metrics_drift()

In [None]:
# plot drift importances computed with the model agnostic approach
cinnamon.plot_model_agnostic_drift_importances(drift_explainer, type='wasserstein', n=40)
drift_explainer.get_model_agnostic_drift_importances(type='wasserstein')

In [None]:
drift_explainer.get_feature_drift(0)

In [None]:
drift_explainer.get_feature_drifts()