In [1]:
import sys

sys.path.append("/workspaces/mlops-practice")

In [2]:
import os

import joblib
from datetime import datetime

import numpy as np
import pandas as pd
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import train_test_validation, model_evaluation

from src.common.constants import (
    ARTIFACT_PATH,
    DATA_PATH,
)
from src.preprocess import CAT_FEATURES


With transform="pandas", `func` should return a DataFrame to follow the set_output API.



In [3]:
DATE = datetime.now().strftime("%Y%m%d")
LABEL_NAME = "rent"

In [4]:
train_df = pd.read_csv(
    os.path.join(DATA_PATH, "house_rent_train.csv"),
    usecols=lambda x: x not in ["area_locality", "posted_on", "id"],
)
new_df = pd.read_csv(
    os.path.join(DATA_PATH, "house_rent_new.csv"),
    usecols=lambda x: x not in ["area_locality", "posted_on", "id"],
)

## Data Drift

In [5]:
train_set = Dataset(
    train_df, 
    label=LABEL_NAME,
    cat_features=CAT_FEATURES,
    )
new_set = Dataset(
    new_df, 
    label=LABEL_NAME,
    cat_features=CAT_FEATURES,
    )

In [6]:
validation_suite = train_test_validation()
suite_result = validation_suite.run(train_set, new_set)

In [7]:
DRIFT_DETECTION_PATH = os.path.join(
    ARTIFACT_PATH, "drift_detection"
)

if not os.path.exists(DRIFT_DETECTION_PATH):
    os.makedirs(DRIFT_DETECTION_PATH)

In [8]:
for result in suite_result.get_not_passed_checks():
    print(result.header)
    print(result.conditions_results[0].details)

Feature Drift
Failed for 3 out of 8 columns.
Found 3 categorical columns with Cramer's V above threshold: {'area_type': '0.23', 'city': '0.21', 'point_of_contact': '0.31'}
Label Drift
Label's drift score Kolmogorov-Smirnov is 0.25
Multivariate Drift
Found drift value of: 0.42, corresponding to a domain classifier AUC of: 0.71


In [9]:
suite_result.save_as_html(
    os.path.join(DRIFT_DETECTION_PATH, f"{DATE}_drift_detection.html")
)

'/workspaces/mlops-practice/artifacts/drift_detection/20230613_drift_detection.html'

In [10]:
suite_result.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_I0I2SY25JYF4FS7VEA661MP3Q">Train Test Validat…

---

## Model Drift

In [11]:
from src.preprocess import preprocess_pipeline

In [12]:
model = joblib.load(os.path.join(ARTIFACT_PATH, "model.pkl"))

In [13]:
y_train = np.log1p(train_df[LABEL_NAME])
x_train = preprocess_pipeline.fit_transform(
    X=train_df.drop([LABEL_NAME], axis=1),
    y=y_train
)

y_new = np.log1p(new_df[LABEL_NAME])
x_new = preprocess_pipeline.fit_transform(
    X=new_df.drop([LABEL_NAME], axis=1),
    y=y_new
)

In [14]:
train_set = Dataset(
    x_train,
    label=y_train,
    cat_features=CAT_FEATURES,
)
new_set = Dataset(
    x_new,
    label=y_new,
    cat_features=CAT_FEATURES,
)

In [15]:
evaluation_suite = model_evaluation()
suite_result = evaluation_suite.run(train_set, new_set, model["regr"])



In [16]:
suite_result.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_NK3ZUKFUZU0OPQT63MAXH02DF">Model Evaluation S…

: 

In [76]:
for result in suite_result.get_not_passed_checks():
    print(
        "The following test failed!\n"
        f"{result.header}: {result.conditions_results[0].details}\n"
    )

The following test failed!
Feature Drift: Failed for 3 out of 8 columns.
Found 3 categorical columns with Cramer's V above threshold: {'area_type': '0.23', 'city': '0.21', 'point_of_contact': '0.31'}

The following test failed!
Label Drift: Label's drift score Kolmogorov-Smirnov is 0.25

The following test failed!
Multivariate Drift: Found drift value of: 0.42, corresponding to a domain classifier AUC of: 0.71

