# Load package

In [1]:
from pathlib import Path

from validation import TrainDataset
from validation import ServeDataset
from validation import Validator
from validation.enums import ConstraintType

2021-11-16 14:22:27.659907: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-16 14:22:27.659921: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Load common parameters

In [2]:
PROJECT = "mightyhive-data-science-poc"
TABLE_ID = f"{PROJECT}.data_drift_demo.bike_sharing"
SERVICE_PATH = Path.home().joinpath(".ssh", "mightyhive.json")
BUCKET_NAME = "data-drift-detection"
MODEL_FILE_NAME = "model.sav"

# Initialize TrainDataset class

In [3]:
train = TrainDataset.from_bigquery(
    f"""
    SELECT
      temp,
      atemp,
      humidity,
      windspeed,
      season,
      holiday,
      workingday,
      EXTRACT(DAYOFWEEK FROM datetime) AS weekday,
      EXTRACT(HOUR FROM datetime) AS hour,
    FROM
      `{TABLE_ID}`
    WHERE DATE(datetime) BETWEEN "2011-01-01" and "2011-01-28"
    """
)

[DEBUG] Looking for cache in GCS


# Modify autogenerated schema

In [4]:
train.show_schema()

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'temp',FLOAT,required,,-
'atemp',FLOAT,required,,-
'humidity',INT,required,,-
'windspeed',FLOAT,required,,-
'season',INT,required,,-
'holiday',INT,required,,-
'workingday',INT,required,,-
'weekday',INT,required,,-
'hour',INT,required,,-


**schema related modification**

In [5]:
constraints = [
    {"feature": "season",     "kind": ConstraintType.is_categorical, "value": True},
    {"feature": "holiday",    "kind": ConstraintType.is_categorical, "value": True},
    {"feature": "workingday", "kind": ConstraintType.is_categorical, "value": True},
    {"feature": "weekday",    "kind": ConstraintType.is_categorical, "value": True},
    {"feature": "hour",       "kind": ConstraintType.is_categorical, "value": True},
    {"feature": "temp",       "kind": ConstraintType.max,            "value": 50},
    {"feature": "humidity",   "kind": ConstraintType.max,            "value": 100},
]

for constraint in constraints:
    train.add_schema_constraint(**constraint)

**Data drift related modification**

In [6]:
constraints = [
    {"feature": "temp",      "kind": ConstraintType.numerical_drift_threshold, "value": 0.1},
    {"feature": "humidity",  "kind": ConstraintType.numerical_drift_threshold, "value": 0.1},
    {"feature": "windspeed", "kind": ConstraintType.numerical_drift_threshold, "value": 0.1},
]

for constraint in constraints:
    train.add_schema_constraint(**constraint)

# Initialize serving dataset

In [7]:
serve = ServeDataset.from_bigquery(
    f"""
    SELECT
      temp,
      atemp,
      humidity,
      windspeed,
      season,
      holiday,
      workingday,
      EXTRACT(DAYOFWEEK FROM datetime) AS weekday,
      EXTRACT(HOUR FROM datetime) AS hour,
    FROM
      `{TABLE_ID}`
    WHERE DATE(datetime) BETWEEN "2011-01-29" and "2011-02-25"
    """
)

[DEBUG] Looking for cache in GCS


# Initialize validator class

In [8]:
validator = Validator(train, serve)

In [9]:
validator.show_stats()

In [10]:
_ = validator.validate_schema(visual=True)

In [11]:
result = validator.detect_drift()

In [12]:
for info in result["driftSkewInfo"]:
    print(info)

{'path': {'step': ['humidity']}, 'skewMeasurements': [{'type': 'JENSEN_SHANNON_DIVERGENCE', 'value': 0.1003878704306455, 'threshold': 0.1}]}
{'path': {'step': ['temp']}, 'skewMeasurements': [{'type': 'JENSEN_SHANNON_DIVERGENCE', 'value': 0.13301297312630075, 'threshold': 0.1}]}
{'path': {'step': ['windspeed']}, 'skewMeasurements': [{'type': 'JENSEN_SHANNON_DIVERGENCE', 'value': 0.03758136452220929, 'threshold': 0.1}]}


In [13]:
 for info in result["driftSkewInfo"]:
    feature_name = info["path"]["step"][0]
    is_drifted = info["skewMeasurements"][0]["threshold"] < info["skewMeasurements"][0]["value"]
    print(f"feature {feature_name} drfited: {is_drifted}")

feature humidity drfited: True
feature temp drfited: True
feature windspeed drfited: False
