# Import custom classes

In [1]:
from src.logics.validation import TrainDataset
from src.logics.validation import ServeDataset
from src.logics.validation import Validator

from src.logics.enums import ConstraintType

2021-10-07 11:33:26.061543: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-07 11:33:26.061558: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Bigquery as the data source to initialize classes

In [2]:
train_query = (
    """
    SELECT
        IFNULL(totals.bounces, 0) AS bounces,
        IFNULL(totals.hits, 0) AS hits,
        IFNULL(totals.pageviews, 0) as pageviews,
        IFNULL(totals.timeOnSite, 0) as timeOnSite,
        IFNULL(totals.transactions, 0) as transactions,
        trafficSource.source as source,
        geoNetwork.city as city,
        channelGrouping as channelGrouping,
    FROM
        `mightyhive-data-science-poc.all_sample_tables.assignment_google_analytics`
    WHERE
        date BETWEEN '2020-06-01' AND '2020-06-02'
    """
)

serve_query = (
    """
    SELECT
        IFNULL(totals.bounces, 0) AS bounces,
        IFNULL(totals.hits, 0) AS hits,
        IFNULL(totals.pageviews, 0) as pageviews,
        IFNULL(totals.timeOnSite, 0) as timeOnSite,
        IFNULL(totals.transactions, 0) as transactions,
        trafficSource.source as source,
        geoNetwork.city as city,
        channelGrouping as channelGrouping,
    FROM
        `mightyhive-data-science-poc.all_sample_tables.assignment_google_analytics`
    WHERE
        date BETWEEN '2021-04-01' AND '2021-04-02'
    """
)

In [3]:
train = TrainDataset.from_bigquery(train_query)
serve = ServeDataset.from_bigquery(serve_query)

[DEBUG] Looking for cache in GCS
[DEBUG] Looking for cache in GCS


In [4]:
train

<src.logics.validation.TrainDataset at 0x7f27c6ca2d90>

# Train dataset related methods

## display schema and statistics

In [5]:
train.show_schema()

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'bounces',INT,required,,-
'hits',INT,required,,-
'pageviews',INT,required,,-
'timeOnSite',INT,required,,-
'transactions',INT,required,,-
'source',BYTES,required,,-
'city',BYTES,required,,-
'channelGrouping',STRING,required,,'channelGrouping'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'channelGrouping',"'(Other)', 'Affiliates', 'Direct', 'Display', 'Email', 'Local Listing', 'Organic Search', 'Paid Search', 'Referral', 'Social'"


In [6]:
train.show_stats()

## Modify inferred schema
For a list of things that can be modified, please refer to README.

In [7]:
constraints = [
    {"feature": "pageviews", "kind": ConstraintType.max,                       "value": 100},
    {"feature": "pageviews", "kind": ConstraintType.datatype,                  "value": "INT"},
    {"feature": "pageviews", "kind": ConstraintType.numerical_drift_threshold, "value": 0.01},
    {"feature": "hits",      "kind": ConstraintType.datatype,                  "value": "FLOAT"},
    {"feature": "bounces",   "kind": ConstraintType.is_categorical,            "value": True},
]

for constraint in constraints:
    train.add_schema_constraint(**constraint)

In [8]:
train.show_schema()

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'bounces',INT,required,,-
'hits',FLOAT,required,,-
'pageviews',INT,required,,max: 100
'timeOnSite',INT,required,,-
'transactions',INT,required,,-
'source',BYTES,required,,-
'city',BYTES,required,,-
'channelGrouping',STRING,required,,'channelGrouping'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'channelGrouping',"'(Other)', 'Affiliates', 'Direct', 'Display', 'Email', 'Local Listing', 'Organic Search', 'Paid Search', 'Referral', 'Social'"


# Serve dataset related methods

In [9]:
serve.show_stats()

# Validate serve dataset statistics and drift

In [10]:
validator = Validator(train, serve)

In [11]:
validator.show_stats()

In [12]:
_ = validator.validate_schema(visual=True)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'pageviews',Out-of-range values,Unexpectedly large value: 291.
'hits',Expected data of type: FLOAT but got INT,


In [13]:
_ = validator.detect_drift(visual=True)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'pageviews',Multiple errors,"Unexpectedly large value: 361. The approximate Jensen-Shannon divergence between training and serving is 0.0871842 (up to six significant digits), above the threshold 0.01."
'hits',Expected data of type: FLOAT but got INT,
