In [1]:
import os.path
import pandas as pd
import numpy as np

In [2]:
# using the new v2 session will expose a few experimental parts of whylogs
from whylogs.v2 import get_or_create_session
from whylogs.v2 import MetricPlugin
from whylogs.v2.core.columnprofile_configuration import ColumnProfileConfiguration
from whylogs.v2.core.statistics import SchemaTracker, HllSketch

In [3]:
session = get_or_create_session()

WARN: Missing config


In [4]:
# This removes all the automatic column trackers and stastics from whylog's loggers
# created from this session, you will have to configure trackers for specific columns
# or add custom metric_plugins to get anything logged.
session.with_no_column_trackers()

# Here is a simple schema and cardinality trackers for a column named 'Text'
text_column_name = "Text"
column_profiles = {}
column_profiles[text_column_name] = ColumnProfileConfiguration(trackers=[SchemaTracker(), HllSketch()])

session.with_custom_column_profiles(column_profiles)


<bound method Session.with_custom_column_profiles of metadata:
  input_path: ''
  output_path: output
  path_template: <string.Template object at 0x7f2daee92940>
pipeline: default-pipeline
project: default-project
verbose: false
with_rotation_time: null
writers:
- filename_template: <string.Template object at 0x7f2daee92850>
  formats:
  - OutputFormat.json
  - OutputFormat.flat
  - OutputFormat.protobuf
  output_path: output
  path_template: <string.Template object at 0x7f2daee92910>
>

In [5]:
print("Current working directory:", os.getcwd())

Current working directory: /home/jamie/projects/whylogs/examples


In [6]:
data_file = "data/custom_metric_sample.txt"


In [7]:
data = pd.read_csv(os.path.join(data_file), delimiter = '\n')
data.head()

Unnamed: 0,Text
0,Much Ado About Nothing
1,ACT I
2,SCENE I. Before LEONATO'S house.
3,"Enter LEONATO, HERO, and BEATRICE, with a Mess..."
4,LEONATO


In [8]:
profile = None
from whylogs.logs import display_logging
display_logging('debug')


2021-09-29 16:39:20,953 - whylogs.logs - DEBUG - whylogs.logs logging -> stdout at level DEBUG


In [9]:
with session.logger(dataset_name='test.data') as ylog:
    profile = ylog.profile
    print(profile.has_custom_profile_config)
    profile.configured_column_names
    ylog.log_dataframe(data)


True
2021-09-29 16:39:21,029 - whylogs.v2.core.datasetprofile - INFO - We got trackers: [<whylogs.v2.core.statistics.schematracker.SchemaTracker object at 0x7f2daeea8070>, <whylogs.v2.core.statistics.hllsketch.HllSketch object at 0x7f2daeea81c0>] and type False
2021-09-29 16:39:21,031 - whylogs.v2.core.columnprofile - INFO - Using custom set of trackers ['SchemaTracker', 'HllSketch'] on column Text


In [10]:
profile

<whylogs.v2.core.datasetprofile.DatasetProfile at 0x7f2dae830bb0>

In [11]:
print(profile.has_custom_profile_config)
profile.configured_column_names

True


['Text']

In [12]:
profile.configured_column_names

['Text']

In [13]:
summary = profile.to_summary()
summary

properties {
  schema_major_version: 1
  schema_minor_version: 3
  session_id: "9b4661f1-418e-4f24-b65b-573cfe798c7b"
  session_timestamp: 1632933560631
  tags {
    key: "name"
    value: "test.data"
  }
}
columns {
  name: "Text"
  trackers {
    key: "HllSketch"
    value {
      name: "HllSketch"
      type_index: 9
      unique_count {
        estimate: 2657.375089447257
        upper: 2692.1586733030113
        lower: 2623.437636452387
      }
    }
  }
  trackers {
    key: "SchemaTracker"
    value {
      name: "SchemaTracker"
      type_index: 4
      schema {
        inferred_type {
          type: STRING
          ratio: 1.0
        }
        type_counts {
          key: "STRING"
          value: 3683
        }
      }
    }
  }
}

In [14]:
# Ok here is a sample custom metric that 
from dataclasses import dataclass
from json import loads

@dataclass
class TargetWordCountMetric(MetricPlugin):
    target_string: str = 'Hero'
    target_column_name: str = ''
    name: str = 'TargetWordCounter(Hero)'
    word_counts: int = 0

    def track(self, data):
        self.word_counts = self.word_counts + data.count(self.target_string)

    def merge(self, other: 'TargetWordCountMetric'):
        self.word_counts = self.word_counts + other.word_counts

In [16]:
# Note that 'Hero' is the name of one of the characters in the play 'Much Ado about Nothing' so this counter should
# return how many times we see this string
hero_counter = TargetWordCountMetric(target_string='Hero', name='HeroCounter', target_column_name=text_column_name)

profile2 = None
plugins = None

with session.logger(dataset_name='test2.data') as ylog:
    ylog.add_metric_plugin(hero_counter)
    profile2 = ylog.profile
    plugins = ylog.plugins
    ylog.log_dataframe(data)

In [17]:
plugins

{'HeroCounter': TargetWordCountMetric(target_string='Hero', target_column_name='Text', name='HeroCounter', word_counts=63)}

In [18]:
summary2 = profile2.to_summary()
summary2

properties {
  schema_major_version: 1
  schema_minor_version: 3
  session_id: "9b4661f1-418e-4f24-b65b-573cfe798c7b"
  session_timestamp: 1632933560631
  tags {
    key: "name"
    value: "test2.data"
  }
}
columns {
  name: "Text"
  trackers {
    key: "HllSketch"
    value {
      name: "HllSketch"
      type_index: 9
      unique_count {
        estimate: 2657.375089447257
        upper: 2692.1586733030113
        lower: 2623.437636452387
      }
    }
  }
  trackers {
    key: "SchemaTracker"
    value {
      name: "SchemaTracker"
      type_index: 4
      schema {
        inferred_type {
          type: STRING
          ratio: 1.0
        }
        type_counts {
          key: "STRING"
          value: 7366
        }
      }
    }
  }
}
metric_plugins {
  key: "HeroCounter"
  value {
    name: "HeroCounter"
    plugin_types {
      plugin_class_name: "TargetWordCountMetric"
    }
    params {
      fields {
        key: "target_column_name"
        value {
          string_value