Image Drift Debugging example

In [49]:
%pip install datasets whylogs[image,whylabs,viz]==1.1.35-dev2 -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
dataset = load_dataset("cifar10")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset cifar10 (/home/jamie/.cache/huggingface/datasets/cifar10/plain_text/1.0.0/447d6ec4733dddd1ce3bb577c7166b986eaa4c538dcd9e805ba61f35674a9de4)
100%|██████████| 2/2 [00:00<00:00, 337.83it/s]


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 10000
    })
})

In [4]:
def darken_image(img, factor):
  from PIL import ImageEnhance

  enhancer = ImageEnhance.Brightness(img)
  im_output = enhancer.enhance(factor)
  return im_output

In [5]:
n_reference = 1000
n_test = 200
ratio_anomalies = 0.1
darkness_factor = 0.50
save_images = True

In [6]:
import pandas as pd
import random

random.seed(1095)

def generate_df(n_points=100,split="train", anomaly_ratio = 0):
  df = pd.DataFrame(columns=['image','filename'])
  for i in range(n_points):
    img = dataset[split][i]['img']
    x = random.uniform(0, 1)
    anomaly_injected = False
    if x < anomaly_ratio:
        img = darken_image(img, darkness_factor)
        anomaly_injected = True
    filename = f"{split}_{i}{'a' if anomaly_injected else ''}.png"
    img.filename = filename
    row = {"image":img, "filename":filename}
    if split == "test" and save_images:
      img.save("test_images/" + filename)
    df = pd.concat([df, pd.DataFrame([row])])
    df = df.reset_index(drop=True)
  return df

# the above saved images were uploaded to the following location
# to demonstrate how a practicioner could link back to the raw images from debug information
demo_test_image_location = "https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/"

ref_df = generate_df(n_reference, split="train")
test_df = generate_df(n_test, split="test",anomaly_ratio=ratio_anomalies)

In [7]:
from typing import Dict, Optional
from PIL.ImageStat import Stat
from PIL.Image import Image as ImageType
import pandas as pd

import whylogs as why
from whylogs.core.datatypes import DataType
from whylogs.core.metrics import Metric
from whylogs.core.metrics.metrics import DistributionMetric
from whylogs.core.resolvers import StandardResolver
from whylogs.core.schema import DatasetSchema, ColumnSchema
from whylogs.core.view.dataset_profile_view import DatasetProfileView
from whylogs.extras.image_metric import ImageMetric, ImageMetricConfig
from whylogs.experimental.core.metrics.udf_metric import UdfMetric, UdfMetricConfig

def _get_brightness_thresholds(reference_profile: Optional[DatasetProfileView] = None):
  if reference_profile is not None and "image" in reference_profile.get_columns():
    image_profile: ImageMetric = reference_profile.get_column("image").get_metric(ImageMetric.get_namespace())
    brightness_mean_distribution: DistributionMetric = image_profile.submetrics.get("Brightness.mean")["distribution"]
    return (brightness_mean_distribution.q_01, brightness_mean_distribution.q_99)
  return (40, 215) # arbitrary baseline

def image_outliers(reference_profile: Optional[DatasetProfileView] = None, image_folder: Optional[str] = None):
  if image_folder is None:
    image_folder = ""
  def image_filename_outlier(img: ImageType):
    min, max = _get_brightness_thresholds(reference_profile)
    stats = Stat(img.convert("HSV"))
    stats.mean[2] # Brightness is stored in index=2
    if stats.mean[2] < min or stats.mean[2] > max:
      return image_folder + img.filename
    return None
  return image_filename_outlier

def get_udf_resolver(reference_profile: Optional[DatasetProfileView] = None, image_folder=""):
  udf_config = UdfMetricConfig(
        udfs={
            "image_brightness_outliers": image_outliers(reference_profile, image_folder),
        },
    )
  class DebugImageResolver(StandardResolver):
    def resolve(self, name: str, why_type: DataType, column_schema: ColumnSchema) -> Dict[str, Metric]:
      if "image" in name:
        return {
          ImageMetric.get_namespace(): ImageMetric.zero(),
          UdfMetric.get_namespace(): UdfMetric.zero(udf_config)
        }
      return super(DebugImageResolver, self).resolve(name, why_type, column_schema)
  return DebugImageResolver()


ref_profile = why.log(ref_df, schema=DatasetSchema(resolvers=get_udf_resolver())).view()
image_drift_debug_schema = DatasetSchema(resolvers=get_udf_resolver(ref_profile, image_folder=demo_test_image_location))
test_profile = why.log(test_df,schema=image_drift_debug_schema).view()

In [8]:
from whylogs.core.view.column_profile_view import ColumnProfileView


ref_col_profile: ColumnProfileView = ref_profile.get_column("image")
outliers_metric = ref_col_profile.get_metric("udf").submetrics["image_brightness_outliers"]
outliers = outliers_metric["frequent_items"]
outliers.to_summary_dict()


{'frequent_strings': [FrequentItem(value='train_880.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_408.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_114.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_497.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_289.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_107.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_493.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_968.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_709.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_271.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_959.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_467.png', est=1, upper=1, lower=1),
  FrequentItem(value='train_490.png', est=1, upper=1, lower=1)]}

In [9]:
test_col_profile: ColumnProfileView = test_profile.get_column("image")
outliers_metric = test_col_profile.get_metric("udf").submetrics["image_brightness_outliers"]
outliers = outliers_metric["frequent_items"]
outliers.to_summary_dict()

{'frequent_strings': [FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_55a.png', est=1, upper=1, lower=1),
  FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_71a.png', est=1, upper=1, lower=1),
  FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_193a.png', est=1, upper=1, lower=1),
  FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_41.png', est=1, upper=1, lower=1),
  FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_21.png', est=1, upper=1, lower=1),
  FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_81a.png', est=1, upper=1, lower=1),
  FrequentItem(value='https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_186.png', est=1, upper=1, lower=1),
  FrequentItem(value='https:/

In [10]:
import getpass
import os

# set your org-id here - should be something like "org-xxxx"
print("Enter your WhyLabs Org ID") 
os.environ["WHYLABS_DEFAULT_ORG_ID"] = input()

# set your datased_id (or model_id) here - should be something like "model-xxxx"
print("Enter your WhyLabs Dataset ID")
os.environ["WHYLABS_DEFAULT_DATASET_ID"] = input()

# set your API key here
print("Enter your WhyLabs API key")
os.environ["WHYLABS_API_KEY"] =  getpass.getpass()
print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10])

Enter your WhyLabs Org ID
Enter your WhyLabs Dataset ID
Enter your WhyLabs API key
Using API Key ID:  ZnP7Pqh52l


In [46]:
from whylogs.api.writer.whylabs import WhyLabsWriter
from datetime import timedelta, datetime, timezone

writer = WhyLabsWriter()
# writer.write(test_profile) # write the test profile to WhyLabs
now = datetime.now(timezone.utc)
for i in range(1, 7):
    test_profile.dataset_timestamp = now - timedelta(days=i)
    print(f"Writing test profile for {test_profile.dataset_timestamp} to WhyLabs")
    # writer.write(test_profile)
# writer.option(reference_profile_name="training images").write(ref_profile)

Writing test profile for 2023-04-04 05:58:26.578287+00:00 to WhyLabs
Writing test profile for 2023-04-03 05:58:26.578287+00:00 to WhyLabs
Writing test profile for 2023-04-02 05:58:26.578287+00:00 to WhyLabs
Writing test profile for 2023-04-01 05:58:26.578287+00:00 to WhyLabs
Writing test profile for 2023-03-31 05:58:26.578287+00:00 to WhyLabs
Writing test profile for 2023-03-30 05:58:26.578287+00:00 to WhyLabs


In [None]:
from whylogs.viz import NotebookProfileVisualizer

viz = NotebookProfileVisualizer()
viz.set_profiles(target_profile_view=test_profile, reference_profile_view=ref_profile)

# Go to WhyLabs and look at the data uploaded, drill into the outliers.


In [45]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
from whylogs.extras.image_metric import log_image

display_image_debug_output = True

for fi in outliers.strings:
    print(f"target={fi.value}")
    response = requests.get(fi.value)
    img = Image.open(BytesIO(response.content))
    hsv_img = img.convert("HSV")
    stats = Stat(hsv_img)
    brightness_mean = stats.mean[2]

    if display_image_debug_output:
        single_image_profile = log_image(img).view()
        debug_viz = NotebookProfileVisualizer()
        debug_viz.set_profiles(target_profile_view=single_image_profile, reference_profile_view=ref_profile)
        display(debug_viz.double_histogram(feature_name="image.Brightness.mean"))
        print(f"brightness is {brightness_mean}")






target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_55a.png


brightness is 65.798828125
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_71a.png


brightness is 54.833984375
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_193a.png


brightness is 50.56640625
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_41.png


brightness is 67.931640625
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_21.png


brightness is 220.53515625
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_81a.png


brightness is 39.9765625
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_186.png


brightness is 63.1767578125
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_12a.png


brightness is 54.41796875
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_118.png


brightness is 27.1826171875
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_57.png


brightness is 63.65625
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_0a.png


brightness is 67.2744140625
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_19a.png


brightness is 65.86328125
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_26a.png


brightness is 53.8564453125
target=https://whylabs-public.s3.us-west-2.amazonaws.com/demo_images/cifar_test/test_106a.png


brightness is 51.77734375
