In [1]:
from data_merger.data_export_manager import DataExportManager
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 2000)

bucket = "whylabs-export"
prefix = "MyPrefix"
manager = DataExportManager(bucket=bucket, prefix=prefix)

In [None]:
# Attempt to download the bucket's metadata, which contains a list of all profiles
df = manager.load_metadata()
df

In [None]:
# Download the profiles based on the metadata
manager.download_profiles(force_update=False)

In [None]:
# Do some manual exploration to see which non-segmented datasets have the most profiles

non_segmented = df[df["segment"].isna()]

# group the dataframe by the count of each org_id/dataset_id
grouped_df = non_segmented.groupby(['org_id', 'dataset_id']).size().reset_index(name='count')

# sort from highest to lowest count
grouped_df.sort_values(by='count', ascending=False, inplace=True)
grouped_df

In [5]:
org_id = "org-xxx"
dataset_id = "model-1"

In [None]:
manager.list_orgs()

In [None]:
manager.list_models()

In [None]:

manager.preview_data(org_id, dataset_id)


In [None]:
manager.list_profile_metrics(org_id, dataset_id)

In [None]:
profile_columns = manager.list_profile_columns(org_id, dataset_id)
profile_columns

In [None]:
columns = profile_columns['column_name'].tolist()[:3]
manager.show_column_plot(org_id, dataset_id, columns , ['distribution/mean', 'counts/n', 'distribution/min'])


In [None]:
from whylogs.viz import NotebookProfileVisualizer


# Can compare time ranges, even across different datasets

dec_2024 = 1735390861000

first_half_2024_dataset_1, total = manager.get_merged_profile(end=dec_2024 , org_id=org_id, dataset_id="model-42")
print(f"Found {total} profiles for the first dataset")

second_half_2024_dataset_2, total = manager.get_merged_profile(start=dec_2024 , org_id=org_id, dataset_id="model-42")
print(f"Found {total} profiles for the second dataset")

visualization = NotebookProfileVisualizer()
visualization.set_profiles(target_profile_view=first_half_2024_dataset_1, reference_profile_view=second_half_2024_dataset_2)

In [None]:
visualization.summary_drift_report()

In [None]:
# Compare select features between two profiles
visualization.double_histogram(feature_name=["feature1"])

In [None]:
# Can compare using reference profiles from the reference profile export as well

reference_bucket = "whylabs-reference" # the bucket where the reference profiles were exported
reference_manager = DataExportManager(bucket=reference_bucket, prefix=prefix)
df = reference_manager.load_metadata()
reference_manager.download_profiles(force_update=False)

df[df["org_id"] == org_id][df["dataset_id"] == dataset_id]

In [None]:
ref = reference_manager.get_reference_profile(org_id, dataset_id, "ref-Aqid1ta9omNxULJF")

ref.to_pandas()

In [None]:

visualization = NotebookProfileVisualizer()
visualization.set_profiles(target_profile_view=first_half_2024_dataset_1, reference_profile_view=ref)
visualization.summary_drift_report()

## Further Comparisons

See the sample visualization notebook for more examples of generating comparisons between multiple profiles

- https://github.com/whylabs/whylogs/blob/mainline/python/examples/basic/Notebook_Profile_Visualizer.ipynb