In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from typing import List

import pandas as pd
from artifact_core.table_comparison import (
    TableComparisonArrayCollectionType,
    TableComparisonArray,
    TableComparisonPlotCollectionType,
    TableComparisonPlot,
    TableComparisonScoreCollectionType,
    TableComparisonScoreType,
    TabularDataSpec,
)
from artifact_experiment.table_comparison import TableComparisonPlan
from artifact_experiment.tracking import (
    ClearMLTrackingClient,
    FilesystemTrackingClient,
    MlflowTrackingClient,
    NeptuneTrackingClient,
)

# Source

In [None]:
artifact_experiment_root = Path().absolute().parent

df_real = pd.read_csv(artifact_experiment_root / "assets/real.csv")
df_synthetic = pd.read_csv(artifact_experiment_root / "assets/synthetic.csv")

In [None]:
ls_cts_features = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]

data_spec = TabularDataSpec.from_df(
    df=df_real,
    ls_cts_features=ls_cts_features,
    ls_cat_features=[feature for feature in df_real.columns if feature not in ls_cts_features],
)

# Validation Plan

In [None]:
class MyValidationPlan(TableComparisonPlan):
    @staticmethod
    def _get_score_types() -> List[TableComparisonScoreType]:
        return [TableComparisonScoreType.MEAN_JS_DISTANCE]

    @staticmethod
    def _get_array_types() -> List[TableComparisonArray]:
        return []

    @staticmethod
    def _get_plot_types() -> List[TableComparisonPlot]:
        return [
            TableComparisonPlot.PDF,
            TableComparisonPlot.CDF,
            TableComparisonPlot.CORRELATION_HEATMAP_JUXTAPOSITION,
            TableComparisonPlot.DESCRIPTIVE_STATS_ALIGNMENT,
            TableComparisonPlot.PCA_JUXTAPOSITION,
            TableComparisonPlot.TSNE_JUXTAPOSITION,
        ]

    @staticmethod
    def _get_score_collection_types() -> List[TableComparisonScoreCollectionType]:
        return [TableComparisonScoreCollectionType.JS_DISTANCE]

    @staticmethod
    def _get_array_collection_types() -> List[TableComparisonArrayCollectionType]:
        return [
            TableComparisonArrayCollectionType.MIN_JUXTAPOSITION,
            TableComparisonArrayCollectionType.MAX_JUXTAPOSITION,
            TableComparisonArrayCollectionType.MEAN_JUXTAPOSITION,
            TableComparisonArrayCollectionType.STD_JUXTAPOSITION,
        ]

    @staticmethod
    def _get_plot_collection_types() -> List[TableComparisonPlotCollectionType]:
        return [
            TableComparisonPlotCollectionType.PDF,
            TableComparisonPlotCollectionType.CDF,
        ]

In [None]:
plan = MyValidationPlan.create(resource_spec=data_spec)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

# Filesystem Logging

In [None]:
filesystem_tracking_client = FilesystemTrackingClient.build(experiment_id="demo")

In [None]:
plan = MyValidationPlan.create(resource_spec=data_spec, tracking_client=filesystem_tracking_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
filesystem_tracking_client.run.stop()

# ClearML Logging

In [None]:
CLEAR_ML_PROJECT_NAME = "Artifact-ML demo"


clear_ml_client = ClearMLTrackingClient.build(experiment_id=CLEAR_ML_PROJECT_NAME)

In [None]:
plan = MyValidationPlan.create(resource_spec=data_spec, tracking_client=clear_ml_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
clear_ml_client.run.stop()

# Neptune Logging

In [None]:
NEPTUNE_PROJECT_NAME = "h.papoulias/privacy-research-longitudinal"

neptune_client = NeptuneTrackingClient.build(experiment_id=NEPTUNE_PROJECT_NAME)

In [None]:
plan = MyValidationPlan.create(resource_spec=data_spec, tracking_client=neptune_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
neptune_client.run.stop()

# Mlflow Logging

In [None]:
MLFLOW_EXPERIMENT_ID = "Artifact-ML demo"

mlflow_client = MlflowTrackingClient.build(experiment_id=MLFLOW_EXPERIMENT_ID)

In [None]:
plan = MyValidationPlan.create(resource_spec=data_spec, tracking_client=mlflow_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
mlflow_client.run.stop()