In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from typing import List

import pandas as pd
from artifact_core.libs.resource_spec.tabular.spec import TabularDataSpec
from artifact_experiment.libs.tracking.clear_ml.client import ClearMLTrackingClient
from artifact_experiment.libs.tracking.filesystem.client import FilesystemTrackingClient
from artifact_experiment.libs.tracking.mlflow.client import MlflowTrackingClient
from artifact_experiment.libs.tracking.neptune.client import NeptuneTrackingClient
from artifact_experiment.table_comparison.validation_plan import (
    TableComparisonArrayCollectionType,
    TableComparisonArrayType,
    TableComparisonPlotCollectionType,
    TableComparisonPlotType,
    TableComparisonScoreCollectionType,
    TableComparisonScoreType,
    TableComparisonValidationPlan,
)

# Source

In [None]:
artifact_core_root = Path().absolute().parent

df_real = pd.read_csv(artifact_core_root / "assets/real.csv")
df_synthetic = pd.read_csv(artifact_core_root / "assets/synthetic.csv")

In [None]:
ls_cts_features = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]

resource_spec = TabularDataSpec.from_df(
    df=df_real,
    ls_cts_features=ls_cts_features,
    ls_cat_features=[feature for feature in df_real.columns if feature not in ls_cts_features],
)

# Validation Plan

In [None]:
class MyValidationPlan(TableComparisonValidationPlan):
    @staticmethod
    def _get_score_types() -> List[TableComparisonScoreType]:
        return [TableComparisonScoreType.MEAN_JS_DISTANCE]

    @staticmethod
    def _get_array_types() -> List[TableComparisonArrayType]:
        return []

    @staticmethod
    def _get_plot_types() -> List[TableComparisonPlotType]:
        return [
            TableComparisonPlotType.PDF_PLOT,
            TableComparisonPlotType.CDF_PLOT,
            TableComparisonPlotType.DESCRIPTIVE_STATS_COMPARISON_PLOT,
            TableComparisonPlotType.PCA_PROJECTION_PLOT,
            TableComparisonPlotType.TSNE_PROJECTION_PLOT,
        ]

    @staticmethod
    def _get_score_collection_types() -> List[TableComparisonScoreCollectionType]:
        return [TableComparisonScoreCollectionType.JS_DISTANCE]

    @staticmethod
    def _get_array_collection_types() -> List[TableComparisonArrayCollectionType]:
        return [
            TableComparisonArrayCollectionType.MINIMA,
            TableComparisonArrayCollectionType.MAXIMA,
            TableComparisonArrayCollectionType.MEANS,
            TableComparisonArrayCollectionType.STDS,
        ]

    @staticmethod
    def _get_plot_collection_types() -> List[TableComparisonPlotCollectionType]:
        return [
            TableComparisonPlotCollectionType.PDF_PLOTS,
            TableComparisonPlotCollectionType.CDF_PLOTS,
        ]


plan = MyValidationPlan.build(resource_spec=resource_spec)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

# Filesystem Logging

In [None]:
filesystem_tracker = FilesystemTrackingClient.build(experiment_id="demo")

In [None]:
filesystem_tracker.run_dir

In [None]:
plan = MyValidationPlan.build(resource_spec=resource_spec, tracking_client=filesystem_tracker)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
filesystem_tracker.run.stop()

# ClearML Logging

In [None]:
CLEAR_ML_PROJECT_NAME = "Artifact-ML demo"


clear_ml_client = ClearMLTrackingClient.build(experiment_id=CLEAR_ML_PROJECT_NAME)

In [None]:
plan = MyValidationPlan.build(resource_spec=resource_spec, tracking_client=clear_ml_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
clear_ml_client.run.stop()

# Neptune Logging

In [None]:
NEPTUNE_PROJECT_NAME = "Artifact-ML demo"

neptune_client = NeptuneTrackingClient.build(experiment_id=NEPTUNE_PROJECT_NAME)

In [None]:
plan = MyValidationPlan.build(resource_spec=resource_spec, tracking_client=neptune_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
neptune_client.run.stop()

# MLFlow Logging

In [None]:
MLFLOW_EXPERIMENT_NAME = "Artifact-ML demo"

experiment_id = MlflowTrackingClient.create_experiment(experiment_name=MLFLOW_EXPERIMENT_NAME)

mlflow_client = MlflowTrackingClient.build(experiment_id=experiment_id)

In [None]:
plan = MyValidationPlan.build(resource_spec=resource_spec, tracking_client=mlflow_client)

In [None]:
plan.execute_table_comparison(dataset_real=df_real, dataset_synthetic=df_synthetic)

In [None]:
mlflow_client.run.stop()