In [None]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
%%capture
!pip install sdmetrics
!pip install sdv
!pip install scikit-learn==1.1

In [2]:
from pandas import read_csv

from common.config import *
from common.utils import update_json_file, ndd, ndd_to_dict, load_dataframe

from ydata.metadata import Metadata
from ydata.utils.data_types import DataType, VariableType

metrics_config = get_metrics_config()

In [3]:
from typing import Dict
from numpy import array as nparray

from common.config import get_metrics_config
from common.utils import ndd, ndd_to_dict, get_model_class

def compute_single_column_metrics(metric_info: Dict, metadata: Metadata, real_data, synthetic_data) -> Dict:
    metric_res = ndd()
    metric = metric_info['class']
    metric_dtype = metric_info.get('datatype')
    for k, v in metadata.columns.items():
        if metric_dtype is None or v.datatype == metric_dtype:
            metric_res['columns'][k] = metric.compute(
                real_data=real_data[k],
                synthetic_data=synthetic_data[k]
            )
    metric_res['mean'] = nparray(list(metric_res['columns'].values())).mean()
    metric_res['std'] = nparray(list(metric_res['columns'].values())).std()
    return ndd_to_dict(metric_res)


In [4]:
from typing import Dict
from numpy import array as nparray
from itertools import combinations

def compute_column_pairs_metrics(metric_info: Dict, metadata: Metadata, real_data, synthetic_data) -> Dict:
    metric_res = ndd()
    metric = metric_info['class']
    metric_dtype = metric_info.get('datatype')
    columns_to_consider = [k for k, v in metadata.columns.items() if metric_dtype is None or v.datatype == metric_dtype]
    combi = list(combinations(columns_to_consider, 2))
    for k1, k2 in combi:
        metric_res['columns'][f"{k1}|{k2})"] = metric.compute(
            real_data=real_data[[k1, k2]],
            synthetic_data=synthetic_data[[k1, k2]]
                )
    metric_res['mean'] = nparray(list(metric_res['columns'].values())).mean()
    metric_res['std'] = nparray(list(metric_res['columns'].values())).std()
    return ndd_to_dict(metric_res)


In [5]:
def compute_all_metrics():
    datasets_config = get_datsets_config()
    models_config = get_models_config()
    metrics_config = get_metrics_config()
    analysis_config = get_analysis_config()

    evals = ndd()
    for dataset_name in datasets_config.keys():
        metadata = Metadata.load(str(Path(DATASET_PATH) / f'{dataset_name}_all.metadata.pkl'))
        dataset_df = load_dataframe(dataset_name)
        print(f'# Dataset {dataset_name}')
        for model_name in models_config.keys():
            print(f' # Model: {model_name}')
            try:
                sample = read_csv(Path(SAMPLES_PATH) / f'{model_name}_{dataset_name}_sample.csv')
            except Exception as e:
                print(f'Could not compute the metric {bundle}.{metric_category}.{mname} for {model_name} on {dataset_name}: {e}')
                continue
            for bundle, bundle_info in metrics_config.items():
                print(f'  # Metric bundle: {bundle}')
                for metric_category, metrics_infos in bundle_info.items():
                    print(f'  # Metric category: {metric_category}')
                    metrics_infos_ = metrics_config[bundle][metric_category]['metrics']
                    handler_name = metrics_infos['handler']
                    handler = globals()[handler_name]
                    for mname, metric_info in metrics_infos_.items():
                        try:
                            print(f'  -> Metric: {mname}')
                            evals[dataset_name][model_name][bundle][metric_category][mname] = handler(metric_info, metadata, dataset_df, sample)
                            path = analysis_config['output_files']['metrics']
                            update_json_file(path, ndd_to_dict(evals))
                        except Exception as e:
                            print(f'Could not compute the metric {bundle}.{metric_category}.{mname} for {model_name} on {dataset_name}: {e}')
    return ndd_to_dict(evals)


In [None]:
metrics_evals = compute_all_metrics()

# Dataset sdv.adult
 # Model: fabric.regular
  # Metric bundle: sdv
  # Metric category: single_columns
  -> Metric: category_coverage
  -> Metric: range_coverage
  -> Metric: bounary_adherence
  -> Metric: ks_complement
  -> Metric: tv_complement
  -> Metric: statistic_similarity
  -> Metric: missing_value_similarity
  # Metric category: column_pairs
  -> Metric: contingency_similarity
  -> Metric: correlation_similarity
 # Model: sdv.tabular
  # Metric bundle: sdv
  # Metric category: single_columns
  -> Metric: category_coverage
  -> Metric: range_coverage
  -> Metric: bounary_adherence
  -> Metric: ks_complement
  -> Metric: tv_complement


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



  -> Metric: statistic_similarity
  -> Metric: missing_value_similarity
  # Metric category: column_pairs
  -> Metric: contingency_similarity


In [None]:
analysis_config = get_analysis_config()
path = analysis_config['output_files']['metrics']
update_json_file(path, metrics_evals)