# SynthEval check: real vs synthetic

This notebook evaluates a generated dataset against the given real Iris dataset using `SynthEval`.

Focused metrics:
- PCA eigenvalue difference (`pca_eigval_diff`, equivalent idea to `exp_var_diff`)
- PCA first-component angle (`pca_eigvec_ang`, equivalent idea to `comp_angle_diff`)
- Quantile MSE (`avg_qMSE`)


In [None]:
from pathlib import Path

import pandas as pd
from sklearn.datasets import load_iris
from syntheval import SynthEval

ROOT = Path.cwd()
if not (ROOT / 'demo').exists() and (ROOT.parent / 'demo').exists():
    ROOT = ROOT.parent

ROOT


In [None]:
# Real/given dataset (Iris)
iris = load_iris(as_frame=True)
real_df = iris.frame.rename(columns={
    'sepal length (cm)': 'sepal_length',
    'sepal width (cm)': 'sepal_width',
    'petal length (cm)': 'petal_length',
    'petal width (cm)': 'petal_width',
}).copy()
real_df['species'] = real_df['target'].map(dict(enumerate(iris.target_names)))
real_df = real_df.drop(columns=['target'])

real_df.shape


In [None]:
# Synthetic datasets to test.
# Keep at least one generated file. Add more if you have additional given synthetic files.
SYNTHETIC_DATASETS = {
    'itergen_generated': ROOT / 'demo' / 'output' / 'iris_synthetic.csv',
    # 'given_dataset': ROOT / 'path' / 'to' / 'given_synthetic.csv',
}

loaded = {}
for name, path in SYNTHETIC_DATASETS.items():
    if Path(path).exists():
        loaded[name] = pd.read_csv(path)
    else:
        print(f'Skipping {name}: file not found -> {path}')

if not loaded:
    raise FileNotFoundError('No synthetic datasets were found. Generate one first.')

{k: v.shape for k, v in loaded.items()}


In [None]:
def run_eval(real_frame: pd.DataFrame, synthetic_frame: pd.DataFrame) -> pd.DataFrame:
    evaluator = SynthEval(real_frame, cat_cols=['species'], verbose=False)
    return evaluator.evaluate(
        synthetic_frame,
        analysis_target_var='species',
        pca={'preprocess': 'mean'},
        q_mse={'num_quants': 10, 'cat_mse': False},
    )

reports = []
for name, synthetic_df in loaded.items():
    result = run_eval(real_df, synthetic_df)
    result['dataset'] = name
    reports.append(result)

all_results = pd.concat(reports, ignore_index=True)
all_results


In [None]:
metric_map = {
    'pca_eigval_diff': 'exp_var_diff_like',
    'pca_eigvec_ang': 'comp_angle_diff_like',
    'avg_qMSE': 'qMSE',
}

focus = all_results[all_results['metric'].isin(metric_map)].copy()
focus['metric_alias'] = focus['metric'].map(metric_map)
focus = focus[['dataset', 'metric', 'metric_alias', 'val', 'n_val', 'err', 'n_err']]
focus.sort_values(['dataset', 'metric_alias'])


In [None]:
# Optional: quick ranking (higher normalized value is better in SynthEval output).
ranking = (
    focus.groupby('dataset', as_index=False)['n_val']
    .mean()
    .rename(columns={'n_val': 'mean_normalized_score'})
    .sort_values('mean_normalized_score', ascending=False)
)
ranking
