# My Generator Benchmark (Light)

This is a quick notebook to show my current progress.

I test:
1. features
2. config impact on output
3. speed and worker scaling


In [1]:
import copy
import sys
import time
from pathlib import Path

import pandas as pd
from IPython.display import display

ROOT = Path.cwd()
if not (ROOT / 'src').exists() and (ROOT.parent / 'src').exists():
    ROOT = ROOT.parent

SRC = ROOT / 'src'
if SRC.exists() and str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from itergen import ItergenSynthesizer, RunConfig, get_sample_config
from itergen.schema.config import build_column_specs

pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 180)

print(f'Project root: {ROOT}')


def summarize_column_kinds(config):
    specs = build_column_specs(config)
    by_kind = {'binary': [], 'categorical': [], 'continuous': []}
    for col_id, spec in specs.items():
        by_kind.setdefault(spec.get('kind', 'other'), []).append(col_id)
    return specs, by_kind


def run_generation_case(label, config, run_overrides=None):
    params = dict(
        n_rows=1200,
        seed=101,
        tolerance=0.03,
        max_attempts=3,
        log_level='quiet',
        save_output=False,
        proposal_scoring_mode='incremental',
    )
    if run_overrides:
        params.update(run_overrides)

    run_cfg = RunConfig(**params)

    t0 = time.perf_counter()
    result = ItergenSynthesizer(config, run_cfg).generate()
    elapsed = time.perf_counter() - t0

    specs, by_kind = summarize_column_kinds(config)
    frame = result.dataframe

    summary = {
        'label': label,
        'rows': int(frame.shape[0]),
        'cols': int(frame.shape[1]),
        'binary_cols': len(by_kind.get('binary', [])),
        'categorical_cols': len(by_kind.get('categorical', [])),
        'continuous_cols': len(by_kind.get('continuous', [])),
        'success': bool(result.success),
        'attempts': int(result.attempts),
        'objective': float(result.metrics.get('objective', float('nan'))),
        'max_error': float(result.metrics.get('max_error', float('nan'))),
        'confidence': float(result.quality_report.get('confidence', float('nan'))),
        'runtime_sec': float(elapsed),
        'rows_per_sec': float(frame.shape[0] / elapsed) if elapsed > 0 else float('nan'),
    }
    return summary, result, specs, by_kind


def safe_first(items):
    return items[0] if items else None


Project root: /Users/temurmamanazarov/Work/Swansea-Uni/Code/itergen


In [2]:
PROFILE_NAME = 'light'

FEATURE_CASES = [
    ('Binary', 'binary', 1000),
    ('Categorical', 'categorical', 1000),
    ('Mixed', 'mixed', 1200),
]

CONFIG_BASE_SAMPLE = 'mixed'
CONFIG_VARIANTS = [
    ('baseline_incremental', {}),
    ('stricter_tolerance', {'tolerance': 0.015, 'max_attempts': 4}),
    ('faster_looser', {'tolerance': 0.06, 'max_attempts': 2}),
    ('attempt_workers_2', {'attempt_workers': 2}),
]

PERF_ROW_LEVELS = [600, 1200]
PERF_WORKER_LEVELS = [1, 2]
PERF_REPEATS = 1
PERF_MAX_ATTEMPTS = 3
PERF_OPTIMIZE_OVERRIDES = {
    'max_iters': 80,
    'patience': 6,
    'batch_size': 384,
    'proposals_per_batch': 16,
}

print(f'Loaded profile: {PROFILE_NAME}')


Loaded profile: light


## Part 1 - Feature check


In [3]:
feature_rows = []
feature_artifacts = {}

for label, sample, n_rows in FEATURE_CASES:
    print(f'Running feature case: {label}')
    cfg = get_sample_config(sample)
    summary, result, specs, by_kind = run_generation_case(
        label=label,
        config=cfg,
        run_overrides={'n_rows': n_rows, 'seed': 202, 'max_attempts': 3},
    )
    summary['sample'] = sample
    feature_rows.append(summary)
    feature_artifacts[sample] = {'result': result, 'specs': specs, 'by_kind': by_kind}
    print(f'Done: {label} in {summary["runtime_sec"]:.2f}s')

feature_df = pd.DataFrame(feature_rows).sort_values('runtime_sec').reset_index(drop=True)
display(feature_df[['label', 'sample', 'rows', 'cols', 'success', 'attempts', 'objective', 'max_error', 'confidence', 'runtime_sec', 'rows_per_sec']].round(4))

checks = []
for sample, payload in feature_artifacts.items():
    df = payload['result'].dataframe
    specs = payload['specs']
    by_kind = payload['by_kind']

    for col in by_kind.get('binary', []):
        observed = set(pd.Series(df[col]).dropna().unique().tolist())
        checks.append({'sample': sample, 'column': col, 'check': 'binary in {0,1}', 'ok': observed.issubset({0, 1})})

    for col in by_kind.get('categorical', []):
        allowed = set(specs[col].get('labels') or [])
        observed = set(pd.Series(df[col]).dropna().astype(str).unique().tolist())
        checks.append({'sample': sample, 'column': col, 'check': 'categorical in labels', 'ok': observed.issubset(allowed)})

checks_df = pd.DataFrame(checks)
if not checks_df.empty:
    checks_summary = checks_df.groupby('sample', as_index=False).agg(total=('ok', 'count'), passed=('ok', 'sum'))
    checks_summary['pass_rate'] = checks_summary['passed'] / checks_summary['total']
    display(checks_summary.round(4))


Running feature case: Binary
Done: Binary in 0.19s
Running feature case: Categorical
Done: Categorical in 0.18s
Running feature case: Mixed
Done: Mixed in 1.88s


Unnamed: 0,label,sample,rows,cols,success,attempts,objective,max_error,confidence,runtime_sec,rows_per_sec
0,Categorical,categorical,1000,2,True,1,0.0002,0.0004,0.9998,0.1798,5562.6441
1,Binary,binary,1000,3,True,1,0.0001,0.0002,0.9999,0.1896,5274.3787
2,Mixed,mixed,1200,4,True,1,0.0091,0.0343,0.991,1.878,638.9746


Unnamed: 0,sample,total,passed,pass_rate
0,binary,3,3,1.0
1,categorical,2,2,1.0
2,mixed,3,3,1.0


## Part 2 - Config impact check


In [4]:
base_config = get_sample_config(CONFIG_BASE_SAMPLE)
variant_rows = []
variant_results = {}

for name, overrides in CONFIG_VARIANTS:
    print(f'Running variant: {name}')
    summary, result, specs, by_kind = run_generation_case(
        label=name,
        config=copy.deepcopy(base_config),
        run_overrides={'n_rows': 1500, 'seed': 404, 'tolerance': 0.03, 'max_attempts': 3, **overrides},
    )
    summary['variant'] = name
    variant_rows.append(summary)
    variant_results[name] = {'result': result, 'specs': specs, 'by_kind': by_kind}
    print(f'Done variant: {name} | objective={summary["objective"]:.4f} | runtime={summary["runtime_sec"]:.2f}s')

variants_df = pd.DataFrame(variant_rows).sort_values('objective').reset_index(drop=True)
base_obj = float(variants_df.loc[variants_df['variant'] == 'baseline_incremental', 'objective'].iloc[0])
base_runtime = float(variants_df.loc[variants_df['variant'] == 'baseline_incremental', 'runtime_sec'].iloc[0])
variants_df['objective_delta_vs_base'] = variants_df['objective'] - base_obj
variants_df['runtime_delta_vs_base_sec'] = variants_df['runtime_sec'] - base_runtime
display(variants_df[['variant', 'success', 'attempts', 'objective', 'objective_delta_vs_base', 'confidence', 'runtime_sec', 'runtime_delta_vs_base_sec']].round(4))

baseline = variant_results['baseline_incremental']
binary_col = safe_first(baseline['by_kind'].get('binary', []))
categorical_col = safe_first(baseline['by_kind'].get('categorical', []))
continuous_col = safe_first(baseline['by_kind'].get('continuous', []))

rows = []
for name, payload in variant_results.items():
    df = payload['result'].dataframe
    row = {'variant': name}
    if binary_col and binary_col in df.columns:
        row[f'{binary_col}_rate'] = float(pd.to_numeric(df[binary_col], errors='coerce').mean())
    if categorical_col and categorical_col in df.columns:
        dist = df[categorical_col].value_counts(normalize=True)
        row[f'{categorical_col}_top'] = str(dist.index[0]) if not dist.empty else None
        row[f'{categorical_col}_top_share'] = float(dist.iloc[0]) if not dist.empty else float('nan')
    if continuous_col and continuous_col in df.columns:
        series = pd.to_numeric(df[continuous_col], errors='coerce')
        row[f'{continuous_col}_mean'] = float(series.mean())
        row[f'{continuous_col}_std'] = float(series.std(ddof=0))
    rows.append(row)

display(pd.DataFrame(rows).sort_values('variant').reset_index(drop=True).round(4))


Running variant: baseline_incremental
Done variant: baseline_incremental | objective=0.0094 | runtime=2.27s
Running variant: stricter_tolerance
Done variant: stricter_tolerance | objective=0.0041 | runtime=3.58s
Running variant: faster_looser
Done variant: faster_looser | objective=0.0177 | runtime=1.16s
Running variant: attempt_workers_2
Done variant: attempt_workers_2 | objective=0.0094 | runtime=2.82s


Unnamed: 0,variant,success,attempts,objective,objective_delta_vs_base,confidence,runtime_sec,runtime_delta_vs_base_sec
0,stricter_tolerance,True,1,0.0041,-0.0053,0.996,3.5795,1.3142
1,baseline_incremental,True,1,0.0094,0.0,0.9908,2.2653,0.0
2,attempt_workers_2,True,1,0.0094,0.0,0.9908,2.8238,0.5585
3,faster_looser,True,1,0.0177,0.0084,0.9826,1.1646,-1.1008


Unnamed: 0,variant,loyalty_rate,device_top,device_top_share,spend_mean,spend_std
0,attempt_workers_2,0.416,mobile,0.4753,110.1194,23.1977
1,baseline_incremental,0.416,mobile,0.4753,110.1194,23.1977
2,faster_looser,0.416,mobile,0.482,110.0483,21.5156
3,stricter_tolerance,0.416,mobile,0.4767,110.1677,24.2355


## Part 3 - Performance check


In [5]:
perf_config = get_sample_config('mixed')
perf_rows = []

for n_rows in PERF_ROW_LEVELS:
    for workers in PERF_WORKER_LEVELS:
        for rep in range(PERF_REPEATS):
            print(f'Perf run -> rows={n_rows}, workers={workers}, repeat={rep}')
            seed = 700 + rep
            summary, _result, _specs, _by_kind = run_generation_case(
                label=f'rows_{n_rows}_w_{workers}_r_{rep}',
                config=copy.deepcopy(perf_config),
                run_overrides={
                    'n_rows': n_rows,
                    'seed': seed,
                    'tolerance': 0.03,
                    'max_attempts': PERF_MAX_ATTEMPTS,
                    'attempt_workers': workers,
                    'proposal_scoring_mode': 'incremental',
                    'optimize_overrides': PERF_OPTIMIZE_OVERRIDES,
                },
            )
            perf_rows.append({
                'n_rows': n_rows,
                'workers': workers,
                'repeat': rep,
                'runtime_sec': summary['runtime_sec'],
                'rows_per_sec': summary['rows_per_sec'],
                'objective': summary['objective'],
                'success': summary['success'],
                'attempts': summary['attempts'],
            })

perf_df = pd.DataFrame(perf_rows)
display(perf_df.round(4))

perf_agg = perf_df.groupby(['n_rows', 'workers'], as_index=False).agg(
    avg_runtime_sec=('runtime_sec', 'mean'),
    avg_rows_per_sec=('rows_per_sec', 'mean'),
    avg_objective=('objective', 'mean'),
    success_rate=('success', 'mean'),
    avg_attempts=('attempts', 'mean'),
)

base = perf_agg[perf_agg['workers'] == 1][['n_rows', 'avg_runtime_sec']].rename(columns={'avg_runtime_sec': 'runtime_w1'})
perf_agg = perf_agg.merge(base, on='n_rows', how='left')
perf_agg['speedup_vs_1'] = perf_agg['runtime_w1'] / perf_agg['avg_runtime_sec']
display(perf_agg.round(4))

try:
    import matplotlib.pyplot as plt

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    for workers in sorted(perf_agg['workers'].unique()):
        sub = perf_agg[perf_agg['workers'] == workers]
        axes[0].plot(sub['n_rows'], sub['avg_runtime_sec'], marker='o', label=f'workers={workers}')
        axes[1].plot(sub['n_rows'], sub['avg_rows_per_sec'], marker='o', label=f'workers={workers}')
    axes[0].set_title('Runtime vs Rows (Light)')
    axes[0].set_xlabel('n_rows')
    axes[0].set_ylabel('seconds')
    axes[0].legend()
    axes[1].set_title('Throughput vs Rows (Light)')
    axes[1].set_xlabel('n_rows')
    axes[1].set_ylabel('rows/sec')
    axes[1].legend()
    plt.tight_layout()
    plt.show()
except Exception as exc:
    print(f'Plot skipped: {exc}')

print('Current progress: light benchmark finished')


Perf run -> rows=600, workers=1, repeat=0
Perf run -> rows=600, workers=2, repeat=0
Perf run -> rows=1200, workers=1, repeat=0
Perf run -> rows=1200, workers=2, repeat=0


Unnamed: 0,n_rows,workers,repeat,runtime_sec,rows_per_sec,objective,success,attempts
0,600,1,0,0.9452,634.7569,0.0088,True,1
1,600,2,0,1.7097,350.9424,0.0088,True,1
2,1200,1,0,1.7852,672.1854,0.0095,True,1
3,1200,2,0,3.1559,380.2443,0.0095,True,1


Unnamed: 0,n_rows,workers,avg_runtime_sec,avg_rows_per_sec,avg_objective,success_rate,avg_attempts,runtime_w1,speedup_vs_1
0,600,1,0.9452,634.7569,0.0088,1.0,1.0,0.9452,1.0
1,600,2,1.7097,350.9424,0.0088,1.0,1.0,0.9452,0.5529
2,1200,1,1.7852,672.1854,0.0095,1.0,1.0,1.7852,1.0
3,1200,2,3.1559,380.2443,0.0095,1.0,1.0,1.7852,0.5657


Plot skipped: No module named 'matplotlib'
Current progress: light benchmark finished


## Output explanation and result notes

How to read the main output columns:
- `objective`: main quality score (lower is better)
- `max_error`: worst error point (lower is better)
- `confidence`: quick quality signal (higher is better)
- `runtime_sec`: how long one run took
- `rows_per_sec`: generation speed
- `speedup_vs_1`: worker gain vs single worker

Possible causes behind numbers:
- strict settings can improve quality but often cost more time
- looser settings can be faster but may reduce quality
- worker scaling depends on run size and machine overhead


In [6]:
print('=== Light run discussion ===')

if 'feature_df' in globals() and len(feature_df) > 0:
    fast = feature_df.sort_values('runtime_sec').iloc[0]
    best = feature_df.sort_values('objective').iloc[0]
    print(f'- Fastest feature case: {fast["label"]} ({fast["runtime_sec"]:.2f}s).')
    print(f'- Best quality feature case: {best["label"]} (objective={best["objective"]:.4f}).')
    print('  Why this can happen: feature complexity changes optimization difficulty.')

if 'variants_df' in globals() and len(variants_df) > 0:
    best_variant = variants_df.sort_values('objective').iloc[0]
    fast_variant = variants_df.sort_values('runtime_sec').iloc[0]
    print(f'- Best config quality: {best_variant["variant"]} (objective={best_variant["objective"]:.4f}).')
    print(f'- Fastest config: {fast_variant["variant"]} ({fast_variant["runtime_sec"]:.2f}s).')
    print('  Possible cause: this is a speed vs quality trade-off between variants.')

if 'perf_agg' in globals() and len(perf_agg) > 0:
    best_speed = perf_agg.sort_values('speedup_vs_1', ascending=False).iloc[0]
    print(
        f'- Best speedup: rows={int(best_speed["n_rows"])}, workers={int(best_speed["workers"])}, '
        f'speedup={best_speed["speedup_vs_1"]:.2f}x'
    )
    if best_speed['speedup_vs_1'] > 1.1:
        print('  Cause idea: parallel attempts are helping on this workload size.')
    else:
        print('  Cause idea: overhead is close to compute time in light runs.')

print('Current progress note: light results discussed.')


=== Light run discussion ===
- Fastest feature case: Categorical (0.18s).
- Best quality feature case: Binary (objective=0.0001).
  Why this can happen: feature complexity changes optimization difficulty.
- Best config quality: stricter_tolerance (objective=0.0041).
- Fastest config: faster_looser (1.16s).
  Possible cause: this is a speed vs quality trade-off between variants.
- Best speedup: rows=600, workers=1, speedup=1.00x
  Cause idea: overhead is close to compute time in light runs.
Current progress note: light results discussed.


## My final conclusion (fill this after run)

Use 4-5 short lines:
- Current progress: ...
- Best quality result: ...
- Best speed result: ...
- Main trade-off I saw: ...
- Possible cause of this behavior: ...
- Next thing I will improve: ...
