# 🧪 NSAI Experiment Notebook

**Neurosymbolic Runner Selection — A/B Comparison & Analysis**

| Version | Date | Author |
|---------|------|--------|
| 0.3.0 | 2026-02-06 | Wolfram Laube |

This notebook runs reproducible experiments comparing three runner selection strategies:

1. **Rule-Based** — always pick the first feasible runner (static baseline)
2. **Pure MAB** — UCB1 over all runners, no constraint filtering
3. **NSAI** — CSP filter → UCB1 (our neurosymbolic approach)

Metrics: cumulative reward, regret, convergence speed, selection distribution.

**Every section ends with `assert` cells — this notebook is a test suite.**

---
## 0. Setup & Imports

In [None]:
import sys, os
import math, random, time
import json
from collections import Counter, defaultdict
from typing import Dict, List, Tuple

# Ensure nsai is importable (local, CI, or Colab)
for p in [os.path.abspath(os.path.join(os.getcwd(), '..')),
          os.path.abspath(os.path.join(os.getcwd(), '..', '..')),
          '/content']:
    if p not in sys.path:
        sys.path.insert(0, p)

from nsai import NeurosymbolicBandit, __version__
from nsai.ontology import RunnerOntology, create_blauweiss_ontology
from nsai.csp import ConstraintSolver, SolverStatus
from nsai.parser import JobRequirementParser

print(f'NSAI v{__version__}')
print(f'Python {sys.version.split()[0]}')

In [None]:
# ── TestSuite: Setup ──────────────────────────────────────
assert __version__ >= '0.3.0', f'Need NSAI >= 0.3.0, got {__version__}'

onto = create_blauweiss_ontology()
assert len(onto.runners) == 4, f'Expected 4 runners, got {len(onto.runners)}'

expected_runners = {
    'gitlab-runner-nordic', 'Mac Docker Runner',
    'Mac2 Docker Runner', 'Linux Yoga Docker Runner'
}
assert set(onto.runners.keys()) == expected_runners, \
    f'Runner mismatch: {set(onto.runners.keys()) ^ expected_runners}'

# MAB tag mapping roundtrip
for name, runner in onto.runners.items():
    assert runner.mab_tag, f'Runner {name} has no mab_tag'
    assert onto.runner_name_for_mab_tag(runner.mab_tag) == name, \
        f'Tag roundtrip failed for {name}'

print('✅ Setup: version, runners, tag mapping')

---
## 1. Ground Truth Definition

We simulate job execution with known per-runner success probabilities and durations.
This lets us compute **optimal reward** and therefore **regret**.

In [None]:
GROUND_TRUTH = {
    'docker-any': {
        'gitlab-runner-nordic':     {'p_success': 0.96, 'avg_duration': 18.0, 'cost': 0.01},
        'Mac Docker Runner':        {'p_success': 0.92, 'avg_duration': 25.0, 'cost': 0.00},
        'Mac2 Docker Runner':       {'p_success': 0.85, 'avg_duration': 35.0, 'cost': 0.00},
        'Linux Yoga Docker Runner': {'p_success': 0.95, 'avg_duration': 15.0, 'cost': 0.00},
    },
    'gcp': {
        'gitlab-runner-nordic':     {'p_success': 0.96, 'avg_duration': 18.0, 'cost': 0.01},
        'Mac Docker Runner':        {'p_success': 0.00, 'avg_duration': 999,  'cost': 0.00},
        'Mac2 Docker Runner':       {'p_success': 0.00, 'avg_duration': 999,  'cost': 0.00},
        'Linux Yoga Docker Runner': {'p_success': 0.00, 'avg_duration': 999,  'cost': 0.00},
    },
    'shell': {
        'gitlab-runner-nordic':     {'p_success': 0.94, 'avg_duration': 12.0, 'cost': 0.01},
        'Mac Docker Runner':        {'p_success': 0.00, 'avg_duration': 999,  'cost': 0.00},
        'Mac2 Docker Runner':       {'p_success': 0.00, 'avg_duration': 999,  'cost': 0.00},
        'Linux Yoga Docker Runner': {'p_success': 0.93, 'avg_duration': 10.0, 'cost': 0.00},
    },
}

def compute_reward(success: bool, duration: float, cost_per_min: float = 0.0) -> float:
    """Reward function matching NSAI/MAB: reward = success / (dur_min + cost_penalty + eps)."""
    if not success:
        return 0.0
    dur_min = duration / 60.0
    cost_penalty = cost_per_min * dur_min
    return 1.0 / (dur_min + cost_penalty + 0.1)

def simulate_job(runner: str, job_type: str, rng: random.Random) -> Tuple[bool, float]:
    """Simulate a job execution. Returns (success, duration_seconds)."""
    profile = GROUND_TRUTH[job_type].get(runner, {'p_success': 0.0, 'avg_duration': 60.0})
    success = rng.random() < profile['p_success']
    duration = max(5.0, rng.gauss(profile['avg_duration'], profile['avg_duration'] * 0.2))
    if not success:
        duration = max(duration, rng.gauss(60.0, 10.0))
    return success, duration

# Show expected rewards
print('Expected rewards per runner (docker-any):\n')
for runner, prof in GROUND_TRUTH['docker-any'].items():
    e_reward = prof['p_success'] * compute_reward(True, prof['avg_duration'], prof['cost'])
    print(f'  {runner:30} E[reward] = {e_reward:.3f}')

In [None]:
# ── TestSuite: Ground Truth ────────────────────────────────
# All ground-truth runners must exist in ontology
for job_type, runners in GROUND_TRUTH.items():
    for runner in runners:
        assert runner in onto.runners, \
            f'Ground truth runner "{runner}" not in ontology'

# Reward function sanity
assert compute_reward(True, 30.0) > 0, 'Success should yield positive reward'
assert compute_reward(False, 30.0) == 0.0, 'Failure should yield zero reward'
assert compute_reward(True, 10.0) > compute_reward(True, 60.0), \
    'Faster jobs should yield higher reward'
assert compute_reward(True, 30.0, cost_per_min=0.0) > compute_reward(True, 30.0, cost_per_min=1.0), \
    'Higher cost should reduce reward'

# Simulation determinism
rng1 = random.Random(42)
rng2 = random.Random(42)
r1 = simulate_job('gitlab-runner-nordic', 'docker-any', rng1)
r2 = simulate_job('gitlab-runner-nordic', 'docker-any', rng2)
assert r1 == r2, 'Simulation must be deterministic with same seed'

# Linux Yoga should have highest expected reward (docker-any)
expected = {}
for r, p in GROUND_TRUTH['docker-any'].items():
    expected[r] = p['p_success'] * compute_reward(True, p['avg_duration'], p['cost'])
best_runner = max(expected, key=expected.get)
assert best_runner == 'Linux Yoga Docker Runner', \
    f'Expected Linux Yoga as optimal, got {best_runner}'

print('✅ Ground truth: ontology alignment, reward monotonicity, determinism, optimal runner')

---
## 2. Strategy Implementations

In [None]:
class RuleBasedStrategy:
    """Always picks the first online runner (static baseline)."""
    def __init__(self):
        self.name = 'Rule-Based'
        self._default = 'gitlab-runner-nordic'

    def select(self, job_type: str) -> str:
        return self._default

    def update(self, runner, success, duration, cost=0.0):
        pass


class PureMABStrategy:
    """UCB1 over ALL runners — no symbolic filtering."""
    def __init__(self, runners: list, c: float = 2.0):
        self.name = 'Pure MAB'
        self.c = c
        self._stats = {r: {'pulls': 0, 'total_reward': 0.0} for r in runners}
        self._total = 0

    def select(self, job_type: str) -> str:
        for r, s in self._stats.items():
            if s['pulls'] == 0:
                return r
        best, best_ucb = None, -1
        for r, s in self._stats.items():
            mean = s['total_reward'] / s['pulls']
            explore = self.c * math.sqrt(math.log(self._total + 1) / s['pulls'])
            ucb = mean + explore
            if ucb > best_ucb:
                best, best_ucb = r, ucb
        return best

    def update(self, runner, success, duration, cost=0.0):
        reward = compute_reward(success, duration, cost)
        self._stats[runner]['pulls'] += 1
        self._stats[runner]['total_reward'] += reward
        self._total += 1


class NSAIStrategy:
    """Full NSAI: CSP filter → UCB1."""
    def __init__(self):
        self.name = 'NSAI'
        self.nsai = NeurosymbolicBandit.create_default()

    def select(self, job_type: str) -> str:
        tags = [job_type] if job_type != 'docker-any' else ['docker-any']
        runner, _ = self.nsai.select_runner({'tags': tags})
        return runner or 'gitlab-runner-nordic'

    def update(self, runner, success, duration, cost=0.0):
        self.nsai.update(runner, success, duration, cost)

print('Strategies defined ✓')

In [None]:
# ── TestSuite: Strategy Sanity ─────────────────────────────
runners_list = list(GROUND_TRUTH['docker-any'].keys())

# Rule-Based always returns same runner
rb = RuleBasedStrategy()
assert rb.select('docker-any') == rb.select('gcp') == 'gitlab-runner-nordic'

# Pure MAB explores first, then exploits (must update between selects)
mab = PureMABStrategy(runners_list)
first_4 = []
for _ in range(4):
    sel_m = mab.select('docker-any')
    first_4.append(sel_m)
    mab.update(sel_m, success=True, duration=20.0)
assert len(set(first_4)) == 4, \
    f'MAB should explore all 4 runners first, visited {len(set(first_4))}'

# NSAI returns a valid runner
ns = NSAIStrategy()
sel = ns.select('docker-any')
assert sel in runners_list, f'NSAI selected unknown runner: {sel}'

print('✅ Strategies: rule-based static, MAB explores all, NSAI valid selection')


---
## 3. Run Experiment (docker-any)

In [None]:
def run_experiment(n_rounds=200, seed=42, job_type='docker-any'):
    """Run A/B experiment and return per-round results."""
    rng = random.Random(seed)
    runners = list(GROUND_TRUTH[job_type].keys())

    strategies = [
        RuleBasedStrategy(),
        PureMABStrategy(runners),
        NSAIStrategy(),
    ]

    # Optimal expected reward for regret calculation
    best_expected = max(
        p['p_success'] * compute_reward(True, p['avg_duration'], p['cost'])
        for p in GROUND_TRUTH[job_type].values()
        if p['p_success'] > 0
    )

    results = {s.name: {
        'rewards': [], 'cum_reward': [], 'regret': [],
        'cum_regret': [], 'selections': []
    } for s in strategies}

    for t in range(n_rounds):
        for strategy in strategies:
            r = results[strategy.name]
            runner = strategy.select(job_type)
            success, duration = simulate_job(runner, job_type, rng)
            cost = GROUND_TRUTH[job_type].get(runner, {}).get('cost', 0.0)
            reward = compute_reward(success, duration, cost)
            strategy.update(runner, success, duration, cost)
            r['rewards'].append(reward)
            r['cum_reward'].append(sum(r['rewards']))
            r['regret'].append(best_expected - reward)
            r['cum_regret'].append(sum(r['regret']))
            r['selections'].append(runner)

    return results, best_expected

N_ROUNDS = 300
results, optimal_reward = run_experiment(n_rounds=N_ROUNDS, seed=42)
print(f'Experiment complete: {N_ROUNDS} rounds × {len(results)} strategies')
print(f'Optimal expected reward per round: {optimal_reward:.3f}')

In [None]:
# ── TestSuite: Experiment Integrity ────────────────────────
for name, r in results.items():
    assert len(r['rewards']) == N_ROUNDS, \
        f'{name}: expected {N_ROUNDS} rounds, got {len(r["rewards"])}'
    assert len(r['selections']) == N_ROUNDS
    assert len(r['cum_regret']) == N_ROUNDS

    # Cumulative reward must be monotonically non-decreasing
    for i in range(1, len(r['cum_reward'])):
        assert r['cum_reward'][i] >= r['cum_reward'][i-1], \
            f'{name}: cum_reward decreased at round {i}'

    # All rewards must be non-negative
    assert all(rw >= 0 for rw in r['rewards']), f'{name}: negative reward found'

    # All selections must be valid runners
    valid = set(GROUND_TRUTH['docker-any'].keys())
    invalid = set(r['selections']) - valid
    assert not invalid, f'{name}: invalid selections {invalid}'

# Determinism: same seed → same results
results2, _ = run_experiment(n_rounds=50, seed=42)
for name in results:
    assert results[name]['rewards'][:50] == results2[name]['rewards'], \
        f'{name}: not deterministic with same seed'

print(f'✅ Experiment integrity: {N_ROUNDS} rounds, monotonic rewards, valid selections, deterministic')

---
## 4. Results: Cumulative Reward

In [None]:
print('=== Cumulative Reward (after %d rounds) ===\n' % N_ROUNDS)
for name, r in sorted(results.items(), key=lambda x: -x[1]['cum_reward'][-1]):
    total = r['cum_reward'][-1]
    avg = total / N_ROUNDS
    bar = '█' * int(total / 5)
    print(f'{name:15} {total:7.1f}  (avg {avg:.3f})  {bar}')

print('\n=== Cumulative Regret (lower is better) ===\n')
for name, r in sorted(results.items(), key=lambda x: x[1]['cum_regret'][-1]):
    total = r['cum_regret'][-1]
    bar = '░' * int(total / 2)
    print(f'{name:15} {total:7.1f}  {bar}')

In [None]:
# ── TestSuite: Reward Comparison ──────────────────────────
nsai_total  = results['NSAI']['cum_reward'][-1]
rule_total  = results['Rule-Based']['cum_reward'][-1]
mab_total   = results['Pure MAB']['cum_reward'][-1]

# NSAI must beat Rule-Based (the whole point)
assert nsai_total > rule_total, \
    f'NSAI ({nsai_total:.1f}) should beat Rule-Based ({rule_total:.1f})'

# All strategies must have positive total reward
for name, r in results.items():
    assert r['cum_reward'][-1] > 0, f'{name} has zero total reward'

# NSAI should achieve at least 80% of Pure MAB reward
# (CSP adds overhead on unconstrained jobs, but shouldn't destroy performance)
ratio = nsai_total / mab_total
assert ratio > 0.80, \
    f'NSAI/MAB ratio {ratio:.2f} too low (threshold 0.80)'

# NSAI regret must be finite and less than Rule-Based regret
nsai_regret = results['NSAI']['cum_regret'][-1]
rule_regret = results['Rule-Based']['cum_regret'][-1]
assert nsai_regret < rule_regret, \
    f'NSAI regret ({nsai_regret:.1f}) should be < Rule-Based ({rule_regret:.1f})'

print(f'✅ Reward: NSAI ({nsai_total:.0f}) > Rule-Based ({rule_total:.0f}), '
      f'NSAI/MAB ratio = {ratio:.2f}, regret NSAI ({nsai_regret:.0f}) < Rule ({rule_regret:.0f})')

---
## 5. Results: Selection Distribution

In [None]:
RUNNERS_SHORT = {
    'gitlab-runner-nordic': 'nordic',
    'Mac Docker Runner': 'mac',
    'Mac2 Docker Runner': 'mac2',
    'Linux Yoga Docker Runner': 'linux',
}

print('=== Runner Selection Distribution ===\n')
for name, r in results.items():
    counts = Counter(r['selections'])
    total = len(r['selections'])
    print(f'--- {name} ---')
    for runner in GROUND_TRUTH['docker-any']:
        n = counts.get(runner, 0)
        pct = n / total * 100
        bar = '▓' * int(pct / 2)
        print(f'  {RUNNERS_SHORT[runner]:8} {n:4}/{total}  ({pct:5.1f}%)  {bar}')
    print()

In [None]:
# ── TestSuite: Selection Distribution ──────────────────────
# Rule-Based: must only select one runner
rb_unique = set(results['Rule-Based']['selections'])
assert len(rb_unique) == 1, f'Rule-Based selected {len(rb_unique)} runners (expected 1)'

# Pure MAB: must have explored all 4 runners
mab_unique = set(results['Pure MAB']['selections'])
assert len(mab_unique) == 4, f'Pure MAB only explored {len(mab_unique)} runners'

# NSAI: must have explored all 4 runners (all are feasible for docker-any)
nsai_unique = set(results['NSAI']['selections'])
assert len(nsai_unique) == 4, f'NSAI only explored {len(nsai_unique)} runners'

# Both learning strategies should favor Linux Yoga (optimal) in majority
for name in ['Pure MAB', 'NSAI']:
    counts = Counter(results[name]['selections'])
    top_runner = counts.most_common(1)[0][0]
    top_pct = counts[top_runner] / N_ROUNDS
    assert top_runner == 'Linux Yoga Docker Runner', \
        f'{name} top runner is {RUNNERS_SHORT[top_runner]}, expected linux'
    assert top_pct > 0.50, \
        f'{name} selected optimal runner only {top_pct:.0%} (need >50%)'

# Mac2 (worst) should be selected least by learning strategies
for name in ['Pure MAB', 'NSAI']:
    counts = Counter(results[name]['selections'])
    mac2_count = counts.get('Mac2 Docker Runner', 0)
    linux_count = counts.get('Linux Yoga Docker Runner', 0)
    assert mac2_count < linux_count, \
        f'{name}: mac2 ({mac2_count}) should be selected less than linux ({linux_count})'

print('✅ Distribution: Rule-Based static, both learners favor linux, avoid mac2')

---
## 6. Convergence Analysis

In [None]:
def convergence_round(selections: list, optimal: str,
                       window: int = 20, threshold: float = 0.7) -> int:
    """Find the round where strategy converges to selecting optimal runner.
    Returns -1 if not converged."""
    for i in range(window, len(selections)):
        recent = selections[i-window:i]
        if recent.count(optimal) / window >= threshold:
            return i - window
    return -1

optimal = 'Linux Yoga Docker Runner'

print(f'=== Convergence to Optimal Runner ({RUNNERS_SHORT[optimal]}) ===\n')
print(f'Criterion: ≥70% selection rate in rolling window of 20\n')

convergence = {}
for name, r in results.items():
    conv = convergence_round(r['selections'], optimal)
    convergence[name] = conv
    if conv >= 0:
        print(f'{name:15} converged at round {conv}')
    else:
        pct = Counter(r['selections']).get(optimal, 0) / N_ROUNDS * 100
        print(f'{name:15} did not converge ({pct:.0f}% overall)')

In [None]:
# ── TestSuite: Convergence ─────────────────────────────────
# Rule-Based should NOT converge (never selects Linux Yoga)
assert convergence['Rule-Based'] == -1, \
    'Rule-Based should not converge to optimal'

# Both learning strategies should converge within 200 rounds
for name in ['Pure MAB', 'NSAI']:
    assert convergence[name] >= 0, f'{name} did not converge'
    assert convergence[name] < 200, \
        f'{name} converged too late (round {convergence[name]})'

# Regret in second half should be less than first half (learning effect)
half = N_ROUNDS // 2
for name in ['Pure MAB', 'NSAI']:
    r = results[name]
    regret_1st = sum(r['regret'][:half])
    regret_2nd = sum(r['regret'][half:])
    assert regret_2nd <= regret_1st * 1.1, \
        f'{name}: regret not decreasing (1st={regret_1st:.1f}, 2nd={regret_2nd:.1f})'

print(f'✅ Convergence: MAB@{convergence["Pure MAB"]}, NSAI@{convergence["NSAI"]}, '
      f'Rule-Based never, regret sublinear')

---
## 7. Regret Curve (ASCII)

In [None]:
def ascii_plot(series_dict: dict, title: str, width: int = 60, height: int = 18):
    """Simple ASCII line chart."""
    all_vals = [v for vals in series_dict.values() for v in vals]
    y_min, y_max = min(all_vals), max(all_vals)
    if y_max == y_min:
        y_max = y_min + 1
    x_len = max(len(v) for v in series_dict.values())
    symbols = {'Rule-Based': '.', 'Pure MAB': '+', 'NSAI': '*'}
    grid = [[' '] * width for _ in range(height)]

    for name, vals in series_dict.items():
        sym = symbols.get(name, '?')
        step = max(1, len(vals) // width)
        for xi in range(0, min(len(vals), width * step), step):
            col = xi // step
            if col >= width:
                break
            row = int((vals[xi] - y_min) / (y_max - y_min) * (height - 1))
            row = height - 1 - row
            grid[row][col] = sym

    print(f'\n{title}')
    print('─' * (width + 10))
    for i, row in enumerate(grid):
        y_val = y_max - (y_max - y_min) * i / (height - 1)
        print(f'{y_val:7.1f} │{"".join(row)}')
    print(f'        └{"─" * width}')
    print(f'         0{" " * (width - 10)}round {x_len}')
    print(f'  Legend: ', end='')
    for name, sym in symbols.items():
        print(f'{sym}={name}  ', end='')
    print()

ascii_plot(
    {name: r['cum_regret'] for name, r in results.items()},
    'Cumulative Regret (lower is better)'
)

---
## 8. Constraint-Intensive Experiment (GCP-only)

This is where NSAI shines: only 1 runner is feasible for GCP jobs.
Pure MAB wastes rounds trying infeasible runners.

In [None]:
results_gcp, _ = run_experiment(n_rounds=100, seed=42, job_type='gcp')

print('=== GCP-Only Experiment (100 rounds) ===\n')
for name, r in sorted(results_gcp.items(), key=lambda x: -x[1]['cum_reward'][-1]):
    total = r['cum_reward'][-1]
    unique = len(set(r['selections']))
    fails = sum(1 for rw in r['rewards'] if rw == 0)
    print(f'{name:15} reward={total:6.1f}  runners_tried={unique}  failures={fails}')

In [None]:
# ── TestSuite: Constraint-Intensive (GCP) ─────────────────
nsai_gcp   = results_gcp['NSAI']['cum_reward'][-1]
mab_gcp    = results_gcp['Pure MAB']['cum_reward'][-1]
rule_gcp   = results_gcp['Rule-Based']['cum_reward'][-1]

# NSAI should match or beat Pure MAB on constrained jobs
# because CSP immediately filters to only nordic
assert nsai_gcp >= mab_gcp * 0.95, \
    f'NSAI ({nsai_gcp:.1f}) should match MAB ({mab_gcp:.1f}) on GCP jobs'

# NSAI should only select nordic (the only GCP runner)
nsai_gcp_selections = set(results_gcp['NSAI']['selections'])
assert nsai_gcp_selections == {'gitlab-runner-nordic'}, \
    f'NSAI selected non-GCP runners: {nsai_gcp_selections}'

# Pure MAB will waste rounds on infeasible runners
mab_gcp_unique = set(results_gcp['Pure MAB']['selections'])
mab_gcp_failures = sum(1 for rw in results_gcp['Pure MAB']['rewards'] if rw == 0)

nsai_gcp_failures = sum(1 for rw in results_gcp['NSAI']['rewards'] if rw == 0)

# NSAI should have fewer failures (no wasted rounds on infeasible runners)
# Only natural failures (4% of nordic jobs)
assert nsai_gcp_failures <= mab_gcp_failures, \
    f'NSAI failures ({nsai_gcp_failures}) should be <= MAB failures ({mab_gcp_failures})'

print(f'✅ GCP constraint test: NSAI only selected nordic, '
      f'failures NSAI={nsai_gcp_failures} <= MAB={mab_gcp_failures}')

---
## 9. Live MAB Service Sync

In [None]:
import urllib.request

MAB_URL = 'https://runner-bandit-m5cziijwqa-lz.a.run.app'
live_available = False

try:
    with urllib.request.urlopen(f'{MAB_URL}/stats', timeout=5) as resp:
        live_stats = json.loads(resp.read())
    live_available = True

    print(f'MAB Service: {live_stats["algorithm"]}')
    print(f'Total observations: {live_stats["total_observations"]}\n')

    for runner, stats in live_stats.get('runners', {}).items():
        print(f'{runner:30} pulls={stats["pulls"]:3}  '
              f'success={stats["success_rate"]:.0%}  '
              f'avg_dur={stats["avg_duration"]:.1f}s  '
              f'reward={stats["mean_reward"]:.3f}')

    nsai_live = NeurosymbolicBandit.from_live_service(MAB_URL)
    print(f'\n✅ NSAI warm-started from live service')
    print(f'   Total pulls synced: {nsai_live._total_pulls}')

except Exception as e:
    print(f'⚠️  MAB service not reachable: {e}')
    print('Continuing with cold-start NSAI (live tests skipped).')

In [None]:
# ── TestSuite: Live MAB Service ───────────────────────────
if live_available:
    # Service should report correct algorithm
    assert 'UCB1' in live_stats['algorithm'], \
        f'Unexpected algorithm: {live_stats["algorithm"]}'

    # All 4 runners must be registered
    live_runners = set(live_stats['runners'].keys())
    assert live_runners == expected_runners, \
        f'Runner mismatch: expected {expected_runners}, got {live_runners}'

    # Stats must have valid ranges
    for runner, stats in live_stats['runners'].items():
        assert 0 <= stats['success_rate'] <= 1.0, \
            f'{runner}: invalid success_rate {stats["success_rate"]}'
        assert stats['pulls'] >= 0, \
            f'{runner}: negative pulls {stats["pulls"]}'
        assert stats['avg_duration'] >= 0, \
            f'{runner}: negative duration'

    # Warm-started NSAI should have synced total pulls
    assert nsai_live._total_pulls == live_stats['total_observations'], \
        f'Sync mismatch: {nsai_live._total_pulls} vs {live_stats["total_observations"]}'

    # Warm-started NSAI should still be able to select runners
    runner, exp = nsai_live.select_runner({'tags': ['docker-any']})
    assert runner in expected_runners

    print(f'✅ Live MAB: algorithm OK, {len(live_runners)} runners, '
          f'{live_stats["total_observations"]} obs synced, selection works')
else:
    print('⏭️  Live tests skipped (service offline)')

---
## 10. NSAI Explanation Quality

In [None]:
nsai_demo = NeurosymbolicBandit.create_default()
rng = random.Random(42)

# Quick training
for _ in range(30):
    runner, _ = nsai_demo.select_runner({'tags': ['docker-any']})
    success, dur = simulate_job(runner, 'docker-any', rng)
    nsai_demo.update(runner, success, dur)

print('=== Docker Job ===')
runner_d, exp_d = nsai_demo.select_runner({'tags': ['docker-any']}, job_name='test:unit')
print(exp_d)

print('\n=== GCP-Only Job ===')
runner_g, exp_g = nsai_demo.select_runner({'tags': ['docker-any', 'gcp']}, job_name='cloud-run:build')
print(exp_g)

print('\n=== Impossible Job ===')
runner_x, exp_x = nsai_demo.select_runner({'tags': ['gpu', 'arm64']}, job_name='ml:train')
print(exp_x)

In [None]:
# ── TestSuite: Explanation Quality ─────────────────────────
# Docker job: feasible, multiple runners
assert runner_d is not None, 'Docker job should find a runner'
assert len(exp_d.feasible_runners) == 4, \
    f'Docker job: expected 4 feasible, got {len(exp_d.feasible_runners)}'
assert exp_d.confidence > 0, 'Confidence should be positive'
assert exp_d.solve_time_ms > 0, 'Solve time should be measured'
assert len(exp_d.symbolic_reasoning) > 10, 'Symbolic reasoning too short'
assert len(exp_d.statistical_reasoning) > 10, 'Statistical reasoning too short'

# GCP job: only nordic feasible
assert runner_g == 'gitlab-runner-nordic', \
    f'GCP job should select nordic, got {runner_g}'
assert exp_g.feasible_runners == ['gitlab-runner-nordic'], \
    f'GCP: expected only nordic feasible, got {exp_g.feasible_runners}'

# Impossible job: no feasible runner
assert runner_x is None, 'Impossible job should return None'
assert exp_x.confidence == 0.0, 'Impossible job confidence should be 0'
assert len(exp_x.feasible_runners) == 0, 'Impossible job should have 0 feasible'
assert 'No feasible' in exp_x.statistical_reasoning

# Explanation serialization roundtrip
d = exp_d.to_dict()
assert d['selected_runner'] == runner_d
assert isinstance(d['feasible_runners'], list)
assert isinstance(d['confidence'], float)

print('✅ Explanations: docker (4 feasible), GCP (nordic only), impossible (None), serializable')

---
## 11. Performance Benchmark

In [None]:
# Selection latency benchmark
nsai_bench = NeurosymbolicBandit.create_default()
# Warm up with some data
for r in nsai_bench._stats:
    nsai_bench.update(r, success=True, duration_seconds=20.0)

n_iters = 1000
start = time.perf_counter()
for _ in range(n_iters):
    nsai_bench.select_runner({'tags': ['docker-any']})
elapsed = (time.perf_counter() - start) * 1000
avg_ms = elapsed / n_iters

print(f'Selection latency: {avg_ms:.3f}ms avg ({n_iters} iterations)')
print(f'Throughput: {n_iters / (elapsed/1000):.0f} selections/sec')

In [None]:
# ── TestSuite: Performance ─────────────────────────────────
assert avg_ms < 5.0, f'Selection too slow: {avg_ms:.3f}ms (limit: 5ms)'

# Update latency
start = time.perf_counter()
for _ in range(1000):
    nsai_bench.update('gitlab-runner-nordic', True, 20.0)
update_ms = (time.perf_counter() - start) * 1000 / 1000
assert update_ms < 1.0, f'Update too slow: {update_ms:.3f}ms (limit: 1ms)'

# Memory: stats dict shouldn't grow unbounded
assert len(nsai_bench._stats) == 4, 'Stats should only track registered runners'

print(f'✅ Performance: select={avg_ms:.3f}ms, update={update_ms:.3f}ms, memory=4 runners')

---
## 12. Summary & Final Validation

In [None]:
print('╔══════════════════════════════════════════════════════════════╗')
print('║              NSAI Experiment Summary                        ║')
print('╠══════════════════════════════════════════════════════════════╣')
print(f'║  NSAI Version:         {__version__:>8}                            ║')
print(f'║  Runners:              {len(onto.runners):>8}                            ║')
print(f'║  Experiment Rounds:    {N_ROUNDS:>8}                            ║')
print('╠══════════════════════════════════════════════════════════════╣')
for name in ['NSAI', 'Pure MAB', 'Rule-Based']:
    r = results[name]
    cr = r['cum_reward'][-1]
    reg = r['cum_regret'][-1]
    print(f'║  {name:15}  reward={cr:7.1f}  regret={reg:7.1f}          ║')
print('╠══════════════════════════════════════════════════════════════╣')
print(f'║  NSAI vs Rule-Based:  {((nsai_total/rule_total - 1)*100):+5.1f}% reward                  ║')
print(f'║  NSAI vs Pure MAB:    {((nsai_total/mab_total - 1)*100):+5.1f}% reward                  ║')
print(f'║  GCP advantage:        NSAI 0 wasted rounds              ║')
if live_available:
    print(f'║  Live MAB:             {live_stats["total_observations"]:>4} observations synced        ║')
print('╚══════════════════════════════════════════════════════════════╝')

In [None]:
# ── TestSuite: Final Gate ──────────────────────────────────
# This cell summarizes ALL assertions — if we get here, everything passed.

PASSED = []

# Re-validate core invariants
assert __version__ >= '0.3.0';                        PASSED.append('version')
assert len(onto.runners) == 4;                        PASSED.append('runner_count')
assert nsai_total > rule_total;                       PASSED.append('nsai_beats_rule')
assert nsai_total > mab_total * 0.80;                 PASSED.append('nsai_within_20pct_mab')
assert nsai_regret < rule_regret;                     PASSED.append('nsai_lower_regret')
assert convergence['NSAI'] >= 0;                      PASSED.append('nsai_converges')
assert convergence['NSAI'] < 200;                     PASSED.append('nsai_converges_fast')
assert nsai_gcp_failures <= mab_gcp_failures;         PASSED.append('gcp_fewer_failures')
assert avg_ms < 5.0;                                  PASSED.append('latency_ok')
assert runner_x is None;                              PASSED.append('infeasible_handled')

print(f'\n🏁 ALL {len(PASSED)} ASSERTIONS PASSED')
print(f'   {", ".join(PASSED)}')
print(f'\n   Notebook is a valid test suite. ✅')