In [4]:
# compare_paradigms.py — aligned with your mario_cnn_a2c_2 (3).py
import os
os.environ.setdefault("PYTHONFAULTHANDLER","1")
os.environ.setdefault("SDL_VIDEODRIVER","dummy")
os.environ.setdefault("SDL_AUDIODRIVER","dummy")
os.environ.setdefault("SDL_RENDER_DRIVER","software")
os.environ.setdefault("OMP_NUM_THREADS","1")
os.environ.setdefault("OPENBLAS_NUM_THREADS","1")
os.environ.setdefault("MKL_NUM_THREADS","1")
os.environ.setdefault("NUMEXPR_NUM_THREADS","1")

import numpy as np, pandas as pd
from stable_baselines3 import A2C
from mario_cnn_a2c_2 import make_mario_env  # your factory (VecEnv)

def _load_a2c(path, env):
    return A2C.load(
        path, env=env,
        custom_objects={
            "observation_space": env.observation_space,
            "action_space": env.action_space,
            "_last_obs": None,
            "_last_episode_starts": None,
            "_last_original_obs": None,
        },
    )

def _safe_num(x, typ=float, default=0):
    try: return typ(x)
    except: return default

def eval_stage(model, env_id, single_stage=True, n_episodes=1, deterministic=False):
    env = make_mario_env(
        env_id, n_envs=1,
        wrapper_kwargs={"frame_skip":4,"screen_size":84},
        use_vec_normalize=True,
        vec_normalize_kwargs={"training":False,"norm_reward":True},
        env_kwargs=None,   # do not pass render_mode
    )
    rows=[]
    for epi in range(n_episodes):
        obs = env.reset()
        R, L, cleared = 0.0, 0, 0
        while True:
            action, _ = model.predict(obs, deterministic=deterministic)
            obs, rewards, dones, infos = env.step(action)
            R += float(rewards[0]); L += 1
            info = infos[0] if isinstance(infos,(list,tuple)) else infos
            if _safe_num(info.get("flag_get",0), int): cleared = 1
            if bool(dones[0]): break
        rows.append({"Reward":R,"Steps":L,"Completed":bool(cleared)})
        if not single_stage and cleared:  # sequential could roll onward in a custom setup
            pass
    env.close()
    return pd.DataFrame(rows)

def evaluate_sequential_completion(model, outdir):
    # Start 1-1, allow progression if your env supports cross-stage continuation
    # Your env flag is wrapper_kwargs["use_single_stage_episodes"]; here we rely on default False
    df = eval_stage(model, "SuperMarioBros-1-1-v0", single_stage=False, n_episodes=1, deterministic=False)
    os.makedirs(os.path.join(outdir,"sequential"), exist_ok=True)
    df.to_csv(os.path.join(outdir,"sequential","results.csv"), index=False)
    return {
        "mean_total_reward": float(df["Reward"].mean()),
        "std_total_reward": float(df["Reward"].std(ddof=1)) if len(df)>1 else 0.0,
        "max_stages_completed": np.nan,  # requires per-stage logging; omit if not present
        "mean_stages_completed": np.nan,
    }

def evaluate_generalization(model, outdir, stages):
    all_rows=[]
    for st in stages:
        df = eval_stage(model, f"SuperMarioBros-{st}-v0", single_stage=True, n_episodes=1, deterministic=False)
        df["Stage"]=st
        all_rows.append(df)
    big = pd.concat(all_rows, ignore_index=True)
    os.makedirs(os.path.join(outdir,"generalization"), exist_ok=True)
    big.to_csv(os.path.join(outdir,"generalization","results.csv"), index=False)
    g = big.groupby("Stage").agg(Mean_Reward=("Reward","mean"),
                                 Std_Reward=("Reward",lambda x: float(x.std(ddof=1)) if len(x)>1 else 0.0),
                                 Completion_Rate=("Completed", lambda x: 100.0*float(np.mean(x))))
    g.to_csv(os.path.join(outdir,"generalization","stage_summaries.csv"))
    return {
        "overall_mean_reward": float(g["Mean_Reward"].mean()),
        "overall_completion_rate": float(g["Completion_Rate"].mean()),
    }

def compare_paradigms(ckpts:dict, outdir="results/analysis"):
    os.makedirs(outdir, exist_ok=True)
    # a tiny env just to supply spaces for model load
    spaces_env = make_mario_env("SuperMarioBros-1-1-v0", n_envs=1,
                                use_vec_normalize=False, env_kwargs=None)
    results={}
    # All 32 stages
    test_stages=[f"{w}-{s}" for w in range(1,9) for s in range(1,5)]
    for name, path in ckpts.items():
        model = _load_a2c(path, spaces_env)
        paradigm_dir = os.path.join(outdir, name); os.makedirs(paradigm_dir, exist_ok=True)
        seq = evaluate_sequential_completion(model, paradigm_dir)
        gen = evaluate_generalization(model, paradigm_dir, test_stages)
        results[name] = {"sequential":seq, "generalization":gen, "ckpt":path}

    # Compact CSV
    rows=[]
    for n,r in results.items():
        rows.append({
            "Paradigm": n,
            "Mean_Reward_Sequential": r["sequential"]["mean_total_reward"],
            "Mean_Reward_Generalization": r["generalization"]["overall_mean_reward"],
            "Completion_Rate_Generalization": r["generalization"]["overall_completion_rate"],
            "Checkpoint": r["ckpt"],
        })
    pd.DataFrame(rows).to_csv(os.path.join(outdir,"paradigm_comparison.csv"), index=False)
    spaces_env.close()
    return results

if __name__ == "__main__":
    ckpts = {
        "paradigm1_single_level": "results/a2c/exp7/models/best_model/best_model.zip",
        "paradigm2_multi_level": "results/a2c/exp7/models/best_model/best_model.zip",
    }
    compare_paradigms(ckpts, outdir="results/analysis")


  logger.deprecation(


New stage reached: 2
New stage reached: 3
New stage reached: 2
New stage reached: 3


  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(


New stage reached: 2
New stage reached: 3
New stage reached: 2
New stage reached: 3


  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


  logger.deprecation(
  logger.deprecation(


New stage reached: 2


  logger.deprecation(


New stage reached: 3


  logger.deprecation(


New stage reached: 4


## Final

In [36]:
"""
Comparative Evaluation Script for Training Paradigms (A2C)

Paradigm 1 (Single-Level): Train on World 1-1 only
Paradigm 2 (Multi-Level): Train on random stages (all 32 levels)

Evaluation:
1) Sequential Completion Test: start on the profile's eval env (e.g., 1-1 or RandomStages)
2) Generalization Test: evaluate all 32 deterministic stages (1-1 ... 8-4)

This script reads BOTH [single] and [multi] from hyperparameters.txt
and evaluates both paradigms in one run (no need to flip active_profile).
"""

# -------------------- Headless & threading (set before any env import) --------------------
import os
os.environ.setdefault("PYTHONFAULTHANDLER", "1")
os.environ.setdefault("SDL_VIDEODRIVER", "dummy")
os.environ.setdefault("SDL_AUDIODRIVER", "dummy")
os.environ.setdefault("SDL_RENDER_DRIVER", "software")
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")

USE_RENDER = os.getenv("MARIO_RENDER", "0") == "1"

# -------------------- Imports --------------------
import re
import numpy as np
import pandas as pd
from stable_baselines3 import A2C
from mario_cnn_a2c_2 import make_mario_env  # your VecEnv factory

# -------------------- Config --------------------
# Point this to your hyperparameters.txt (we will parse BOTH [single] and [multi])
HP_TXT = "results/a2c/exp7/models/best_model/hyperparameters.txt"

# Map which profile to use for each paradigm name (both run in one go)
FORCE_PROFILE_MAP = {
    "paradigm1_single_level": "single",
    "paradigm2_multi_level":  "multi",
}

# -------------------- Small utils --------------------
def _safe_num(x, typ=float, default=0):
    try:
        return typ(x)
    except Exception:
        return default

def pick_env_ids_from_txt(hp_path: str, desired_profile: str):
    """
    Read env ids from [single]/[multi] in hyperparameters.txt and return (profile, env_train, env_eval)
    Ignores 'active_profile' intentionally so we can run both paradigms at once.
    """
    with open(hp_path, "r", encoding="utf-8") as f:
        txt = f.read()

    def extract_block(name):
        pattern = rf"\[{name}\](.*?)(?:\n\[|$)"
        m = re.search(pattern, txt, flags=re.S | re.I)
        if not m:
            return {}
        block = m.group(1)
        d = {}
        for line in block.splitlines():
            line = line.split(";", 1)[0]  # strip ';' comments
            line = line.strip()
            if not line or "=" not in line:
                continue
            k, v = [p.strip() for p in line.split("=", 1)]
            d[k] = v
        return d

    single = extract_block("single")
    multi  = extract_block("multi")

    if desired_profile == "single":
        env_train = single.get("env_id.train", "SuperMarioBros-1-1-v0")
        env_eval  = single.get("env_id.eval",  "SuperMarioBros-1-1-v0")
        profile   = "single"
    elif desired_profile == "multi":
        env_train = multi.get("env_id.train", "SuperMarioBrosRandomStages-v0")
        env_eval  = multi.get("env_id.eval",  "SuperMarioBrosRandomStages-v0")
        profile   = "multi"
    else:
        profile   = "unknown"
        env_train = "SuperMarioBros-1-1-v0"
        env_eval  = "SuperMarioBros-1-1-v0"

    print(f"[ENV PICKER] forced_profile={profile} | train={env_train} | eval={env_eval}")
    return profile, env_train, env_eval

def _make_env(env_id: str, *, norm_reward: bool):
    """
    Safe env factory using your make_mario_env:
    - n_envs=1 (DummyVecEnv → no forks)
    - Keep wrapper kwargs minimal to match your MarioWrapper signature
    - No render_mode unless MARIO_RENDER=1 (to avoid headless SDL issues)
    """
    env_kwargs = {"render_mode": "rgb_array"} if USE_RENDER else None
    return make_mario_env(
        env_id,
        n_envs=1,
        wrapper_kwargs={
            "frame_skip": 4,
            "screen_size": 84,
        },
        vec_normalize_kwargs={
            "training": False,
            "norm_reward": bool(norm_reward),
        },
        env_kwargs=env_kwargs,
    )

def _load_a2c_with_spaces(checkpoint_path: str, env_for_spaces):
    """
    Load A2C with a live env so SB3 has spaces; survive zips that lack buffers.
    """
    return A2C.load(
        checkpoint_path,
        env=env_for_spaces,
        custom_objects={
            "observation_space": env_for_spaces.observation_space,
            "action_space": env_for_spaces.action_space,
            "_last_obs": None,
            "_last_episode_starts": None,
            "_last_original_obs": None,
        },
    )

# -------------------- Evaluations --------------------
def evaluate_sequential_completion_for_env(
    model,
    results_dir: str,
    eval_env_id: str,
    n_episodes=1,
    deterministic=True
):
    """
    Sequential test on the given eval_env_id.
    Tracks per-episode total reward/steps and counts clears via info['flag_get'] (if present).
    Saves CSVs and returns summary dict.
    """
    import os
    os.makedirs(os.path.join(results_dir, "sequential"), exist_ok=True)

    env = _make_env(eval_env_id, norm_reward=False)

    episode_rows = []
    stage_events = []

    for epi in range(n_episodes):
        obs = env.reset()
        ep_reward, ep_steps, clears = 0.0, 0, 0
        stages_list = []

        while True:
            action, _ = model.predict(obs, deterministic=deterministic)
            obs, rewards, dones, infos = env.step(action)
            ep_reward += float(rewards[0])
            ep_steps += 1

            info = infos[0] if isinstance(infos, (list, tuple)) else infos
            if _safe_num(info.get("flag_get", 0), int):
                clears += 1
                w = _safe_num(info.get("world", 0), int)
                s = _safe_num(info.get("stage", 0), int)
                stage_name = f"{w}-{s}" if (w and s) else "?"
                stages_list.append(stage_name)
                stage_events.append({
                    "Episode": epi + 1,
                    "Stage_Order": clears,
                    "Stage": stage_name,
                    "Cumulative_Reward": ep_reward,
                    "Cumulative_Steps": ep_steps,
                })

            if bool(dones[0]):
                break

        episode_rows.append({
            "Episode": epi + 1,
            "Stages_Completed": clears,
            "Stages_List": " -> ".join(stages_list) if stages_list else "None",
            "Total_Reward": ep_reward,
            "Total_Steps": ep_steps,
        })

    env.close()

    df = pd.DataFrame(episode_rows)
    df.to_csv(os.path.join(results_dir, "sequential", "sequential_completion_results.csv"), index=False)

    if stage_events:
        pd.DataFrame(stage_events).to_csv(
            os.path.join(results_dir, "sequential", "stage_completion_records.csv"), index=False
        )

    mean_stages = float(df["Stages_Completed"].mean())
    std_stages = float(df["Stages_Completed"].std(ddof=1)) if len(df) > 1 else 0.0
    max_stages = int(df["Stages_Completed"].max())
    mean_reward = float(df["Total_Reward"].mean())
    std_reward = float(df["Total_Reward"].std(ddof=1)) if len(df) > 1 else 0.0

    if len(df) > 1:
        summary_df = pd.DataFrame([{
            "Metric": "Mean Stages Completed",
            "Value": f"{mean_stages:.2f} ± {std_stages:.2f}",
        }, {
            "Metric": "Max Stages Completed",
            "Value": f"{max_stages}",
        }, {
            "Metric": "Mean Total Reward",
            "Value": f"{mean_reward:.2f} ± {std_reward:.2f}",
        }])
    else:
        summary_df = pd.DataFrame([{
            "Metric": "Stages Completed",
            "Value": f"{mean_stages:.0f}",
        }, {
            "Metric": "Total Reward",
            "Value": f"{mean_reward:.2f}",
        }])

    summary_df.to_csv(os.path.join(results_dir, "sequential", "summary.csv"), index=False)

    return {
        "mean_stages_completed": mean_stages,
        "std_stages_completed": std_stages,
        "max_stages_completed": max_stages,
        "mean_total_reward": mean_reward,
        "std_total_reward": std_reward,
        "episode_data": episode_rows,
    }

def evaluate_generalization_for_envs(
    model,
    results_dir: str,
    stage_list,
    env_id_template="SuperMarioBros-{}-v0",
    n_episodes_per_stage=1,
    deterministic=True
):
    """
    Generalization test: loop explicit deterministic stages (e.g., 1-1 ... 8-4).
    Saves per-episode results and per-stage summaries; returns overall stats.
    """
    import os
    os.makedirs(os.path.join(results_dir, "generalization"), exist_ok=True)

    all_rows = []
    stage_summaries = []

    for stage in stage_list:
        env_id = env_id_template.format(stage)
        env = _make_env(env_id, norm_reward=False)

        ep_rewards, ep_steps, ep_clears = [], [], []

        for e in range(n_episodes_per_stage):
            obs = env.reset()
            R, L, cleared = 0.0, 0, 0
            status = "Failed"

            while True:
                action, _ = model.predict(obs, deterministic=deterministic)
                obs, rewards, dones, infos = env.step(action)
                R += float(rewards[0])
                L += 1

                info = infos[0] if isinstance(infos, (list, tuple)) else infos
                if _safe_num(info.get("flag_get", 0), int):
                    cleared = 1
                    status = "Completed"

                if bool(dones[0]):
                    break

            ep_rewards.append(R)
            ep_steps.append(L)
            ep_clears.append(cleared)

            all_rows.append({
                "Stage": stage,
                "Episode": e + 1,
                "Reward": R,
                "Steps": L,
                "Completed": bool(cleared),
                "Status": status,
            })

        env.close()

        mean_reward = float(np.mean(ep_rewards)) if ep_rewards else 0.0
        std_reward = float(np.std(ep_rewards, ddof=1)) if len(ep_rewards) > 1 else 0.0
        completion_rate = float(np.mean(ep_clears) * 100.0) if ep_clears else 0.0

        stage_summaries.append({
            "Stage": stage,
            "Mean_Reward": mean_reward,
            "Std_Reward": std_reward,
            "Completion_Rate": completion_rate,
        })

    pd.DataFrame(all_rows).to_csv(
        os.path.join(results_dir, "generalization", "generalization_results.csv"),
        index=False
    )
    stage_df = pd.DataFrame(stage_summaries)
    stage_df.to_csv(os.path.join(results_dir, "generalization", "stage_summaries.csv"), index=False)

    overall_mean_reward = float(stage_df["Mean_Reward"].mean()) if not stage_df.empty else 0.0
    overall_completion_rate = float(stage_df["Completion_Rate"].mean()) if not stage_df.empty else 0.0

    return {
        "stage_summaries": stage_summaries,
        "overall_mean_reward": overall_mean_reward,
        "overall_completion_rate": overall_completion_rate,
        "all_results": all_rows,
    }

# -------------------- Orchestrator --------------------
def compare_paradigms(checkpoint_paths: dict, output_dir="results/a2c/exp7/analysis"):
    """
    Evaluate both paradigms (single & multi) and write CSV summaries.
    """
    import os
    os.makedirs(output_dir, exist_ok=True)

    # All 32 deterministic stages (1-1 ... 8-4)
    test_stages = [f"{w}-{s}" for w in range(1, 9) for s in range(1, 5)]
    results = {}

    for name, ckpt in checkpoint_paths.items():
        desired_profile = FORCE_PROFILE_MAP.get(name, "single")
        profile, env_id_train, env_id_eval = pick_env_ids_from_txt(HP_TXT, desired_profile=desired_profile)

        # Build a tiny env with TRAIN env to provide spaces at load time
        tmp_env = _make_env(env_id_train, norm_reward=False)
        model = _load_a2c_with_spaces(ckpt, tmp_env)
        tmp_env.close()

        paradigm_dir = os.path.join(output_dir, name)
        os.makedirs(paradigm_dir, exist_ok=True)

        with open(os.path.join(paradigm_dir, "paradigm_info.txt"), "w", encoding="utf-8") as f:
            f.write(f"Paradigm: {name}\n")
            f.write(f"Training Type (forced): {profile}\n")
            f.write(f"Checkpoint: {ckpt}\n")
            f.write(f"env_id.train: {env_id_train}\n")
            f.write(f"env_id.eval : {env_id_eval}\n")
            f.write(f"Render Enabled: {USE_RENDER}\n")

        # Sequential on the eval env of the chosen profile
        seq = evaluate_sequential_completion_for_env(
            model, paradigm_dir, env_id_eval, n_episodes=1, deterministic=True
        )
        # Generalization on all 32 deterministic stages (v0 to match old models)
        gen = evaluate_generalization_for_envs(
            model, paradigm_dir, test_stages, env_id_template="SuperMarioBros-{}-v0",
            n_episodes_per_stage=1, deterministic=True
        )

        results[name] = {
            "training_type": profile,
            "checkpoint": ckpt,
            "sequential": seq,
            "generalization": gen,
        }

    # Compact comparison CSV
    rows = []
    for n, r in results.items():
        rows.append({
            "Paradigm": n,
            "Training_Type": r["training_type"],
            "Mean_Stages_Sequential": r["sequential"]["mean_stages_completed"],
            "Max_Stages_Sequential": r["sequential"]["max_stages_completed"],
            "Mean_Reward_Sequential": r["sequential"]["mean_total_reward"],
            "Mean_Reward_Generalization": r["generalization"]["overall_mean_reward"],
            "Completion_Rate_Generalization": r["generalization"]["overall_completion_rate"],
            "Checkpoint": r["checkpoint"],
        })
    pd.DataFrame(rows).to_csv(os.path.join(output_dir, "paradigm_comparison.csv"), index=False)

    return results

# -------------------- Main --------------------
if __name__ == "__main__":
    checkpoint_paths = {
        "paradigm1_single_level": "results/a2c/exp7/models/best_model/best_model.zip",
        "paradigm2_multi_level":  "results/a2c/exp7/models/best_model/best_model.zip",
    }
    compare_paradigms(checkpoint_paths, output_dir="results/a2c/exp7/analysis")


[ENV PICKER] forced_profile=single | train=SuperMarioBros-1-1-v0 | eval=SuperMarioBros-1-1-v0
New stage reached: 2
New stage reached: 3
New stage reached: 2
New stage reached: 3
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
[ENV PICKER] forced_profile=multi | train=SuperMarioBrosRandomStages-v0 | eval=SuperMarioBrosRandomStages-v0
New stage reached: 3
New stage reached: 2
New stage reached: 3
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2
New stage reached: 3
New stage reached: 4
New stage reached: 2