In [None]:
#@title Load environment variables from the .env file
from pathlib import Path
import importlib
import os
import sys


def _load_package():
    try:
        return importlib.import_module("introspect_repro")
    except ModuleNotFoundError:
        search_roots = [Path.cwd().resolve()]
        search_roots += list(search_roots[0].parents)
        for root in search_roots:
            src_dir = root / "src"
            if not src_dir.is_dir():
                continue
            if str(src_dir) not in sys.path:
                sys.path.append(str(src_dir))
            try:
                return importlib.import_module("introspect_repro")
            except ModuleNotFoundError:
                continue
        raise


pkg = _load_package()
if not hasattr(pkg, "activate_local_venv"):
    pkg = importlib.reload(pkg)

activate_local_venv = getattr(pkg, "activate_local_venv")
load_project_env = getattr(pkg, "load_project_env")

activate_local_venv()
load_project_env()

project_root = Path(pkg.__file__).resolve().parent.parent
project_venv = project_root / ".venv"
interpreter_path = Path(sys.executable)
if project_venv.exists():
    if project_venv in interpreter_path.parents:
        print(f"Using interpreter: {interpreter_path}")
    else:
        print(f"Warning: kernel interpreter {interpreter_path} is outside .venv; added .venv site-packages to sys.path.")
else:
    print("Warning: project .venv directory not found. Create it via `python -m venv .venv`.")

print("Loaded variables from .env (existing environment values are preserved).")
status_labels = (
    ("ANTHROPIC_API_KEY", "Anthropic"),
    ("OPENAI_API_KEY", "OpenAI   "),
    ("OPENROUTER_API_KEY", "OpenRouter"),
)
for key, label in status_labels:
    print(f"{label}: {'set' if os.environ.get(key) else 'not set'}")

hf_env_keys = (
    "HUGGINGFACEHUB_API_TOKEN",
    "HUGGINGFACE_TOKEN",
    "HF_TOKEN",
    "HF_API_TOKEN",
)
set_key = next((key for key in hf_env_keys if os.environ.get(key)), None)
if set_key:
    print(f"HuggingFace: set ({set_key})")
else:
    print("HuggingFace: not set")


# Publication‑Style Panels for *Introspective Awareness* Repro

This notebook assembles **multi‑panel figures** that mirror the layouts used in the paper.  
It assumes you have already run the experiment scripts and have results in `runs/<timestamp>/...`.

> Design: each small chart is rendered as a single stand‑alone figure and then **tiled** into a grid using Pillow (no matplotlib subplots).


## 0) Environment setup
Make sure the repro harness is available in `./src`. If you used the main notebook, you already have it. Otherwise unzip/place it here.


In [None]:
# Check for src
import os
print("Have ./src? ->", os.path.isdir("src"))

In [None]:
# Install small helper for image tiling
!pip install pillow

## 1) Helpers
Utilities to tile images into a grid and to locate your latest run directories.


In [None]:
# Panel helpers: tiling and metrics
import os, math, glob, json
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from introspect_repro.plotting.utils import load_results
from introspect_repro.plotting.plot_injected_thoughts import compute_metrics

def tile_images(image_paths, out_path, n_cols=2, pad=10, bg="white"):
    imgs = [Image.open(p).convert("RGB") for p in image_paths]
    if not imgs:
        raise ValueError("No images to tile")
    w = max(i.width for i in imgs)
    h = max(i.height for i in imgs)
    n_rows = math.ceil(len(imgs) / n_cols)
    canvas = Image.new("RGB", (n_cols*w + (n_cols+1)*pad, n_rows*h + (n_rows+1)*pad), bg)
    for idx, im in enumerate(imgs):
        r = idx // n_cols; c = idx % n_cols
        x = pad + c*(w+pad); y = pad + r*(h+pad)
        offx = x + (w - im.width)//2
        offy = y + (h - im.height)//2
        canvas.paste(im, (offx, offy))
    canvas.save(out_path)
    return out_path

def latest_run_dir(name):
    cands = glob.glob(os.path.join("runs", "*", name))
    return max(cands, key=os.path.getmtime) if cands else None

print("helpers ready")

## 2) Injected Thoughts — multi‑strength layer‑wise panel
For each **strength**, render a line chart (awareness / affirmative / mentions / false positives), then tile them.


In [None]:
# Build Injected‑Thoughts panel
RUN_DIR = latest_run_dir("injected_thoughts")  #@param {type:"string"}
STRENGTHS = [1,2,4,8]                           #@param
N_COLS = 2                                       #@param {type:"integer"}
OUT_PATH = os.path.join(RUN_DIR, "panel_injected_thoughts.png")  #@param {type:"string"}

per_strength_paths = []
import sys, subprocess
for s in STRENGTHS:
    out_png = os.path.join(RUN_DIR, f"layerwise_strength{s}.png")
    cmd = [sys.executable, "-m", "introspect_repro.plotting.plot_injected_thoughts",
           "--run-dir", RUN_DIR, "--strength", str(s), "--save", out_png]
    print(">>>", " ".join(cmd))
    subprocess.run(cmd, check=True)
    per_strength_paths.append(out_png)

panel_path = tile_images(per_strength_paths, OUT_PATH, n_cols=N_COLS)
panel_path

## 3) Thought vs Text — multi‑strength layer‑wise panel
One chart per **strength** (identify‑thought vs exact‑repeat), tiled.


In [None]:
# Build Thought‑vs‑Text panel
RUN_DIR = latest_run_dir("thought_vs_text")   #@param {type:"string"}
STRENGTHS = [1,2,4,8]                          #@param
N_COLS = 2                                     #@param {type:"integer"}
OUT_PATH = os.path.join(RUN_DIR, "panel_thought_vs_text.png")  #@param {type:"string"}

per_strength_paths = []
import sys, subprocess
for s in STRENGTHS:
    out_png = os.path.join(RUN_DIR, f"tvt_layerwise_strength{s}.png")
    cmd = [sys.executable, "-m", "introspect_repro.plotting.plot_thought_vs_text",
           "--run-dir", RUN_DIR, "--strength", str(s), "--save", out_png]
    print(">>>", " ".join(cmd))
    subprocess.run(cmd, check=True)
    per_strength_paths.append(out_png)

panel_path = tile_images(per_strength_paths, OUT_PATH, n_cols=N_COLS)
panel_path

## 4) Prefill Intention — three‑condition panel (control / matched / random‑other)

You’ll need three sibling directories under the same `runs/<timestamp>/` root:
- `prefill_control` (run prefill with `--strength 0`),  
- `prefill_intention` (matched concept injection),  
- `prefill_random` (random other word; generated by helper below).

Each cell below renders a chart per **condition × strength** and tiles into a grid like the panel on *page 24* of the paper. fileciteturn0file0


In [None]:
# (Optional) Generate Control + Random‑Other runs here (Matched: run the standard CLI)
TS = None  #@param {type:"string"}
N_TRIALS = 30  #@param {type:"integer"}
LAYER =  12    #@param {type:"integer"}
STRENGTHS = [0,1,2,4,8]  # include 0 for control  #@param

import time, os, random, json, sys
from introspect_repro.models import load_model_and_tokenizer
from introspect_repro.word_lists import CONCEPT_WORDS, SENTENCES
from introspect_repro.prompts import PREFILL_PROMPT
from introspect_repro.concept_vectors import compute_baseline_mean, compute_concept_vector
from introspect_repro.generation import generate_with_optional_injection
from introspect_repro.judges import Judge, JudgeConfig

if TS is None:
    TS = time.strftime("%Y%m%d_%H%M%S")
root = os.path.join("runs", TS)
os.makedirs(root, exist_ok=True)

HF_MODEL = globals().get("HF_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
JUDGE_PROVIDER = globals().get("JUDGE_PROVIDER", "openai")
JUDGE_MODEL = globals().get("JUDGE_MODEL", "gpt-4o-mini")

model, tok = load_model_and_tokenizer(HF_MODEL, device="cuda",
                                      load_in_4bit=globals().get("LOAD_IN_4BIT", False),
                                      load_in_8bit=globals().get("LOAD_IN_8BIT", False),
                                      dtype=globals().get("DTYPE", None))
judge = Judge(JudgeConfig(provider=JUDGE_PROVIDER, model=JUDGE_MODEL))
baseline = compute_baseline_mean(model, tok, LAYER)

# CONTROL
ctrl_dir = os.path.join(root, "prefill_control"); os.makedirs(ctrl_dir, exist_ok=True)
trials = []
for i in range(N_TRIALS):
    w = random.choice(CONCEPT_WORDS); sent = random.choice(SENTENCES)
    prompt = PREFILL_PROMPT.format(sentence=sent, word=w.lower())
    resp = generate_with_optional_injection(model, tok, prompt, None, None, 0.0,
                                            token_range=None, max_new_tokens=64, temperature=0.0)
    intended = judge.grade_intent(resp, w)
    trials.append(dict(word=w, sentence=sent, response=resp, intended=intended, condition="control"))
with open(os.path.join(ctrl_dir, f"layer{LAYER}_strength0.json"), "w") as f:
    json.dump(dict(layer=LAYER, strength=0, trials=trials), f, indent=2)
print("Wrote:", ctrl_dir)

# RANDOM‑OTHER
rand_dir = os.path.join(root, "prefill_random"); os.makedirs(rand_dir, exist_ok=True)
for s in [x for x in STRENGTHS if x>0]:
    trials = []
    for i in range(N_TRIALS):
        w = random.choice(CONCEPT_WORDS); sent = random.choice(SENTENCES)
        prompt = PREFILL_PROMPT.format(sentence=sent, word=w.lower())
        other = random.choice([x for x in CONCEPT_WORDS if x != w])
        vec = compute_concept_vector(model, tok, other, LAYER, cached_baseline=baseline)
        resp = generate_with_optional_injection(model, tok, prompt, LAYER, vec, s,
                                                token_range=None, max_new_tokens=64, temperature=0.0)
        intended = judge.grade_intent(resp, w)
        trials.append(dict(word=w, sentence=sent, injected_other=other, response=resp, intended=intended, condition="random_other"))
    with open(os.path.join(rand_dir, f"layer{LAYER}_strength{s}.json"), "w") as f:
        json.dump(dict(layer=LAYER, strength=s, trials=trials), f, indent=2)
print("Wrote:", rand_dir)

print("For MATCHED condition, run the standard CLI with --outdir", os.path.join(root, "prefill_intention"))

In [None]:
# Build Prefill panel (control vs matched vs random-other)
TS = None  #@param {type:"string"}
STRENGTHS = [1,2,4,8]  #@param
N_COLS = 3             # control, matched, random-other  #@param {type:"integer"}

import glob, os, sys, subprocess
if TS:
    root = os.path.join("runs", TS)
else:
    # guess latest that has all three
    candidates = sorted(glob.glob(os.path.join("runs", "*")), key=os.path.getmtime)
    root = None
    for r in reversed(candidates):
        if all(os.path.isdir(os.path.join(r, d)) for d in ["prefill_control","prefill_intention","prefill_random"]):
            root = r; break

if not root:
    raise SystemExit("Could not find a runs/<TS>/ with prefill_control, prefill_intention, and prefill_random.")

dirs = {
    "Control (no injection)": os.path.join(root, "prefill_control"),
    "Matched (prefill word)": os.path.join(root, "prefill_intention"),
    "Random other word": os.path.join(root, "prefill_random"),
}

per_tile = []
for s in STRENGTHS:
    for name, d in dirs.items():
        out_png = os.path.join(d, f"prefill_layerwise_strength{s}.png")
        cmd = [sys.executable, "-m", "introspect_repro.plotting.plot_prefill_intention",
               "--run-dir", d, "--strength", str(s), "--save", out_png]
        print(">>>", " ".join(cmd))
        subprocess.run(cmd, check=True)
        per_tile.append(out_png)

panel_path = os.path.join(root, f"panel_prefill_{'_'.join(map(str,STRENGTHS))}.png")
panel_path = tile_images(per_tile, panel_path, n_cols=N_COLS)
panel_path

## 5) Intentional Control — small multiples (token‑level traces)

Find the layer with the largest average **Think − Don’t** gap, render **N** per‑trial token traces as single‑plot images, and tile them.


In [None]:
# Build Intentional‑Control small‑multiples panel
RUN_DIR = latest_run_dir("intentional_control")  #@param {type:"string"}
N_EXAMPLES = 12                                   #@param {type:"integer"}
N_COLS = 4                                        #@param {type:"integer"}

import json, os, glob, numpy as np, matplotlib.pyplot as plt
from introspect_repro.plotting.utils import _extract_layer_strength

files = sorted(glob.glob(os.path.join(RUN_DIR, "layer*.json")))
if not files: raise SystemExit("No intentional_control layer*.json files found in " + str(RUN_DIR))

# choose best layer by mean gap
best = None
for f in files:
    with open(f, "r") as fh: j = json.load(fh)
    sims_t = []; sims_d = []
    for t in j["trials"]:
        sims_t.extend(t["sims_think"]); sims_d.extend(t["sims_dont"])
    if sims_t and sims_d:
        gap = float(np.mean(sims_t) - np.mean(sims_d))
        layer, _ = _extract_layer_strength(os.path.basename(f))
        if (best is None) or (gap > best[0]):
            best = (gap, layer, j, f)
gap, layer, j, f = best
print("Best layer:", layer, "gap:", gap)

tmp_imgs = []
for i, t in enumerate(j["trials"][:N_EXAMPLES]):
    plt.figure()
    plt.plot(t["sims_think"], label="Think")
    plt.plot(t["sims_dont"], label="Don't think")
    plt.xlabel("Token index")
    plt.ylabel("Cosine similarity")
    plt.title(f"Trial {i+1} – layer {layer}")
    plt.legend()
    out_png = os.path.join(RUN_DIR, f"intent_small_{i+1:02d}.png")
    plt.savefig(out_png, bbox_inches="tight", dpi=160)
    tmp_imgs.append(out_png)
    plt.close()

panel_path = os.path.join(RUN_DIR, f"panel_intent_smallmultiples_layer{layer}.png")
panel_path = tile_images(tmp_imgs, panel_path, n_cols=N_COLS)
panel_path