In [2]:
# Cell: Run the fine-tuned HF model to repair new corrupted levels in "test_data"
# Assumes you fine-tuned with the earlier HF cell (Qwen/gpt-oss/etc.)
# !pip install -q transformers accelerate

import json, re, os
from pathlib import Path
from typing import List, Dict

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# -----------------------------
# Config
# -----------------------------
MODEL_DIR = "hf-checkpoints/level-repair-qwen/checkpoint-300"   # <- change if you saved elsewhere
TEST_DIR  = Path("test_data")                     # folder of level folders each with corrupted.txt (and optionally metadata.json)
OUT_DIR   = Path("repairs_out")

# generation settings
MAX_NEW_TOKENS   = 4096
DO_SAMPLE        = False   # greedy by default; set True for sampling
TEMPERATURE      = 0.8
TOP_P            = 0.95

# sanitization behavior (mirrors training preprocess defaults)
VOCAB = ['M','F','y','Y','E','g','G','k','K','r','X','#','%','|','*','B','b','?','@','Q','!','1','2','D','S','C','U','L','o','t','T','<','>','[',']']
BACKGROUND = '|'
VOCAB_SET = set(VOCAB)
COMMENT_PREFIXES = ("#", "//", ";")
SKIP_SEPARATOR_LINES = True
MIN_SEP_RUN = 5
STRICT = False                # if True, error on unknown chars; else map to BACKGROUND
UNKNOWN_POLICY = "map_to_background"  # or "drop"

# If you saved this during preprocessing, we can use it to produce IDs.
VOCAB_PATH = Path("processed/vocab.json")  # optional
tok2id = {t:i for i,t in enumerate(VOCAB)}

# -----------------------------
# Utilities
# -----------------------------
def read_text(path: Path) -> List[str]:
    with path.open("r", encoding="utf-8", errors="replace") as f:
        raw = f.read().lstrip("\ufeff")
    lines = [ln.rstrip("\n\r") for ln in raw.splitlines()]
    while lines and lines[-1] == "":
        lines.pop()
    return lines

SEP_LINE_RE = re.compile(r"^(.)\1+$")

def is_separator_line(line: str) -> bool:
    if not SKIP_SEPARATOR_LINES or len(line) < MIN_SEP_RUN: return False
    m = SEP_LINE_RE.match(line)
    if not m: return False
    ch = m.group(1)
    return ch not in VOCAB_SET

def sanitize_lines(lines: List[str]) -> List[str]:
    cleaned = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        if any(s.startswith(pref) for pref in COMMENT_PREFIXES):
            continue
        if is_separator_line(s):
            continue
        new_chars = []
        for ch in ln:
            if ch in VOCAB_SET:
                new_chars.append(ch)
            else:
                if STRICT:
                    raise ValueError(f"Unknown token '{ch}' in line: {ln}")
                if UNKNOWN_POLICY == "map_to_background":
                    new_chars.append(BACKGROUND)
                elif UNKNOWN_POLICY == "drop":
                    continue
                else:
                    new_chars.append(BACKGROUND)
        if new_chars:
            cleaned.append("".join(new_chars))
    return cleaned

def to_ids(grid_text: str) -> List[int]:
    return [tok2id.get(ch, tok2id[BACKGROUND]) for ch in grid_text.replace("\n", "")]

# -----------------------------
# Load model/tokenizer
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, trust_remote_code=True).to(device)
model.eval()

INSTR = "### Instruction:\nRepair the level.\n\n"
CORR  = "### Corrupted:\n"
REPR  = "\n\n### Repaired:\n"

def build_prompt(corrupted_lines: List[str]) -> str:
    corrupted_txt = "\n".join(corrupted_lines)
    return INSTR + CORR + corrupted_txt + REPR

@torch.no_grad()
def generate_repair(corrupted_lines: List[str]) -> str:
    prompt = build_prompt(corrupted_lines)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    gen_kwargs = dict(
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=DO_SAMPLE,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    out = model.generate(**inputs, **gen_kwargs)
    gen_text = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return gen_text

# -----------------------------
# Run over test_data
# -----------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
summary_path = OUT_DIR / "predictions.jsonl"
n_done = 0

with summary_path.open("w", encoding="utf-8") as jf:
    for level_dir in sorted([p for p in TEST_DIR.iterdir() if p.is_dir()]):
        corr_path = level_dir / "corrupted.txt"
        if not corr_path.exists():
            continue

        # sanitize -> prompt -> generate
        raw_lines  = read_text(corr_path)
        clean_lines = sanitize_lines(raw_lines)
        if not clean_lines:
            # skip empty after cleaning
            continue

        pred_txt = generate_repair(clean_lines)

        # ensure an output subfolder per level_id
        out_sub = OUT_DIR / level_dir.name
        out_sub.mkdir(parents=True, exist_ok=True)

        # save repaired grid text
        repaired_path = out_sub / "repaired.txt"
        with repaired_path.open("w", encoding="utf-8") as f:
            f.write(pred_txt.rstrip() + "\n")

        # optional: save predicted ids (flattened; newlines removed)
        repaired_ids_path = out_sub / "repaired_ids.json"
        try:
            pred_ids = to_ids(pred_txt)
            json.dump({"level_id": level_dir.name, "repaired_ids": pred_ids}, repaired_ids_path.open("w", encoding="utf-8"))
        except Exception:
            # if mapping fails for any reason, skip ids
            pass

        # write to summary jsonl
        rec = {
            "level_id": level_dir.name,
            "corrupted_preview_first3": clean_lines[:3],
            "repaired_preview_first3": pred_txt.splitlines()[:3],
            "repaired_path": str(repaired_path),
        }
        jf.write(json.dumps(rec, ensure_ascii=False) + "\n")
        n_done += 1

print(f"✓ Repaired {n_done} levels from {TEST_DIR} -> {OUT_DIR}")
print(f"Summary JSONL: {summary_path}")
print("Example preview (open any repaired.txt inside repairs_out/<level_id>/):")
for p in sorted(OUT_DIR.iterdir()):
    if p.is_dir():
        sample = p / "repaired.txt"
        if sample.exists():
            print(" -", p.name, "->", sample)
            break

ImportError: 
 requires the protobuf library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# Cell: Robust inference for repairing levels with your fine-tuned HF model

# If you see ImportError about protobuf / sentencepiece / tiktoken, install:
# %pip install -U transformers accelerate
# %pip install protobuf sentencepiece tiktoken

import os, re, json, math
from pathlib import Path
from typing import List, Dict

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

print("Transformers:", transformers.__version__)

# -----------------------------
# Paths & config
# -----------------------------
MODEL_DIR = "hf-checkpoints/level-repair-qwen/checkpoint-300"     # your fine-tuned checkpoint dir
BASE_MODEL_NAME = "Qwen/Qwen2.5-0.5B"              
TEST_DIR  = Path("test_data")                      # each subfolder contains corrupted.txt
OUT_DIR   = Path("repairs_out")

# Generation settings
MAX_NEW_TOKENS = 4096
DO_SAMPLE      = False
TEMPERATURE    = 0.8
TOP_P          = 0.95

# Sanitization (match training preprocessing)
VOCAB = ['M','F','y','Y','E','g','G','k','K','r','X','#','%','|','*','B','b','?','@','Q','!','1','2','D','S','C','U','L','o','t','T','<','>','[',']']
BACKGROUND = '|'
VOCAB_SET = set(VOCAB)
COMMENT_PREFIXES = ("#", "//", ";")
MIN_SEP_RUN = 5
STRICT = False
UNKNOWN_POLICY = "map_to_background"  # or "drop"

tok2id = {t:i for i,t in enumerate(VOCAB)}

# -----------------------------
# IO helpers
# -----------------------------
def read_text(path: Path) -> List[str]:
    with path.open("r", encoding="utf-8", errors="replace") as f:
        raw = f.read().lstrip("\ufeff")
    lines = [ln.rstrip("\n\r") for ln in raw.splitlines()]
    while lines and lines[-1] == "":
        lines.pop()
    return lines

SEP_LINE_RE = re.compile(r"^(.)\1+$")
def is_separator_line(line: str) -> bool:
    if len(line) < MIN_SEP_RUN: return False
    m = SEP_LINE_RE.match(line)
    if not m: return False
    ch = m.group(1)
    return ch not in VOCAB_SET

def sanitize_lines(lines: List[str]) -> List[str]:
    cleaned = []
    for ln in lines:
        s = ln.strip()
        if not s: continue
        if any(s.startswith(pref) for pref in COMMENT_PREFIXES): continue
        if is_separator_line(s): continue
        row = []
        for ch in ln:
            if ch in VOCAB_SET:
                row.append(ch)
            else:
                if STRICT:
                    raise ValueError(f"Unknown token '{ch}' in line: {ln}")
                if UNKNOWN_POLICY == "drop":
                    continue
                row.append(BACKGROUND)
        if row:
            cleaned.append("".join(row))
    return cleaned

def to_ids(grid_text: str) -> List[int]:
    return [tok2id.get(ch, tok2id[BACKGROUND]) for ch in grid_text.replace("\n", "")]

# -----------------------------
# Load tokenizer & model (robust)
# -----------------------------
def load_tokenizer_with_fallback(finetuned_dir: str, base_model: str):
    # Try local FT dir first (fast, offline)
    try:
        tok = AutoTokenizer.from_pretrained(finetuned_dir, trust_remote_code=True, local_files_only=True)
        print("Tokenizer loaded from fine-tuned dir (local).")
        return tok
    except Exception as e_local:
        print("Tokenizer not found in fine-tuned dir (local):", repr(e_local))

    # Try FT dir allowing downloads (if any missing files)
    try:
        tok = AutoTokenizer.from_pretrained(finetuned_dir, trust_remote_code=True)
        print("Tokenizer loaded from fine-tuned dir.")
        return tok
    except Exception as e_ft:
        print("Tokenizer not found in fine-tuned dir:", repr(e_ft))

    # Fall back to base model tokenizer
    try:
        tok = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
        print(f"Tokenizer loaded from base model: {base_model}")
        return tok
    except Exception as e_base:
        print("Base tokenizer load failed:", repr(e_base))
        raise RuntimeError(
            "Failed to load a tokenizer. If the error mentions protobuf or sentencepiece, run:\n"
            "  pip install protobuf sentencepiece tiktoken\n"
            "Also ensure BASE_MODEL_NAME matches the model you fine-tuned."
        )

def load_model_finetuned(finetuned_dir: str):
    # We expect weights to be in the fine-tuned dir
    return AutoModelForCausalLM.from_pretrained(finetuned_dir, trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = load_tokenizer_with_fallback(MODEL_DIR, BASE_MODEL_NAME)
if tokenizer.pad_token is None:
    # Many decoder-only models lack a pad; use eos
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

try:
    model = load_model_finetuned(MODEL_DIR).to(device)
    print("Model loaded from fine-tuned dir.")
except Exception as e_model:
    raise RuntimeError(
        f"Could not load model from {MODEL_DIR}. Ensure trainer.save_model(...) wrote weights there.\n"
        f"Original error: {e_model}"
    )

model.eval()

INSTR = "### Instruction:\nRepair the level.\n\n"
CORR  = "### Corrupted:\n"
REPR  = "\n\n### Repaired:\n"

def build_prompt(corrupted_lines: List[str]) -> str:
    return INSTR + CORR + "\n".join(corrupted_lines) + REPR

@torch.no_grad()
def generate_repair(corrupted_lines: List[str]) -> str:
    prompt = build_prompt(corrupted_lines)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    out = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=DO_SAMPLE,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

# -----------------------------
# Run over test_data
# -----------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
summary_path = OUT_DIR / "predictions.jsonl"
n_done = 0

with summary_path.open("w", encoding="utf-8") as jf:
    for level_dir in sorted([p for p in TEST_DIR.iterdir() if p.is_dir()]):
        corr = level_dir / "corrupted.txt"
        if not corr.exists():
            continue
        raw = read_text(corr)
        clean = sanitize_lines(raw)
        if not clean:
            continue

        pred_txt = generate_repair(clean)

        sub_out = OUT_DIR / level_dir.name
        sub_out.mkdir(parents=True, exist_ok=True)

        with (sub_out / "repaired.txt").open("w", encoding="utf-8") as f:
            f.write(pred_txt.rstrip() + "\n")

        # Optional ID export
        try:
            pred_ids = to_ids(pred_txt)
            with (sub_out / "repaired_ids.json").open("w", encoding="utf-8") as f:
                json.dump({"level_id": level_dir.name, "repaired_ids": pred_ids}, f)
        except Exception:
            pass

        rec = {
            "level_id": level_dir.name,
            "repaired_path": str(sub_out / "repaired.txt"),
            "corrupted_preview_first3": clean[:3],
            "repaired_preview_first3": pred_txt.splitlines()[:3],
        }
        jf.write(json.dumps(rec, ensure_ascii=False) + "\n")
        n_done += 1

print(f"✓ Repaired {n_done} levels from {TEST_DIR} -> {OUT_DIR}")
print(f"Summary: {summary_path}")

Transformers: 4.56.1
Tokenizer not found in fine-tuned dir (local): ImportError('\n requires the protobuf library but it was not found in your environment. Check out the instructions on the\ninstallation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones\nthat match your environment. Please note that you may need to restart your runtime after installation.\n')
Tokenizer not found in fine-tuned dir: ImportError('\n requires the protobuf library but it was not found in your environment. Check out the instructions on the\ninstallation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones\nthat match your environment. Please note that you may need to restart your runtime after installation.\n')


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Tokenizer loaded from base model: Qwen/Qwen2.5-0.5B


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model loaded from fine-tuned dir.


##### 