In [1]:
# Cell: Build tokenized train/val/test JSONL from "data" folder (robust to stray chars/separators)

from pathlib import Path
import json, random, re
from typing import List, Tuple, Dict, Iterable

# ----------------------------
# Config
# ----------------------------
DATA_DIR = Path("data")            # root folder containing subfolders (one per level)
OUT_DIR  = Path("processed")       # where to write jsonl outputs
SEED     = 42
SPLITS   = (0.80, 0.10, 0.10)
PAD_TO_RECTANGLE = True

# Cleaning/sanitization behavior
STRICT = False                     # if True -> error on unknown token; if False -> clean with policy below
UNKNOWN_POLICY = "map_to_background"  # "map_to_background" | "drop"
COMMENT_PREFIXES = ("#", "//", ";")   # whole-line comments to skip
SKIP_SEPARATOR_LINES = True           # skip lines made of 1 non-vocab char repeated (----, =====, etc.)
MIN_SEP_RUN = 5

# ----------------------------
# Token vocabulary (from our project context)
# ----------------------------
VOCAB = ['M','F','y','Y','E','g','G','k','K','r','X','#','%','|','*','B','b','?','@','Q','!','1','2','D','S','C','U','L','o','t','T','<','>','[',']']
BACKGROUND = '|'
tok2id = {t:i for i,t in enumerate(VOCAB)}
id2tok = {i:t for t,i in tok2id.items()}
VOCAB_SET = set(VOCAB)

# ----------------------------
# Helpers
# ----------------------------
def read_level_txt(p: Path) -> List[str]:
    """Return list of raw lines without trailing newlines; strip BOM; remove trailing empties."""
    with p.open("r", encoding="utf-8", errors="replace") as f:
        raw = f.read()
    raw = raw.lstrip("\ufeff")  # strip BOM if present
    lines = [ln.rstrip("\n\r") for ln in raw.splitlines()]
    while lines and lines[-1] == "":
        lines.pop()
    return lines

SEP_LINE_RE = re.compile(r"^(.)\1+$")  # same char repeated

def is_separator_line(line: str) -> bool:
    if not SKIP_SEPARATOR_LINES:
        return False
    if len(line) < MIN_SEP_RUN:
        return False
    m = SEP_LINE_RE.match(line)
    if not m:
        return False
    ch = m.group(1)
    return ch not in VOCAB_SET  # skip only if char is not a valid tile

def sanitize_lines(lines: List[str], stats: Dict[str,int]) -> List[str]:
    """Remove comment/separator lines; optionally map/drop unknown chars. Update stats."""
    cleaned = []
    for ln in lines:
        striped = ln.strip()
        if not striped:
            continue
        # whole-line comments
        if any(striped.startswith(pref) for pref in COMMENT_PREFIXES):
            stats["skipped_comment_lines"] += 1
            continue
        # separator lines (-----, =====)
        if is_separator_line(striped):
            stats["skipped_separator_lines"] += 1
            continue

        # character-level cleaning
        new_chars = []
        for ch in ln:
            if ch in VOCAB_SET:
                new_chars.append(ch)
            else:
                if STRICT:
                    raise ValueError(f"Unknown token '{ch}' in line: {ln}")
                stats["unknown_chars"] += 1
                if UNKNOWN_POLICY == "map_to_background":
                    new_chars.append(BACKGROUND)
                elif UNKNOWN_POLICY == "drop":
                    # just skip this character
                    stats["dropped_chars"] += 1
                    continue
                else:
                    # fallback to mapping
                    new_chars.append(BACKGROUND)
        # keep line only if something remains (all-dropped lines vanish)
        if new_chars:
            cleaned.append("".join(new_chars))
        else:
            stats["dropped_empty_after_clean"] += 1
    return cleaned

def normalize_rectangular(lines: List[str]) -> Tuple[List[str], int, int]:
    """Pad all rows to the same width using BACKGROUND; return (lines, W, H)."""
    if not lines:
        return [], 0, 0
    W = max(len(row) for row in lines)
    H = len(lines)
    if PAD_TO_RECTANGLE and W > 0:
        lines = [row + (BACKGROUND * (W - len(row))) for row in lines]
    return lines, W, H

def tokenize_level(lines: List[str]) -> List[int]:
    """Flatten grid row-major into token IDs; assumes all chars ∈ VOCAB."""
    tokens: List[int] = []
    for row in lines:
        for ch in row:
            tokens.append(tok2id[ch])
    return tokens

def collect_pairs(data_dir: Path):
    """Yield dicts with tokenized corrupted/original and basic shape info."""
    stats = {
        "folders_seen": 0,
        "folders_kept": 0,
        "skipped_missing_files": 0,
        "skipped_comment_lines": 0,
        "skipped_separator_lines": 0,
        "unknown_chars": 0,
        "dropped_chars": 0,
        "dropped_empty_after_clean": 0,
        "empty_after_sanitize_pairs": 0,
    }

    for sub in sorted([p for p in data_dir.iterdir() if p.is_dir()]):
        stats["folders_seen"] += 1
        corr = sub / "corrupted.txt"
        orig = sub / "original.txt"
        if not (corr.exists() and orig.exists()):
            stats["skipped_missing_files"] += 1
            continue

        corr_lines_raw = read_level_txt(corr)
        orig_lines_raw = read_level_txt(orig)

        corr_lines = sanitize_lines(corr_lines_raw, stats)
        orig_lines = sanitize_lines(orig_lines_raw, stats)

        # If sanitization nuked all rows, skip this pair
        if not corr_lines or not orig_lines:
            stats["empty_after_sanitize_pairs"] += 1
            continue

        corr_lines, cW, cH = normalize_rectangular(corr_lines)
        orig_lines, oW, oH = normalize_rectangular(orig_lines)

        # Tokenize (now safe)
        corr_ids = tokenize_level(corr_lines)
        orig_ids = tokenize_level(orig_lines)

        stats["folders_kept"] += 1
        yield {
            "level_id": sub.name,
            "corrupted_ids": corr_ids,
            "original_ids": orig_ids,
            "width_corrupted": cW,
            "height_corrupted": cH,
            "width_original": oW,
            "height_original": oH,
            # For debugging: uncomment if you want raw/clean text persisted
            # "corrupted_text": "\n".join(corr_lines),
            # "original_text":  "\n".join(orig_lines),
        }

    return stats

# ----------------------------
# Main
# ----------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)

# First pass: collect and also get cleaning stats
pairs_iter = collect_pairs(DATA_DIR)
pairs = list(pairs_iter)  # exhaust generator

total = len(pairs)
if total == 0:
    raise RuntimeError(f"No valid level pairs found under {DATA_DIR.resolve()} after sanitization. "
                       f"Try setting STRICT=False and UNKNOWN_POLICY='map_to_background'.")

# Deterministic shuffle + split
random.Random(SEED).shuffle(pairs)
train_ratio, val_ratio, test_ratio = SPLITS
assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-9, "SPLITS must sum to 1.0"

n_train = int(total * train_ratio)
n_val   = int(total * val_ratio)
n_test  = total - n_train - n_val

train_set = pairs[:n_train]
val_set   = pairs[n_train:n_train+n_val]
test_set  = pairs[n_train+n_val:]

def write_jsonl(path: Path, rows: Iterable[Dict]):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

write_jsonl(OUT_DIR / "train.jsonl", train_set)
write_jsonl(OUT_DIR / "val.jsonl",   val_set)
write_jsonl(OUT_DIR / "test.jsonl",  test_set)

with (OUT_DIR / "vocab.json").open("w", encoding="utf-8") as f:
    json.dump({"vocab": VOCAB, "tok2id": tok2id}, f, ensure_ascii=False, indent=2)

# ----------------------------
# Summary (with cleaning report)
# ----------------------------
def avg_dims(rows, key_w, key_h):
    if not rows: return (0.0, 0.0)
    aw = sum(r.get(key_w, 0) for r in rows) / len(rows)
    ah = sum(r.get(key_h, 0) for r in rows) / len(rows)
    return (round(aw, 2), round(ah, 2))

cW_tr, cH_tr = avg_dims(train_set, "width_corrupted", "height_corrupted")
oW_tr, oH_tr = avg_dims(train_set, "width_original", "height_original")

print(f"✓ Processed {total} level folders from: {DATA_DIR.resolve()}")
print(f"Split -> Train: {len(train_set)} | Val: {len(val_set)} | Test: {len(test_set)}  (seed={SEED})")
print(f"Outputs -> {OUT_DIR / 'train.jsonl'}, {OUT_DIR / 'val.jsonl'}, {OUT_DIR / 'test.jsonl'}")
print(f"Saved   -> {OUT_DIR / 'vocab.json'}")
print(f"Train avg dims (corrupted): {cW_tr}x{cH_tr} | (original): {oW_tr}x{oH_tr}")
print("--- Cleaning report ---")
print(f"STRICT={STRICT}, UNKNOWN_POLICY='{UNKNOWN_POLICY}'")
print("Note: counts reflect both corrupted/original files across all folders.")
# We can’t read the `stats` dict returned from a generator after exhaustion; re-scan quickly to print stats.
# Light-weight rescan to only count issues:
def quick_scan_unknowns(base: Path) -> Dict[str,int]:
    s = {"folders_seen":0,"skipped_missing_files":0,"skipped_comment_lines":0,
         "skipped_separator_lines":0,"unknown_chars":0,"dropped_chars":0,"dropped_empty_after_clean":0}
    for sub in sorted([p for p in base.iterdir() if p.is_dir()]):
        s["folders_seen"] += 1
        corr = sub / "corrupted.txt"
        orig = sub / "original.txt"
        if not (corr.exists() and orig.exists()):
            s["skipped_missing_files"] += 1
            continue
        for p in (corr, orig):
            lines = read_level_txt(p)
            # simulate sanitize (no errors)
            _stats = {k:0 for k in s.keys()}
            sanitize_lines(lines, _stats)
            for k in s.keys():
                if k in _stats: s[k] += _stats[k]
    return s

scan = quick_scan_unknowns(DATA_DIR)
for k,v in scan.items():
    print(f"{k}: {v}")
print("If '-' or other stray characters are meaningful tiles for you, tell me and I'll add them to VOCAB.")

✓ Processed 3843 level folders from: C:\Users\xhepon\Documents\a-No.2\rp\explore\Mario-AI-Framework\ship_to_train\data
Split -> Train: 3074 | Val: 384 | Test: 385  (seed=42)
Outputs -> processed\train.jsonl, processed\val.jsonl, processed\test.jsonl
Saved   -> processed\vocab.json
Train avg dims (corrupted): 199.99x9.71 | (original): 199.99x8.08
--- Cleaning report ---
STRICT=False, UNKNOWN_POLICY='map_to_background'
Note: counts reflect both corrupted/original files across all folders.
folders_seen: 3843
skipped_missing_files: 0
skipped_comment_lines: 0
skipped_separator_lines: 54759
unknown_chars: 9928048
dropped_chars: 0
dropped_empty_after_clean: 0
If '-' or other stray characters are meaningful tiles for you, tell me and I'll add them to VOCAB.
