In [None]:
import json
from pathlib import Path
from collections import Counter
import math
import random
from itertools import islice

# Paths (adjust names if you used different filenames)
GLOBAL_SNAP = Path("pcfg_all.json")
LONG_SNAP = Path("pcfg_of_len6_or_more.json")
FRAG_TSV = Path("frag_tokens_all.tsv")   # FIXED: was frag_tokens_all_len_ge3.tsv

# Load snapshots (prefer long snapshot for distributions; fall back to global)
def load_json(path):
    if not path.exists():
        return None
    return json.loads(path.read_text(encoding="utf8"))

snap_long = load_json(LONG_SNAP)
snap_global = load_json(GLOBAL_SNAP)

# choose which snapshot to prefer for token frequencies
snap_pref = snap_long if snap_long else snap_global
if not snap_pref:
    raise RuntimeError("No snapshot JSON found: please ensure pcfg_of_len6_or_more.json or pcfg_all.json exist.")

# extract top templates, words, digits
top_templates = [tpl for tpl, cnt in snap_pref.get("top_templates", [])]
top_words = [w for w, cnt in snap_pref.get("top_words", [])]
top_digits = [d for d, cnt in snap_pref.get("top_digits", [])]

# Load frag counts from TSV (preferred) or fallback to snapshot FRAG if present
frag_counter = Counter()
if FRAG_TSV.exists():
    with FRAG_TSV.open("r", encoding="utf8") as f:
        nxt = next(f)  # header
        for line in f:
            tok, cnt = line.rstrip("\n").split("\t")
            frag_counter[tok] = int(cnt)
else:
    # try JSON if you saved frag JSON earlier
    frag_json = Path("frag_tokens_all_len_ge3.json")
    if frag_json.exists():
        data = json.loads(frag_json.read_text(encoding="utf8"))
        for tok, cnt in data.get("frag_tokens", []):
            frag_counter[tok] = int(cnt)
    else:
        # no frag file — use FRAG from snapshot if present
        if snap_pref.get("top_frags"):
            for tok, cnt in snap_pref["top_frags"]:
                frag_counter[tok] = int(cnt)

# quick stats
print(f"Using snapshot: {'long' if snap_long else 'global'}")
print(f"Loaded {len(top_templates)} templates, {len(top_words)} top words, {len(top_digits)} top digit runs, {len(frag_counter)} frag tokens")

In [None]:
# helper: parse template into slot list
def parse_template(tpl: str):
    parts = tpl.split("|")
    slots = []
    for part in parts:
        if part.startswith("WORD"):
            n = int(part[4:]) if part[4:].isdigit() else None
            slots.append(("WORD", n))
        elif part.startswith("DIGITS"):
            n = int(part[6:]) if part[6:].isdigit() else None
            slots.append(("DIGITS", n))
        elif part == "SYMBOL":
            slots.append(("SYMBOL", None))
        elif part == "FRAG":
            slots.append(("FRAG", None))
        else:
            # fallback: treat as FRAG
            slots.append(("FRAG", None))
    return slots

# estimate minimal & maximal length possible for a template using top token lengths
def estimate_template_lengths(tpl, top_words_list, top_frags_list, top_digits_list, symbols_max=4):
    slots = parse_template(tpl)
    min_len = 0
    # for FRAG use shortest observed top token; for WORD use their declared length (if present) or shortest top_words
    for s, n in slots:
        if s == "DIGITS":
            if n: min_len += n
            else:
                min_len += (len(top_digits_list[0]) if top_digits_list else 1)
        elif s == "WORD":
            if n: min_len += n
            else:
                min_len += (len(top_words_list[0]) if top_words_list else 4)
        elif s == "SYMBOL":
            min_len += 1
        elif s == "FRAG":
            # use shortest frag (if any) otherwise assume 3
            min_len += (len(top_frags_list[-1]) if top_frags_list else 3)
    # maximal (heuristic): sum of max lengths of top lists (cap to avoid runaway)
    max_len = 0
    for s, n in slots:
        if s == "DIGITS":
            max_len += (len(top_digits_list[0]) if top_digits_list else 4)
        elif s == "WORD":
            max_len += (n if n else (len(top_words_list[0]) if top_words_list else 12))
        elif s == "SYMBOL":
            max_len += symbols_max
        elif s == "FRAG":
            max_len += (len(top_frags_list[0]) if top_frags_list else 12)
    return min_len, max_len

# prepare sorted frag and word lists (most->least frequent)
top_frags = [t for t,c in frag_counter.most_common()]
# For efficiency, we'll use top-k lists (tuneable)
TOP_WORDS_USE = 2000
TOP_FRAGS_USE = 2000
TOP_DIGITS_USE = 500

words_list = top_words[:TOP_WORDS_USE]
frags_list = top_frags[:TOP_FRAGS_USE]
digits_list = top_digits[:TOP_DIGITS_USE]

# filter templates to those that can potentially reach length >= 6 (heuristic)
def templates_capable_of_len(templates, min_len=6):
    capable = []
    for tpl in templates:
        minl, maxl = estimate_template_lengths(tpl, words_list, frags_list, digits_list)
        if maxl >= min_len:
            capable.append(tpl)
    return capable

templates_candidates = templates_capable_of_len(top_templates, min_len=6)
print(f"{len(templates_candidates)} templates can potentially reach length >= 6 (from {len(top_templates)})")
# show top few
for t in templates_candidates[:10]:
    print(" ", t)

In [None]:
# Deterministic beam generator (pruned by partial log-score)
def slot_top_choices(slot_type, topk=200):
    if slot_type == "WORD":
        return words_list[:topk]
    if slot_type == "FRAG":
        return frags_list[:topk]
    if slot_type == "DIGITS":
        return digits_list[:topk]
    if slot_type == "SYMBOL":
        # simple symbol choices (common ones). You can expand if you recorded symbols in snapshot.
        return ["!", "@", "#", "$", "%", "&", "!!", "##"]
    return []

# build simple frequency-based score maps from snapshots (fallback if no model.score available)
# Use counts from snapshot if available
def build_count_map_from_snapshot(snapshot):
    counts = {}
    for w,c in snapshot.get("top_words", []):
        counts[("WORD", w)] = c
    for f,c in snapshot.get("top_digits", []):
        counts[("DIGITS", f)] = c
    # frags: use frag_counter
    for f,c in frag_counter.items():
        counts[("FRAG", f)] = c
    return counts

count_map = build_count_map_from_snapshot(snap_pref)

def partial_score(prefix_score, slot, token):
    # a small additive log-score using counts (avoid zeros)
    cnt = count_map.get((slot, token), 1)
    return prefix_score + math.log(cnt + 1)

def generate_from_template_beam(tpl, topk_per_slot=200, prune_beam=2000, min_len=6, max_out_per_template=5000):
    slots = parse_template(tpl)
    # get candidate lists
    slot_choices = [ slot_top_choices(s, topk_per_slot) for s,_ in slots ]
    # beam will hold (partial_string, partial_score)
    beam = [("", 0.0)]
    for i, (s, _n) in enumerate(slots):
        new_beam = []
        choices = slot_choices[i] or slot_top_choices(s, topk_per_slot)
        for pref, pscore in beam:
            for tok in choices:
                cand = pref + tok
                # prune if too long
                if len(cand) > 64:
                    continue
                new_score = partial_score(pscore, s, tok)
                new_beam.append((cand, new_score))
        # keep top-K by partial score
        new_beam.sort(key=lambda x: x[1], reverse=True)
        beam = new_beam[:prune_beam]
        if not beam:
            break
    # finalize: keep only those >= min_len, sort and yield
    final = [(cand, score) for cand, score in beam if len(cand) >= min_len]
    final.sort(key=lambda x: x[1], reverse=True)
    for cand, score in final[:max_out_per_template]:
        yield cand, score

# Example: generate for first few templates and write to file
OUT_DET = Path("candidates_det_ge6.txt")
OUT_DET.parent.mkdir(exist_ok=True)
written = 0
MAX_TOTAL = 200000  # tune: total candidates to write
with OUT_DET.open("w", encoding="utf8") as fout:
    for tpl in templates_candidates[:40]:  # tune how many templates you try
        for cand, sc in generate_from_template_beam(tpl, topk_per_slot=300, prune_beam=2000, min_len=6, max_out_per_template=2000):
            fout.write(cand + "\n")
            written += 1
            if written >= MAX_TOTAL:
                break
        if written >= MAX_TOTAL:
            break
print(f"Wrote {written} deterministic candidates to {OUT_DET.resolve()}")

In [None]:
# stochastic sampler: sample from empirical frequency weights
def sample_from_counter(counter_list):
    if not counter_list:  # safety check
        return ""
    tokens, counts = zip(*counter_list)
    total = sum(counts)
    weights = [c/total for c in counts]
    return random.choices(tokens, weights=weights, k=1)[0]

# prepare counters for sampling (use top lists)
word_counter = [(w, next((c for (k,w2),c in count_map.items() if k=="WORD" and w2==w), 1)) for w in words_list]
digit_counter = [(d, next((c for (k,d2),c in count_map.items() if k=="DIGITS" and d2==d), 1)) for d in digits_list]
frag_counter_list = [(f, frag_counter[f]) for f in frags_list]

def stochastic_generate_for_template(tpl, n_samples=2000, min_len=6):
    slots = parse_template(tpl)
    results = {}
    for _ in range(n_samples):
        parts = []
        for s, _ in slots:
            if s == "WORD":
                tok = sample_from_counter(word_counter)
            elif s == "FRAG":
                tok = sample_from_counter(frag_counter_list)
            elif s == "DIGITS":
                tok = sample_from_counter(digit_counter)
            else:
                tok = random.choice(["!", "@", "#"])
            parts.append(tok)
        cand = "".join(parts)
        if len(cand) < min_len:
            continue
        # score approximate via product of counts (log)
        sc = 0.0
        for p, (s,_n) in zip(parts, slots):
            sc += math.log(count_map.get((s, p), 1) + 1)
        # keep best score per candidate
        if cand not in results or results[cand] < sc:
            results[cand] = sc
    # return sorted
    return sorted(results.items(), key=lambda x: x[1], reverse=True)

# run sampler across top templates and write a file
OUT_STO = Path("candidates_sto_ge6.txt")
written = 0
with OUT_STO.open("w", encoding="utf8") as fout:
    for tpl in templates_candidates[:60]:
        out = stochastic_generate_for_template(tpl, n_samples=3000, min_len=6)
        for cand, sc in out[:1000]:
            fout.write(cand + "\n")
            written += 1
print(f"Wrote {written} stochastic candidates to {OUT_STO.resolve()}")