In [33]:

import re
import math
import json
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Tuple
from IPython.display import display, Markdown


In [34]:
# Cell A — Install and import NLTK resources (run once)
import nltk
nltk.download("words")

from nltk.corpus import words as nltk_words

# Build vocab set for fast lookup
ENGLISH_VOCAB = set(w.lower() for w in nltk_words.words())
len(ENGLISH_VOCAB)


[nltk_data] Downloading package words to /Users/twochar/nltk_data...
[nltk_data]   Package words is already up-to-date!


234377

In [35]:
# Cell 2 — Tokenizer + optional leet normalization helper

LEET_MAP = str.maketrans({
    '0': 'o', '1': 'l', '3': 'e', '4': 'a', '5': 's', '7': 't', '@': 'a', '$': 's', '!': 'i'
})

def leet_normalize(s: str) -> str:
    """Return a leet-normalized version of s (lowercased)."""
    return s.translate(LEET_MAP).lower()

def classify_run(r: str, do_leet: bool = False, use_vocab: bool = True):
    """
    Classify a run (letters / digits / symbols) into (slot_type, token_for_counts, token_for_template).
    - token_for_counts: normalized token used for slot_counts (lowercased, leet-normalized if do_leet)
    - token_for_template: used to decide template text (WORD<N>, FRAG, DIGITS<N>, SYMBOL)
    """
    if r.isdigit():
        return "DIGITS", r, f"DIGITS{len(r)}"
    if r.isalpha():
        token_norm = r.lower()
        if do_leet:
            token_norm = leet_normalize(token_norm)
        # decide WORD vs FRAG using vocab + length threshold
        if use_vocab and len(token_norm) >= 3 and token_norm in ENGLISH_VOCAB:
            return "WORD", token_norm, f"WORD{len(token_norm)}"
        else:
            return "FRAG", token_norm, "FRAG"
    # else symbol / punctuation
    return "SYMBOL", r, "SYMBOL"

def tokenize(password: str, do_leet: bool = False, use_vocab: bool = True) -> Tuple[List[str], str]:
    """
    Split password into runs and return (tokens_for_counts_list, template_str).
    Template uses canonical slot descriptors (WORD<N>, DIGITS<N>, SYMBOL, FRAG).
    The tokens returned are the normalized tokens used for counting (i.e. lowercase + leet-normalized if do_leet).
    """
    pw = password.strip()
    runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', pw)
    tokens_for_counts = []
    template_parts = []
    for r in runs:
        slot, token_for_counts, tpl = classify_run(r, do_leet=do_leet, use_vocab=use_vocab)
        tokens_for_counts.append(token_for_counts)
        template_parts.append(tpl)
    template = "|".join(template_parts)
    return tokens_for_counts, template



In [36]:
class PCFGLite:
    def __init__(self, alpha: float = 1.0, do_leet: bool = False):
        self.template_counts = Counter()
        self.slot_counts = defaultdict(Counter)  # slot_type -> Counter(token)
        self.total_templates = 0
        self.alpha = float(alpha)
        self.do_leet = do_leet

    def fit_list(self, pw_list, max_samples: int = None, verbose: bool = True, use_vocab: bool = True):
        """
        Fit from an iterable/list of plaintext passwords.
        If pw_list is an iterator/generator, it will be consumed streaming.
        """
        for i, pw in enumerate(pw_list):
            if max_samples and i >= max_samples:
                break
            if not pw:
                continue
            # get tokens and template using unified tokenize
            tokens, template = tokenize(pw, do_leet=self.do_leet, use_vocab=use_vocab)
            self.template_counts[template] += 1
            self.total_templates += 1

            # update slot counts by re-parsing runs so we keep the original run boundaries
            runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', pw)
            for r in runs:
                slot, token_for_counts, _ = classify_run(r, do_leet=self.do_leet, use_vocab=use_vocab)
                self.slot_counts[slot][token_for_counts] += 1

            # optional: progress print for very large datasets
            if verbose and (i + 1) % 500000 == 0:
                display(Markdown(f"Trained on {i+1} passwords..."))

        if verbose:
            display(Markdown(f"**Trained on {self.total_templates} templates. Unique templates: {len(self.template_counts)}**"))

    def fit_file(self, filepath: str, max_lines: int = None, use_vocab: bool = True):
        p = Path(filepath)
        if not p.exists():
            raise FileNotFoundError(filepath)
        # stream lines to avoid loading full file into memory
        def iter_lines():
            with p.open("r", encoding="latin-1", errors="ignore") as f:
                for i, line in enumerate(f):
                    if max_lines and i >= max_lines:
                        break
                    yield line.rstrip("\n\r")
        self.fit_list(iter_lines(), max_samples=None, verbose=True, use_vocab=use_vocab)

    def template_prob(self, template: str) -> float:
        V = len(self.template_counts)
        return (self.template_counts[template] + self.alpha) / (self.total_templates + self.alpha * (V + 1))

    def slot_token_prob(self, slot_type: str, token: str) -> float:
        counter = self.slot_counts.get(slot_type, Counter())
        total = sum(counter.values())
        V = len(counter)
        return (counter[token] + self.alpha) / (total + self.alpha * (V + 1))

    def score(self, password: str, use_vocab: bool = True) -> float:
        """Return natural-log probability score (higher = more likely under model)."""
        # Use same tokenize/classify logic as training
        tokens, template = tokenize(password, do_leet=self.do_leet, use_vocab=use_vocab)
        logp = math.log(self.template_prob(template))
        # Re-split runs to align with classification
        runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', password)
        for r in runs:
            slot, token_for_counts, _ = classify_run(r, do_leet=self.do_leet, use_vocab=use_vocab)
            p = self.slot_token_prob(slot, token_for_counts)
            logp += math.log(p)
        return logp

    # keep the rest of your methods unchanged (top_templates/top_tokens/snapshot)
    def top_templates(self, n=30):
        return self.template_counts.most_common(n)

    def top_tokens(self, slot_type: str, n=30):
        return self.slot_counts.get(slot_type, Counter()).most_common(n)

    def snapshot(self, top_templates_n=200, top_words_n=500, top_digits_n=200):
        out = {
            "total_templates": self.total_templates,
            "unique_templates": len(self.template_counts),
            "top_templates": self.top_templates(top_templates_n),
            "top_words": self.top_tokens("WORD", top_words_n),
            "top_digits": self.top_tokens("DIGITS", top_digits_n),
        }
        return out

In [37]:
# Cell 4 — Demo run: use a small synthetic sample if you don't want to load rockyou now.
# Configure DATA_PATH = "/path/to/rockyou.txt" to load real data. For demonstration we'll use a small list.

DATA_PATH = "Data-Breach/rockyou.txt"  # <-- set to path string if you have the file accessible

# Small synthetic sample (for quick demo)
# sample_pw = [
#     "password", "123456", "qwerty", "letmein", "password1", "admin123", "iloveyou", "abc123",
#     "sunshine", "passw0rd", "P@ssw0rd", "john1987", "alice2020!", "dragon", "welcome1", "football"
# ]

model = PCFGLite(alpha=1.0, do_leet=True)

if DATA_PATH:
    print("Loading from file:", DATA_PATH)
    model.fit_file(DATA_PATH)  # change max_lines or remove it for full file
else:
    print("No DATA_PATH provided — running demo on synthetic sample.")
    # model.fit_list(sample_pw, max_samples=None)

# show top templates and top tokens
display(Markdown("### Top templates"))
for t, c in model.top_templates(20):
    print(f"{t:25} {c}")

display(Markdown("### Top WORD tokens"))
for w, c in model.top_tokens("WORD", 15):
    print(f"{w:20} {c}")

display(Markdown("### Top DIGIT runs"))
for d, c in model.top_tokens("DIGITS", 15):
    print(f"{d:10} {c}")


Loading from file: Data-Breach/rockyou.txt


Trained on 500000 passwords...

Trained on 1000000 passwords...

Trained on 1500000 passwords...

Trained on 2000000 passwords...

Trained on 2500000 passwords...

Trained on 3000000 passwords...

Trained on 3500000 passwords...

Trained on 4000000 passwords...

Trained on 4500000 passwords...

Trained on 5000000 passwords...

Trained on 5500000 passwords...

Trained on 6000000 passwords...

Trained on 6500000 passwords...

Trained on 7000000 passwords...

Trained on 7500000 passwords...

Trained on 8000000 passwords...

Trained on 8500000 passwords...

Trained on 9000000 passwords...

Trained on 9500000 passwords...

Trained on 10000000 passwords...

Trained on 10500000 passwords...

Trained on 11000000 passwords...

Trained on 11500000 passwords...

Trained on 12000000 passwords...

Trained on 12500000 passwords...

Trained on 13000000 passwords...

Trained on 13500000 passwords...

Trained on 14000000 passwords...

**Trained on 14344390 templates. Unique templates: 29295**

### Top templates

FRAG                      4051390
FRAG|DIGITS2              1440243
FRAG|DIGITS1              886830
FRAG|DIGITS4              755828
FRAG|DIGITS3              515233
DIGITS7                   487437
DIGITS10                  478224
DIGITS8                   428306
DIGITS6                   390546
DIGITS9                   307540
FRAG|DIGITS6              212393
FRAG|DIGITS5              128906
WORD4|DIGITS4             125077
FRAG|DIGITS1|FRAG         123929
FRAG|SYMBOL               118006
DIGITS11                  107864
WORD6|DIGITS2             102431
WORD5|DIGITS2             100013
DIGITS1|FRAG              99932
DIGITS4|FRAG              96901


### Top WORD tokens

love                 23062
ever                 18744
life                 16089
eva                  14720
yahoo                9746
baby                 8523
may                  7791
angel                7092
sexy                 6318
alex                 5420
pink                 4977
june                 4977
sam                  4760
jan                  4684
girl                 4568


### Top DIGIT runs

1          699853
2          272123
4          238996
3          221212
123        146979
7          129316
12         121734
5          114328
0          105195
8          100934
13         90319
6          85355
9          82820
11         78797
23         67325


In [38]:
# Cell 5 — Scoring examples & usage
examples = ["password", "P@ssw0rd", "john1987", "unique!X9", "iloveyou"]
display(Markdown("### Example scores (higher = more likely under model)"))
for ex in examples:
    print(f"{ex:15}  score = {model.score(ex):.4f}")


### Example scores (higher = more likely under model)

password         score = -14.1072
P@ssw0rd         score = -42.6932
john1987         score = -18.3355
unique!X9        score = -34.2135
iloveyou         score = -10.4604


In [39]:
# Cell 6 — Save snapshot for inspection (JSON)
snap = model.snapshot()
out_path = Path("pcfg_snapshot_notebook.json")
out_path.write_text(json.dumps(snap))
display(Markdown(f"Snapshot saved to **{out_path.resolve()}** — contains top templates and top tokens."))


Snapshot saved to **/Users/twochar/vS/Password-Decryption/pcfg_snapshot_notebook.json** — contains top templates and top tokens.

In [40]:
display(Markdown("### Top Real Words (NLTK vocab)"))
for w, c in model.top_tokens("WORD", 30):
    print(f"{w:20} {c}")

display(Markdown("### Top Fragments (non-dictionary)"))
for f, c in model.top_tokens("FRAG", 30):
    print(f"{f:20} {c}")


### Top Real Words (NLTK vocab)

love                 23062
ever                 18744
life                 16089
eva                  14720
yahoo                9746
baby                 8523
may                  7791
angel                7092
sexy                 6318
alex                 5420
pink                 4977
june                 4977
sam                  4760
jan                  4684
girl                 4568
july                 4555
you                  4388
blue                 4359
chris                4325
star                 4139
red                  4074
mike                 4058
the                  3975
jay                  3835
mar                  3785
man                  3672
ash                  3644
april                3629
john                 3614
rock                 3610


### Top Fragments (non-dictionary)

a                    89281
m                    62740
s                    57834
k                    53087
j                    49701
b                    48169
d                    46586
c                    45580
r                    45260
l                    44758
n                    41989
t                    38440
e                    36806
p                    30604
com                  30434
g                    28065
i                    26372
h                    26309
x                    23275
u                    22284
f                    21639
y                    20971
me                   19312
o                    19228
v                    18234
w                    17447
z                    14557
hotmail              13419
q                    8750
my                   7087


In [41]:
m = PCFGLite(alpha=1.0, do_leet=True)
sample_pw = ["password", "passw0rd", "P@ssw0rd", "john1987", "superman2020!", "aaaa1111bbbb"]
m.fit_list(sample_pw, verbose=True)
for pw in sample_pw:
    toks, tpl = tokenize(pw, do_leet=True)
    print(pw, "=>", tpl, toks, "score:", m.score(pw))
print("Top WORD tokens:", m.top_tokens("WORD", 20))
print("Top FRAG tokens:", m.top_tokens("FRAG", 20))


**Trained on 6 templates. Unique templates: 6**

password => WORD8 ['password'] score: -3.1245651453969594
passw0rd => FRAG|DIGITS1|FRAG ['passw', '0', 'rd'] score: -6.56213017122999
P@ssw0rd => FRAG|SYMBOL|FRAG|DIGITS1|FRAG ['p', '@', 'ssw', '0', 'rd'] score: -9.424331052159458
john1987 => WORD4|DIGITS4 ['john', '1987'] score: -4.7340030578310595
superman2020! => WORD8|DIGITS4|SYMBOL ['superman', '2020', '!'] score: -5.650293789705215
aaaa1111bbbb => FRAG|DIGITS4|FRAG ['aaaa', '1111', 'bbbb'] score: -7.373060387446318
Top WORD tokens: [('password', 1), ('john', 1), ('superman', 1)]
Top FRAG tokens: [('rd', 2), ('passw', 1), ('p', 1), ('ssw', 1), ('aaaa', 1), ('bbbb', 1)]


In [42]:
# Create snapshot JSON for passwords with length >= 12
MIN_LEN = 12
OUT_LONG_SNAP = Path("pcfg_snapshot_long_ge12.json")
MAX_LINES = None  # optional: set to an int to limit how many lines to process for testing

def iter_lines_minlen(path: str, min_len: int = 12, max_lines: int = None):
    """Stream lines from `path` yielding only passwords whose length >= min_len."""
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)
    with p.open("r", encoding="latin-1", errors="ignore") as f:
        for i, line in enumerate(f):
            if max_lines and i >= max_lines:
                break
            pw = line.rstrip("\n\r")
            if not pw:
                continue
            if len(pw) >= min_len:
                yield pw

# Build model trained only on long passwords
model_long = PCFGLite(alpha=1.0, do_leet=model.do_leet)

# Try to stream from DATA_PATH; if not present, fallback to sample list if available
try:
    pw_iter = iter_lines_minlen(DATA_PATH, min_len=MIN_LEN, max_lines=MAX_LINES)
    # Count filtered items while streaming into model.fit_list -- feed the iterator directly
    # fit_list will consume the iterator streaming-style
    print(f"Training model_long on passwords from {DATA_PATH} with length >= {MIN_LEN} ...")
    model_long.fit_list(pw_iter, max_samples=None, verbose=True, use_vocab=True)
except FileNotFoundError as e:
    print(f"Data file not found at {DATA_PATH}: {e}. Falling back to demo sample_pw (if available).")
    try:
        long_sample = [pw for pw in sample_pw if len(pw) >= MIN_LEN]
    except NameError:
        long_sample = []
    if not long_sample:
        raise RuntimeError("No input data available: set DATA_PATH to a valid file or provide sample_pw.")
    model_long.fit_list(long_sample, max_samples=None, verbose=True, use_vocab=True)

# Snapshot and save
snap_long = model_long.snapshot()
# add metadata about filter
snap_long["filter"] = {"min_len": MIN_LEN}
out_text = json.dumps(snap_long)
OUT_LONG_SNAP.write_text(out_text, encoding="utf8")
display(Markdown(f"Saved long-password snapshot to **{OUT_LONG_SNAP.resolve()}**"))
print(f"Total long-password templates: {snap_long['total_templates']}; unique templates: {snap_long['unique_templates']}")


Training model_long on passwords from Data-Breach/rockyou.txt with length >= 12 ...


Trained on 500000 passwords...

Trained on 1000000 passwords...

Trained on 1500000 passwords...

**Trained on 1573606 templates. Unique templates: 25729**

Saved long-password snapshot to **/Users/twochar/vS/Password-Decryption/pcfg_snapshot_long_ge12.json**

Total long-password templates: 1573606; unique templates: 25729


In [45]:
# Extract FRAG tokens (length >= 3) from ALL passwords in DATA_PATH (streaming)
from collections import Counter
import json
from pathlib import Path

MIN_TOKEN_LEN = 3
OUT_JSON = Path("frag_tokens_all_len_ge3.json")
OUT_TSV = Path("frag_tokens_all_len_ge3.tsv")

def iter_all_lines(path: str):
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)
    with p.open("r", encoding="latin-1", errors="ignore") as f:
        for line in f:
            yield line.rstrip("\n\r")

frag_counter = Counter()
processed = 0

for i, pw in enumerate(iter_all_lines(DATA_PATH)):
    if not pw:
        continue
    runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', pw)
    for r in runs:
        slot, token_for_counts, _ = classify_run(r, do_leet=False, use_vocab=True)
        if slot == "FRAG" and isinstance(token_for_counts, str) and len(token_for_counts) >= MIN_TOKEN_LEN:
            frag_counter[token_for_counts] += 1
    processed += 1
    if (i + 1) % 1_000_000 == 0:  # progress print every million
        print(f"Processed {i+1:,} passwords...")

# Results
total_occ = sum(frag_counter.values())
unique_tokens = len(frag_counter)
print(f"Processed {processed:,} passwords.")
print(f"Found {unique_tokens:,} unique FRAG tokens (total occurrences: {total_occ:,}).")

# Save JSON
OUT_JSON.write_text(json.dumps({
    "N_lines": processed,
    "min_token_len": MIN_TOKEN_LEN,
    "total_occurrences": total_occ,
    "unique_tokens": unique_tokens,
    "frag_tokens": frag_counter.most_common()
}, ensure_ascii=False))
print("Wrote JSON:", OUT_JSON.resolve())

# Save TSV
with OUT_TSV.open("w", encoding="utf8") as f:
    f.write("token\tcount\n")
    for token, cnt in frag_counter.most_common():
        f.write(f"{token}\t{cnt}\n")
print("Wrote TSV:", OUT_TSV.resolve())

# Quick preview
print("\nTop 50 FRAG tokens (len >= 3):")
for token, cnt in frag_counter.most_common(50):
    print(f"{token:20} {cnt}")


Processed 1,000,000 passwords...
Processed 2,000,000 passwords...
Processed 3,000,000 passwords...
Processed 4,000,000 passwords...
Processed 5,000,000 passwords...
Processed 6,000,000 passwords...
Processed 7,000,000 passwords...
Processed 8,000,000 passwords...
Processed 9,000,000 passwords...
Processed 10,000,000 passwords...
Processed 11,000,000 passwords...
Processed 12,000,000 passwords...
Processed 13,000,000 passwords...
Processed 14,000,000 passwords...
Processed 14,344,390 passwords.
Found 5,212,697 unique FRAG tokens (total occurrences: 9,389,113).
Wrote JSON: /Users/twochar/vS/Password-Decryption/frag_tokens_all_len_ge3.json
Wrote TSV: /Users/twochar/vS/Password-Decryption/frag_tokens_all_len_ge3.tsv

Top 50 FRAG tokens (len >= 3):
com                  30434
hotmail              13419
luv                  5556
feb                  4080
nov                  3993
abc                  3934
lil                  3906
dec                  3814
mom                  3547
oct       

In [1]:
# in other_notebook.ipynb (after training finished)
import pickle
model_path = "pcfg_model.pkl"
with open(model_path, "wb") as f:
    pickle.dump({
        "template_counts": model.template_counts,
        "slot_counts": {k: dict(v) for k,v in model.slot_counts.items()},
        "total_templates": model.total_templates,
        "alpha": model.alpha,
        "do_leet": model.do_leet
    }, f)
print("Saved model to", model_path)


NameError: name 'model' is not defined