In [12]:

import re
import math
import json
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Tuple
from IPython.display import display, Markdown


In [13]:
# Cell A — Install and import NLTK resources (run once)
import nltk
nltk.download("words")

from nltk.corpus import words as nltk_words

# Build vocab set for fast lookup
ENGLISH_VOCAB = set(w.lower() for w in nltk_words.words())
len(ENGLISH_VOCAB)


[nltk_data] Downloading package words to /Users/twochar/nltk_data...
[nltk_data]   Package words is already up-to-date!


234377

In [14]:
# Cell 2 — Tokenizer + optional leet normalization helper

LEET_MAP = str.maketrans({
    '0': 'o', '1': 'l', '3': 'e', '4': 'a', '5': 's', '7': 't', '@': 'a', '$': 's', '!': 'i'
})

def leet_normalize(s: str) -> str:
    """Return a leet-normalized version of s (lowercased)."""
    return s.translate(LEET_MAP).lower()

def tokenize(password: str, do_leet=False, use_vocab=True) -> Tuple[List[str], str]:
    """
    Split password into runs: letters, digits, symbols.
    - If use_vocab=True, only count alphabetic tokens that are in the NLTK vocab (len>=3).
    - Otherwise they go into FRAG bucket.
    """
    pw = password.strip()
    runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', pw)

    slots = []
    for r in runs:
        if r.isdigit():
            slots.append(("DIGITS", r))
        elif r.isalpha():
            token = r.lower()
            if use_vocab and len(token) >= 3 and token in ENGLISH_VOCAB:
                slots.append(("WORD", token))
            else:
                slots.append(("FRAG", token))
        else:
            slots.append(("SYMBOL", r))

    template = "|".join(f"{t}{len(tok) if t not in ['SYMBOL','FRAG'] else t}" 
                        for t, tok in slots)
    return [tok for _, tok in slots], template



In [15]:
# Cell 3 — PCFGLite class
class PCFGLite:
    def __init__(self, alpha: float = 1.0, do_leet: bool = False):
        self.template_counts = Counter()
        self.slot_counts = defaultdict(Counter)  # slot_type -> Counter(token)
        self.total_templates = 0
        self.alpha = float(alpha)
        self.do_leet = do_leet

    def fit_list(self, pw_list: List[str], max_samples: int = None, verbose: bool = True):
        """
        Fit from a list of plaintext passwords.
        Use max_samples to limit for quick tests.
        """
        for i, pw in enumerate(pw_list):
            if max_samples and i >= max_samples:
                break
            if not pw:
                continue
            tokens, template = tokenize(pw, do_leet=self.do_leet)
            self.template_counts[template] += 1
            self.total_templates += 1
            # map tokens to type and update counters
            runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', pw)
            for r in runs:
                if r.isdigit():
                    self.slot_counts["DIGITS"][r] += 1
                elif r.isalpha():
                    token = r.lower()
                    if self.do_leet:
                        token = leet_normalize(token)

                    if len(token) >= 3 and token in ENGLISH_VOCAB:
                        self.slot_counts["WORD"][token] += 1
                    else:
                        self.slot_counts["FRAG"][token] += 1

                else:
                    self.slot_counts["SYMBOL"][r] += 1
        if verbose:
            display(Markdown(f"**Trained on {self.total_templates} templates. Unique templates: {len(self.template_counts)}**"))

    def fit_file(self, filepath: str, max_lines: int = None):
        p = Path(filepath)
        if not p.exists():
            raise FileNotFoundError(filepath)
        with p.open("r", encoding="latin-1", errors="ignore") as f:
            lines = (line.rstrip("\n\r") for line in f)
            self.fit_list(list(lines), max_samples=max_lines)

    def template_prob(self, template: str) -> float:
        V = len(self.template_counts)
        return (self.template_counts[template] + self.alpha) / (self.total_templates + self.alpha * (V + 1))

    def slot_token_prob(self, slot_type: str, token: str) -> float:
        counter = self.slot_counts.get(slot_type, Counter())
        total = sum(counter.values())
        V = len(counter)
        return (counter[token] + self.alpha) / (total + self.alpha * (V + 1))

    def score(self, password: str) -> float:
        """Return natural-log probability score (higher = more likely under model)."""
        tokens, template = tokenize(password, do_leet=self.do_leet)
        # If the template was never seen, template_prob still returns smoothed value
        logp = math.log(self.template_prob(template))
        runs = re.findall(r'[A-Za-z]+|\d+|[^A-Za-z\d]+', password)
        for r in runs:
            if r.isdigit():
                p = self.slot_token_prob("DIGITS", r)
            elif r.isalpha():
                p = self.slot_token_prob("WORD", r.lower())
            else:
                p = self.slot_token_prob("SYMBOL", r)
            logp += math.log(p)
        return logp

    def top_templates(self, n=30):
        return self.template_counts.most_common(n)

    def top_tokens(self, slot_type: str, n=30):
        return self.slot_counts.get(slot_type, Counter()).most_common(n)

    def snapshot(self, top_templates_n=200, top_words_n=500, top_digits_n=200):
        out = {
            "total_templates": self.total_templates,
            "unique_templates": len(self.template_counts),
            "top_templates": self.top_templates(top_templates_n),
            "top_words": self.top_tokens("WORD", top_words_n),
            "top_digits": self.top_tokens("DIGITS", top_digits_n),
        }
        return out


In [22]:
# Cell 4 — Demo run: use a small synthetic sample if you don't want to load rockyou now.
# Configure DATA_PATH = "/path/to/rockyou.txt" to load real data. For demonstration we'll use a small list.

DATA_PATH = "Data-Breach/rockyou.txt"  # <-- set to path string if you have the file accessible

# Small synthetic sample (for quick demo)
sample_pw = [
    "password", "123456", "qwerty", "letmein", "password1", "admin123", "iloveyou", "abc123",
    "sunshine", "passw0rd", "P@ssw0rd", "john1987", "alice2020!", "dragon", "welcome1", "football"
]

model = PCFGLite(alpha=1.0, do_leet=True)

if DATA_PATH:
    print("Loading from file:", DATA_PATH)
    model.fit_file(DATA_PATH, max_lines=1000000)  # change max_lines or remove it for full file
else:
    print("No DATA_PATH provided — running demo on synthetic sample.")
    model.fit_list(sample_pw, max_samples=None)

# show top templates and top tokens
display(Markdown("### Top templates"))
for t, c in model.top_templates(20):
    print(f"{t:25} {c}")

display(Markdown("### Top WORD tokens"))
for w, c in model.top_tokens("WORD", 30):
    print(f"{w:20} {c}")

display(Markdown("### Top DIGIT runs"))
for d, c in model.top_tokens("DIGITS", 30):
    print(f"{d:10} {c}")


Loading from file: Data-Breach/rockyou.txt


**Trained on 999999 templates. Unique templates: 1546**

### Top templates

FRAGFRAG                  401146
FRAGFRAG|DIGITS2          87727
DIGITS6                   75055
FRAGFRAG|DIGITS1          74558
DIGITS8                   30286
WORD5|DIGITS2             28224
WORD6|DIGITS2             26894
WORD4|DIGITS2             23459
FRAGFRAG|DIGITS3          19225
FRAGFRAG|DIGITS4          18222
WORD7|DIGITS2             12067
WORD6|DIGITS1             10005
WORD5|DIGITS1             9819
WORD4|DIGITS4             8589
WORD6                     8382
WORD5|DIGITS3             7588
DIGITS7                   7216
WORD7|DIGITS1             7122
DIGITS5                   6869
WORD4|DIGITS3             6390


### Top WORD tokens

love                 2255
life                 1700
ever                 1588
may                  1369
eva                  1072
june                 961
july                 828
sexy                 776
angel                770
baby                 720
pink                 632
chris                593
jan                  565
march                498
princess             472
star                 459
mike                 443
blue                 442
password             439
red                  438
jesus                417
monkey               404
alex                 394
john                 390
james                387
david                374
marie                367
april                365
soccer               361
you                  349


### Top DIGIT runs

1          65145
2          17549
123        16364
4          13750
3          12296
12         10968
13         8214
7          8147
5          7106
11         6873
22         5948
23         5926
01         5764
21         5693
07         5586
14         5449
8          5413
10         5318
06         4947
69         4821
08         4779
15         4769
6          4743
16         4479
0          4291
9          4280
18         4159
17         3967
05         3946
24         3893


In [23]:
# Cell 5 — Scoring examples & usage
examples = ["password", "P@ssw0rd", "john1987", "unique!X9", "iloveyou"]
display(Markdown("### Example scores (higher = more likely under model)"))
for ex in examples:
    print(f"{ex:15}  score = {model.score(ex):.4f}")


### Example scores (higher = more likely under model)

password         score = -11.8559
P@ssw0rd         score = -56.3306
john1987         score = -18.2752
unique!X9        score = -41.4447
iloveyou         score = -13.4324


In [27]:
# Cell 6 — Save snapshot for inspection (JSON)
snap = model.snapshot()
out_path = Path("pcfg_snapshot_notebook.json")
out_path.write_text(json.dumps(snap))
display(Markdown(f"Snapshot saved to **{out_path.resolve()}** — contains top templates and top tokens."))


Snapshot saved to **/Users/twochar/vS/Password-Decryption/pcfg_snapshot_notebook.json** — contains top templates and top tokens.

In [25]:
display(Markdown("### Top Real Words (NLTK vocab)"))
for w, c in model.top_tokens("WORD", 30):
    print(f"{w:20} {c}")

display(Markdown("### Top Fragments (non-dictionary)"))
for f, c in model.top_tokens("FRAG", 30):
    print(f"{f:20} {c}")


### Top Real Words (NLTK vocab)

love                 2255
life                 1700
ever                 1588
may                  1369
eva                  1072
june                 961
july                 828
sexy                 776
angel                770
baby                 720
pink                 632
chris                593
jan                  565
march                498
princess             472
star                 459
mike                 443
blue                 442
password             439
red                  438
jesus                417
monkey               404
alex                 394
john                 390
james                387
david                374
marie                367
april                365
soccer               361
you                  349


### Top Fragments (non-dictionary)

me                   1213
a                    1139
u                    1125
m                    1091
r                    999
k                    947
s                    945
l                    924
n                    912
d                    813
j                    808
c                    797
i                    761
t                    751
e                    675
b                    661
p                    624
feb                  551
x                    523
g                    474
dec                  464
h                    463
nov                  430
w                    429
my                   412
y                    409
o                    386
f                    360
oct                  355
v                    347
