In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rockyou/rockyou.txt
/kaggle/input/ptl1/transformers/default/1/pass_tsl_model_kaggle.pth


In [1]:
import torch
import torch.nn as nn
import math

class CausalSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        self.attn_dropout = nn.Dropout(dropout)
        self.proj_dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        B, T, C = x.size()
        q = self.W_q(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        k = self.W_k(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        v = self.W_v(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = self.attn_dropout(scores.softmax(dim=-1))
        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.W_o(out)

class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.attn = CausalSelfAttention(d_model, num_heads, dropout)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask=None):
        x = x + self.attn(self.ln1(x), mask)
        x = x + self.ffn(self.ln2(x))
        return x

class PassTSL(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=4, d_ff=1024, num_layers=6, max_len=512, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_len, d_model)
        self.layers = nn.ModuleList([
            DecoderBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self, x):
        B, T = x.shape
        pos = torch.arange(0, T, device=x.device).unsqueeze(0).expand(B, T)
        x = self.token_emb(x) + self.pos_emb(pos)

        # causal mask
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)
        for layer in self.layers:
            x = layer(x, mask)

        x = self.ln_f(x)
        return self.head(x)  # (B, T, vocab_size)

In [2]:
from tqdm import tqdm
import torch.optim as optim


# save as pass_tsl_toy.py and run in a Python environment with PyTorch installed
import string
import math
import random
from typing import List

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# -----------------------
# 1) Vocabulary / Tokenizer
# -----------------------
# 95 printable ASCII characters (space..~) as in paper; we'll also add specials
PRINTABLE = ''.join([chr(i) for i in range(32, 127)])  # len 95

SPECIAL_TOKENS = ["[PAD]", "[SOS]", "[EOS]", "[UNK]", "[MASK]"]
ALL_TOKENS = SPECIAL_TOKENS + list(PRINTABLE)
VOCAB_SIZE = len(ALL_TOKENS)

stoi = {s: i for i, s in enumerate(ALL_TOKENS)}
itos = {i: s for s, i in stoi.items()}

def encode_password(pw: str, max_len: int = None) -> List[int]:
    """Encode a password string to token ids (no SOS/EOS here)."""
    ids = []
    for ch in pw:
        if ch in PRINTABLE:
            ids.append(stoi[ch])
        else:
            ids.append(stoi["[UNK]"])
    if max_len is not None:
        ids = ids[:max_len]
    return ids

def decode_ids(ids: List[int]) -> str:
    return ''.join(itos[i] for i in ids if itos[i] not in SPECIAL_TOKENS)

# -----------------------
# 2) Dataset
# -----------------------
class PasswordDataset(Dataset):
    def __init__(self, passwords: List[str], max_seq_len: int = 64):
        self.passwords = passwords
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.passwords)

    def __getitem__(self, idx):
        pw = self.passwords[idx]
        ids = encode_password(pw, max_len=self.max_seq_len-2)  # leave space for SOS/EOS
        return torch.tensor(ids, dtype=torch.long)

# -----------------------
# 3) Collate (pad + add SOS/EOS)
# -----------------------
PAD_ID = stoi["[PAD]"]
SOS_ID = stoi["[SOS]"]
EOS_ID = stoi["[EOS]"]

def collate_fn(batch):
    """
    batch: list of LongTensors (raw encoded chars)
    returns: input_ids (B, T), target_ids (B, T)
    where inputs start with SOS and targets are inputs shifted left (next-token)
    """
    B = len(batch)
    lengths = [b.size(0) for b in batch]
    max_len = max(lengths) + 2  # +SOS +EOS

    input_ids = torch.full((B, max_len), PAD_ID, dtype=torch.long)
    target_ids = torch.full((B, max_len), PAD_ID, dtype=torch.long)

    for i, b in enumerate(batch):
        L = b.size(0)
        # inputs: [SOS] + b + [EOS] + PAD...
        input_ids[i, 0] = SOS_ID
        input_ids[i, 1:1+L] = b
        input_ids[i, 1+L] = EOS_ID

        # targets: b + [EOS] (model should predict char1..EOS given inputs)
        target_ids[i, :L+1] = torch.cat([b, torch.tensor([EOS_ID])])

    return input_ids, target_ids  # shapes (B, T), (B, T)


def train_model(model, data_loader, num_epochs=10, learning_rate=0.001, device='mps'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in tqdm(range(num_epochs)):
        total_loss = 0
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}")

In [3]:
# read the rock-you list
with open("/rockyou.txt", "r", encoding="latin-1") as f:
    passwords = [line.strip() for line in f if 0 < len(line.strip()) <= 30]

In [4]:
dataset = PasswordDataset(passwords, max_seq_len=32)
loader = DataLoader(dataset, batch_size=1000, shuffle=True, collate_fn=collate_fn, num_workers=4)
print("data loader ready...")
model = PassTSL(vocab_size=VOCAB_SIZE, d_model=128, num_heads=4, d_ff=512, num_layers=4, max_len=64, dropout=0.1)
train_model(model, loader, num_epochs=5, learning_rate=0.001, device='cuda')
# save the model
torch.save(model.state_dict(), "pass_tsl_model.pth")



data loader ready...


 20%|██        | 1/5 [21:48<1:27:12, 1308.22s/it]

Epoch 1/5, Loss: 0.9660970462880601


 40%|████      | 2/5 [43:37<1:05:25, 1308.56s/it]

Epoch 2/5, Loss: 0.9407524530312878


 60%|██████    | 3/5 [1:05:21<43:33, 1306.56s/it]

Epoch 3/5, Loss: 0.9350441972436819


 80%|████████  | 4/5 [1:27:07<21:46, 1306.60s/it]

Epoch 4/5, Loss: 0.9318073157336155


100%|██████████| 5/5 [1:48:55<00:00, 1307.12s/it]

Epoch 5/5, Loss: 0.9306190355112842





In [3]:
model = PassTSL(vocab_size=VOCAB_SIZE, d_model=128, num_heads=4, d_ff=512, num_layers=4, max_len=64, dropout=0.1)
model.load_state_dict(torch.load("./pass_tsl_model_kaggle.pth", weights_only=True, map_location=torch.device('cpu')
                            ))
model.eval()

PassTSL(
  (token_emb): Embedding(100, 128)
  (pos_emb): Embedding(64, 128)
  (layers): ModuleList(
    (0-3): 4 x DecoderBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): CausalSelfAttention(
        (W_q): Linear(in_features=128, out_features=128, bias=True)
        (W_k): Linear(in_features=128, out_features=128, bias=True)
        (W_v): Linear(in_features=128, out_features=128, bias=True)
        (W_o): Linear(in_features=128, out_features=128, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (proj_dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=128, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (head): Li

In [23]:

def generate_passwords(model, num_samples=10, min_len=12, max_len=32, 
                       temperature=1.0, top_k=10, device="cuda"):
    model.eval()
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # results = []
    while True:
        input_ids = torch.tensor([[SOS_ID]], device=device)  # start with [SOS]
        generated = []

        for step in range(max_len):
            logits = model(input_ids)[:, -1, :]  # (1, vocab_size)
            if (len(generated) < min_len):
                logits[0, EOS_ID] = -float("inf")

            # top-k filtering
            if top_k > 0:
                topk_vals, topk_idx = torch.topk(logits, top_k)
                probs = torch.softmax(topk_vals, dim=-1)
                next_id = topk_idx[0, torch.multinomial(probs, 1)]
            else:
                probs = torch.softmax(logits, dim=-1)
                next_id = torch.multinomial(probs, 1)

            next_id = next_id.item()

            if next_id == EOS_ID:
                break

            generated.append(next_id)
            # append next_id to sequence
            input_ids = torch.cat([input_ids, torch.tensor([[next_id]], device=device)], dim=1)

        # enforce min length: if too short, resample
        if len(generated) < min_len:
            continue  

        # decode
        pw = decode_ids(generated)
        # results.append(pw)
        print("trying password: ", pw)
        if (not tryPass(pw)):
            break

    return results


In [26]:
# assuming `model` is trained and stoi/itos defined
import os
passwords = generate_passwords(model, num_samples=20, min_len=12, max_len=20, top_k=20)
print(passwords)

trying password:  rossa4203362002
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
trying password:  hunmuch!!!!!
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
failed decryption
trying password:  2712669630785
fai

KeyboardInterrupt: 

In [13]:
!pip install pycryptodomex

Collecting pycryptodomex
  Downloading pycryptodomex-3.23.0-cp37-abi3-macosx_10_9_universal2.whl.metadata (3.4 kB)
Downloading pycryptodomex-3.23.0-cp37-abi3-macosx_10_9_universal2.whl (2.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pycryptodomex
Successfully installed pycryptodomex-3.23.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [14]:
from PyPDF2 import PdfReader, PdfWriter # modern package name: pypdf
from Cryptodome.Cipher import AES

In [25]:
foundFiles = []
def tryPass(password):
    count=0
    for dirname, _, filenames in os.walk('../Mock/'):
        totalFiles = len(filenames)
        for filename in filenames:
            if filename[-3:] != 'pdf' or filename in foundFiles:
                count += 1
                continue
            in_path = os.path.join(dirname, filename)
            reader = PdfReader(in_path)
            if reader.is_encrypted:
                try:
                    result = reader.decrypt(password)
                    if (result):
                        with open ("found_pass.txt", 'a') as f:
                            f.write(f"found password: {password} for file: {in_path}\n")
                        foundFiles.append(filename)
                    else:
                        print("failed decryption")
                    # do something with text...
                except Exception as e:
                    print("Could not decrypt PDF:", e)
            else:
                print("PDF not encrypted")
        if count == totalFiles:
            return False
        count = 0
    return True


In [24]:
foundFiles

[]

In [18]:
with open ("found_pass.txt", 'a') as f:
    f.write(f"found password: dummay for file: dummayfile")

with open ("found_pass.txt", 'r') as f:
    lines = f.readlines()
    print(lines)

['found password: dummay for file: dummayfilefound password: dummay for file: dummayfile']


In [19]:
!rm -rf found_pass.txt