In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Vocabulary setup
chars = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`~!@#$%^&*()_+-=[]}{\\|;:'\",<.>/? ")
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}
vocab_size = len(chars)

# Hyperparameters
latent_dim = 64
embedding_dim = 32
hidden_dim = 128
max_length = 32  # Fixed password length
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# VAE Model
class PasswordFeatureVAE(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, latent_dim, feature_dim, max_length):
        super(PasswordFeatureVAE, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc_mu = nn.Linear(hidden_dim + feature_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim + feature_dim, latent_dim)
        self.decoder_rnn = nn.LSTM(latent_dim + feature_dim, hidden_dim, batch_first=True)  # Include feature dimension
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
        self.max_length = max_length

    def encode(self, password_seq, features):
        embedded = self.embedding(password_seq)
        _, (hidden, _) = self.encoder_rnn(embedded)
        hidden = hidden[-1]
        combined = torch.cat([hidden, features], dim=1)
        mu = self.fc_mu(combined)
        logvar = self.fc_logvar(combined)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, features):
        z = z.unsqueeze(1).repeat(1, self.max_length, 1)  # Repeat z to match sequence length
        combined_input = torch.cat([z, features.unsqueeze(1).repeat(1, self.max_length, 1)], dim=-1)  # Concatenate z and features
        outputs, _ = self.decoder_rnn(combined_input)
        logits = self.fc_out(outputs)
        return logits

    def forward(self, password_seq, features):
        mu, logvar = self.encode(password_seq, features)
        z = self.reparameterize(mu, logvar)
        logits = self.decode(z, features)  # Pass features along with z to the decoder
        return logits, mu, logvar

# VAE loss function with beta (beta-VAE)
def vae_loss(recon_logits, target, mu, logvar, beta=1.0):
    recon_loss = nn.CrossEntropyLoss()(recon_logits.view(-1, vocab_size), target.view(-1))
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl_loss

# Load and preprocess dataset
df = pd.read_csv("StrongPasswordsOnly.csv")
df = df.dropna()

# Normalize features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(df.iloc[:, 1:-1])  # Exclude `Password` and `Strength`

# Convert passwords to sequences
password_seqs = [[char_to_idx[c] for c in pw.ljust(max_length)[:max_length]] for pw in df["Password"]]

# Custom dataset class
class PasswordDataset(Dataset):
    def __init__(self, passwords, features):
        self.passwords = torch.tensor(passwords, dtype=torch.long)
        self.features = torch.tensor(features, dtype=torch.float32)

    def __len__(self):
        return len(self.passwords)

    def __getitem__(self, idx):
        return self.passwords[idx], self.features[idx]

dataset = PasswordDataset(password_seqs, normalized_features)
dataloader = DataLoader(dataset, batch_size=4096, shuffle=True)

# Initialize model, loss, optimizer
feature_dim = normalized_features.shape[1]
model = PasswordFeatureVAE(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                           latent_dim=latent_dim, feature_dim=feature_dim, max_length=max_length).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [2]:
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    print(f"Epoch {epoch + 1}/{epochs} started.")
    for batch_idx, (password_seq, features) in enumerate(dataloader, start=1):
        password_seq, features = password_seq.to(device), features.to(device)
        optimizer.zero_grad()
        logits, mu, logvar = model(password_seq, features)
        loss = vae_loss(logits, password_seq, mu, logvar, beta=4)  # Experiment with beta
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
        # Print batch processing status
        print(f"Batch {batch_idx}/{len(dataloader)} processed.", end="\r")
    
    # Print epoch summary
    print(f"Epoch {epoch + 1} completed. Loss: {epoch_loss / len(dataloader):.4f}")

Epoch 1/10 started.
Epoch 1 completed. Loss: 94.3601
Epoch 2/10 started.
Epoch 2 completed. Loss: 3.6489
Epoch 3/10 started.
Epoch 3 completed. Loss: 2.7160
Epoch 4/10 started.
Epoch 4 completed. Loss: 2.5128
Epoch 5/10 started.
Epoch 5 completed. Loss: 2.4368
Epoch 6/10 started.
Epoch 6 completed. Loss: 2.3868
Epoch 7/10 started.
Epoch 7 completed. Loss: 2.3840
Epoch 8/10 started.
Epoch 8 completed. Loss: 2.3621
Epoch 9/10 started.
Epoch 9 completed. Loss: 2.3513
Epoch 10/10 started.
Epoch 10 completed. Loss: 2.3731


In [50]:
# Save model and optimizer state dictionaries
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, "01-traininggenerator.pth")


In [None]:
import numpy as np
np.save("01-normalizedfeatures.npy", normalized_features)
