In [1]:
!pip install pytorch-lightning
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler, Adam

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import pandas as pd
import string

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


Extension horovod.torch has not been built: /home/grads/b/bhanu/.local/lib/python3.8/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-38-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.


# Data downloading and pre-processing 

In [3]:
# Uncomment if you are running on new server to download
# !wget https://transfer.sh/6KUxG4/rockyou.txt
import re
data = re.findall('\w+', open('rockyou.txt', encoding='latin-1').read())

all_letters = ["<pad>"] + list(string.printable) + ["<eos>"]
n_letters = len(all_letters)
n_letters

clean_data = []
for strs in data:
  if strs.isascii():
    clean_data.append(strs)

data = clean_data   


--2022-05-03 19:24:52--  https://transfer.sh/6KUxG4/rockyou.txt
Resolving transfer.sh (transfer.sh)... 144.76.136.153, 2a01:4f8:200:1097::2
Connecting to transfer.sh (transfer.sh)|144.76.136.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 139921497 (133M) [text/plain]
Saving to: ‘rockyou.txt’


2022-05-03 19:25:00 (19.1 MB/s) - ‘rockyou.txt’ saved [139921497/139921497]



In [6]:
# String to int and int to str dict and list
stoi = {letter : idx for idx, letter in enumerate(all_letters)}
itos = [letter for idx, letter in enumerate(all_letters)]

# Data loader

In [7]:
class NamesDataset(Dataset):
    def __init__(self, data, stoi, eos_token="<eos>"):
        self.stoi = stoi
        self.eos_token = eos_token
        self.n_tokens = len(self.stoi)
        
        self.names = data


    def __getitem__(self, item):
        name = self.names[item]
        
        input_tensor = torch.tensor([stoi[char] for char in name])
        target_tensor = torch.tensor([stoi[char] for char in list(name[1:])+[self.eos_token]])
        
        item_dict = {
        "name": name,
        "input_tensor": input_tensor,
        "target_tensor": target_tensor}
        
        
        return item_dict

    def __len__(self):
        return len(self.names)

In [8]:
ds = NamesDataset(data, stoi)

ds[0]

{'name': '123456',
 'input_tensor': tensor([2, 3, 4, 5, 6, 7]),
 'target_tensor': tensor([  3,   4,   5,   6,   7, 101])}

# Collate to pad on batch with various length inputs

In [9]:
def collate_fn(data):
    def merge(sequences):
        "https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py"
        
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq[:end]
        return padded_seqs, lengths
       
    names = [x["name"] for x in data]          
    
    input_tensors = [x["input_tensor"] for x in data]
    input_tensors, _ = merge(input_tensors)
    
    target_tensors = [x["target_tensor"] for x in data]
    target_tensors, _ = merge(target_tensors)
    
    return names, input_tensors, target_tensors

In [10]:
dl = DataLoader(ds, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [11]:
next(iter(dl))

(['0892273609'],
 tensor([[ 1,  9, 10,  3,  3,  8,  4,  7,  1, 10]]),
 tensor([[  9,  10,   3,   3,   8,   4,   7,   1,  10, 101]]))

# Lightning Module

In [15]:
class NamesDatamodule(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.data = re.findall('\w+', open('rockyou.txt', encoding='latin-1').read())
        self.all_letters = ["<pad>"] + list(string.printable) + ["<eos>"]
        self.stoi = {letter : idx for idx, letter in enumerate(self.all_letters)}

    def setup(self, stage=None):
        self.train_set = NamesDataset(data, self.stoi)

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)
    
    def collate_fn(self, data):
        def merge(sequences):
            "https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py"

            lengths = [len(seq) for seq in sequences]
            padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
            for i, seq in enumerate(sequences):
                end = lengths[i]
                padded_seqs[i, :end] = seq[:end]
            return padded_seqs, lengths

        names = [x["name"] for x in data]          

        input_tensors = [x["input_tensor"] for x in data]
        input_tensors, _ = merge(input_tensors)

        target_tensors = [x["target_tensor"] for x in data]
        target_tensors, _ = merge(target_tensors)
        
        item_dict = {
                     "names": names, 
                     "input_tensors": input_tensors,
                     "target_tensors": target_tensors}

        return item_dict

# RNN model module

In [16]:
class RNN(pl.LightningModule):
    lr = 5e-4

    def __init__(self, input_size, hidden_size, embeding_size, n_layers, output_size, p):
        super().__init__()

        self.criterion = nn.CrossEntropyLoss()
        
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        
        self.embeding = nn.Embedding(input_size, embeding_size)
        self.lstm = nn.LSTM(embeding_size, hidden_size, n_layers, dropout=p)
        self.out_fc = nn.Linear(hidden_size, output_size)
        
        self.dropout = nn.Dropout(p)
        

    def forward(self, batch_of_letter, hidden, cell):
        ## letter level operations
        
        embeding = self.dropout(self.embeding(batch_of_letter))
        # category_plus_letter = torch.cat((batch_of_category, embeding), 1)

        #sequence_length = 1
        category_plus_letter = embeding.unsqueeze(1)
        
        out, (hidden, cell) = self.lstm(category_plus_letter, (hidden, cell))
        out = self.out_fc(out)
        out = out.squeeze(1)
        
        return out, (hidden, cell)
        

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), self.lr)
        scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        item_dict = batch
        loss = 0

        #to(device) needed due to some problem with PL
        hidden = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)
        cell = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)

        #we loop over letters, single batch at the time 
        for t in range(item_dict["input_tensors"].size(1)):
            batch_of_letter = item_dict["input_tensors"][:, t]
            
            output, (hidden, cell) = self(batch_of_letter, hidden, cell)
            
            loss += self.criterion(output, item_dict["target_tensors"][:, t])

        loss = loss/(t+1)

        tensorboard_logs = {'train_loss': loss}

        return {'loss': loss, 'log': tensorboard_logs}
    
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        
        return hidden, cell

# Training

In [20]:
dm = NamesDatamodule(1024)

rnn_model = RNN(input_size=ds.n_tokens,
            hidden_size=256,
            embeding_size = 128, 
            n_layers=2,    
            output_size=ds.n_tokens,
            p=0.3)


# Comment from here on while inference 

trainer = Trainer(max_epochs=3, 
                  logger=None,
                  gpus=1,
                 auto_scale_batch_size=True)

trainer.fit(rnn_model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2]

  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | embeding  | Embedding        | 13.1 K
2 | lstm      | LSTM             | 921 K 
3 | out_fc    | Linear           | 26.2 K
4 | dropout   | Dropout          | 0     
-----------------------------------------------
960 K     Trainable params
0         Non-trainable params
960 K     Total params
3.843     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

# Inference

In [171]:
max_length = 5

# Sample from a category and starting letter
def sample(start_letter, model,max_length):
    with torch.no_grad():  # no need to track history in sampling
        arr = [ds.stoi[c] for c in 'abc']
        input = torch.tensor(ds.stoi[start_letter]).unsqueeze(0)
#         input = torch.tensor(arr)
        hidden, cell = model.init_hidden(1)
#         print(input.shape)
        output_name = start_letter

        for i in range(max_length):
            output, (hidden, cell) = model(input, hidden, cell)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == ds.stoi["<eos>"]:
                break
            else:
                letter = itos[topi]
                output_name += letter
                
            input = torch.tensor(ds.stoi[letter]).unsqueeze(0)
#                 arr.append(ds.stoi[letter])
#                 input = torch.tensor(arr)

        return output_name


def samples(start_letter, num_gens, model,max_length):
    arr = []
    for i in range(num_gens):
        arr.append(sample(start_letter, model,max_length))
#         print(sample(start_letter, model,max_length))
    return arr

# BLEU Calculation

In [222]:
from collections import Counter

# compute a very naïve BLEU score -- for educational purposes only

def BLEU_star(refs, candidates):
    
    # tokenize the references and the candidate
    refs = [ref.split() for ref in refs]
    candidate = candidates.split()

    # compute word frequencies for the references and the candidate
    refs_counts = [Counter(ref) for ref in refs]
    candidate_counts = Counter(candidate)

    covered = 0
    total = 0
    
    # compute the coverage for each word
    for word, count in candidate_counts.items():
        covered += min(count, max([ref[word] for ref in refs_counts]))
        total += count
    
    # note: we can also use len(candidate) instead of total :)
    return covered / total


- Randomly sample 10000 passwords from whole dataset
- In each of 10000 password take first character of each password and generate the a new passowrd
- calculate bleu score on 10000 samples passwords as reference and 10000 generated passswords as hypothesis


In [225]:
import random
from tqdm import tqdm


tsams = random.sample(data, 10000)

cands = []
for samp in tqdm(tsams):
    cands.extend(samples(samp[0], 1 , rnn_model,max_length=5))

0.625

In [227]:
final_scr = []
for tup in tqdm(zip(tsams,cands)):
    refstr = tup[0].replace("", " ")[1: -1]
    hypostr = tup[1].replace("", " ")[1: -1]
    final_scr.append(BLEU_star([refstr],hypostr))
    
print(sum(final_scr)/len(final_scr))

10000it [00:02, 4768.25it/s]


In [231]:
sum(final_scr)/10000

0.36791666666666367