In [2]:
!pip install pytorch-lightning
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler, Adam

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import pandas as pd
import string



In [3]:
# !wget https://transfer.sh/6KUxG4/rockyou.txt
import re
data = re.findall('\w+', open('rockyou.txt', encoding='latin-1').read())

In [4]:
all_letters = ["<pad>"] + list(string.printable) + ["<eos>"]
n_letters = len(all_letters)
n_letters



102

In [6]:
clean_data = []
for strs in data:
  if strs.isascii():
    clean_data.append(strs)

data = clean_data    

In [7]:
stoi = {letter : idx for idx, letter in enumerate(all_letters)}
itos = [letter for idx, letter in enumerate(all_letters)]

In [8]:
class NamesDataset(Dataset):
    def __init__(self, data, stoi, eos_token="<eos>"):
        self.stoi = stoi
        self.eos_token = eos_token
        self.n_tokens = len(self.stoi)
        
        self.names = data


    def __getitem__(self, item):
        name = self.names[item]
        
        input_tensor = torch.tensor([stoi[char] for char in name])
        target_tensor = torch.tensor([stoi[char] for char in list(name[1:])+[self.eos_token]])
        
        item_dict = {
        "name": name,
        "input_tensor": input_tensor,
        "target_tensor": target_tensor}
        
        
        return item_dict

    def __len__(self):
        return len(self.names)

In [9]:
ds = NamesDataset(data, stoi)

ds[0]

{'input_tensor': tensor([2, 3, 4, 5, 6, 7]),
 'name': '123456',
 'target_tensor': tensor([  3,   4,   5,   6,   7, 101])}

In [10]:
def collate_fn(data):
    def merge(sequences):
        "https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py"
        
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq[:end]
        return padded_seqs, lengths
       
    names = [x["name"] for x in data]          
    
    input_tensors = [x["input_tensor"] for x in data]
    input_tensors, _ = merge(input_tensors)
    
    target_tensors = [x["target_tensor"] for x in data]
    target_tensors, _ = merge(target_tensors)
    
    return names, input_tensors, target_tensors

In [11]:
dl = DataLoader(ds, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [12]:
next(iter(dl))

(['021444'],
 tensor([[1, 3, 2, 5, 5, 5]]),
 tensor([[  3,   2,   5,   5,   5, 101]]))

In [13]:
class NamesDatamodule(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.data = re.findall('\w+', open('rockyou.txt', encoding='latin-1').read())
        self.all_letters = ["<pad>"] + list(string.printable) + ["<eos>"]
        self.stoi = {letter : idx for idx, letter in enumerate(self.all_letters)}

    def setup(self, stage=None):
        self.train_set = NamesDataset(data, self.stoi)

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)
    
    def collate_fn(self, data):
        def merge(sequences):
            "https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py"

            lengths = [len(seq) for seq in sequences]
            padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
            for i, seq in enumerate(sequences):
                end = lengths[i]
                padded_seqs[i, :end] = seq[:end]
            return padded_seqs, lengths

        names = [x["name"] for x in data]          

        input_tensors = [x["input_tensor"] for x in data]
        input_tensors, _ = merge(input_tensors)

        target_tensors = [x["target_tensor"] for x in data]
        target_tensors, _ = merge(target_tensors)
        
        item_dict = {
                     "names": names, 
                     "input_tensors": input_tensors,
                     "target_tensors": target_tensors}

        return item_dict

In [14]:
class RNN(pl.LightningModule):
    lr = 5e-4

    def __init__(self, input_size, hidden_size, embeding_size, n_layers, output_size, p):
        super().__init__()

        self.criterion = nn.CrossEntropyLoss()
        
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        
        self.embeding = nn.Embedding(input_size, embeding_size)
        self.lstm = nn.LSTM(embeding_size, hidden_size, n_layers, dropout=p)
        self.out_fc = nn.Linear(hidden_size, output_size)
        
        self.dropout = nn.Dropout(p)
        

    def forward(self, batch_of_letter, hidden, cell):
        ## letter level operations
        
        embeding = self.dropout(self.embeding(batch_of_letter))
        # category_plus_letter = torch.cat((batch_of_category, embeding), 1)

        #sequence_length = 1
        category_plus_letter = embeding.unsqueeze(1)
        
        out, (hidden, cell) = self.lstm(category_plus_letter, (hidden, cell))
        out = self.out_fc(out)
        out = out.squeeze(1)
        
        return out, (hidden, cell)
        

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), self.lr)
        scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        item_dict = batch
        loss = 0

        #to(device) needed due to some problem with PL
        hidden = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)
        cell = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)

        #we loop over letters, single batch at the time 
        for t in range(item_dict["input_tensors"].size(1)):
            batch_of_letter = item_dict["input_tensors"][:, t]
            
            output, (hidden, cell) = self(batch_of_letter, hidden, cell)
            
            loss += self.criterion(output, item_dict["target_tensors"][:, t])

        loss = loss/(t+1)

        tensorboard_logs = {'train_loss': loss}

        return {'loss': loss, 'log': tensorboard_logs}
    
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        
        return hidden, cell

In [None]:
dm = NamesDatamodule(1)

rnn_model = RNN(input_size=ds.n_tokens,
            hidden_size=256,
            embeding_size = 128, 
            n_layers=2,    
            output_size=ds.n_tokens,
            p=0.3)


trainer = Trainer(max_epochs=3, 
                  logger=None,
                  gpus=1,)

trainer.fit(rnn_model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | embeding  | Embedding        | 13.1 K
2 | lstm      | LSTM             | 921 K 
3 | out_fc    | Linear           | 26.2 K
4 | dropout   | Dropout          | 0     
-----------------------------------------------
960 K     Trainable params
0         Non-trainable params
960 K     Total params
3.843     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]