In [37]:
import pandas as pd 
import re 
import numpy as np 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch.optim as optim 
from Transformer import * 
from BPE.tokenizer import * 
from BPE.utilities import * 
from BPE.vocab_builder import *
import os
import time

# Load Data ⬇️

In [9]:
path = '../data/pickles/vocab'

In [10]:
!ls {path}

inttostr.pkl	   strtoint.pkl		      vocab_tokens.pkl
sorted_tokens.pkl  tokenized_screenplays.pkl


In [11]:
tokenized_screenplays = load_pickle(f'{path}/tokenized_screenplays.pkl')

In [12]:
len(tokenized_screenplays)

395312

In [13]:
min(tokenized_screenplays)

1

In [14]:
max(tokenized_screenplays)

6563

In [15]:
class GeneratorDataset(Dataset):

    def __init__(self, data_array, seq_length):

        self.data_array = data_array
        self.seq_length = seq_length
        self.total_words = len(self.data_array)
        self.req_size = self.total_words - self.seq_length - 1

    def __len__(self):

        return self.req_size

    def __getitem__(self, ix):

        inp_seq = torch.from_numpy(np.array(self.data_array[ix:ix+self.seq_length]))
        op_seq = torch.from_numpy(np.array(self.data_array[ix+1:ix+self.seq_length+1]))

        return {'input': inp_seq.long(), 'output': op_seq}

In [34]:
inttostr = load_pickle(f'{path}/inttostr.pkl')

In [16]:
EMBEDDING_DIM = 512
HEADS = 8
DEPTH = 6
SEQ_LEN = 256
NUM_TOKENS = len(inttostr) + 1


In [17]:
data_set = GeneratorDataset(tokenized_screenplays, SEQ_LEN)

In [18]:
dataloader = DataLoader(data_set, batch_size=4, shuffle=True, num_workers=100, pin_memory=True)

In [19]:
model = GenerationTransformer(emb = EMBEDDING_DIM, heads = HEADS, depth = DEPTH, seq_length = SEQ_LEN, num_tokens = NUM_TOKENS, device = 'cuda', mask = True, wide = True)

if torch.cuda.is_available():
    model.to("cuda")

In [36]:
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [38]:
def fit_generator(epoch, model, dataloader, save_every, path = '../models/'):

    start_time = time.time()

    batch_running_loss = 0.0
    for ix, batch in enumerate(dataloader):

        optimizer.zero_grad()

        input, target = batch['input'], batch['output']

        if torch.cuda.is_available():
            input, target = input.cuda(), target.cuda()

        input, target = torch.autograd.Variable(input), torch.autograd.Variable(target)

        output = model(input)

        output = output.transpose(2,1)

        loss = F.nll_loss(output, target, reduction = 'mean')

        batch_running_loss += loss.item()

        loss.backward()

        optimizer.step()
    
    epoch_loss = batch_running_loss / len(dataloader.dataset)


    t = f"""
    Epoch {epoch}:
        Loss {epoch_loss}
        Time {time.time() - start_time}
    """

    return epoch_loss 



In [28]:
op.size()

torch.Size([4, 256, 395313])

In [30]:
op_.size()

torch.Size([4, 395313, 256])

In [29]:
inttostrt = load_pickle(f'{path}/inttostr.pkl')

In [33]:
len(inttostrt)

6571

In [31]:
[inttostrt[c] for c in (torch.argmax(op, dim = 2)[1]).tolist()]

KeyError: 276978