## Train Tokenizer

In [5]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
import pandas as pd

In [6]:
de_en = load_dataset("wmt/wmt14",'de-en')

In [7]:
tokenzier_dataset=pd.DataFrame(de_en['train']['translation'])
tokenzier_dataset['total']=tokenzier_dataset['de'] + ' ' + tokenzier_dataset['en']

def get_training_corpus():
    for i in range(0, len(tokenzier_dataset), 1000):
        yield tokenzier_dataset[i : i + 1000]["total"]
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
# tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]","[BOS]", "[EOS]"]
trainer = trainers.WordPieceTrainer(vocab_size=52000, special_tokens=special_tokens)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [8]:
rnn_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    bos_token="[BOS]",
    eos_token="[EOS]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [9]:
rnn_tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

## Preparing Model

In [8]:
import pandas as pd
import numpy as np

import torch
from torch import nn,optim
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm

In [93]:
class RNNConfig:
    def __init__(self,
                vocab_size=52000,
                hidden_dim=128,
                seq_len=512,
                batch_size=2,
                ):
        self.vocab_size=vocab_size
        self.hidden_dim=hidden_dim
        self.seq_len = seq_len
        self.batch_size=batch_size
    
class RNNLayer(nn.Module):
    def __init__(self,config:RNNConfig):
        super().__init__()
        self.hidden_dim=config.hidden_dim
        self.vocab_size=config.vocab_size
        self.embedding = nn.Embedding(self.vocab_size,self.hidden_dim)
        self.w_hx = nn.Linear(config.hidden_dim,config.hidden_dim,bias=False)
        self.w_hh = nn.Linear(config.hidden_dim,config.hidden_dim,bias=False)
        self.w_yh = nn.Linear(config.hidden_dim,config.vocab_size,bias=False)
        self.sigmoid=nn.Sigmoid()
    def __calculate_rnn(self,x,h):
        ht = self.w_hx(x) + self.w_hh(h)
        ht = self.sigmoid(ht)
        yt = nn.functional.softmax(self.w_yh(ht))
        return ht,yt
        
    def _training_step(self,input_ids,mask=None,h=None,labels=None):
        
        bsz,seq=input_ids.shape
        
        total_loss = 0
        
        if h is None:
            h = torch.zeros((bsz,self.hidden_dim))
        for i in range(seq):
            h,y = self.forward(input_ids[:,i],h)
            
            # For masking the loss of the inputs and pad tokens and only calculating the loss for the labels.
            y=y*mask[:,i:i+1]
            
            total_loss += nn.functional.nll_loss(y,labels[:,i])
        print(total_loss)
    
    def forward(self,input_ids,h=None):
        bsz=input_ids.shape[0] # X =(batch size, sequence length)
        x = self.embedding(input_ids) #(batch size, sequence_length, hidden size)
        
        if h is None:
            h = torch.zeros((bsz,self.hidden_dim))
        # RNN Calculation
        h,yt = self.__calculate_rnn(x,h)
        return h,yt
        
class RNNDataset(torch.utils.data.Dataset):
    def __init__(self,dataset,tokenizer,sequence_length):
        self.dataset=dataset
        self.tokenizer=tokenizer
        self.params = {'padding':'max_length','max_length':sequence_length,'truncation':True,'return_tensors':'pt'}

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):
        inputs= self.tokenizer.bos_token + self.dataset[idx]['de'] + self.tokenizer.sep_token + self.dataset[idx]['en'] 
        labels= self.dataset[idx]['de'] + self.tokenizer.sep_token + self.dataset[idx]['en'] + self.tokenizer.eos_token
        input_ids = self.tokenizer.encode(inputs,**self.params)[0]
        tokens = self.tokenizer(labels,**self.params)
        labels=tokens['input_ids'][0]
        mask=tokens['attention_mask'][0]
        i = (labels==self.tokenizer.sep_token_id).nonzero()[0][0]
        mask[:i+1]=0
        
        return input_ids,labels,mask
model = RNNLayer(config)

## Training RNN

In [94]:
config=RNNConfig()

In [95]:
tokenizer=AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token="<|endoftext|>"
tokenizer.cls_token="<|endoftext|>"
tokenizer.sep_token="<|endoftext|>"

In [40]:
de_en = load_dataset("wmt/wmt14",'de-en')
train_ds = RNNDataset(de_en['train']['translation'],tokenizer,100)
train_loader=torch.utils.data.DataLoader(train_ds,batch_size=config.batch_size,shuffle=True)

In [96]:
for (input_ids,labels,mask) in train_loader:
    break

In [97]:
model._training_step(input_ids,mask=mask,labels=labels)


NameError: name 'x' is not defined

In [92]:
torch.nn.functional.nll_loss(y,labels[:,0])

tensor(0., grad_fn=<NllLossBackward0>)

In [91]:
nn.functional.nll_loss(torch.tensor([1,2,0,42],dtype=torch.float32).unsqueeze(0),torch.tensor([3]))

tensor(-42.)