## Train Tokenizer

In [5]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
import pandas as pd

In [6]:
de_en = load_dataset("wmt/wmt14",'de-en')

In [7]:
tokenzier_dataset=pd.DataFrame(de_en['train']['translation'])
tokenzier_dataset['total']=tokenzier_dataset['de'] + ' ' + tokenzier_dataset['en']

def get_training_corpus():
    for i in range(0, len(tokenzier_dataset), 1000):
        yield tokenzier_dataset[i : i + 1000]["total"]
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
# tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]","[BOS]", "[EOS]"]
trainer = trainers.WordPieceTrainer(vocab_size=52000, special_tokens=special_tokens)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [8]:
rnn_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    bos_token="[BOS]",
    eos_token="[EOS]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [9]:
rnn_tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

## Preparing Model

In [1]:
import pandas as pd
import numpy as np

import torch
from torch import nn,optim
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device : {device}')

  from .autonotebook import tqdm as notebook_tqdm


Device : cpu


In [12]:
class RNNConfig:
    def __init__(self):
        self.vocab_size=52000
        self.hidden_dim=128
        self.seq_len = 512
        self.batch_size=1
        self.epochs=5
        self.learning_rate=1e-4
        self.weight_decay=0.001
        
class RNNLayer(nn.Module):
    def __init__(self,config:RNNConfig):
        super().__init__()
        self.hidden_dim=config.hidden_dim
        self.vocab_size=config.vocab_size
        self.embedding = nn.Embedding(self.vocab_size,self.hidden_dim)
        self.w_hx = nn.Linear(config.hidden_dim,config.hidden_dim,bias=False)
        self.w_hh = nn.Linear(config.hidden_dim,config.hidden_dim,bias=False)
        self.w_yh = nn.Linear(config.hidden_dim,config.vocab_size,bias=False)
        self.sigmoid=nn.Sigmoid()
    def __calculate_rnn(self,x,h):
        ht = self.w_hx(x) + self.w_hh(h)
        ht = self.sigmoid(ht)
        yt = nn.functional.softmax(self.w_yh(ht))
        return ht,yt
        
    def _training_step(self,input_ids,mask=None,h=None,labels=None):
        
        bsz,seq=input_ids.shape
        
        total_loss = 0
        
        if h is None:
            h = torch.zeros((bsz,self.hidden_dim)).to(input_ids.device)
        
        for i in range(seq):
            h,y = self.forward(input_ids[:,i],h)
            
            # For masking the loss of the inputs and pad tokens and only calculating the loss for the labels.
            y=y*mask[:,i:i+1]
            
            total_loss += nn.functional.nll_loss(y,labels[:,i])
            # print(nn.functional.nll_loss(y,labels[:,i]), y[0][labels[0,i]])
        return total_loss
    
    def forward(self,input_ids,h=None):
        bsz=input_ids.shape[0] # X =(batch size, sequence length)
        x = self.embedding(input_ids) #(batch size, sequence_length, hidden size)
        
        if h is None:
            h = torch.zeros((bsz,self.hidden_dim))
        # RNN Calculation
        h,yt = self.__calculate_rnn(x,h)
        return h,yt
        
class RNNDataset(torch.utils.data.Dataset):
    def __init__(self,dataset,tokenizer,sequence_length):
        self.dataset=dataset
        self.tokenizer=tokenizer
        self.params = {'padding':'max_length','max_length':sequence_length,'truncation':True,'return_tensors':'pt'}

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):
        inputs= self.tokenizer.bos_token + self.dataset[idx]['de'] + self.tokenizer.sep_token + self.dataset[idx]['en'] 
        labels= self.dataset[idx]['de'] + self.tokenizer.sep_token + self.dataset[idx]['en'] + self.tokenizer.eos_token
        input_ids = self.tokenizer.encode(inputs,**self.params)[0]
        tokens = self.tokenizer(labels,**self.params)
        labels=tokens['input_ids'][0]
        mask=tokens['attention_mask'][0]
        i = (labels==self.tokenizer.sep_token_id).nonzero()[0][0]
        mask[:i+1]=0
        
        return input_ids,labels,mask


## Training RNN

In [13]:
config=RNNConfig()

In [14]:
tokenizer=AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token="<|endoftext|>"
tokenizer.cls_token="<|endoftext|>"
tokenizer.sep_token="<|endoftext|>"

In [15]:
de_en = load_dataset("wmt/wmt14",'de-en')
train_ds = RNNDataset(de_en['train']['translation'],tokenizer,100)
train_loader=torch.utils.data.DataLoader(train_ds,batch_size=config.batch_size,shuffle=True)
model = RNNLayer(config)
optimizer=optim.AdamW(model.parameters(),lr=config.learning_rate,weight_decay=config.weight_decay)

In [17]:
def train_one_epoch(model,train_loader,optimizer):
    model.train()
    loader=tqdm(train_loader)
    total_loss=0
    
    for idx,(input_ids,labels,mask) in enumerate(loader):
        optimizer.zero_grad()
        input_ids=input_ids.to(device)
        labels=labels.to(device)
        mask=mask.to(device)
        loss = model._training_step(input_ids=input_ids,mask=mask,labels=labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        if idx%1000==0:
            loader.set_postfix({'loss':loss.item()})
    
    return total_loss / len(train_loader)
        
    

In [18]:
# Training
for epoch in range(config.epochs):
    loss = train_one_epoch(model,train_loader,optimizer)
    print(f'Loss after Epoch {epoch} is {loss}')

  yt = nn.functional.softmax(self.w_yh(ht))
  0%|                | 18/4508785 [00:12<883:29:32,  1.42it/s, loss=-0.000195]


IndexError: index 0 is out of bounds for dimension 0 with size 0

In [None]:
for i in range(len(train_ds)):
    try:
        a = train_ds[i]
    except:
        print(i)

10
16
23
31
32
36
57
73
75
78
80
81
82
83
101
110
119
128
130
163
164
165
166
167
172
174
175
179
184
188
197
199
211
221
233
235
239
246
251
252
257
259
270
275
290
291
306
314
315
316
322
323
340
342
344
346
347
356
359
360
367
368
369
370
373
374
376
377
378
379
380
381
384
389
394
397
398
399
415
416
429
434
443
444
445
452
462
463
491
492
494
495
500
503
507
508
515
516
523
538
541
542
547
548
551
552
553
554
556
560
561
562
565
567
568
569
571
572
574
578
580
584
595
597
600
606
608
612
613
615
616
619
620
623
627
630
644
645
646
648
650
651
653
654
660
665
685
708
729
731
732
739
759
761
762
764
765
770
773
778
784


Exception ignored in: <function Dataset.__del__ at 0x7c034f8fb600>
Traceback (most recent call last):
  File "/home/yota/Documents/study_and_research/.env/lib/python3.12/site-packages/datasets/arrow_dataset.py", line 1482, in __del__
    if hasattr(self, "_indices"):
       ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt: 


793
798
799
800
801
806
810
811
812
813
814
820
836
850
851
871
911
944
952
964
968
969
983
984
986
990
992
993
998
999
1000
1002
1003
1004
1008


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7c04c51f9c40>>
Traceback (most recent call last):
  File "/home/yota/Documents/study_and_research/.env/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Exception ignored in: <function Dataset.__del__ at 0x7c034f8fb600>
Traceback (most recent call last):
  File "/home/yota/Documents/study_and_research/.env/lib/python3.12/site-packages/datasets/arrow_dataset.py", line 1482, in __del__
    if hasattr(self, "_indices"):
       ^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt: 


1033


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7c04c51f9c40>>
Traceback (most recent call last):
  File "/home/yota/Documents/study_and_research/.env/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


1055
1065
1073
1075
1076
1077
1079
1082
1088
1089
1091
1096
1098
1107
1118
1133
1141
1142
1148
1150
1153
1159
1160
1161
1162
1163
1166
1167
1169
1171
1176
1182
1218
1225
1236
1237
1243
1244
1257
1259
1272
1281
1289
1292
1294
1297
1325
1334
1346
1354
1356
1359
1360
1361
1366
1395
1396
1397
1398
1412
1413
1414
1422
1425
1436
1456
1458
1461
1469
1472
1474
