# CS6910 Assignment 3 (RNN Frameworks for transliteration) - Without attention

In [1]:
# importing required libraries for the notebook
import lightning as lt
import torch
import torch.nn as nn
from IPython.display import display
import numpy as np
from torch.utils.data import DataLoader, Dataset
import random
from language import *
from dataset_dataloader import *
from encoder_decoder import *

In [2]:
# set the device to 'cuda' if available
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Defining the source and target languages and loading data

In [3]:
# define the source and target languages
TARGET = 'hin'
SOURCE = 'eng'

In [4]:
# load all the available data and print sample counts for each set
x_train, y_train = load_data(TARGET, 'train')
x_valid, y_valid = load_data(TARGET, 'valid')
x_test, y_test = load_data(TARGET, 'test')

print(f'Number of train samples = {len(x_train)}')
print(f'Number of valid samples = {len(x_valid)}')
print(f'Number of test samples = {len(x_test)}')

Number of train samples = 51200
Number of valid samples = 4096
Number of test samples = 4096


In [5]:
# create language objects for storing vocabulary, index2sym and sym2index
SRC_LANG = Language(SOURCE)
TAR_LANG = Language(TARGET)

# creating vocabulary using train data only
SRC_LANG.create_vocabulary(*(x_train))
TAR_LANG.create_vocabulary(*(y_train))

# generate mappings from characters to numbers and vice versa
SRC_LANG.generate_mappings()
TAR_LANG.generate_mappings()

# print the source and target vocabularies
print(f'Source Vocabulary Size = {len(SRC_LANG.symbols)}')
print(f'Source Vocabulary = {SRC_LANG.symbols}')
print(f'Source Mapping {SRC_LANG.index2sym}')
print(f'Target Vocabulary Size = {len(TAR_LANG.symbols)}')
print(f'Target Vocabulary = {TAR_LANG.symbols}')
print(f'Target Mapping {TAR_LANG.index2sym}')

Source Vocabulary Size = 26
Source Vocabulary = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Source Mapping {0: '@', 1: '$', 2: '!', 3: '%', 4: 'a', 5: 'b', 6: 'c', 7: 'd', 8: 'e', 9: 'f', 10: 'g', 11: 'h', 12: 'i', 13: 'j', 14: 'k', 15: 'l', 16: 'm', 17: 'n', 18: 'o', 19: 'p', 20: 'q', 21: 'r', 22: 's', 23: 't', 24: 'u', 25: 'v', 26: 'w', 27: 'x', 28: 'y', 29: 'z'}
Target Vocabulary Size = 64
Target Vocabulary = ['ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्']
Target Mapping {0: '@', 1: '$', 2: '!', 3: '%', 4: 'ँ', 5: 'ं', 6: 'ः', 7: 'अ', 8: 'आ', 9: 'इ', 10: 'ई', 11: 'उ', 12: 'ऊ', 13: 'ऋ', 14: 'ए', 15: 'ऐ', 16: 'ऑ', 17: 'ओ', 18: 'औ',

## Runner Class

In [6]:
class Runner(lt.LightningModule):
    def __init__(self, src_lang : Language, tar_lang : Language, common_embed_size, common_num_layers, 
                 common_hidden_size, common_cell_type, enc_bidirect=False, attention=False, dropout=0.0, 
                 opt_name='Adam', learning_rate=1e-3, batch_size=32):
    
        super(Runner,self).__init__()
        # save the language objects
        self.src_lang = src_lang
        self.tar_lang = tar_lang

        # create all the sub-networks and the main model
        self.encoder = EncoderNet(vocab_size=src_lang.get_size(), embed_size=common_embed_size,
                             num_layers=common_num_layers, hid_size=common_hidden_size,
                             cell_type=common_cell_type, bidirect=enc_bidirect, dropout=dropout)
        if attention:
            self.attention = True
            self.attn_layer = Attention(common_hidden_size, enc_bidirect)
        else:
            self.attention = False
            self.attn_layer = None
        
        self.decoder = DecoderNet(vocab_size=tar_lang.get_size(), embed_size=common_embed_size,
                             num_layers=common_num_layers, hid_size=common_hidden_size,
                             cell_type=common_cell_type, attention=attention, attn_layer=self.attn_layer,
                             enc_bidirect=enc_bidirect, dropout=dropout)
        
        self.model = EncoderDecoder(encoder=self.encoder, decoder=self.decoder, src_lang=src_lang, 
                                    tar_lang=tar_lang)

        # for determinism
        torch.manual_seed(42); torch.cuda.manual_seed(42); np.random.seed(42); random.seed(42)

        self.model.apply(self.init_weights) # initialize model weights
        self.batch_size = batch_size

        # optimizer for the model and loss function [that ignores locs where target = PAD token]
        self.loss_criterion = nn.CrossEntropyLoss(ignore_index=tar_lang.sym2index[PAD_SYM])
        self.opt_name = opt_name
        self.learning_rate = learning_rate

        # only adam is present in configure_optimizers as of now
        if (opt_name != 'Adam'):
            exit(-1)
        
        self.pred_train_words = []
        self.true_train_words = []

    def configure_optimizers(self):
        optimizer = None
        if self.opt_name == 'Adam':
            optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        return optimizer

    @staticmethod
    def init_weights(m):
        '''
        function to initialize the weights of the model parameters
        '''
        for name, param in m.named_parameters():
            if 'weight' in name:
                 nn.init.uniform_(param.data, -0.04, 0.04)
            else:
                nn.init.constant_(param.data, 0)
    
    @staticmethod
    def exact_accuracy(pred_words, tar_words):
        ''' 
        compute the accuracy using (predicted words, target words) and return it.
        exact word matching is used.
        '''
        assert(len(pred_words) == len(tar_words))
        count = 0
        for i in range(len(pred_words)):
            if pred_words[i] == tar_words[i]:
                count += 1
        return count / len(pred_words)
    
    ####################
    # DATA RELATED HOOKS
    ####################

    def setup(self, stage=None):
        # load all the available data on all GPUs
        self.x_train, self.y_train = load_data(TARGET, 'train')
        self.x_valid, self.y_valid = load_data(TARGET, 'valid')
        self.x_test, self.y_test = load_data(TARGET, 'test')

    def train_dataloader(self):
        dataset = TransliterateDataset(self.x_train, self.y_train, src_lang=SRC_LANG, tar_lang=TAR_LANG)
        dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size, collate_fn=CollationFunction(SRC_LANG, TAR_LANG))
        return dataloader

    def val_dataloader(self):
        dataset = TransliterateDataset(self.x_valid, self.y_valid, src_lang=SRC_LANG, tar_lang=TAR_LANG)
        dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size, collate_fn=CollationFunction(SRC_LANG, TAR_LANG))
        return dataloader

    def test_dataloader(self):
        dataset = TransliterateDataset(self.x_test, self.y_test, src_lang=SRC_LANG, tar_lang=TAR_LANG)
        dataloader = DataLoader(dataset=dataset, batch_size=1, collate_fn=CollationFunction(SRC_LANG, TAR_LANG))
        # we do inference word by word. So, batch_size = 1
        return dataloader

    ####################
    # INTERFACE RELATED FUNCTIONS - NOTE -> added validation and test methods; early stopping; heatmap; beam decoding;
    #                                       save to file and plot test errors; fix teacher forcing time interval;
    #                                       wandb sweeping stuff and model checkpointing; try TAMIL
    ####################

    def training_step(self, train_batch, batch_idx):
        batch_X, batch_y, X_lens = train_batch
        # get the logits, preds for the current batch
        logits, preds = self.model(batch_X, batch_y, X_lens, tf_ratio=0.8)
        # ignore loss for the first time step
        targets = batch_y[:, 1:]; logits = logits[:, 1:, :]
        logits = logits.swapaxes(1, 2) # make class logits the second dimension as needed
        loss = self.loss_criterion(logits, targets)
        self.true_train_words.append(batch_y)
        self.pred_train_words.append(preds)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss
    
    def on_train_epoch_end(self):
        num_batches = len(self.pred_train_words)
        pred, true = [], []
        for i in range(num_batches):
            pred += self.tar_lang.convert_to_words(self.pred_train_words[i])
            true += self.tar_lang.convert_to_words(self.true_train_words[i])
        print(len(true))
        print(*zip(pred[:50],true[:50]), sep='\n')
        self.log('train_acc', 100*self.exact_accuracy(pred, true), on_epoch=True, prog_bar=True)
        pred.clear(); true.clear()
        self.pred_train_words.clear(); self.true_train_words.clear()

In [9]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

#runner = Runner(SRC_LANG, TAR_LANG, 128, 3, 256, 'LSTM', True, False, 0.0, 'Adam', learning_rate=2e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=5)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 4.3 M 
1 | decoder        | DecoderNet       | 1.5 M 
2 | model          | EncoderDecoder   | 5.8 M 
3 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
5.8 M     Trainable params
0         Non-trainable params
5.8 M     Total params
23.266    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

51200
('शस्त्रागरर', 'शस्त्रागार')
('बिं्ध्य', 'बिन्द्या')
('किरंकांत', 'किरणकांत')
('यज्ञोपवित', 'यज्ञोपवीत')
('रतैनिया', 'रटानिया')
('वगगन्याचे', 'वागण्याचे')
('देशभरामध्ये', 'देशभरामध्ये')
('सुघा़पन', 'सुघड़पन')
('मोहीवाल', 'मोहीवाल')
('सर्वसंग्रह', 'सर्वसंग्रह')
('बसेको', 'बसेको')
('तुमच्यापैकी', 'तुमच्यापैकी')
('कनन्यकुंभजम', 'कान्यकुब्ज')
('इन्ोक्सिनेशन', 'इनटॉक्सिनेशन')
('मैचुयूरिटी', 'मेच्यूरिटी')
('अग्ी', 'अगरी')
('अनुक्रमुनापात', 'अनुक्रमानुपात')
('धुलचंदड', 'धूलचन्द')
('अवलेहा', 'अवलेह')
('अब्ोर्ड', 'एबरोर्ड')
('बैलर्स', 'बैलर्स')
('बावललीी', 'बार्गली')
('पंक्डर्ड', 'पंक्चर्ड')
('हैंकर्स', 'हैंकर्स')
('जवानोंके', 'जवानोंके')
('पैंकी', 'पौंकी')
('जानधाम', 'जगनधाम')
('पोनिनियम', 'पोन्नियम')
('इईएनबीए', 'आईएनबीए')
('वेदलम', 'वेदलम')
('चिनवास', 'चिनवास')
('मारवाड़ा', 'मारवाड़ा')
('अन्मझा', 'अनसमझा')
('इस्टूडेंट', 'इस्टूडेंट')
('दुहखीत', 'दुःखीत')
('सिघांची', 'सिघांची')
('शिक्षेचे', 'शिक्षेचे')
('उजववलतम', 'उज्वलतम')
('अपत्टन', 'आपट्टन')
('अम्बीकवन', 'अम्बिकावन')
('खानी', 'खगनी')

`Trainer.fit` stopped: `max_epochs=5` reached.


In [15]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 1, 128, 'LSTM', True, False, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 199 K 
1 | decoder        | DecoderNet       | 93.9 K
2 | model          | EncoderDecoder   | 293 K 
3 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
293 K     Trainable params
0         Non-trainable params
293 K     Total params
1.175     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [16]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 1, 128, 'GRU', True, False, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 158 K 
1 | decoder        | DecoderNet       | 73.2 K
2 | model          | EncoderDecoder   | 231 K 
3 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
231 K     Trainable params
0         Non-trainable params
231 K     Total params
0.926     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [17]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 1, 128, 'GRU', True, True, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 158 K 
1 | attn_layer     | Attention        | 49.4 K
2 | decoder        | DecoderNet       | 220 K 
3 | model          | EncoderDecoder   | 379 K 
4 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
379 K     Trainable params
0         Non-trainable params
379 K     Total params
1.517     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [18]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 3, 128, 'LSTM', True, False, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 1.1 M 
1 | decoder        | DecoderNet       | 358 K 
2 | model          | EncoderDecoder   | 1.4 M 
3 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.657     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [19]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 3, 128, 'LSTM', True, False, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 1.1 M 
1 | decoder        | DecoderNet       | 358 K 
2 | model          | EncoderDecoder   | 1.4 M 
3 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.657     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [20]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 3, 128, 'GRU', True, False, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 816 K 
1 | decoder        | DecoderNet       | 271 K 
2 | model          | EncoderDecoder   | 1.1 M 
3 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.353     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [21]:
# testing runner
# keep embedding small (around 32) -> important to get dense embedding
# also, adjust learning rate reasonably

runner = Runner(SRC_LANG, TAR_LANG, 32, 3, 128, 'GRU', True, True, 0.2, 'Adam', learning_rate=1e-3, batch_size=128)
trainer = lt.Trainer(max_epochs=1)
trainer.fit(runner)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | encoder        | EncoderNet       | 816 K 
1 | attn_layer     | Attention        | 49.4 K
2 | decoder        | DecoderNet       | 419 K 
3 | model          | EncoderDecoder   | 1.2 M 
4 | loss_criterion | CrossEntropyLoss | 0     
----------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.944     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
