In [3]:
import os
import pandas as pd
import json
from utils import decoder_function


In [1]:
from torch.utils.data import Dataset
import pandas as pd
from utils import str_idx_to_list
import torch


class CustomTextDataset(Dataset):
    def __init__(
        self,
        dataset_df: pd.DataFrame,
        X_max_length: int,
        Y_max_length: int,
        X_vocab_size: int,
        Y_vocab_size: int,
    )-> tuple(torch.tensor,torch.tensor,torch.tensor,torch.tensor,torch.tensor,torch.tensor):
        """
        Text dataset for translit from English to Tamil task.

        Args:
            dataset_df (pd.DataFrame): _description_
            X_max_length (int): Maximum length of the word (in X) in the dataset
            Y_max_length (int): Maximum length of the word (in Y) in the dataset
            X_vocab_size (int): Size of the X vocabulary. This is to add padding integer.
            Y_vocab_size (int): Size of the Y vocabulary. This is to add padding integer.
        """
        self.dataset_df = dataset_df
        self.X_max_length = X_max_length
        self.Y_max_length = Y_max_length
        self.X_vocab_size = X_vocab_size
        self.Y_vocab_size = Y_vocab_size

    def __len__(self):
        return len(self.dataset_df)

    def __getitem__(self, idx: int):
        X = str_idx_to_list(self.dataset_df.iloc[idx]["English"])  # .values[0]
        Y = str_idx_to_list(self.dataset_df.iloc[idx]["Tamil"])  # .values[0]

        ## Decoder input y
        Y_decoder_ip = Y[:-1]
        ## Decoder output y
        Y_decoder_op = Y[1:]
        ## The actual length of the sequence
        X_len = len(X)
        Y_decoder_ip_len = len(Y_decoder_ip)
        Y_decoder_op_len = len(Y_decoder_op)
        if X_len < self.X_max_length:
            ## self.X_vocab_size refer to the padding index (last)
            X.extend([self.X_vocab_size] * (self.X_max_length - X_len))
        ## Decoder IP
        if Y_decoder_ip_len < self.Y_max_length:
            ## self.Y_vocab_size refer to the padding index (last)
            Y_decoder_ip.extend(
                [self.Y_vocab_size] * (self.Y_max_length - Y_decoder_ip_len)
            )
        if Y_decoder_op_len < self.Y_max_length:
            ## self.Y_vocab_size refer to the padding index (last)
            Y_decoder_op.extend(
                [self.Y_vocab_size] * (self.Y_max_length - Y_decoder_op_len)
            )
        ## Padding index
        ## X : English
        ## Y : Tamil
        
        

        X = torch.tensor(X, dtype=torch.long)
        Y_decoder_ip = torch.tensor(Y_decoder_ip, dtype=torch.long)
        Y_decoder_op = torch.tensor(Y_decoder_op, dtype=torch.long)

        X_len = torch.tensor(X_len, dtype=torch.long)
        Y_decoder_ip_len = torch.tensor(Y_decoder_ip_len, dtype=torch.long)
        Y_decoder_op_len = torch.tensor(Y_decoder_op_len, dtype=torch.long)
        return X, Y_decoder_ip, Y_decoder_op, X_len, Y_decoder_ip_len, Y_decoder_op_len


In [4]:
## 
DATASET_PATH = os.path.join("..","dataset","dakshina_dataset_v1.0","ta","lexicons")
TRAIN = "ta.translit.sampled.train.tsv"
VAL = "ta.translit.sampled.dev.tsv"
TEST =   "ta.translit.sampled.test.tsv"

In [7]:
# x,y_dec_ip,y_dec_op,X_len, Y_decoder_ip_len, Y_decoder_op_len = train_dataset.__getitem__(867)
# print(decoder_function(character_idx_seq=','.join([str(i) for i in x.detach().tolist()]), idx_to_char_dict=english_idx_to_char))
# print(decoder_function(character_idx_seq=','.join([str(i) for i in y_dec_ip.detach().tolist()]), idx_to_char_dict=tamil_idx_to_char))
# print(decoder_function(character_idx_seq=','.join([str(i) for i in y_dec_op.detach().tolist()]), idx_to_char_dict=tamil_idx_to_char))

# print(x.__len__())
# print(y_dec_ip.__len__())
# print(y_dec_op.__len__())
# print("lengths")
# print(X_len, Y_decoder_ip_len, Y_decoder_op_len)


adaiyaalam--------------------
	அடையாளம்-----------------
அடையாளம்
-----------------
30
26
26
lengths
tensor(10) tensor(9) tensor(9)


In [9]:
## Build the RNN network
from lightning import LightningModule
import torch.nn as nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


## TODO: use torch.nn.utils.rnn.pack_padded_sequence
## TODO: torch.nn.utils.rnn.pad_packed_sequence


class EncoderNetwork(LightningModule):
    def __init__(self, config: dict):
        super(EncoderNetwork, self).__init__()
        self.config = config
        ## encoder
        ### Embedding layer
        #### Here config.X_vocab_size will be the padding index also
        self.embedding = nn.Embedding(
            self.config.X_vocab_size,
            self.config.encoder_embedding_size,
            padding_idx=self.config.X_padding_idx,
        )

        if self.config.recurrent_layer_type == "RNN":
            self.recursive_layer = nn.RNN(
                input_size=self.config.encoder_embedding_size,
                hidden_size=self.config.encoder_hidden_size,
                num_layers=self.config.num_encoder_layers,
                dropout=self.config.encoder_dropout_prob,
                bidirectional=self.config.encoder_bidir,
                nonlinearity=self.config.encoder_nonlinearity,
                batch_first=True,
            )

    def forward(self, x, lengths):
        ## Initialize H0
        ##! The doc said the H0 will dafault to zeros. Going to check this https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN
        e_x = self.embedding(x)

        ## Pack the padded input for better computation
        packed = pack_padded_sequence(e_x, lengths.cpu(), batch_first=True, enforce_sorted=False)

        output_packed, h_n = self.recursive_layer(packed)

        output, _ = pad_packed_sequence(output_packed, batch_first=True, total_length = self.config.X_max_length)
        return e_x, output, h_n


class DecoderNetwork(nn.Module):
    def __init__(self, config: dict):
        super(DecoderNetwork, self).__init__()
        self.config = config
        ## Tamil encoder
        self.embedding = nn.Embedding(
            num_embeddings=config.Y_vocab_size,
            embedding_dim=config.decoder_embedding_size,
            padding_idx=config.Y_padding_idx,
        )
        self.rnn = nn.RNN(
            input_size=config.decoder_embedding_size,
            hidden_size=config.decoder_hidden_size,
            num_layers=config.num_decoder_layers,
            dropout=config.decoder_dropout_prob,
            bidirectional=config.decoder_bidir,
            nonlinearity=config.decoder_nonlinearity,
            batch_first=True,
        )
        ## Here the Vocab size should be one less as we added 1 for the embedding layer
        self.fc = nn.Linear(config.decoder_hidden_size, config.Y_vocab_size-1)

    def forward(self, y_decoder_input, encoder_hidden):
        """
        y_decoder_input: (batch, tgt_seq_len)
        encoder_hidden: (num_layers * num_directions, batch, hidden_size)
        """
        embedded = self.embedding(y_decoder_input)  # (batch, tgt_seq_len, embed_dim)
        output, hidden = self.rnn(embedded, encoder_hidden)
        # output: (batch, tgt_seq_len, hidden_size)
        logits = self.fc(output)  # (batch, tgt_seq_len, vocab_size)
        return logits, hidden


In [34]:
# #from RecursiveNetwork import EncoderNetwork, DecoderNetwork
# train_iter = iter(train_loader)
# x,y_dec_ip,y_dec_op, x_len,_,_ = next(train_iter)

# print(x.shape)
# print(y_dec_ip.shape)
# print(y_dec_op.shape)
# print(x_len, x_len.shape)

torch.Size([2, 30])
torch.Size([2, 26])
torch.Size([2, 26])
tensor([10,  5]) torch.Size([2])


In [14]:
class Config:
    def __init__(self):
        self.wandb_project = "assignment_3"
        self.wandb_entity = "v1"
        self.epoch = 5
        self.batch_size = 16
        self.encoder_embedding_size = 128
        self.X_vocab_size = 26+1 ## Here +1 is because the nn.Embedding layer throws this error AssertionError =  Padding_idx must be within num_embeddings
        self.X_padding_idx = 26
        self.X_max_length = 30
        self.Y_max_length = 26
        self.recurrent_layer_type = "RNN"
        self.encoder_hidden_size = 256
        self.num_encoder_layers = 1
        self.encoder_dropout_prob = 0.0
        self.encoder_bidir = False
        self.encoder_nonlinearity = "tanh"
        self.Y_vocab_size = 48+1 ## Here +1 is because the nn.Embedding layer throws this error AssertionError =  Padding_idx must be within num_embeddings
        self.decoder_embedding_size = 128
        self.Y_padding_idx = 48
        self.decoder_hidden_size = 256
        self.num_decoder_layers = 1
        self.decoder_dropout_prob = 0.0
        self.decoder_bidir = False
        self.decoder_nonlinearity = "tanh"
        self.Y_true_vocab_size = 48 ## No need for extra digit for padding s required by nn.Embedding
        self.LR = 1e-3



In [15]:
# config = Config()
# RNN_enc_net = EncoderNetwork(config = config)
# RNN_dec_net = DecoderNetwork(config = config)

In [16]:
# enc_, out, h_out = RNN_enc_net.forward(x,x_len)

In [17]:
# print(out.shape, h_out.shape)

In [39]:
enc_[0].shape

torch.Size([30, 128])

In [40]:
logits, hidden = RNN_dec_net.forward(y_dec_ip,encoder_hidden=h_out)

In [41]:
logits.shape

torch.Size([2, 26, 48])

In [18]:
import torch
import torch.nn as nn

from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from lightning.pytorch.loggers import WandbLogger

from lightning import LightningModule
from lightning import Trainer, seed_everything

SEED = 5
seed_everything(SEED, workers=True)

class Seq2SeqModel(LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.encoder = EncoderNetwork(self.config)
        self.decoder = DecoderNetwork(self.config)
        self.loss_fn = nn.CrossEntropyLoss(
            ignore_index=self.config.Y_padding_idx  # Mask out padding positions
        )
        
        self.train_correct = 0
        self.train_total = 0
        self.val_correct = 0
        self.val_total = 0

        self.train_loss = []
        self.val_loss = []

    def forward(self, x, X_len, y_dec_ip):
        # Encoder forward (optionally use X_len for packing)
        _, _, encoder_hidden = self.encoder(x, X_len)
        # Decoder forward
        logits, _ = self.decoder(y_dec_ip, encoder_hidden)
        return logits

    def training_step(self, batch):
        
        x,y_dec_ip,y_dec_op,X_len,_,_ = batch  
        
        logits = self(x, X_len, y_dec_ip)  # (batch, tgt_len, vocab_size)
        ## reshaping to match the required shape of (N,C) for logits 
        ## and (N,) for label
        logits = logits.view(-1, logits.size(-1))
        targets = y_dec_op.view(-1) ## Flatten the decoder 
        loss = self.loss_fn(logits, targets)

        ## Accuracy and loss tracking
        prob = F.softmax(logits, dim=1)
        preds = torch.argmax(prob, dim=1)
        correct = (preds == targets).sum().item()
        batch_size = logits.size(0)

        # Update counters
        self.train_correct += correct
        self.train_total += batch_size
        self.train_loss.append(loss.view(1).cpu())



        #self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch):
        
        x,y_dec_ip,y_dec_op,X_len,_,_ = batch  
        
        logits = self(x, X_len, y_dec_ip)  # (batch, tgt_len, vocab_size)
        ## reshaping to match the required shape of (N,C) for logits 
        ## and (N,) for label
        logits = logits.view(-1, logits.size(-1))
        targets = y_dec_op.view(-1) ## Flatten the decoder 
        loss = self.loss_fn(logits, targets)

        ## Accuracy and loss tracking
        prob = F.softmax(logits, dim=1)
        preds = torch.argmax(prob, dim=1)
        correct = (preds == targets).sum().item()
        batch_size = logits.size(0)

        # Update counters
        self.val_correct += correct
        self.val_total += batch_size
        self.val_loss.append(loss.view(1).cpu())



        #self.log("val_loss", loss)
        return loss
    
    def on_train_epoch_end(self):
        # Calculate epoch accuracy
        epoch_acc = self.train_correct / self.train_total
        self.log("train_acc_epoch", epoch_acc)
        if len(self.train_loss) > 0:
            self.log("train_loss_epoch", torch.cat(self.train_loss).mean())
        # Reset lists
        self.train_correct = 0
        self.train_total = 0
        self.train_loss = []

    def on_validation_epoch_end(self):
        # Calculate epoch accuracy
        epoch_acc = self.val_correct / self.val_total
        self.log("val_acc_epoch", epoch_acc)
        if len(self.val_loss) > 0:
            self.log("val_loss_epoch", torch.cat(self.val_loss).mean())
        # Reset lists
        self.val_correct = 0
        self.val_total = 0
        self.val_loss = []

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.LR)
        lr_scheduler_config = {
            "scheduler": ReduceLROnPlateau(
                optimizer=optimizer, mode="max", factor=0.1, patience=2
            ),
            "interval": "epoch",
            "frequency": 1,
            "monitor": "val_acc_epoch",
            # If set to `True`, will enforce that the value specified 'monitor'
            # is available when the scheduler is updated, thus stopping
            # training if not found. If set to `False`, it will only produce a warning
            "strict": True,
            "name": "LR_track",
        }
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}


Seed set to 5


In [19]:
config = Config()

In [51]:
train_df = pd.read_csv(os.path.join(DATASET_PATH, "ta.translit.sampled.train.idx.csv"))
val_df = pd.read_csv(os.path.join(DATASET_PATH, "ta.translit.sampled.dev.idx.csv"))
test_df = pd.read_csv(os.path.join(DATASET_PATH, "ta.translit.sampled.test.idx.csv"))


with open(os.path.join(DATASET_PATH,"tamil_token_index.json"),'r', encoding="utf-8") as f:
	tamil_idx = json.load(f)
tamil_idx_to_char = {j:i for i,j in tamil_idx.items()}

with open(os.path.join(DATASET_PATH,"english_token_index.json"),'r', encoding="utf-8") as f:
	english_idx = json.load(f)
english_idx_to_char = {j:i for i,j in english_idx.items()}

## For padding
english_idx_to_char[26] = "-"
tamil_idx_to_char[48] = "-"

In [54]:
train_df.shape

(68218, 2)

In [71]:
dataset = CustomTextDataset(dataset_df = test_df, X_max_length = 30,
        Y_max_length = 27,
        X_vocab_size = 26,
        Y_vocab_size = 48,)

In [72]:

for i in range(dataset.__len__()):
    X, Y_decoder_ip, Y_decoder_op, X_len, Y_decoder_ip_len, Y_decoder_op_len = dataset.__getitem__(i)
    if Y_decoder_ip.shape[0] != 27:
        print("idx",i,"Y_decoder_ip",Y_decoder_ip.shape[0] )
    if Y_decoder_op.shape[0] != 27:
        print("idx",i,"Y_decoder_op",Y_decoder_op.shape[0] )
    if X.shape[0] != 30:
        print("idx",i,"X" )
    

In [None]:
## Train 38766 38767 Y_decoder_op

In [40]:
X, Y_decoder_ip, Y_decoder_op, X_len, Y_decoder_ip_len, Y_decoder_op_len = train_dataset.__getitem__(38766)
Y_decoder_op.shape

torch.Size([27])

In [24]:
from torch.utils.data import DataLoader
train_dataset = CustomTextDataset(dataset_df = train_df[:200], X_max_length = 30,
        Y_max_length = 26,
        X_vocab_size = 26,
        Y_vocab_size = 48,)

val_dataset = CustomTextDataset(dataset_df = val_df[:200], X_max_length = 30,
        Y_max_length = 26,
        X_vocab_size = 26,
        Y_vocab_size = 48,)

train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    drop_last=True,
    pin_memory=True,
    # num_workers=2,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    drop_last=False,
    pin_memory=True,
    # num_workers=2,
)

In [25]:

lit_model = Seq2SeqModel(config=config)

wandb_logger = WandbLogger(
        project=config.wandb_project,
        name=config.wandb_entity,
        log_model=False,
        config=config,
    )

In [26]:
import wandb
wandb.login()
trainer = Trainer(
    max_epochs=config.epoch,
    accelerator="auto",
    log_every_n_steps=100,
    logger=wandb_logger,
    #callbacks=[checkpoint_callback],
)  # Added accelerator gpu, can be cpu also, devices set to 1

trainer.fit(
    lit_model,
    train_loader,
    val_loader,
)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | encoder | EncoderNetwork   | 102 K  | train
1 | decoder | DecoderNetwork   | 117 K  | train
2 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
219 K     Trainable params
0         Non-trainable params
219 K     Total params
0.879     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode


                                                                           

c:\Users\yuvar\miniconda3\envs\fastapi\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=100). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 4: 100%|██████████| 12/12 [00:00<00:00, 15.39it/s, v_num=b9ex]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 12/12 [00:00<00:00, 14.64it/s, v_num=b9ex]
