In [1]:
import os
import pandas as pd
import json
from utils import decoder_function


In [2]:
from torchvision.transforms import v2
from torchvision.transforms.v2 import Normalize
from torchvision.transforms import Resize
from torch.utils.data import Dataset
import pandas as pd
from utils import str_idx_to_list
import torch


class CustomTextDataset(Dataset):
    def __init__(
        self,
        dataset_df: pd.DataFrame,
        X_max_length: int,
        Y_max_length: int,
        X_vocab_size: int,
        Y_vocab_size: int,
    ):
        """
        Text dataset for translit from English to Tamil task.

        Args:
            dataset_df (pd.DataFrame): _description_
            X_max_length (int): Maximum length of the word (in X) in the dataset
            Y_max_length (int): Maximum length of the word (in Y) in the dataset
            X_vocab_size (int): Size of the X vocabulary. This is to add padding integer.
            Y_vocab_size (int): Size of the Y vocabulary. This is to add padding integer.
        """
        self.dataset_df = dataset_df
        self.X_max_length = X_max_length
        self.Y_max_length = Y_max_length
        self.X_vocab_size = X_vocab_size
        self.Y_vocab_size = Y_vocab_size

    def __len__(self):
        return len(self.dataset_df)

    def __getitem__(self, idx: int):
        X = str_idx_to_list(self.dataset_df.iloc[idx]["English"])  # .values[0]
        Y = str_idx_to_list(self.dataset_df.iloc[idx]["Tamil"])  # .values[0]

        ## Decoder input y
        Y_decoder_ip = Y[:-1]
        ## Decoder output y
        Y_decoder_op = Y[1:]
        ## The actual length of the sequence
        X_len = len(X)
        Y_decoder_ip_len = len(Y_decoder_ip)
        Y_decoder_op_len = len(Y_decoder_op)
        if X_len < self.X_max_length:
            ## self.X_vocab_size refer to the padding index (last)
            X.extend([self.X_vocab_size] * (self.X_max_length - X_len))
        ## Decoder IP
        if Y_decoder_ip_len < self.Y_max_length:
            ## self.Y_vocab_size refer to the padding index (last)
            Y_decoder_ip.extend(
                [self.Y_vocab_size] * (self.Y_max_length - Y_decoder_ip_len)
            )
        if Y_decoder_op_len < self.Y_max_length:
            ## self.Y_vocab_size refer to the padding index (last)
            Y_decoder_op.extend(
                [self.Y_vocab_size] * (self.Y_max_length - Y_decoder_op_len)
            )
        ## Padding index
        ## X : English
        ## Y : Tamil
        
        

        X = torch.tensor(X, dtype=torch.long)
        Y_decoder_ip = torch.tensor(Y_decoder_ip, dtype=torch.long)
        Y_decoder_op = torch.tensor(Y_decoder_op, dtype=torch.long)

        X_len = torch.tensor(X_len, dtype=torch.long)
        Y_decoder_ip_len = torch.tensor(Y_decoder_ip_len, dtype=torch.long)
        Y_decoder_op_len = torch.tensor(Y_decoder_op_len, dtype=torch.long)
        return X, Y_decoder_ip, Y_decoder_op, X_len, Y_decoder_ip_len, Y_decoder_op_len


In [3]:
## 
DATASET_PATH = os.path.join("..","dataset","dakshina_dataset_v1.0","ta","lexicons")
TRAIN = "ta.translit.sampled.train.tsv"
VAL = "ta.translit.sampled.dev.tsv"
TEST =   "ta.translit.sampled.test.tsv"

In [4]:
char_idx_df = pd.read_csv(os.path.join(DATASET_PATH, "ta.translit.sampled.train.idx.csv"))


with open(os.path.join(DATASET_PATH,"tamil_token_index.json"),'r', encoding="utf-8") as f:
	tamil_idx = json.load(f)
tamil_idx_to_char = {j:i for i,j in tamil_idx.items()}

with open(os.path.join(DATASET_PATH,"english_token_index.json"),'r', encoding="utf-8") as f:
	english_idx = json.load(f)
english_idx_to_char = {j:i for i,j in english_idx.items()}

## For padding
english_idx_to_char[26] = "-"
tamil_idx_to_char[48] = "-"

In [5]:
from torch.utils.data import DataLoader
text_dataset = CustomTextDataset(dataset_df = char_idx_df, X_max_length = 30,
        Y_max_length = 26,
        X_vocab_size = 26,
        Y_vocab_size = 48,)

train_loader = DataLoader(
    text_dataset,
    batch_size=2,
    shuffle=True,
    drop_last=True,
    pin_memory=True,
    # num_workers=2,
)

In [6]:
x,y_dec_ip,y_dec_op,X_len, Y_decoder_ip_len, Y_decoder_op_len = text_dataset.__getitem__(867)
print(decoder_function(character_idx_seq=','.join([str(i) for i in x.detach().tolist()]), idx_to_char_dict=english_idx_to_char))
print(decoder_function(character_idx_seq=','.join([str(i) for i in y_dec_ip.detach().tolist()]), idx_to_char_dict=tamil_idx_to_char))
print(decoder_function(character_idx_seq=','.join([str(i) for i in y_dec_op.detach().tolist()]), idx_to_char_dict=tamil_idx_to_char))

print(x.__len__())
print(y_dec_ip.__len__())
print(y_dec_op.__len__())

adaiyaalam--------------------
	அடையாளம்-----------------
அடையாளம்
-----------------
30
26
26


In [None]:
## Build the RNN network
from pytorch_lightning import LightningModule
import torch.nn as nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


## TODO: use torch.nn.utils.rnn.pack_padded_sequence
## TODO: torch.nn.utils.rnn.pad_packed_sequence


class EncoderNetwork(LightningModule):
    def __init__(self, config: dict):
        super(EncoderNetwork, self).__init__()
        self.config = config
        ## encoder
        ### Embedding layer
        #### Here config["X_vocab_size"] will be the padding index also
        self.embedding = nn.Embedding(
            self.config["X_vocab_size"],
            self.config["encoder_embedding_size"],
            padding_idx=self.config["X_padding_idx"],
        )

        if self.config["recurrent_layer_type"] == "RNN":
            self.recursive_layer = nn.RNN(
                input_size=self.config["encoder_embedding_size"],
                hidden_size=self.config["encoder_hidden_size"],
                num_layers=self.config["num_encoder_layers"],
                dropout=self.config["encoder_dropout_prob"],
                bidirectional=self.config["encoder_bidir"],
                nonlinearity=self.config["encoder_nonlinearity"],
                batch_first=True,
            )

    def forward(self, x, lengths):
        ## Initialize H0
        ##! The doc said the H0 will dafault to zeros. Going to check this https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN
        e_x = self.embedding(x)

        ## Pack the padded input for better computation
        packed = pack_padded_sequence(e_x, lengths.cpu(), batch_first=True, enforce_sorted=False)

        output_packed, h_n = self.recursive_layer(packed)

        output, _ = pad_packed_sequence(output_packed, batch_first=True)
        return e_x, output, h_n


class DecoderNetwork(nn.Module):
    def __init__(self, config: dict):
        super(DecoderNetwork, self).__init__()
        self.config = config
        ## Tamil encoder
        self.embedding = nn.Embedding(
            num_embeddings=config["Y_vocab_size"],
            embedding_dim=config["decoder_embedding_size"],
            padding_idx=config["Y_padding_idx"],
        )
        self.rnn = nn.RNN(
            input_size=config["decoder_embedding_size"],
            hidden_size=config["decoder_hidden_size"],
            num_layers=config["num_decoder_layers"],
            dropout=config["decoder_dropout_prob"],
            bidirectional=config["decoder_bidir"],
            nonlinearity=config["decoder_nonlinearity"],
            batch_first=True,
        )
        self.fc = nn.Linear(config["decoder_hidden_size"], config["Y_vocab_size"])

    def forward(self, y_decoder_input, encoder_hidden):
        """
        y_decoder_input: (batch, tgt_seq_len)
        encoder_hidden: (num_layers * num_directions, batch, hidden_size)
        """
        embedded = self.embedding(y_decoder_input)  # (batch, tgt_seq_len, embed_dim)
        output, hidden = self.rnn(embedded, encoder_hidden)
        # output: (batch, tgt_seq_len, hidden_size)
        logits = self.fc(output)  # (batch, tgt_seq_len, vocab_size)
        return logits, hidden


In [7]:
#from RecursiveNetwork import EncoderNetwork, DecoderNetwork
train_iter = iter(train_loader)
x,y_dec_ip,y_dec_op = next(train_iter)

print(x.shape)
print(y_dec_ip.shape)
print(y_dec_op.shape)

torch.Size([2, 30])
torch.Size([2, 26])
torch.Size([2, 26])


In [8]:
config = {
"encoder_embedding_size":128,
"X_vocab_size":26+1, ## Here +1 is because the nn.Embedding layer throws this error "AssertionError: Padding_idx must be within num_embeddings"
"X_padding_idx":26,
"recurrent_layer_type":"RNN",
"encoder_hidden_size":256,
"num_encoder_layers":1,
"encoder_dropout_prob":0.0,
"encoder_bidir":False,
"encoder_nonlinearity":"tanh",
"Y_vocab_size":48+1, ## Here +1 is because the nn.Embedding layer throws this error "AssertionError: Padding_idx must be within num_embeddings"
"decoder_embedding_size":128,
"Y_padding_idx":48,
"decoder_hidden_size":256,
"num_decoder_layers":1,
"decoder_dropout_prob":0.0,
"decoder_bidir":False,
"decoder_nonlinearity":"tanh",
"Y_true_vocab_size":48, ## No need for extra digit for padding s required by nn.Embedding

}


In [9]:
RNN_enc_net = EncoderNetwork(config = config)
RNN_dec_net = DecoderNetwork(config = config)

In [10]:
enc_, out, h_out = RNN_enc_net.forward(x)

In [11]:
print(out.shape, h_out.shape)

torch.Size([2, 30, 256]) torch.Size([1, 2, 256])


In [12]:
enc_[0]

tensor([[-0.0265, -1.1140, -2.0928,  ..., -1.0514,  0.8493, -1.5793],
        [ 0.4141, -0.0973, -2.3049,  ...,  0.1408, -0.1397,  0.0620],
        [-1.1666,  0.8203,  0.3888,  ...,  1.5981, -1.2054, -0.8678],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<SelectBackward0>)

In [14]:
logits, hidden = RNN_dec_net.forward(y_dec_ip,encoder_hidden=h_out)

In [15]:
logits.shape

torch.Size([2, 26, 49])