In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import re
import string
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm


In [2]:

# The 'punkt' resource is a pre-trained model used for tokenization, which is the process of splitting text into individual words or sentences.
# The 'tab' part likely refers to a variant or extension of the punkt tokenizer that may handle tab-separated data or related formatting nuances.
# Downloading this resource ensures that the tokenizer is available for use in subsequent NLP tasks.
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
with open('/kaggle/input/ee958-cap-test/train_data1.json', 'r') as file: # Replace this path with the dataset path in your local machine
    data = json.load(file)

In [4]:
# initialize the variables to process JSON data
source_sentences_train = []
target_sentences_train = []

source_sentences_val = []
target_sentences_val = []

id_train = []
id_val = []

In [5]:
# Display the list of Language pairs
for language_pair, language_data in data.items():
  print(f"Language Pair: {language_pair}")


Language Pair: English-Bengali
Language Pair: English-Hindi


In [6]:
# Load souce and target for Training
for language_pair, language_data in data.items():
    if(language_pair == "English-Bengali"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"]
              target = entry_data["target"]
              if (data_type == "Validation"):
                source_sentences_val.append(source)
                target_sentences_val.append(target)
                id_val.append(entry_id)
              else:
                source_sentences_train.append(source)
                target_sentences_train.append(target)
                id_train.append(entry_id)

Language Pair: English-Bengali
  Data Type: Train


In [7]:
with open('/kaggle/input/ee958-cap-test/val_data1.json', 'r') as file: # Replace this path with the dataset path in your local machine
    data = json.load(file)

In [8]:
# Load souce and target for Validation
for language_pair, language_data in data.items():
    if(language_pair == "English-Bengali"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"]
              #target = entry_data["target"]
              if (data_type == "Validation"):
                source_sentences_val.append(source)
                #target_sentences_val.append(target)
                #id_val.append(entry_id)
              #else:
                #source_sentences_train.append(source)
                #target_sentences_train.append(target)
                #id_train.append(entry_id)

Language Pair: English-Bengali
  Data Type: Validation


In [9]:
print(len(source_sentences_train))
print(len(target_sentences_train))

print(len(source_sentences_val))
print(len(target_sentences_val))

68849
68849
9836
0


In [10]:
x={'English':source_sentences_train,'Bengali':target_sentences_train}

In [11]:
df=pd.DataFrame(x)

In [12]:
df

Unnamed: 0,English,Bengali
0,Do not forget to visit the point where the Nar...,এই জায়গাগুলো দেখতে ভুলো না যেখানে নর্মদা নদী ম...
1,It is evident that the biggest cause of povert...,এই কথা স্পষ্ট যে দরিদ্রতার বড় কারণ হল অশিক্ষা ।
2,The film was released theatrically on 12 April...,চলচ্চিত্রটি ২০১৩ সালের ১২ই এপ্রিল প্রেক্ষাগৃহে...
3,is wyatt's birthday party at ten p. m.,অনিমেষ এর জন্মদিনের পার্টি রাত দশটায়
4,"Apart from being used as an eatable, barley is...",খাদ্যদ্রব্য ছাড়াও যব আরো বিভিন্ন ক্ষেত্রে যেমন...
...,...,...
68844,But it is evident that there is change in both...,তাও এটা প্রত্যক্ষভাবে দেখা যায় যে প্রিন্ট মিড...
68845,Include a detailed listing of all of your prod...,ছবি সহ সম্পূর্ণ করা আপনার সমস্ত পণ্যের একটি বি...
68846,"Each subcategory counts as one page, pages in ...",প্রতিটি উপবিভাগ একটি পৃষ্ঠা হিসাবে গণনা করা হয...
68847,It is one of the country's oldest state-run pu...,এটি দেশের সবচেয়ে পুরনো রাষ্ট্র-চালিত সরকারী বা...


In [13]:
import nltk
from collections import Counter

In [14]:
# Function to preprocess and remove punctuation and numbers
def preprocess_and_remove_punctuation(sentence):
    # Remove punctuation and numbers
    sentence = ''.join([char for char in sentence if char not in string.punctuation and not char.isdigit()])
    return sentence

In [15]:
# Tokenization and Lowercasing
def preprocess(sentences):
    tokenized_sentences = [nltk.word_tokenize(preprocess_and_remove_punctuation(sentence.lower())) for sentence in sentences]
    return tokenized_sentences

In [16]:
target_sentences_train = [re.sub(r'[a-zA-Z]','',hi) for hi in target_sentences_train] #optional

In [17]:
english_tokens = preprocess(source_sentences_train)
english_test=preprocess(source_sentences_val)
hindi_tokens = preprocess(target_sentences_train)
hindi_test=preprocess(target_sentences_val)

In [18]:
en_train=english_tokens
en_test=english_test
de_train=hindi_tokens
de_test=hindi_test

In [19]:
en_index2word = ["<PAD>", "<SOS>", "<EOS>"]
de_index2word = ["<PAD>", "<SOS>", "<EOS>"]

for ds in [en_train, en_test]:
    for sent in ds:
        for token in sent:
            if token not in en_index2word:
                en_index2word.append(token)

for ds in [de_train, de_test]:
    for sent in ds:
        for token in sent:
            if token not in de_index2word:
                de_index2word.append(token)

In [20]:
en_index2word

['<PAD>',
 '<SOS>',
 '<EOS>',
 'do',
 'not',
 'forget',
 'to',
 'visit',
 'the',
 'point',
 'where',
 'narmada',
 'flowing',
 'through',
 'marble',
 'rocks',
 'interchanges',
 'its',
 'calmness',
 'and',
 'serenity',
 'into',
 'insouciance',
 'it',
 'is',
 'evident',
 'that',
 'biggest',
 'cause',
 'of',
 'poverty',
 'illiteracy',
 'film',
 'was',
 'released',
 'theatrically',
 'on',
 'april',
 'wyatts',
 'birthday',
 'party',
 'at',
 'ten',
 'p',
 'm',
 'apart',
 'from',
 'being',
 'used',
 'as',
 'an',
 'eatable',
 'barley',
 'also',
 'in',
 'many',
 'other',
 'fields',
 'like',
 'industries',
 'agriculture',
 'emperor',
 'akbar',
 'got',
 'state',
 'museum',
 'constructed',
 'during',
 'his',
 'trip',
 'ajmer',
 'initiate',
 'music',
 'electronic',
 'playlist',
 'srinagar',
 'a',
 'distance',
 'kms',
 'jammu',
 'devotee',
 'chosen',
 'karaga',
 'or',
 'clay',
 'pot',
 'placed',
 'head',
 'city',
 'lined',
 'up',
 'with',
 'shops',
 'have',
 'everything',
 'you',
 'need',
 'stock',
 

In [21]:
# Save vocabularies for English-Hindi
with open('/kaggle/working/bn_en_index2word.json', 'w') as f:
    json.dump(en_index2word, f)
with open('/kaggle/working/bn_de_index2word.json', 'w') as f:
    json.dump(de_index2word, f)
print("English-Hindi vocabularies saved to bn_en_index2word.json and bn_de_index2word.json")


English-Hindi vocabularies saved to bn_en_index2word.json and bn_de_index2word.json


In [None]:
# torch.cuda.is_available() checks if a CUDA-enabled GPU is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# It iterates through the en_index2word list using enumerate(), which provides both the index (idx) and the value (token, which is a word).
# For each word, it creates a key-value pair in the dictionary, where the key is the word (token) and the value is its index (idx).
en_word2index = {token: idx for idx, token in enumerate(en_index2word)}
de_word2index = {token: idx for idx, token in enumerate(de_index2word)}

In [None]:
len(en_word2index)

In [None]:
#it divides the total length by the number of sentences (len(en_train)) to get the average sentence length.
en_lengths = sum([len(sent) for sent in en_train])/len(en_train)
de_lengths = sum([len(sent) for sent in de_train])/len(de_train)

In [None]:
seq_length = 20

In [None]:
def encode_and_pad(vocab, sent, max_length):
    """
    Encodes a sentence using a vocabulary and pads or truncates it to a specified maximum length.

    Args:
        vocab (dict): A dictionary mapping words to their corresponding indices.
        sent (list): A list of words representing the sentence to be encoded.
        max_length (int): The maximum length of the encoded and padded/truncated sentence.

    Returns:
        list: The encoded and padded/truncated sentence as a list of indices.
    """

    # Define special tokens: Start of Sentence (SOS), End of Sentence (EOS), and Padding (PAD).
    sos = [vocab["<SOS>"]]
    eos = [vocab["<EOS>"]]
    pad = [vocab["<PAD>"]]

    # Check if the sentence length (excluding SOS and EOS) is less than the maximum length.
    if len(sent) < max_length - 2: # -2 for SOS and EOS
        # Calculate the number of padding tokens needed.
        n_pads = max_length - 2 - len(sent)
        # Encode the sentence by looking up the index of each word in the vocabulary.
        encoded = [vocab[w] for w in sent]
        return sos + encoded + eos + pad * n_pads
    else: # sent is longer than max_length; truncating
        encoded = [vocab[w] for w in sent]
        truncated = encoded[:max_length - 2]
        return sos + truncated + eos

In [None]:
# Encoded Training data
en_train_encoded = [encode_and_pad(en_word2index, sent, seq_length) for sent in en_train]
en_test_encoded = [encode_and_pad(en_word2index, sent, seq_length) for sent in en_test]
de_train_encoded = [encode_and_pad(de_word2index, sent, seq_length) for sent in de_train]
de_test_encoded = [encode_and_pad(de_word2index, sent, seq_length) for sent in de_test]

In [None]:
en_train_encoded[1]

In [None]:
batch_size = 256

train_x = np.array(en_train_encoded)
train_y = np.array(de_train_encoded)
test_x = np.array(en_test_encoded)
test_y = np.array(de_test_encoded)

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_ds = TensorDataset(torch.from_numpy(test_x))


#train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, pin_memory=True, num_workers=2) #added pin_memory and num_workers
#test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size, drop_last=True)

In [None]:
train_x[1]

In [None]:
train_ds[1]

In [None]:
import math

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# As suggested in online docs we need to use a positional encoder for Transformaers

class PositionalEncoding(nn.Module):
    """
    This module implements positional encoding for transformer models.
    Written with help of:
    https://www.geeksforgeeks.org/positional-encoding-in-transformers/
    https://github.com/hyunwoongko/transformer

    Positional encoding adds information about the position of tokens in a sequence to the input embeddings.
    This is crucial because transformer models, unlike recurrent neural networks, do not inherently
    process sequential data in order.
    """
    def __init__(self, d_model, max_len=5000):
        """
        Initializes the PositionalEncoding module.

        Args:
            d_model (int): The dimensionality of the input embeddings.
            max_len (int): The maximum length of the sequences the model can handle.
        """
        # Create a zero tensor of shape (max_len, d_model) to store the positional encodings.
        super(PositionalEncoding, self).__init__()
        # Create a zero tensor of shape (max_len, d_model) to store the positional encodings.
        pe = torch.zeros(max_len, d_model)
        # Create a tensor of positions from 0 to max_len-1.
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # Calculate the division term for the sinusoidal functions.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # Calculate the sine and cosine values for even indices and odd indices.
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Add a batch dimension to the positional encoding tensor. WHY????? TODO figure out
        pe = pe.unsqueeze(0)
        # Buffers are tensors that  are not updated during trainin but are still
        # saved in the model's state dictionary.
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Applies positional encoding to the input tensor.

        Args:
            x (torch.Tensor): The input tensor of shape (batch_size, sequence_length, d_model).

        Returns:
            torch.Tensor: The input tensor with positional encoding added, of the same shape as x.
        """
        # The positional encoding is sliced to match the sequence length of the input.
        return x + self.pe[:, :x.size(1), :]


In [None]:
# Improved Encoder and Decoder with LSTM and Attention
class EnhancedEncoder(nn.Module):
    """
    An enhanced encoder module that combines embedding, positional encoding, multi-head attention,
    and feed-forward network for sequence encoding.
    """
    def __init__(self, input_size, hidden_size, num_heads=4, dropout=0.1):
        """
        Initializes the EnhancedEncoder module.

        Args:
            input_size (int): The size of the input vocabulary.
            hidden_size (int): The dimensionality of the hidden state and embeddings.
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability.
        """
        super(EnhancedEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads

        # Embedding layer to convert input tokens to embeddings.
        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
        # Positional encoding to add positional information to embeddings.
        self.pos_encoding = PositionalEncoding(hidden_size)
        # Multi-head attention layer.
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        # Layer normalization after attention.
        self.norm1 = nn.LayerNorm(hidden_size)
        # Layer normalization after feed-forward network.
        self.norm2 = nn.LayerNorm(hidden_size)
        # Feed-forward network.
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden=None):
        """
        Forward pass of the encoder.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, seq_length).
            hidden (torch.Tensor, optional): Hidden state (not used in this encoder).

        Returns:
            tuple: A tuple containing the encoder output and None (for compatibility).
        """
        # Embed the input tokens. 
        embedded = self.embedding(input)  # [batch_size, seq_length, hidden_size]
        # Add positional encoding. need to figure out why this is a must
        embedded = self.pos_encoding(embedded)
        # Permute dimensions for multi-head attention. 
        # https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853
        embedded = embedded.permute(1, 0, 2)  # [seq_length, batch_size, hidden_size]

        # Apply multi-head attention.
        # https://paperswithcode.com/method/multi-head-attention
        attn_output, _ = self.attention(embedded, embedded, embedded)
        attn_output = self.norm1(embedded + self.dropout(attn_output))
        ffn_output = self.ffn(attn_output)
        output = self.norm2(attn_output + self.dropout(ffn_output))
        output = output.permute(1, 0, 2)  # [batch_size, seq_length, hidden_size]
        return output, None

    def initHidden(self):
        """
        Initializes the hidden state (not used in this encoder).

        Returns:
            None
        """
        return None

In [None]:
class EnhancedDecoder(nn.Module):
    """
    An enhanced decoder module that combines embedding, positional encoding, self-attention,
    encoder-decoder attention, and a feed-forward network for sequence decoding.
    """
    def __init__(self, hidden_size, output_size, num_heads=4, dropout=0.1):
        """
        Initializes the EnhancedDecoder module.

        Args:
            hidden_size (int): The dimensionality of the hidden state and embeddings.
            output_size (int): The size of the output vocabulary.
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability.
        """
        super(EnhancedDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads

        # Embedding layer to convert input tokens to embeddings.
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)
        # Positional encoding to add positional information to embeddings.
        self.pos_encoding = PositionalEncoding(hidden_size)
        # Self-attention layer.
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        # Encoder-decoder attention layer.
        self.enc_dec_attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        # Layer normalization after self-attention.
        self.norm1 = nn.LayerNorm(hidden_size)
        # Layer normalization after encoder-decoder attention.
        self.norm2 = nn.LayerNorm(hidden_size)
        # Layer normalization after feed-forward network.
        self.norm3 = nn.LayerNorm(hidden_size)
        # Feed-forward network.
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        # Output linear layer.
        self.out = nn.Linear(hidden_size, output_size)
        # Log softmax for output probabilities.
        self.softmax = nn.LogSoftmax(dim=-1)
        # Dropout layer.
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, encoder_output, mask=None):
        """
        Forward pass of the decoder.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, seq_len).
            encoder_output (torch.Tensor): Output tensor from the encoder of shape (batch_size, src_seq_len, hidden_size).
            mask (torch.Tensor, optional): Mask for self-attention (e.g., for padding or look-ahead).

        Returns:
            tuple: A tuple containing the decoder output and None (for compatibility).
        """
        # Embed the input tokens.
        embedded = self.embedding(input)  # [batch_size, seq_len, hidden_size]
        # Add positional encoding.
        embedded = self.pos_encoding(embedded)
        # Permute dimensions for multi-head attention.
        embedded = embedded.permute(1, 0, 2)  # [seq_len, batch_size, hidden_size]
        # Permute encoder output dimensions for encoder-decoder attention.
        enc_output = encoder_output.permute(1, 0, 2)  # [src_seq_len, batch_size, hidden_size]

        # Apply self-attention.
        self_attn_output, _ = self.self_attention(embedded, embedded, embedded, attn_mask=mask)
        # Apply layer normalization and residual connection after self-attention.
        self_attn_output = self.norm1(embedded + self.dropout(self_attn_output))

        # Apply encoder-decoder attention.
        attn_output, _ = self.enc_dec_attention(self_attn_output, enc_output, enc_output)
        # Apply layer normalization and residual connection after encoder-decoder attention.
        attn_output = self.norm2(self_attn_output + self.dropout(attn_output))

        # Apply feed-forward network.
        ffn_output = self.ffn(attn_output)
        # Apply layer normalization and residual connection after feed-forward network.
        output = self.norm3(attn_output + self.dropout(ffn_output))
        # Permute dimensions back to (batch_size, seq_len, hidden_size).
        output = output.permute(1, 0, 2)  # [batch_size, seq_len, hidden_size]

        # Output logits for the last token only
        output = self.out(output[:, -1, :])  # [batch_size, output_size]
        output = self.softmax(output)
        return output, None

    # We do not need an .initHidden() method for the decoder since the encoder output will act as input in the first decoder time-step

In [None]:
# Modified instantiation
hidden_size = 128  # Increased hidden size for better representation
encoder = EnhancedEncoder(len(en_index2word), hidden_size).to(device)
decoder = EnhancedDecoder(hidden_size, len(de_index2word)).to(device)

#criterion = nn.CrossEntropyLoss(ignore_index=0)
#enc_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.001)
#dec_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001)

# Training loop
criterion = nn.CrossEntropyLoss()
enc_optimizer = torch.optim.Adam(encoder.parameters(), lr=3e-3)
dec_optimizer = torch.optim.Adam(decoder.parameters(), lr=3e-3)

losses = []


#EPOCHS

epochs = 50  # Increased epochs since transformer-style models often need more training
SOS = en_word2index["<SOS>"]
EOS = en_word2index["<EOS>"]

In [None]:
def generate_square_subsequent_mask(sz):
    """Generate a square mask for the sequence. The masked positions are filled with float('-inf')"""
    # Create an upper triangular matrix of ones.
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    # Replace 0s with float('-inf') and 1s with 0.0.
    # masked_fill(condition, value) replaces elements where the condition is true with the specified value.
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

# Training

In [None]:
def train_epoch(encoder, decoder, train_dl, criterion, enc_optimizer, dec_optimizer):
    """
    Trains the encoder and decoder models for one epoch.

    Args:
        encoder (nn.Module): The encoder model.
        decoder (nn.Module): The decoder model.
        train_dl (DataLoader): DataLoader for the training dataset.
        criterion (nn.Module): Loss function.
        enc_optimizer (torch.optim.Optimizer): Optimizer for the encoder.
        dec_optimizer (torch.optim.Optimizer): Optimizer for the decoder.

    Returns:
        float: Average loss for the epoch.
    """
    # Set the models to training mode.
    encoder.train()
    decoder.train()
    total_loss = 0

    # Iterate over batches in the training DataLoader.
    for idx, batch in enumerate(train_dl):
        # Move input and target tensors to the device.
        input_tensor = batch[0].to(device)  # [batch_size, seq_length]
        target_tensor = batch[1].to(device)  # [batch_size, seq_length]

        # Zero the gradients of the optimizers.
        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()

        # Enable gradient calculation.
        with torch.set_grad_enabled(True):
            # Encode the input sequence.
            encoder_output, _ = encoder(input_tensor)

            # Initialize decoder input with SOS token.
            batch_size = input_tensor.size(0)
            decoder_input = torch.full((batch_size, 1), SOS, dtype=torch.long).to(device)
            # Generate the mask for the decoder.
            mask = generate_square_subsequent_mask(seq_length).to(device)
            # Initialize a tensor to store the decoder results.
            dec_result = torch.zeros(batch_size, seq_length, len(de_index2word)).to(device)

            # Iterate over the target sequence length.
            for t in range(1, seq_length):
                # Decode the input sequence up to time step t
                decoder_output, _ = decoder(
                    decoder_input[:, :t],
                    encoder_output,
                    mask[:t, :t]
                )
                # Assign the 2D output directly
                dec_result[:, t] = decoder_output  # [batch_size, vocab_size]

                # Prepare the next decoder input.
                if t < seq_length - 1:
                    decoder_input = torch.cat(
                        [decoder_input, target_tensor[:, t].unsqueeze(1)],
                        dim=1
                    )

            # Reshape the decoder results and target tensor for loss calculation.
            scores = dec_result[:, 1:].reshape(-1, len(de_index2word))
            targets = target_tensor[:, 1:].reshape(-1)
            loss = criterion(scores, targets)

            # Backpropagate the loss and update the model parameters.
            loss.backward()
            # Clip gradients to prevent exploding gradients. Smoothing the learning process
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)
            enc_optimizer.step()
            dec_optimizer.step()

            total_loss += loss.item()

            if idx % 10 == 0:
                avg_loss = total_loss / (idx + 1)
                print(f"Batch {idx}, Loss: {avg_loss:.4f}")

    # Average loss for this epoch
    return total_loss / len(train_dl)

In [None]:
# Full training loop
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    avg_loss = train_epoch(encoder, decoder, train_dl, criterion, enc_optimizer, dec_optimizer)
    losses.append(avg_loss)
    print(f"Average Loss: {avg_loss:.4f}")

In [None]:
# Plot losses
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.show()

In [None]:
# Save models for English-Bengali
torch.save(encoder.state_dict(), '/kaggle/working/encoder_bn.pth')
torch.save(decoder.state_dict(), '/kaggle/working/decoder_bn.pth')
print("English-Bengali Encoder and Decoder saved")

In [None]:
# Modified evaluation code
# Evaluation
# Corrected Evaluation
def evaluate(encoder, decoder, test_ds):
    """
    Evaluates the encoder and decoder models on the test dataset.

    Args:
        encoder (nn.Module): The encoder model.
        decoder (nn.Module): The decoder model.
        test_ds (Dataset): The test dataset.

    Returns:
        list: A list of predicted sentences as strings.
    """
    # Set the models to evaluation mode.
    encoder.eval()
    decoder.eval()
    val_outs = []

    # Disable gradient calculation during evaluation.
    with torch.no_grad():
        # Iterate over the test dataset.
        for i in tqdm(range(len(test_ds))):
            # Get the input tensor and move it to the device.
            input_tensor = test_ds[i][0].unsqueeze(0).to(device)
            encoder_output, _ = encoder(input_tensor)
            # Initialize the decoder input with the SOS token.
            decoder_input = torch.tensor([[SOS]], device=device)  # [1, 1]
            result = []

            # Iterate over the sequence length.
            for t in range(seq_length):
                # Generate the mask for the decoder.
                mask = generate_square_subsequent_mask(t + 1).to(device)
                # Decode the input sequence up to time step t.
                decoder_output, _ = decoder(decoder_input, encoder_output, mask)
                # Get the predicted token index.
                best = decoder_output.argmax(-1)  # [batch_size], here [1]
                pred_token = best.item()
                
                result.append(de_index2word[pred_token])

                # Check if the predicted token is the EOS token.
                if pred_token == EOS:
                    break
                    
                
                # Prepare the next decoder input.
                # Ensure 2D tensor by concatenating with a tensor containing the pred_token.
                # Fix: by unsqueezing only once
                decoder_input = torch.cat(
                    [decoder_input, torch.tensor([[pred_token]], device=device)],
                    dim=1
                )

            # Remove special tokens from the result list.
            result = [token for token in result if token not in ['<EOS>', '<PAD>', '<SOS>']]
            # Construct sentence
            val_outs.append(" ".join(result))
    
    return val_outs

In [None]:

encoder = EnhancedEncoder(len(en_index2word), hidden_size).to(device)
decoder = EnhancedDecoder(hidden_size, len(de_index2word)).to(device)


#encoder.load_state_dict(torch.load('/kaggle/working/encoder_hi.pth'))
#decoder.load_state_dict(torch.load('/kaggle/working/decoder_hi.pth'))

# Load saved state dictionaries with weights_only=True
encoder.load_state_dict(torch.load('/kaggle/working/encoder_bn.pth', weights_only=True))
decoder.load_state_dict(torch.load('/kaggle/working/decoder_bn.pth', weights_only=True))
print("English-Bengali Encoder and Decoder loaded")

In [None]:
# Run evaluation
val_ids = [i for i, _ in data["English-Bengali"]["Validation"].items()]
val_outs = evaluate(encoder, decoder, test_ds)

In [None]:
# Save results
df0 = pd.DataFrame()
df0["ID"] = val_ids
df0["Translation"] = val_outs
#df0.to_csv('/kaggle/working/answersH.csv', index=False)
df0.to_csv('/kaggle/working/answersB.csv', index=False)

In [None]:
x=pd.read_csv("/kaggle/working/answersB.csv")

In [None]:
x