In [167]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import re
import string
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm


In [168]:

# The 'punkt' resource is a pre-trained model used for tokenization, which is the process of splitting text into individual words or sentences.
# The 'tab' part likely refers to a variant or extension of the punkt tokenizer that may handle tab-separated data or related formatting nuances.
# Downloading this resource ensures that the tokenizer is available for use in subsequent NLP tasks.
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [170]:
# initialize the variables to process JSON data
source_sentences_val = []

id_val = []

In [173]:
with open('/kaggle/input/ee958-cap-val/test_data1_final.json', 'r') as file: # Replace this path with the dataset path in your local machine
    data = json.load(file)

In [174]:
# Load souce and target for Validation
for language_pair, language_data in data.items():
    if(language_pair == "English-Hindi"):
      print(f"Language Pair: {language_pair}")
      for data_type, data_entries in language_data.items():
          print(f"  Data Type: {data_type}")
          for entry_id, entry_data in data_entries.items():
              source = entry_data["source"]
              if (data_type == "Test"):
                source_sentences_val.append(source)

Language Pair: English-Hindi
  Data Type: Test


In [175]:
print(len(source_sentences_val))

23085


In [179]:
import nltk
from collections import Counter

In [180]:
# Function to preprocess and remove punctuation and numbers
def preprocess_and_remove_punctuation(sentence):
    # Remove punctuation and numbers
    sentence = ''.join([char for char in sentence if char not in string.punctuation and not char.isdigit()])
    return sentence

In [181]:
# Tokenization and Lowercasing
def preprocess(sentences):
    tokenized_sentences = [nltk.word_tokenize(preprocess_and_remove_punctuation(sentence.lower())) for sentence in sentences]
    return tokenized_sentences

In [183]:
english_test=preprocess(source_sentences_val)

In [184]:

en_test=english_test


In [187]:
# torch.cuda.is_available() checks if a CUDA-enabled GPU is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [191]:
# Load vocabularies for English-Hindi
with open('/kaggle/input/ee958-cap-val/hi_en_index2word.json', 'r') as f:
    en_index2word = json.load(f)
with open('/kaggle/input/ee958-cap-val/hi_de_index2word.json', 'r') as f:
    de_index2word = json.load(f)

# Convert to word-to-index mappings
en_word2index = {token: idx for idx, token in enumerate(en_index2word)}
de_word2index = {token: idx for idx, token in enumerate(de_index2word)}

print("Vocabularies loaded successfully")

Vocabularies loaded successfully


In [192]:
seq_length = 20

In [193]:
def encode_and_pad(vocab, sent, max_length):
    """
    Encodes a sentence using a vocabulary and pads or truncates it to a specified maximum length.

    Args:
        vocab (dict): A dictionary mapping words to their corresponding indices.
        sent (list): A list of words representing the sentence to be encoded.
        max_length (int): The maximum length of the encoded and padded/truncated sentence.

    Returns:
        list: The encoded and padded/truncated sentence as a list of indices.
    """

    # Define special tokens: Start of Sentence (SOS), End of Sentence (EOS), and Padding (PAD).
    sos = [vocab["<SOS>"]]
    eos = [vocab["<EOS>"]]
    pad = [vocab["<PAD>"]]
        
    encoded = [vocab.get(w, vocab["<PAD>"]) for w in sent]

    # Check if the sentence length (excluding SOS and EOS) is less than the maximum length.
    if len(sent) < max_length - 2: # -2 for SOS and EOS
        # Calculate the number of padding tokens needed.
        n_pads = max_length - 2 - len(sent)
        # Encode the sentence by looking up the index of each word in the vocabulary.
        #encoded = [vocab[w] for w in sent]
        return sos + encoded + eos + pad * n_pads
    else: # sent is longer than max_length; truncating
        #encoded = [vocab[w] for w in sent]
        truncated = encoded[:max_length - 2]
        return sos + truncated + eos


In [194]:
# Encoded Training data
en_test_encoded = [encode_and_pad(en_word2index, sent, seq_length) for sent in en_test]

In [196]:
batch_size = 256

test_x = np.array(en_test_encoded)

test_ds = TensorDataset(torch.from_numpy(test_x))


In [199]:
import math

In [201]:
# As suggested in online docs we need to use a positional encoder for Transformaers

class PositionalEncoding(nn.Module):
    """
    This module implements positional encoding for transformer models.
    Written with help of:
    https://www.geeksforgeeks.org/positional-encoding-in-transformers/
    https://github.com/hyunwoongko/transformer

    Positional encoding adds information about the position of tokens in a sequence to the input embeddings.
    This is crucial because transformer models, unlike recurrent neural networks, do not inherently
    process sequential data in order.
    """
    def __init__(self, d_model, max_len=5000):
        """
        Initializes the PositionalEncoding module.

        Args:
            d_model (int): The dimensionality of the input embeddings.
            max_len (int): The maximum length of the sequences the model can handle.
        """
        # Create a zero tensor of shape (max_len, d_model) to store the positional encodings.
        super(PositionalEncoding, self).__init__()
        # Create a zero tensor of shape (max_len, d_model) to store the positional encodings.
        pe = torch.zeros(max_len, d_model)
        # Create a tensor of positions from 0 to max_len-1.
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # Calculate the division term for the sinusoidal functions.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # Calculate the sine and cosine values for even indices and odd indices.
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Add a batch dimension to the positional encoding tensor. WHY????? TODO figure out
        pe = pe.unsqueeze(0)
        # Buffers are tensors that  are not updated during trainin but are still
        # saved in the model's state dictionary.
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Applies positional encoding to the input tensor.

        Args:
            x (torch.Tensor): The input tensor of shape (batch_size, sequence_length, d_model).

        Returns:
            torch.Tensor: The input tensor with positional encoding added, of the same shape as x.
        """
        # The positional encoding is sliced to match the sequence length of the input.
        return x + self.pe[:, :x.size(1), :]


In [202]:
# Improved Encoder and Decoder with LSTM and Attention
class EnhancedEncoder(nn.Module):
    """
    An enhanced encoder module that combines embedding, positional encoding, multi-head attention,
    and feed-forward network for sequence encoding.
    """
    def __init__(self, input_size, hidden_size, num_heads=4, dropout=0.1):
        """
        Initializes the EnhancedEncoder module.

        Args:
            input_size (int): The size of the input vocabulary.
            hidden_size (int): The dimensionality of the hidden state and embeddings.
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability.
        """
        super(EnhancedEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads

        # Embedding layer to convert input tokens to embeddings.
        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
        # Positional encoding to add positional information to embeddings.
        self.pos_encoding = PositionalEncoding(hidden_size)
        # Multi-head attention layer.
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        # Layer normalization after attention.
        self.norm1 = nn.LayerNorm(hidden_size)
        # Layer normalization after feed-forward network.
        self.norm2 = nn.LayerNorm(hidden_size)
        # Feed-forward network.
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden=None):
        """
        Forward pass of the encoder.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, seq_length).
            hidden (torch.Tensor, optional): Hidden state (not used in this encoder).

        Returns:
            tuple: A tuple containing the encoder output and None (for compatibility).
        """
        # Embed the input tokens. 
        embedded = self.embedding(input)  # [batch_size, seq_length, hidden_size]
        # Add positional encoding. need to figure out why this is a must
        embedded = self.pos_encoding(embedded)
        # Permute dimensions for multi-head attention. 
        # https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853
        embedded = embedded.permute(1, 0, 2)  # [seq_length, batch_size, hidden_size]

        # Apply multi-head attention.
        # https://paperswithcode.com/method/multi-head-attention
        attn_output, _ = self.attention(embedded, embedded, embedded)
        attn_output = self.norm1(embedded + self.dropout(attn_output))
        ffn_output = self.ffn(attn_output)
        output = self.norm2(attn_output + self.dropout(ffn_output))
        output = output.permute(1, 0, 2)  # [batch_size, seq_length, hidden_size]
        return output, None

    def initHidden(self):
        """
        Initializes the hidden state (not used in this encoder).

        Returns:
            None
        """
        return None

In [203]:
class EnhancedDecoder(nn.Module):
    """
    An enhanced decoder module that combines embedding, positional encoding, self-attention,
    encoder-decoder attention, and a feed-forward network for sequence decoding.
    """
    def __init__(self, hidden_size, output_size, num_heads=4, dropout=0.1):
        """
        Initializes the EnhancedDecoder module.

        Args:
            hidden_size (int): The dimensionality of the hidden state and embeddings.
            output_size (int): The size of the output vocabulary.
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability.
        """
        super(EnhancedDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads

        # Embedding layer to convert input tokens to embeddings.
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)
        # Positional encoding to add positional information to embeddings.
        self.pos_encoding = PositionalEncoding(hidden_size)
        # Self-attention layer.
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        # Encoder-decoder attention layer.
        self.enc_dec_attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        # Layer normalization after self-attention.
        self.norm1 = nn.LayerNorm(hidden_size)
        # Layer normalization after encoder-decoder attention.
        self.norm2 = nn.LayerNorm(hidden_size)
        # Layer normalization after feed-forward network.
        self.norm3 = nn.LayerNorm(hidden_size)
        # Feed-forward network.
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        # Output linear layer.
        self.out = nn.Linear(hidden_size, output_size)
        # Log softmax for output probabilities.
        self.softmax = nn.LogSoftmax(dim=-1)
        # Dropout layer.
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, encoder_output, mask=None):
        """
        Forward pass of the decoder.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, seq_len).
            encoder_output (torch.Tensor): Output tensor from the encoder of shape (batch_size, src_seq_len, hidden_size).
            mask (torch.Tensor, optional): Mask for self-attention (e.g., for padding or look-ahead).

        Returns:
            tuple: A tuple containing the decoder output and None (for compatibility).
        """
        # Embed the input tokens.
        embedded = self.embedding(input)  # [batch_size, seq_len, hidden_size]
        # Add positional encoding.
        embedded = self.pos_encoding(embedded)
        # Permute dimensions for multi-head attention.
        embedded = embedded.permute(1, 0, 2)  # [seq_len, batch_size, hidden_size]
        # Permute encoder output dimensions for encoder-decoder attention.
        enc_output = encoder_output.permute(1, 0, 2)  # [src_seq_len, batch_size, hidden_size]

        # Apply self-attention.
        self_attn_output, _ = self.self_attention(embedded, embedded, embedded, attn_mask=mask)
        # Apply layer normalization and residual connection after self-attention.
        self_attn_output = self.norm1(embedded + self.dropout(self_attn_output))

        # Apply encoder-decoder attention.
        attn_output, _ = self.enc_dec_attention(self_attn_output, enc_output, enc_output)
        # Apply layer normalization and residual connection after encoder-decoder attention.
        attn_output = self.norm2(self_attn_output + self.dropout(attn_output))

        # Apply feed-forward network.
        ffn_output = self.ffn(attn_output)
        # Apply layer normalization and residual connection after feed-forward network.
        output = self.norm3(attn_output + self.dropout(ffn_output))
        # Permute dimensions back to (batch_size, seq_len, hidden_size).
        output = output.permute(1, 0, 2)  # [batch_size, seq_len, hidden_size]

        # Output logits for the last token only
        output = self.out(output[:, -1, :])  # [batch_size, output_size]
        output = self.softmax(output)
        return output, None

    # We do not need an .initHidden() method for the decoder since the encoder output will act as input in the first decoder time-step

In [204]:
# Modified instantiation
hidden_size = 128  # Increased hidden size for better representation

# Training loop
criterion = nn.CrossEntropyLoss()
SOS = en_word2index["<SOS>"]
EOS = en_word2index["<EOS>"]

In [205]:
def generate_square_subsequent_mask(sz):
    """Generate a square mask for the sequence. The masked positions are filled with float('-inf')"""
    # Create an upper triangular matrix of ones.
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    # Replace 0s with float('-inf') and 1s with 0.0.
    # masked_fill(condition, value) replaces elements where the condition is true with the specified value.
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [210]:
# Modified evaluation code
# Evaluation
# Corrected Evaluation
def evaluate(encoder, decoder, test_ds):
    """
    Evaluates the encoder and decoder models on the test dataset.

    Args:
        encoder (nn.Module): The encoder model.
        decoder (nn.Module): The decoder model.
        test_ds (Dataset): The test dataset.

    Returns:
        list: A list of predicted sentences as strings.
    """
    # Set the models to evaluation mode.
    encoder.eval()
    decoder.eval()
    val_outs = []

    # Disable gradient calculation during evaluation.
    with torch.no_grad():
        # Iterate over the test dataset.
        for i in tqdm(range(len(test_ds))):
            # Get the input tensor and move it to the device.
            input_tensor = test_ds[i][0].unsqueeze(0).to(device)
            encoder_output, _ = encoder(input_tensor)
            # Initialize the decoder input with the SOS token.
            decoder_input = torch.tensor([[SOS]], device=device)  # [1, 1]
            result = []

            # Iterate over the sequence length.
            for t in range(seq_length):
                # Generate the mask for the decoder.
                mask = generate_square_subsequent_mask(t + 1).to(device)
                # Decode the input sequence up to time step t.
                decoder_output, _ = decoder(decoder_input, encoder_output, mask)
                # Get the predicted token index.
                best = decoder_output.argmax(-1)  # [batch_size], here [1]
                pred_token = best.item()
                
                result.append(de_index2word[pred_token])

                # Check if the predicted token is the EOS token.
                if pred_token == EOS:
                    break
                    
                
                # Prepare the next decoder input.
                # Ensure 2D tensor by concatenating with a tensor containing the pred_token.
                # Fix: by unsqueezing only once
                decoder_input = torch.cat(
                    [decoder_input, torch.tensor([[pred_token]], device=device)],
                    dim=1
                )

            # Remove special tokens from the result list.
            result = [token for token in result if token not in ['<EOS>', '<PAD>', '<SOS>']]
            # Construct sentence
            val_outs.append(" ".join(result))
    
    return val_outs

In [211]:
encoder = EnhancedEncoder(len(en_index2word), hidden_size).to(device)
decoder = EnhancedDecoder(hidden_size, len(de_index2word)).to(device)

# Load saved state dictionaries with weights_only=True
encoder.load_state_dict(torch.load('/kaggle/input/ee958-cap-val/encoder_hi.pth', weights_only=True))
decoder.load_state_dict(torch.load('/kaggle/input/ee958-cap-val/decoder_hi.pth', weights_only=True))
print("English-Hindi Encoder and Decoder loaded")

English-Hindi Encoder and Decoder loaded


In [212]:
# Run evaluation
val_ids = [i for i, _ in data["English-Hindi"]["Test"].items()]
val_outs = evaluate(encoder, decoder, test_ds)

100%|██████████| 23085/23085 [10:16<00:00, 37.43it/s]


In [213]:
# Save results
df0 = pd.DataFrame()
df0["ID"] = val_ids
df0["Translation"] = val_outs
df0.to_csv('/kaggle/working/answersH.csv', index=False)
#df0.to_csv('/kaggle/working/answersB.csv', index=False)

In [214]:
x=pd.read_csv("/kaggle/working/answersH.csv")

In [215]:
x

Unnamed: 0,ID,Translation
0,540139,और फिर हमें विश्वास दिलाने की आवश्यकता है कि स...
1,540140,पहली जनवरी के लिए निर्धारित कार्यक्रम निर्धारि...
2,540141,सन् में वर्ग किलोमीटर क्षेत्र में फैला हुआ है ...
3,540142,स्थानीय संगीतकार सम्राटों के साथ संगीतकारों पर...
4,540143,बेशक इस कोर्स के बारे में कुछ और अधिक आसान हो ...
...,...,...
23080,563219,राम को एक ईमेल भेजकर पूछें और उसे कैसे मदद करत...
23081,563220,श्रीनगर के अंत में भारत सरकार के प्रोफेसर देवस...
23082,563221,प्रारंभिक रूप से ट्यूबों में पेलेट को असंयमिता...
23083,563222,दोनों हाथों की हड्‍डी के साथ दायें हाथ को सुचा...
