# Model

#### This class implements the Multi-Head Attention mechanism used in transformers.
#### It allows the model to focus on different parts of the input sequence simultaneously by applying multiple attention heads. Each head independently computes attention, and the results are combined and passed through a linear layer to produce the final output.

In [None]:
import torch
import torch.nn as nn
from typing import Optional, Tuple

class MultiHeadAttentionLayer(nn.Module):
    def __init__(
        self, hidden_size: int, n_heads: int, dropout: float, device: torch.device
    ) -> None:
        super().__init__()

        assert (
            hidden_size % n_heads == 0
        ), "Hidden size must be divisible by the number of heads."

        self.hidden_size = hidden_size
        self.n_heads = n_heads
        self.head_size = hidden_size // n_heads

        # Linear layers for query, key, and value projections
        self.fc_query = nn.Linear(hidden_size, hidden_size)
        self.fc_key = nn.Linear(hidden_size, hidden_size)
        self.fc_value = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, hidden_size)

        self.dp = nn.Dropout(dropout)

        self.coefficient = torch.sqrt(torch.FloatTensor([self.head_size])).to(device)

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        b_size = query.shape[0]

        # Linear projections
        query_output = self.fc_query(query)
        key_output = self.fc_key(key)
        value_output = self.fc_value(value)

        # Reshape and permute for multi-head attention
        query_output = query_output.view(
            b_size, -1, self.n_heads, self.head_size
        ).permute(0, 2, 1, 3)
        key_output = key_output.view(b_size, -1, self.n_heads, self.head_size).permute(
            0, 2, 1, 3
        )
        value_output = value_output.view(
            b_size, -1, self.n_heads, self.head_size
        ).permute(0, 2, 1, 3)

        # Calculate attention scores
        energy = (
            torch.matmul(query_output, key_output.permute(0, 1, 3, 2))
            / self.coefficient
        )

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        # Apply softmax to get attention weights
        attention = torch.softmax(energy, dim=-1)

        # Calculate the weighted sum of values
        output = torch.matmul(self.dp(attention), value_output)

        # Concatenate heads and pass through the final linear layer
        output = output.permute(0, 2, 1, 3).contiguous()
        output = output.view(b_size, -1, self.hidden_size)
        output = self.fc_out(output)

        return output, attention

#### The EncoderLayer class represents a single layer of the Transformer encoder, combining multi-head attention and a feed-forward neural network to process the input sequence.
#### The Encoder class stacks multiple EncoderLayer instances to build the full encoder, producing a context-rich representation that captures relationships between tokens in the sequence.

In [None]:
class EncoderLayer(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        n_heads: int,
        ff_size: int,
        dropout: float,
        device: torch.device,
    ) -> None:
        super().__init__()

        # Multi-head self-attention layer
        self.self_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout, device)
        self.self_atten_norm = nn.LayerNorm(hidden_size)
        self.ff_layer = FeedForwardLayer(hidden_size, ff_size, dropout)
        self.dp = nn.Dropout(dropout)
        self.ff_layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, input: torch.Tensor, input_mask: torch.Tensor) -> torch.Tensor:
        # Self-attention
        atten_result, _ = self.self_atten(input, input, input, input_mask)

        # Add & norm
        atten_norm = self.self_atten_norm(input + self.dp(atten_result))

        # Feed-forward
        ff_result = self.ff_layer(atten_norm)

        # Add & norm
        output = self.ff_layer_norm(atten_norm + self.dp(ff_result))

        return output


class Encoder(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        n_layers: int,
        n_heads: int,
        ff_size: int,
        dropout: float,
        device: torch.device,
        MAX_LENGTH: int = 100,
    ) -> None:
        super().__init__()

        self.device = device

        # Embedding layers for tokens and positions
        self.te = nn.Embedding(input_size, hidden_size)
        self.pe = nn.Embedding(MAX_LENGTH, hidden_size)

        # Stack of encoder layers
        encoding_layers = [
            EncoderLayer(hidden_size, n_heads, ff_size, dropout, device)
            for _ in range(n_layers)
        ]
        self.encode_sequence = nn.Sequential(*encoding_layers)

        self.dp = nn.Dropout(dropout)
        self.coefficient = torch.sqrt(torch.FloatTensor([hidden_size])).to(device)

    def forward(self, input: torch.Tensor, input_mask: torch.Tensor) -> torch.Tensor:
        b_size, input_size = input.shape

        # Create position tensor and add positional embeddings
        pos = torch.arange(0, input_size).unsqueeze(0).repeat(b_size, 1).to(self.device)
        input = self.dp((self.te(input) * self.coefficient) + self.pe(pos))

        # Pass through each encoder layer
        for layer in self.encode_sequence:
            input = layer(input, input_mask)

        return input

#### The DecoderLayer class represents a single layer of the Transformer decoder, combining multi-head attention with the encoder's output and a feed-forward neural network.
#### The Decoder class stacks multiple DecoderLayer instances to build the full decoder, generating the output sequence by attending to both the encoded input and previously generated tokens.

In [None]:
class DecoderLayer(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        n_heads: int,
        ff_size: int,
        dropout: float,
        device: torch.device,
    ) -> None:
        super().__init__()

        # Self-attention and encoder-decoder attention layers
        self.self_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout, device)
        self.self_atten_norm = nn.LayerNorm(hidden_size)
        self.encoder_atten = MultiHeadAttentionLayer(
            hidden_size, n_heads, dropout, device
        )
        self.encoder_atten_norm = nn.LayerNorm(hidden_size)
        self.ff_layer = FeedForwardLayer(hidden_size, ff_size, dropout)
        self.ff_layer_norm = nn.LayerNorm(hidden_size)
        self.dp = nn.Dropout(dropout)

    def forward(
        self,
        target: torch.Tensor,
        encoded_input: torch.Tensor,
        target_mask: torch.Tensor,
        input_mask: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # Self-attention
        atten_result, _ = self.self_atten(target, target, target, target_mask)
        atten_norm = self.self_atten_norm(target + self.dp(atten_result))

        # Encoder-decoder attention
        atten_encoded, attention = self.encoder_atten(
            atten_norm, encoded_input, encoded_input, input_mask
        )
        encoded_norm = self.encoder_atten_norm(atten_norm + self.dp(atten_encoded))

        # Feed-forward
        ff_result = self.ff_layer(encoded_norm)
        output = self.ff_layer_norm(encoded_norm + self.dp(ff_result))

        return output, attention


class Decoder(nn.Module):
    def __init__(
        self,
        output_size: int,
        hidden_size: int,
        n_layers: int,
        n_heads: int,
        ff_size: int,
        dropout: float,
        device: torch.device,
        MAX_LENGTH: int = 100,
    ) -> None:
        super().__init__()

        self.device = device

        # Embedding layers for tokens and positions
        self.te = nn.Embedding(output_size, hidden_size)
        self.pe = nn.Embedding(MAX_LENGTH, hidden_size)

        # Stack of decoder layers
        decoding_layers = [
            DecoderLayer(hidden_size, n_heads, ff_size, dropout, device)
            for _ in range(n_layers)
        ]
        self.decode_sequence = nn.Sequential(*decoding_layers)

        self.fc_out = nn.Linear(hidden_size, output_size)
        self.dp = nn.Dropout(dropout)
        self.coefficient = torch.sqrt(torch.FloatTensor([hidden_size])).to(device)

    def forward(
        self,
        target: torch.Tensor,
        encoded_input: torch.Tensor,
        target_mask: torch.Tensor,
        input_mask: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        b_size, target_size = target.shape

        # Create position tensor and add positional embeddings
        pos = (
            torch.arange(0, target_size).unsqueeze(0).repeat(b_size, 1).to(self.device)
        )
        target = self.dp((self.te(target) * self.coefficient) + self.pe(pos))

        # Pass through each decoder layer
        for layer in self.decode_sequence:
            target, attention = layer(target, encoded_input, target_mask, input_mask)

        # Final linear layer to generate output predictions
        output = self.fc_out(target)

        return output, attention


#### This class implements the feed-forward neural network used in each layer of the Transformer.
#### It consists of two linear transformations with a ReLU activation in between, applied independently to each position in the sequence.

In [None]:
class FeedForwardLayer(nn.Module):
    def __init__(self, hidden_size: int, ff_size: int, dropout: float) -> None:
        super().__init__()

        # Feed-forward neural network with dropout and ReLU activation
        self.ff_layer = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_size, hidden_size),
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return self.ff_layer(input)

#### This class defines the overall Transformer model, combining the encoder and decoder components.
#### It processes input sequences through the encoder to generate context-rich representations, and then decodes these representations to produce the output sequence.

In [2]:
class Transformer(nn.Module):
    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        device: torch.device,
        padding_index: int = 0,
    ) -> None:
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.padding_index = padding_index
        self.device = device

    def make_input_mask(self, input: torch.Tensor) -> torch.Tensor:
        # Create input mask to ignore padding tokens
        input_mask = (input != self.padding_index).unsqueeze(1).unsqueeze(2)
        return input_mask

    def make_target_mask(self, target: torch.Tensor) -> torch.Tensor:
        # Create target mask to ignore padding tokens and ensure autoregressive property
        target_pad_mask = (target != self.padding_index).unsqueeze(1).unsqueeze(2)
        target_sub_mask = torch.tril(
            torch.ones((target.shape[1], target.shape[1]), device=self.device)
        ).bool()
        target_mask = target_pad_mask & target_sub_mask
        return target_mask

    def forward(
        self, input: torch.Tensor, target: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        input_mask = self.make_input_mask(input)
        target_mask = self.make_target_mask(target)

        # Encode input sequences
        encoded_input = self.encoder(input, input_mask)

        # Decode target sequences with encoded input
        output, attention = self.decoder(target, encoded_input, target_mask, input_mask)

        return output, attention


# Dictionary

#### This class manages the vocabulary and token mappings for the Transformer model.
#### It converts between words and their corresponding indices, facilitating encoding and decoding operations within the model.

In [3]:
from typing import Dict

# Special tokens
PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2
UNK_TOKEN = 3


class Dictionary:
    def __init__(self, name: str) -> None:
        """
        Initialize the dictionary with the given language name. The dictionary keeps track of words and their corresponding indices.

        Args:
            name (str): The name of the language.
        """
        self.name = name
        self.word2index: Dict[str, int] = {
            "<pad>": PAD_TOKEN,
            "<sos>": SOS_TOKEN,
            "<eos>": EOS_TOKEN,
            "<unk>": UNK_TOKEN,
        }
        self.word2count: Dict[str, int] = {}
        self.index2word: Dict[int, str] = {
            PAD_TOKEN: "<pad>",
            SOS_TOKEN: "<sos>",
            EOS_TOKEN: "<eos>",
            UNK_TOKEN: "<unk>",
        }
        self.n_count: int = 4  # Count includes PAD, SOS, EOS, and UNK

    def add_sentence(self, sentence: str) -> None:
        """
        Add all words in a sentence to the dictionary.

        Args:
            sentence (str): The sentence whose words are to be added.
        """
        for word in sentence.split(" "):
            self.add_word(word)

    def add_word(self, word: str) -> None:
        """
        Add a word to the dictionary. If the word already exists, increment its count.

        Args:
            word (str): The word to be added.
        """
        if word not in self.word2index:
            self.word2index[word] = self.n_count
            self.word2count[word] = 1
            self.index2word[self.n_count] = word
            self.n_count += 1
        else:
            self.word2count[word] += 1


# Utilities

#### This cell contains utility functions and helper methods that support various operations within the Transformer model, such as converting UNICODE TO ASCII, normalizing string, loading files & batches and tokenization.

In [4]:
import os
import re
import unicodedata
import csv
from typing import List, Tuple
import torch

PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2


def unicodeToAscii(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
    )


def normalizeString(s: str) -> str:
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def load_files(
    lang1: str,
    lang2: str,
    data_dir: str,
    reverse: bool = True,
    MAX_FILE_SIZE: int = 100000,
    MAX_LENGTH: int = 60,
) -> Tuple[Dictionary, Dictionary, List[str], List[str]]:
    lang1_list = []
    lang2_list = []

    # Assume there's a single CSV file in the data_dir
    csv_file_path = None
    for root, _, files in os.walk(data_dir):
        for file_name in files:
            if file_name.endswith(".csv"):
                csv_file_path = os.path.join(root, file_name)
                break
        if csv_file_path:
            break

    if not csv_file_path:
        raise FileNotFoundError(f"CSV file not found in {data_dir}")

    # Read all lines first to calculate interval
    all_lang1_lines = []
    all_lang2_lines = []

    with open(csv_file_path, mode="r", encoding="utf-8") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            all_lang1_lines.append(row[lang1.capitalize()].strip())
            all_lang2_lines.append(row[lang2.capitalize()].strip())

    # Ensure both files have the same number of lines
    assert len(all_lang1_lines) == len(
        all_lang2_lines
    ), "Mismatched number of lines in language files"

    # Determine the interval to sample lines
    interval = max(1, len(all_lang1_lines) // MAX_FILE_SIZE)

    # Select lines based on the interval
    lang1_list = [all_lang1_lines[i] for i in range(0, len(all_lang1_lines), interval)]
    lang2_list = [all_lang2_lines[i] for i in range(0, len(all_lang2_lines), interval)]

    # Limit the number of selected lines to MAX_FILE_SIZE
    lang1_list = lang1_list[:MAX_FILE_SIZE]
    lang2_list = lang2_list[:MAX_FILE_SIZE]

    # Debugging: Print the length of loaded lists
    print(f"Loaded {len(lang1_list)} sentences for {lang1}")
    print(f"Loaded {len(lang2_list)} sentences for {lang2}")

    # Preprocess strings
    lang1_normalized = list(map(normalizeString, lang1_list))
    lang2_normalized = list(map(normalizeString, lang2_list))

    lang1_sentences = []
    lang2_sentences = []

    for i in range(len(lang1_normalized)):
        tokens1 = lang1_normalized[i].split(" ")
        tokens2 = lang2_normalized[i].split(" ")
        if len(tokens1) <= MAX_LENGTH and len(tokens2) <= MAX_LENGTH:
            lang1_sentences.append(lang1_normalized[i])
            lang2_sentences.append(lang2_normalized[i])

    # Debugging: Print the number of sentences after filtering by length
    print(f"{len(lang1_sentences)} {lang1} sentences after length filtering")
    print(f"{len(lang2_sentences)} {lang2} sentences after length filtering")

    if reverse:
        input_dic = Dictionary(lang2)
        output_dic = Dictionary(lang1)
        return input_dic, output_dic, lang2_sentences, lang1_sentences
    else:
        input_dic = Dictionary(lang1)
        output_dic = Dictionary(lang2)
        return input_dic, output_dic, lang1_sentences, lang2_sentences



def tokenize(sentence: str, dictionary: Dictionary, MAX_LENGTH: int = 60) -> List[int]:
    split_sentence = [word for word in sentence.split(" ")]
    token = [SOS_TOKEN]
    token += [
        dictionary.word2index.get(word, dictionary.word2index["<unk>"])
        for word in sentence.split(" ")
    ]
    token.append(EOS_TOKEN)
    token += [PAD_TOKEN] * (MAX_LENGTH - len(split_sentence))
    return token


def load_batches(
    input_lang: List[List[int]],
    output_lang: List[List[int]],
    batch_size: int,
    device: torch.device,
) -> List[Tuple[torch.Tensor, torch.Tensor]]:
    data_loader = []
    for i in range(0, len(input_lang), batch_size):
        input_batch = input_lang[i : i + batch_size]
        target_batch = output_lang[i : i + batch_size]

        if len(input_batch) == 0 or len(target_batch) == 0:
            continue

        input_tensor = torch.LongTensor(input_batch).to(device)
        target_tensor = torch.LongTensor(target_batch).to(device)
        data_loader.append([input_tensor, target_tensor])
    return data_loader


# Train

#### This class handles the training process for the Transformer model, including the forward pass, loss computation, backpropagation, and optimization.
#### It manages training epochs, tracks performance metrics, and saves model checkpoints.

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
import argparse
import pickle
import os
import logging
from typing import List, Tuple
from random import shuffle
from sklearn.model_selection import KFold
import sacrebleu

# Configure logging for better output control and formatting
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Define special tokens
PAD_TOKEN = 0  # Padding token
SOS_TOKEN = 1  # Start of sentence token
EOS_TOKEN = 2  # End of sentence token


class Trainer:
    def initialize_weights(self, model: nn.Module) -> None:
        """
        Initialize model weights using Xavier uniform initialization.
        """
        if hasattr(model, "weight") and model.weight.dim() > 1:
            nn.init.xavier_uniform_(model.weight.data)

    def save_dictionary(self, dictionary: dict, input: bool = True) -> None:
        """
        Save the language dictionary to disk.
        """
        # Create directory to save dictionaries if it doesn't exist
        directory = (
            f"saved_models/{self.input_lang_dic.name}2{self.output_lang_dic.name}"
        )
        if not os.path.exists(directory):
            os.makedirs(directory)

        # Determine the file path based on whether it's the input or output dictionary
        file_path = f"{directory}/{'input_dic.pkl' if input else 'output_dic.pkl'}"
        # Save the dictionary to a pickle file
        with open(file_path, "wb") as f:
            pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)

    def __init__(
        self,
        lang1: str,
        lang2: str,
        data_directory: str,
        reverse: bool,
        MAX_LENGTH: int,
        MAX_FILE_SIZE: int,
        batch_size: int,
        lr: float = 0.0005,
        hidden_size: int = 256,
        encoder_layers: int = 3,
        decoder_layers: int = 3,
        encoder_heads: int = 8,
        decoder_heads: int = 8,
        encoder_ff_size: int = 512,
        decoder_ff_size: int = 512,
        encoder_dropout: float = 0.1,
        decoder_dropout: float = 0.1,
        device: str = "cpu",
    ) -> None:
        """
        Initialize the Trainer with various hyperparameters and configurations.
        """
        self.MAX_LENGTH = MAX_LENGTH
        self.MAX_FILE_SIZE = MAX_FILE_SIZE
        self.device = device
        self.batch_size = batch_size
        self.lr = lr
        self.hidden_size = hidden_size
        self.encoder_layers = encoder_layers
        self.decoder_layers = decoder_layers
        self.encoder_heads = encoder_heads
        self.decoder_heads = decoder_heads
        self.encoder_ff_size = encoder_ff_size
        self.decoder_ff_size = decoder_ff_size
        self.encoder_dropout = encoder_dropout
        self.decoder_dropout = decoder_dropout

        # Load language data and create dictionaries
        (
            self.input_lang_dic,
            self.output_lang_dic,
            self.input_lang_list,
            self.output_lang_list,
        ) = load_files(
            lang1, lang2, data_directory, reverse, self.MAX_FILE_SIZE, self.MAX_LENGTH
        )

        if self.input_lang_dic is None or self.output_lang_dic is None:
            raise ValueError(
                "Loading language files failed due to mismatched line counts."
            )

        # Add sentences to input and output dictionaries
        for sentence in self.input_lang_list:
            self.input_lang_dic.add_sentence(sentence)
        for sentence in self.output_lang_list:
            self.output_lang_dic.add_sentence(sentence)

        # Save the dictionaries to disk
        self.save_dictionary(self.input_lang_dic, input=True)
        self.save_dictionary(self.output_lang_dic, input=False)

        # Tokenize sentences
        self.tokenized_input_lang = [
            tokenize(sentence, self.input_lang_dic, self.MAX_LENGTH)
            for sentence in self.input_lang_list
        ]
        self.tokenized_output_lang = [
            tokenize(sentence, self.output_lang_dic, self.MAX_LENGTH)
            for sentence in self.output_lang_list
        ]

        # Debugging: Print a few tokenized sequences
        print("Sample tokenized input sentences:")
        for i in range(5):
            print(self.tokenized_input_lang[i])
        print("Sample tokenized output sentences:")
        for i in range(5):
            print(self.tokenized_output_lang[i])

        self.batch_size = batch_size

        # Create data loader for batching the training data
        self.data_loader = load_batches(
            self.tokenized_input_lang,
            self.tokenized_output_lang,
            self.batch_size,
            self.device,
        )

        # Define sizes based on dictionaries
        input_size = self.input_lang_dic.n_count
        output_size = self.output_lang_dic.n_count

        # Print vocabulary sizes for debugging
        print(f"Input vocabulary size: {input_size}")
        print(f"Output vocabulary size: {output_size}")

        # Define encoder and decoder parts of the transformer
        encoder_part = Encoder(
            input_size,
            hidden_size,
            encoder_layers,
            encoder_heads,
            encoder_ff_size,
            encoder_dropout,
            self.device,
        )
        decoder_part = Decoder(
            output_size,
            hidden_size,
            decoder_layers,
            decoder_heads,
            decoder_ff_size,
            decoder_dropout,
            self.device,
        )

        # Initialize the transformer model
        self.transformer = Transformer(
            encoder_part, decoder_part, self.device, PAD_TOKEN
        ).to(self.device)
        self.transformer.apply(self.initialize_weights)

        # Define loss function and optimizer
        self.loss_func = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
        self.optimizer = optim.Adam(self.transformer.parameters(), lr=lr)


    def k_fold_cross_validation(self, k: int, epochs: int, saved_model_directory: str) -> None:
        """
        Perform K-fold cross-validation.
        
        Args:
            k (int): Number of folds.
            epochs (int): Number of epochs for each fold.
            saved_model_directory (str): Directory to save models.
        """
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        all_bleu_scores = []
        all_losses = []
        all_perplexities = []
        
        for fold, (train_index, val_index) in enumerate(kf.split(self.tokenized_input_lang)):
            print(f"Starting fold {fold + 1}/{k}")
            
            # Create training and validation datasets
            train_input = [self.tokenized_input_lang[i] for i in train_index]
            train_output = [self.tokenized_output_lang[i] for i in train_index]
            val_input = [self.tokenized_input_lang[i] for i in val_index]
            val_output = [self.tokenized_output_lang[i] for i in val_index]
            
            # Create data loaders
            self.data_loader = load_batches(train_input, train_output, self.batch_size, self.device)
            val_loader = load_batches(val_input, val_output, self.batch_size, self.device)
            
            # Reinitialize the model for each fold
            input_size = self.input_lang_dic.n_count
            output_size = self.output_lang_dic.n_count
            encoder_part = Encoder(
                input_size, self.hidden_size, self.encoder_layers, self.encoder_heads,
                self.encoder_ff_size, self.encoder_dropout, self.device
            )
            decoder_part = Decoder(
                output_size, self.hidden_size, self.decoder_layers, self.decoder_heads,
                self.decoder_ff_size, self.decoder_dropout, self.device
            )
            self.transformer = Transformer(
                encoder_part, decoder_part, self.device, PAD_TOKEN
            ).to(self.device)
            self.transformer.apply(self.initialize_weights)
            self.optimizer = optim.Adam(self.transformer.parameters(), lr=self.lr)
            
            # Train the model
            for epoch in range(epochs):
                start_time = time.time()
                train_loss, perplexity = self.train_epoch()  # Train for one epoch
                duration = time.time() - start_time  # Calculate epoch duration
                estimated_remaining_time = (
                    (k * epochs) - ((fold * epochs) + epoch + 1)
                ) * duration  # Estimate remaining time

                # Calculate BLEU score on validation dataset
                bleu_score = self.calculate_bleu(val_loader)

                # Log epoch statistics
                logging.info(
                    f"Fold {fold + 1}/{k}, Epoch {epoch + 1}/{epochs}, Time: {duration:.1f}s, Estimated remaining time: {estimated_remaining_time:.1f}s"
                )
                logging.info(
                    f"  Training Loss: {train_loss:.4f}, Perplexity: {perplexity:.4f}, BLEU Score: {bleu_score:.2f}"
                )

                # Save model checkpoint every 5 epochs
                if epoch % 5 == 0 or epoch == epochs - 1:
                    self.save_model(epoch, saved_model_directory)

            # Append results of the current fold
            all_bleu_scores.append(bleu_score)
            all_losses.append(train_loss)
            all_perplexities.append(perplexity)

        # Aggregate results
        avg_bleu_score = np.mean(all_bleu_scores)
        avg_loss = np.mean(all_losses)
        avg_perplexity = np.mean(all_perplexities)
        
        logging.info(f"K-Fold Cross-Validation Results:")
        logging.info(f"  Average BLEU Score: {avg_bleu_score:.2f}")
        logging.info(f"  Average Loss: {avg_loss:.4f}")
        logging.info(f"  Average Perplexity: {avg_perplexity:.4f}")

        # Save final model
        self.final_save_model(saved_model_directory)
        

    def train_epoch(self) -> Tuple[float, float]:
        """
        Train the model for one epoch and calculate average training loss and perplexity.
        """
        # Shuffle the data loader to prevent overfitting
        shuffle(self.data_loader)
        train_loss = 0

        for input, target in self.data_loader:
            # Skip empty batches
            if input.size(0) == 0 or target.size(0) == 0:
                logging.warning("Empty batch detected. Skipping...")
                continue

            # Ensure tensors have at least 2 dimensions
            if input.dim() == 1:
                input = input.unsqueeze(0)
            if target.dim() == 1:
                target = target.unsqueeze(0)

            # Debugging: Check if input values are within vocabulary size
            if (input >= self.input_lang_dic.n_count).any() or (
                target >= self.output_lang_dic.n_count
            ).any():
                print(f"Input tensor has values outside the vocabulary size: {input}")
                print(f"Target tensor has values outside the vocabulary size: {target}")

            # Zero gradients
            self.optimizer.zero_grad()

            # Forward pass through the transformer model
            output, _ = self.transformer(input, target[:, :-1])

            # Reshape output and target for loss calculation
            output = output.reshape(-1, output.shape[-1])
            target = target[:, 1:].reshape(-1)

            # Calculate loss
            loss = self.loss_func(output, target)
            # Backpropagation
            loss.backward()
            # Update model parameters
            self.optimizer.step()

            train_loss += loss.item()

        # Average training loss for the epoch
        avg_loss = train_loss / len(self.data_loader)
        # Perplexity is the exponential of the average loss
        perplexity = np.exp(avg_loss)
        return avg_loss, perplexity

    def train(self, epochs: int, saved_model_directory: str) -> None:
        """
        Train the model for a specified number of epochs.
        """
        for epoch in range(epochs):
            start_time = time.time()
            train_loss, perplexity = self.train_epoch()  # Train for one epoch
            duration = time.time() - start_time  # Calculate epoch duration
            estimated_remaining_time = (
                epochs - epoch - 1
            ) * duration  # Estimate remaining time

            # Log epoch statistics
            logging.info(
                f"Epoch {epoch + 1}/{epochs}, Time: {duration:.1f}s, Estimated remaining time: {estimated_remaining_time:.1f}s"
            )
            logging.info(
                f"  Training Loss: {train_loss:.4f}, Perplexity: {perplexity:.4f}"
            )

            # Save model checkpoint every 5 epochs
            if epoch % 5 == 0 or epoch == epochs - 1:
                self.save_model(epoch, saved_model_directory)

        logging.info("Training finished!")  # Log completion message
        self.final_save_model(saved_model_directory)  # Save final model

    def save_model(self, epoch: int, saved_model_directory: str) -> None:
        """
        Save the model checkpoint.

        Args:
            epoch (int): The current epoch number.
            saved_model_directory (str): The directory where the model should be saved.
        """
        # Create directory to save model checkpoints if it doesn't exist
        directory = os.path.join(
            saved_model_directory,
            f"{self.input_lang_dic.name}2{self.output_lang_dic.name}",
        )
        if not os.path.exists(directory):
            os.makedirs(directory)
        # Define model path and save model state
        model_path = os.path.join(directory, f"transformer_epoch_{epoch}.pth")
        torch.save(self.transformer.state_dict(), model_path)
        logging.info(f"Model saved to {model_path}")  # Log save message

    def final_save_model(self, saved_model_directory: str) -> None:
        """
        Save the final model checkpoint.
        """
        # Create directory to save model checkpoints if it doesn't exist
        directory = os.path.join(
            saved_model_directory,
            f"{self.input_lang_dic.name}2{self.output_lang_dic.name}",
        )
        if not os.path.exists(directory):
            os.makedirs(directory)
        # Define model path and save model state
        model_path = os.path.join(directory, "transformer_model.pt")
        torch.save(self.transformer.state_dict(), model_path)
        logging.info(f"Final model saved to {model_path}")  # Log save message

    def calculate_bleu(self, val_loader):
        """
        Calculate BLEU score on the validation dataset.
        """
        self.transformer.eval()
        references = []
        hypotheses = []
        with torch.no_grad():
            for input, target in val_loader:
                input, target = input.to(self.device), target.to(self.device)
                target_input = target[:, :-1]
                target_output = target[:, 1:]
                
                output, _ = self.transformer(input, target_input)
                predictions = output.argmax(dim=-1)
                
                for ref, hyp in zip(target_output, predictions):
                    ref_text = ' '.join([self.output_lang_dic.index2word[idx.item()] for idx in ref if idx.item() not in [PAD_TOKEN, EOS_TOKEN]])
                    hyp_text = ' '.join([self.output_lang_dic.index2word[idx.item()] for idx in hyp if idx.item() not in [PAD_TOKEN, EOS_TOKEN]])
                    references.append(ref_text)
                    hypotheses.append(hyp_text)
        
        bleu = sacrebleu.corpus_bleu(hypotheses, [references])
        return bleu.score
        

def main() -> None:
    # Example parameters
    lang1 = 'english'
    lang2 = 'french'
    data_directory = 'data'
    reverse = 0
    MAX_LENGTH = 60
    MAX_FILE_SIZE = 200000
    batch_size = 128
    lr = 0.0005
    hidden_size = 256
    encoder_layers = 3
    decoder_layers = 3
    encoder_heads = 8
    decoder_heads = 8
    encoder_ff_size = 512
    decoder_ff_size = 512
    encoder_dropout = 0.1
    decoder_dropout = 0.1
    device = 'cuda'
    
    # Initialize the trainer
    trainer = Trainer(
        lang1, lang2, data_directory, reverse, MAX_LENGTH, MAX_FILE_SIZE, batch_size, lr,
        hidden_size, encoder_layers, decoder_layers, encoder_heads, decoder_heads,
        encoder_ff_size, decoder_ff_size, encoder_dropout, decoder_dropout, device
    )
    
    # Perform k-fold cross-validation
    k = 5
    epochs = 10
    saved_model_directory = './saved_models'
    trainer.k_fold_cross_validation(k, epochs, saved_model_directory)
    # Start the training process
    # trainer.train(epochs, saved_model_directory)


if __name__ == "__main__":
    main()  # Run the main function if the script is executed


Loaded 200000 sentences for english
Loaded 200000 sentences for french
200000 english sentences after length filtering
200000 french sentences after length filtering
Sample tokenized input sentences:
[1, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 6, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

2024-08-08 07:38:40,184 - INFO - Fold 1/5, Epoch 1/10, Time: 85.1s, Estimated remaining time: 4168.2s
2024-08-08 07:38:40,186 - INFO -   Training Loss: 3.6934, Perplexity: 40.1820, BLEU Score: 1.45
2024-08-08 07:38:40,450 - INFO - Model saved to ./saved_models/english2french/transformer_epoch_0.pth
2024-08-08 07:41:36,861 - INFO - Fold 1/5, Epoch 2/10, Time: 80.3s, Estimated remaining time: 3853.0s
2024-08-08 07:41:36,862 - INFO -   Training Loss: 1.7606, Perplexity: 5.8162, BLEU Score: 3.84
2024-08-08 07:44:35,735 - INFO - Fold 1/5, Epoch 3/10, Time: 80.4s, Estimated remaining time: 3777.5s
2024-08-08 07:44:35,736 - INFO -   Training Loss: 1.1238, Perplexity: 3.0765, BLEU Score: 4.54
2024-08-08 07:47:32,344 - INFO - Fold 1/5, Epoch 4/10, Time: 80.4s, Estimated remaining time: 3699.9s
2024-08-08 07:47:32,346 - INFO -   Training Loss: 0.8613, Perplexity: 2.3662, BLEU Score: 4.77
2024-08-08 07:50:27,934 - INFO - Fold 1/5, Epoch 5/10, Time: 80.7s, Estimated remaining time: 3631.2s
2024-08

Starting fold 2/5


2024-08-08 08:08:10,029 - INFO - Fold 2/5, Epoch 1/10, Time: 83.5s, Estimated remaining time: 3257.9s
2024-08-08 08:08:10,030 - INFO -   Training Loss: 3.8082, Perplexity: 45.0687, BLEU Score: 0.93
2024-08-08 08:08:10,249 - INFO - Model saved to ./saved_models/english2french/transformer_epoch_0.pth
2024-08-08 08:11:07,701 - INFO - Fold 2/5, Epoch 2/10, Time: 80.6s, Estimated remaining time: 3062.4s
2024-08-08 08:11:07,702 - INFO -   Training Loss: 1.9541, Perplexity: 7.0575, BLEU Score: 3.48
2024-08-08 08:14:04,310 - INFO - Fold 2/5, Epoch 3/10, Time: 80.2s, Estimated remaining time: 2967.5s
2024-08-08 08:14:04,311 - INFO -   Training Loss: 1.1939, Perplexity: 3.2998, BLEU Score: 4.42
2024-08-08 08:17:00,478 - INFO - Fold 2/5, Epoch 4/10, Time: 80.2s, Estimated remaining time: 2886.9s
2024-08-08 08:17:00,480 - INFO -   Training Loss: 0.8942, Perplexity: 2.4453, BLEU Score: 4.85
2024-08-08 08:19:57,289 - INFO - Fold 2/5, Epoch 5/10, Time: 79.9s, Estimated remaining time: 2795.4s
2024-08

Starting fold 3/5


2024-08-08 08:37:41,437 - INFO - Fold 3/5, Epoch 1/10, Time: 83.2s, Estimated remaining time: 2412.9s
2024-08-08 08:37:41,438 - INFO -   Training Loss: 3.8031, Perplexity: 44.8421, BLEU Score: 0.80
2024-08-08 08:37:41,672 - INFO - Model saved to ./saved_models/english2french/transformer_epoch_0.pth
2024-08-08 08:40:38,532 - INFO - Fold 3/5, Epoch 2/10, Time: 79.8s, Estimated remaining time: 2233.9s
2024-08-08 08:40:38,533 - INFO -   Training Loss: 2.0648, Perplexity: 7.8834, BLEU Score: 3.33
2024-08-08 08:43:32,877 - INFO - Fold 3/5, Epoch 3/10, Time: 80.3s, Estimated remaining time: 2168.4s
2024-08-08 08:43:32,878 - INFO -   Training Loss: 1.2484, Perplexity: 3.4847, BLEU Score: 4.27
2024-08-08 08:46:26,611 - INFO - Fold 3/5, Epoch 4/10, Time: 80.1s, Estimated remaining time: 2081.5s
2024-08-08 08:46:26,612 - INFO -   Training Loss: 0.9301, Perplexity: 2.5347, BLEU Score: 4.66
2024-08-08 08:49:23,353 - INFO - Fold 3/5, Epoch 5/10, Time: 80.2s, Estimated remaining time: 2005.7s
2024-08

Starting fold 4/5


2024-08-08 09:06:32,332 - INFO - Fold 4/5, Epoch 1/10, Time: 83.5s, Estimated remaining time: 1586.9s
2024-08-08 09:06:32,333 - INFO -   Training Loss: 3.6860, Perplexity: 39.8858, BLEU Score: 1.19
2024-08-08 09:06:32,699 - INFO - Model saved to ./saved_models/english2french/transformer_epoch_0.pth
2024-08-08 09:09:22,243 - INFO - Fold 4/5, Epoch 2/10, Time: 80.4s, Estimated remaining time: 1446.5s
2024-08-08 09:09:22,243 - INFO -   Training Loss: 1.8246, Perplexity: 6.2005, BLEU Score: 3.68
2024-08-08 09:12:12,390 - INFO - Fold 4/5, Epoch 3/10, Time: 80.1s, Estimated remaining time: 1361.6s
2024-08-08 09:12:12,391 - INFO -   Training Loss: 1.1463, Perplexity: 3.1466, BLEU Score: 4.42
2024-08-08 09:15:02,036 - INFO - Fold 4/5, Epoch 4/10, Time: 80.1s, Estimated remaining time: 1281.2s
2024-08-08 09:15:02,037 - INFO -   Training Loss: 0.8753, Perplexity: 2.3995, BLEU Score: 4.83
2024-08-08 09:17:51,806 - INFO - Fold 4/5, Epoch 5/10, Time: 80.0s, Estimated remaining time: 1200.7s
2024-08

Starting fold 5/5


2024-08-08 09:35:02,227 - INFO - Fold 5/5, Epoch 1/10, Time: 83.5s, Estimated remaining time: 751.2s
2024-08-08 09:35:02,228 - INFO -   Training Loss: 3.7543, Perplexity: 42.7036, BLEU Score: 0.98
2024-08-08 09:35:02,507 - INFO - Model saved to ./saved_models/english2french/transformer_epoch_0.pth
2024-08-08 09:37:52,045 - INFO - Fold 5/5, Epoch 2/10, Time: 80.3s, Estimated remaining time: 642.4s
2024-08-08 09:37:52,046 - INFO -   Training Loss: 1.8567, Perplexity: 6.4029, BLEU Score: 3.60
2024-08-08 09:40:42,489 - INFO - Fold 5/5, Epoch 3/10, Time: 80.3s, Estimated remaining time: 561.9s
2024-08-08 09:40:42,490 - INFO -   Training Loss: 1.1550, Perplexity: 3.1739, BLEU Score: 4.35
2024-08-08 09:43:32,616 - INFO - Fold 5/5, Epoch 4/10, Time: 80.3s, Estimated remaining time: 481.8s
2024-08-08 09:43:32,617 - INFO -   Training Loss: 0.8710, Perplexity: 2.3894, BLEU Score: 4.71
2024-08-08 09:46:22,629 - INFO - Fold 5/5, Epoch 5/10, Time: 80.2s, Estimated remaining time: 400.9s
2024-08-08 0

# Translate and Summarize

#### This cell contains the function to translate input text using the trained Transformer model and also summarize the text.
#### It processes the input sequence, generates the translation, and returns the translated output.
#### Before proceeding to translation, we produce a summarized version of the text.

In [5]:
import torch
import pickle
import random
from typing import Tuple, Any, List
from datasets import load_dataset
from collections import Counter
import re
from rouge_score import rouge_scorer

# Special tokens
SOS_TOKEN = 1  # Start of Sentence token
EOS_TOKEN = 2  # End of Sentence token

def load_dictionary(directory: str) -> 'Dictionary':
    """
    Load a language dictionary from a pickle file.

    Args:
        directory (str): Path to the pickle file.

    Returns:
        Dictionary: Loaded dictionary object.
    """
    with open(directory, "rb") as f:
        return pickle.load(f)

def summarize_document(document: str, max_chars: int = 100) -> str:
    """
    Summarize the document by extracting the most frequent sentences.

    Args:
        document (str): Document to be summarized.
        max_chars (int): Maximum length of the summary in characters.

    Returns:
        str: Summary of the document.
    """
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', document)
    if len(sentences) <= 1:
        return document[:max_chars]

    # Tokenize and count word frequencies
    word_freq = Counter()
    for sentence in sentences:
        words = re.findall(r'\w+', sentence.lower())
        word_freq.update(words)

    # Score sentences based on word frequencies
    sentence_scores = {}
    for sentence in sentences:
        sentence_scores[sentence] = sum(word_freq.get(word.lower(), 0) for word in re.findall(r'\w+', sentence))

    # Sort sentences by score and extract the most frequent ones
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    summary = ' '.join(sorted_sentences)

    # Ensure summary does not exceed max_chars
    return summary[:max_chars]

def translate_sentence(
    sentence: str,
    input_dic: 'Dictionary',
    output_dic: 'Dictionary',
    model: 'Transformer',
    device: torch.device,
    max_len: int,
    prob_threshold: float = 0.1,  # Probability threshold for stopping condition
) -> Tuple[str, Any]:
    """
    Translate a sentence from the input language to the output language using the transformer model.

    Args:
        sentence (str): Sentence to be translated.
        input_dic (Dictionary): Input language dictionary.
        output_dic (Dictionary): Output language dictionary.
        model (Transformer): Transformer model for translation.
        device (torch.device): Device to run the model on (CPU or GPU).
        max_len (int): Maximum length of the output sentence.
        prob_threshold (float): Probability threshold for stopping condition.

    Returns:
        Tuple[str, Any]: Translated sentence and attention weights.
    """
    model.eval()
    normalized_sentence = normalizeString(sentence)
    tokens = tokenize(normalized_sentence, input_dic, max_len)

    # Ensure tokens are within the valid range
    tokens = [min(token, input_dic.n_count - 1) for token in tokens]

    input_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    input_mask = model.make_input_mask(input_tensor)

    with torch.no_grad():
        encoded_input = model.encoder(input_tensor, input_mask)

    target_tokens = [SOS_TOKEN]
    generated_sentences = set()

    for i in range(max_len):
        target_tensor = torch.LongTensor(target_tokens).unsqueeze(0).to(device)
        target_mask = model.make_target_mask(target_tensor)

        with torch.no_grad():
            output, attention = model.decoder(
                target_tensor, encoded_input, target_mask, input_mask
            )

        pred_token = output.argmax(2)[:, -1].item()
        pred_prob = torch.softmax(output, dim=-1)[0, -1, pred_token].item()

        # Penalize repetition of the same token
        if len(target_tokens) > 1 and pred_token == target_tokens[-1]:
            output[
                0, -1, pred_token
            ] -= 1.0  # Decrease the probability of the last token
            pred_token = output.argmax(2)[:, -1].item()
            pred_prob = torch.softmax(output, dim=-1)[0, -1, pred_token].item()

        target_tokens.append(pred_token)

        # Break if the probability of the predicted token is below the threshold
        if pred_prob < prob_threshold:
            break

        # Check for sentence repetition and penalize
        current_sentence = " ".join(
            [output_dic.index2word[t] for t in target_tokens[1:]]
        )
        if current_sentence in generated_sentences:
            output[0, -1, pred_token] -= 1.0  # Penalize repetition
            pred_token = output.argmax(2)[:, -1].item()
            target_tokens[-1] = pred_token  # Update with new token
        else:
            generated_sentences.add(current_sentence)

        # Break if the end of sentence token is predicted
        if pred_token == EOS_TOKEN:
            break

    # Convert token IDs to words, ignoring the first token (SOS) and the last token (EOS)
    target_results = [output_dic.index2word[i] for i in target_tokens if i != EOS_TOKEN]

    return " ".join(target_results[1:]), attention

def translate_documents(
    documents: List[str],
    input_dic: 'Dictionary',
    output_dic: 'Dictionary',
    model: 'Transformer',
    device: torch.device,
    max_len: int,
    prob_threshold: float = 0.1,
) -> List[str]:
    """
    Translate a list of documents from the input language to the output language using the transformer model.

    Args:
        documents (List[str]): List of documents to be translated.
        input_dic (Dictionary): Input language dictionary.
        output_dic (Dictionary): Output language dictionary.
        model (Transformer): Transformer model for translation.
        device (torch.device): Device to run the model on (CPU or GPU).
        max_len (int): Maximum length of the output sentence.
        prob_threshold (float): Probability threshold for stopping condition.

    Returns:
        List[str]: List of translated documents.
    """
    translated_documents = []
    for document in documents:
        summary = summarize_document(document)
        translation, _ = translate_sentence(
            summary, input_dic, output_dic, model, device, max_len, prob_threshold
        )
        translated_documents.append(translation)
    return translated_documents

def calculate_rouge_scores(hypotheses: List[str], references: List[str]) -> dict:
    """
    Calculate ROUGE scores for a list of hypotheses against reference summaries.

    Args:
        hypotheses (List[str]): List of generated summaries.
        references (List[str]): List of reference summaries.

    Returns:
        dict: ROUGE scores dictionary.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
    num_summaries = len(hypotheses)

    for hypothesis, reference in zip(hypotheses, references):
        if isinstance(reference, list):
            reference = reference[0]  # Ensure reference is a single string
        if isinstance(hypothesis, list):
            hypothesis = hypothesis[0]  # Ensure hypothesis is a single string
        score = scorer.score(reference, hypothesis)
        for key in scores:
            scores[key] += score[key].fmeasure

    # Average scores
    scores = {key: value / num_summaries for key, value in scores.items()}
    return scores

def main() -> None:
    # Set parameters as variables
    dataset_en = load_dataset("wiki_lingua", "english")
    input_lang = "english"
    output_lang = "french"
    models_dir = "saved_models/"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Hyperparameters
    MAX_LENGTH = 60
    hidden_size = 256
    encoder_layers = 3
    decoder_layers = 3
    encoder_heads = 8
    decoder_heads = 8
    encoder_ff_size = 512
    decoder_ff_size = 512
    encoder_dropout = 0.1
    decoder_dropout = 0.1

    transformer_location = f"{models_dir}{input_lang}2{output_lang}/"

    # Load dictionaries
    input_lang_dic = load_dictionary(transformer_location + "input_dic.pkl")
    output_lang_dic = load_dictionary(transformer_location + "output_dic.pkl")

    input_size = input_lang_dic.n_count
    output_size = output_lang_dic.n_count

    # Define models
    encoder_part = Encoder(
        input_size,
        hidden_size,
        encoder_layers,
        encoder_heads,
        encoder_ff_size,
        encoder_dropout,
        device,
    )
    decoder_part = Decoder(
        output_size,
        hidden_size,
        decoder_layers,
        decoder_heads,
        decoder_ff_size,
        decoder_dropout,
        device,
    )

    translator = Transformer(encoder_part, decoder_part, device).to(device)
    # Correctly load model state dictionary
    translator.load_state_dict(
        torch.load(
            transformer_location + "transformer_model.pt",
            map_location=torch.device("cpu"),  # remove this when cuda available NOTE: SAI
        )
    )

    # Randomly select 5 documents
    random_docs = random.sample(dataset_en["train"]["article"], 5)
    documents = []
    reference_summaries = []
    for doc in random_docs:
        if len(doc["document"]) > 1:
            document_text = doc["document"][1]
            documents.append(document_text)
            reference_summaries.append(doc["summary"])
        else:
            print(f"Skipping document due to insufficient structure: {doc}")

    # Translate the summarized documents
    translations = translate_documents(
        documents, input_lang_dic, output_lang_dic, translator, device, MAX_LENGTH
    )

    # Print results and calculate ROUGE scores
    generated_summaries = [summarize_document(doc) for doc in documents]

    for i, (original_doc, summary, translation, reference_summary) in enumerate(
        zip(documents, generated_summaries, translations, reference_summaries)
    ):
        print(f"Document {i+1} - Original: {original_doc[:300]}...")  # Print a snippet of the original document
        print(f"Document {i+1} - Summary: {summary}")
        print(f"Document {i+1} - Translated Summary: {translation}\n")

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge_scores(generated_summaries, reference_summaries)
    print(f"ROUGE Scores: {rouge_scores}")

if __name__ == "__main__":
    main()


  torch.load(


Skipping document due to insufficient structure: {'section_name': [], 'document': [], 'summary': []}
Document 1 - Original: Let him know that you admire him and appreciate all of the good in him. Show him that you value him just as much, even if he is going through a tough time. Giving him unconditional positive regard will reassure him that he can count on you even when he’s not feeling his best. Take his mind off his s...
Document 1 - Summary: Show him that you value him just as much, even if he is going through a tough time. If he is going t
Document 1 - Translated Summary: montrez lui que de la valeur s il n en va pas beaucoup .

Document 2 - Original: To recap our examples from the prior section:   Cash:  $20,000  Investments / Retirement:  $20,000  Home:  $150,000  Personal Property:  $25,000  Car:  $15,000  Life Insurance:  $10,000  Total:  $240,000 Again, from our sample estimates:   Home Loan:  $120,000  Auto Loan:  $15,000  Student Loans:  $...
Document 2 - Summary: To recap 