In [176]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
from datasets import Dataset
from tokenizers import BertWordPieceTokenizer

from sklearn.model_selection import train_test_split
from transformers_from_scratch import Transformer

import os
import pandas as pd


from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [142]:

file_path = "src/sample_dataset_1.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,French,Bassa
0,Il y avait dans le pays d'Uts un homme dont le...,"Mut wada a bé yééne i loñ Us, jôl jé li bé le ..."
1,Et il lui naquit sept fils et trois filles;,"A bééna bon bôlôm basaambok, ni bon bôda baa."
2,"et il possédait sept mille brebis, et trois mi...","A bééna ki 7 000 di mintômba, 3 000 di kamél, ..."
3,"Et ses fils allaient et faisaient un festin, c...",Hiki man wé nu munlôm a bééna yé ngéda i tégba...
4,"Et il arrivait que, quand les jours de festin ...","I ngéda ba bé ba mal i mangand ma, Hiôb a bé a..."


Method to train an encoder-decoder transformer model from scratch where the encoder processes French text and the decoder processes Bassa text. To do this, it is better to use separate tokenizers for each language. This way, each tokenizer can focus on the specific vocabulary and tokenization rules of its respective language, which can improve the efficiency and accuracy of the model.

### Step 1: Create Separate Tokenizers
We'll create separate tokenizers for French and Bassa.

In [143]:


# Save the French texts to a temporary file
os.makedirs("temp", exist_ok=True)
french_texts_path = "temp/french_texts.txt"
data['French'].to_csv(french_texts_path, index=False, header=False)

# Save the Bassa texts to a temporary file
bassa_texts_path = "temp/bassa_texts.txt"
data['Bassa'].to_csv(bassa_texts_path, index=False, header=False)

# Initialize and train the French tokenizer
french_tokenizer = BertWordPieceTokenizer()
french_tokenizer.train(files=[french_texts_path], vocab_size=30_000, min_frequency=2, special_tokens=[
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]",
])
os.makedirs("french_tokenizer", exist_ok=True)
french_tokenizer.save_model("french_tokenizer")

# Initialize and train the Bassa tokenizer
bassa_tokenizer = BertWordPieceTokenizer()
bassa_tokenizer.train(files=[bassa_texts_path], vocab_size=30_000, min_frequency=2, special_tokens=[
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]",
])
os.makedirs("bassa_tokenizer", exist_ok=True)
bassa_tokenizer.save_model("bassa_tokenizer")

# Load the tokenizers using the transformers library
from transformers import BertTokenizerFast

french_tokenizer = BertTokenizerFast.from_pretrained("french_tokenizer")
bassa_tokenizer = BertTokenizerFast.from_pretrained("bassa_tokenizer")









### Step 2: Prepare the Dataset

We will prepare the datasets for training using the respective tokenizers.

In [144]:
# Split the dataset into training and validation sets
train_french, val_french, train_bassa, val_bassa = train_test_split(
    data['French'], data['Bassa'], test_size=0.1, random_state=42
)

In [188]:
val_french

6384     Car ainsi m'a dit le Seigneur: Va, place une s...
9841     Car autrefois, aux jours de David et d'Asaph, ...
1599     ils ne seront pas confus au mauvais temps, et ...
3295     C'est ici mon repos à perpétuité; ici j'habite...
13729    Dieu dit à Abraham: Quant à Saraï, ta femme, t...
                               ...                        
13818    Et Dieu lui dit en songe: Moi aussi je sais qu...
5352     Et si un homme a emprunté une bête à son proch...
10528    Moi, j'ai fait la terre, l'homme et la bête qu...
3511     Animaux et tout le bétail, reptiles et oiseaux...
17166    Élisée vint à Damas; et Ben-Hadad, roi de Syri...
Name: French, Length: 2312, dtype: object

In [187]:
val_bassa

6384     Inyule Yéhôva a nkal me le: “Ke téé mut faa, a...
9841     Inyule i ngéda kôba, i dilo di David bo Asaf, ...
1599     Ba ga wo bé nyuu i ngéda bikuu; ba ga bana nga...
3295     “Linôyôi jem li i boga ni boga; m’a yééne+ mu,...
13729    I mbus, Nyambe a kal Abraham le: “Inyu nwaa wo...
                               ...                        
13818    Bañga Nyambe a kal nye ikété eem le: “Me nyi l...
5352     “Ibale mut a mpoo lém yak mut wé libôk, ndi lé...
10528    ‘Men me bi hek hisi, bôt ba binam ni binuga bi...
3511     a bé binuga bi bikai+ ni bilém gwobisôna, a bé...
17166    Élisa a bi ke i Damaskô+ i ngéda Ben-Hadad,+ k...
Name: Bassa, Length: 2312, dtype: object

In [145]:
# Function to prepare dataset
def tokenize_and_prepare_dataset(src_texts, tgt_texts, src_tokenizer, tgt_tokenizer):
    src_encodings = src_tokenizer(src_texts.tolist(), truncation=True, padding=True, max_length=100, return_tensors="pt")
    tgt_encodings = tgt_tokenizer(tgt_texts.tolist(), truncation=True, padding=True, max_length=100, return_tensors="pt")

    dataset = TensorDataset(src_encodings['input_ids'], tgt_encodings['input_ids'])
    return dataset

In [146]:
# Tokenize and prepare training and validation datasets
train_dataset = tokenize_and_prepare_dataset(train_french, train_bassa, french_tokenizer, bassa_tokenizer)
val_dataset = tokenize_and_prepare_dataset(val_french, val_bassa, french_tokenizer, bassa_tokenizer)

# DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  0%|          | 0/15606 [2:52:42<?, ?it/s]


### Step 2: Train the Model
Now, we need to set up the training loop to train your custom transformer model using the prepared data.



In [147]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
src_vocab_size = french_tokenizer.vocab_size
trg_vocab_size = bassa_tokenizer.vocab_size
# Define the padding token indices for the source and target tokenizers
src_pad_idx = french_tokenizer.pad_token_id
trg_pad_idx = bassa_tokenizer.pad_token_id
embed_size = 256
num_layers = 6
forward_expansion = 4
heads = 8
dropout = 0.1
max_length = 100
learning_rate = 0.0003

# Initialize the transformer model
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx=src_pad_idx, trg_pad_idx=trg_pad_idx, embed_size=embed_size, num_layers=num_layers, forward_expansion=forward_expansion, heads=heads, dropout=dropout, device=device, max_length=max_length).to(device)


In [148]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 50


In [149]:
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (src, trg) in enumerate(train_loader):
        src, trg = src.to(device), trg.to(device)
        trg_input = trg[:, :-1]
        trg_target = trg[:, 1:]

        optimizer.zero_grad()
        output = model(src, trg_input)
        
        # Reshape output and target to calculate loss
        output = output.reshape(-1, output.shape[2])
        trg_target = trg_target.reshape(-1)
        
        loss = criterion(output, trg_target)
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, trg in val_loader:
            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]
            trg_target = trg[:, 1:]

            output = model(src, trg_input)
            output = output.reshape(-1, output.shape[2])
            trg_target = trg_target.reshape(-1)

            loss = criterion(output, trg_target)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    print(f"Validation Loss after Epoch [{epoch+1}/{num_epochs}]: {val_loss:.4f}")

Epoch [1/50], Step [0/651], Loss: 9.0764
Epoch [1/50], Step [100/651], Loss: 5.5660
Epoch [1/50], Step [200/651], Loss: 5.5436
Epoch [1/50], Step [300/651], Loss: 5.6159
Epoch [1/50], Step [400/651], Loss: 5.3959
Epoch [1/50], Step [500/651], Loss: 5.1963
Epoch [1/50], Step [600/651], Loss: 4.8183
Validation Loss after Epoch [1/50]: 4.7980
Epoch [2/50], Step [0/651], Loss: 4.8736
Epoch [2/50], Step [100/651], Loss: 4.7693
Epoch [2/50], Step [200/651], Loss: 4.6657
Epoch [2/50], Step [300/651], Loss: 4.7549
Epoch [2/50], Step [400/651], Loss: 4.5263
Epoch [2/50], Step [500/651], Loss: 4.4027
Epoch [2/50], Step [600/651], Loss: 4.4354
Validation Loss after Epoch [2/50]: 4.3628
Epoch [3/50], Step [0/651], Loss: 4.3717
Epoch [3/50], Step [100/651], Loss: 4.2544
Epoch [3/50], Step [200/651], Loss: 4.2041
Epoch [3/50], Step [300/651], Loss: 4.2838
Epoch [3/50], Step [400/651], Loss: 4.2906
Epoch [3/50], Step [500/651], Loss: 4.2283
Epoch [3/50], Step [600/651], Loss: 4.1304
Validation Loss a

In [156]:
bassa_tokenizer.pad_token_id

0

In [157]:
src_pad_idx = french_tokenizer.pad_token_id
trg_pad_idx = bassa_tokenizer.pad_token_id

In [158]:
# Save the model
model_save_path = "transformer_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Load the model
loaded_model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0.1, device=device, max_length=100).to(device)
loaded_model.load_state_dict(torch.load(model_save_path))
print(f"Model loaded from {model_save_path}")

Model saved to transformer_model.pth
Model loaded from transformer_model.pth


In [160]:
def translate_sentence(model, sentence, src_tokenizer, tgt_tokenizer, device, max_length=50):
    src_tokens = src_tokenizer(sentence, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    src_tokens = src_tokens['input_ids'].to(device)
    
    model.eval()
    with torch.no_grad():
        # Generate the source mask
        src_mask = model.make_src_mask(src_tokens)
        
        # Encode the source tokens
        enc_src = model.encoder(src_tokens, src_mask)
        
        # Prepare the initial target input token ([CLS] token)
        tgt_tokens = torch.tensor([[tgt_tokenizer.cls_token_id]], dtype=torch.long).to(device)
        
        for _ in range(max_length):
            # Generate the target mask
            trg_mask = model.make_trg_mask(tgt_tokens).to(device)
            
            # Decode the current target tokens
            output = model.decoder(tgt_tokens, enc_src, src_mask, trg_mask)
            
            # Get the last token's logits and apply softmax to get probabilities
            preds = output[:, -1, :].softmax(dim=-1)
            
            # Get the token ID with the highest probability
            next_token = preds.argmax(1).unsqueeze(0)
            
            # Concatenate the predicted token to the target tokens
            tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)
            
            # Stop if the end token is generated
            if next_token.item() == tgt_tokenizer.sep_token_id:
                break
        
    # Decode the token IDs to get the translated sentence
    translated_sentence = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)
    return translated_sentence




In [189]:
val_french

6384     Car ainsi m'a dit le Seigneur: Va, place une s...
9841     Car autrefois, aux jours de David et d'Asaph, ...
1599     ils ne seront pas confus au mauvais temps, et ...
3295     C'est ici mon repos à perpétuité; ici j'habite...
13729    Dieu dit à Abraham: Quant à Saraï, ta femme, t...
                               ...                        
13818    Et Dieu lui dit en songe: Moi aussi je sais qu...
5352     Et si un homme a emprunté une bête à son proch...
10528    Moi, j'ai fait la terre, l'homme et la bête qu...
3511     Animaux et tout le bétail, reptiles et oiseaux...
17166    Élisée vint à Damas; et Ben-Hadad, roi de Syri...
Name: French, Length: 2312, dtype: object

In [190]:
val_bassa

6384     Inyule Yéhôva a nkal me le: “Ke téé mut faa, a...
9841     Inyule i ngéda kôba, i dilo di David bo Asaf, ...
1599     Ba ga wo bé nyuu i ngéda bikuu; ba ga bana nga...
3295     “Linôyôi jem li i boga ni boga; m’a yééne+ mu,...
13729    I mbus, Nyambe a kal Abraham le: “Inyu nwaa wo...
                               ...                        
13818    Bañga Nyambe a kal nye ikété eem le: “Me nyi l...
5352     “Ibale mut a mpoo lém yak mut wé libôk, ndi lé...
10528    ‘Men me bi hek hisi, bôt ba binam ni binuga bi...
3511     a bé binuga bi bikai+ ni bilém gwobisôna, a bé...
17166    Élisa a bi ke i Damaskô+ i ngéda Ben-Hadad,+ k...
Name: Bassa, Length: 2312, dtype: object

In [164]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming the model and tokenizers are already initialized and trained
# sentence = "Your input sentence in French"
#sentence = "Bonjour tout le monde"
sentence = "Et l'Éternel dit à Satan"
translated_sentence = translate_sentence(loaded_model, sentence, french_tokenizer, bassa_tokenizer, device)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: yehova a konde podos satan, a kal nye le :


In [167]:
sentence = "Du matin au soir, ils sont frappés; ils périssent pour toujours sans qu'on y fasse attention."
translated_sentence = translate_sentence(loaded_model, sentence, french_tokenizer, bassa_tokenizer, device)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: kegla, kegla, ba ye kegla ; ba wok kegla, ba wok ki ngeda yosona.


In [None]:
"Inyule Yéhôva a nkal me le"

In [191]:
sentence = "Car ainsi m'a dit le Seigneur"
translated_sentence = translate_sentence(loaded_model, sentence, french_tokenizer, bassa_tokenizer, device)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: inyule yehova a bi kal me le :


In [177]:


def evaluate_bleu(model, data_loader, src_tokenizer, tgt_tokenizer, device, max_length=50):
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)

            for i in range(src.size(0)):
                src_sentence = src[i].unsqueeze(0)
                tgt_sentence = tgt[i].unsqueeze(0)
                tgt_input = tgt_sentence[:, :-1]

                src_mask = model.make_src_mask(src_sentence)
                enc_src = model.encoder(src_sentence, src_mask)
                tgt_tokens = torch.tensor([[tgt_tokenizer.cls_token_id]], dtype=torch.long).to(device)

                for _ in range(max_length):
                    trg_mask = model.make_trg_mask(tgt_tokens).to(device)
                    output = model.decoder(tgt_tokens, enc_src, src_mask, trg_mask)
                    preds = output[:, -1, :].softmax(dim=-1)
                    next_token = preds.argmax(1).unsqueeze(0)
                    tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)
                    if next_token.item() == tgt_tokenizer.sep_token_id:
                        break

                predicted_sentence = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)
                reference_sentence = tgt_tokenizer.decode(tgt_input.squeeze().tolist(), skip_special_tokens=True)

                references.append([reference_sentence.split()])
                hypotheses.append(predicted_sentence.split())

    # Calculate BLEU score
    smoothing = SmoothingFunction().method4
    bleu_scores = [sentence_bleu(ref, hyp, smoothing_function=smoothing) for ref, hyp in zip(references, hypotheses)]
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    return average_bleu




In [178]:
# Example usage
bleu_score = evaluate_bleu(loaded_model, val_loader, french_tokenizer, bassa_tokenizer, device)
print(f"BLEU Score: {bleu_score:.4f}")

BLEU Score: 0.1148


In [179]:
from rouge_score import rouge_scorer

def evaluate_rouge(model, data_loader, src_tokenizer, tgt_tokenizer, device, max_length=50):
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)

            for i in range(src.size(0)):
                src_sentence = src[i].unsqueeze(0)
                tgt_sentence = tgt[i].unsqueeze(0)
                tgt_input = tgt_sentence[:, :-1]

                src_mask = model.make_src_mask(src_sentence)
                enc_src = model.encoder(src_sentence, src_mask)
                tgt_tokens = torch.tensor([[tgt_tokenizer.cls_token_id]], dtype=torch.long).to(device)

                for _ in range(max_length):
                    trg_mask = model.make_trg_mask(tgt_tokens).to(device)
                    output = model.decoder(tgt_tokens, enc_src, src_mask, trg_mask)
                    preds = output[:, -1, :].softmax(dim=-1)
                    next_token = preds.argmax(1).unsqueeze(0)
                    tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)
                    if next_token.item() == tgt_tokenizer.sep_token_id:
                        break

                predicted_sentence = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)
                reference_sentence = tgt_tokenizer.decode(tgt_input.squeeze().tolist(), skip_special_tokens=True)

                references.append(reference_sentence)
                hypotheses.append(predicted_sentence)

    # Calculate ROUGE score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
    average_rouge1 = sum(score['rouge1'].fmeasure for score in scores) / len(scores)
    average_rougeL = sum(score['rougeL'].fmeasure for score in scores) / len(scores)
    return average_rouge1, average_rougeL




In [180]:
# Example usage
rouge1, rougeL = evaluate_rouge(loaded_model, val_loader, french_tokenizer, bassa_tokenizer, device)
print(f"ROUGE-1 Score: {rouge1:.4f}")
print(f"ROUGE-L Score: {rougeL:.4f}")

ROUGE-1 Score: 0.4284
ROUGE-L Score: 0.3626


### BLEU Score (Bilingual Evaluation Understudy)

**BLEU** is a precision-oriented metric that measures how many words (n-grams) in the candidate translation appear in the reference translation. It's widely used for evaluating machine translation models.

#### Key Concepts

1. **N-grams**: Sequences of N words. BLEU typically uses 1-grams (individual words) to 4-grams.
2. **Precision**: The fraction of n-grams in the candidate translation that are also in the reference translation.
3. **Brevity Penalty**: A penalty applied to short translations to avoid the model generating very short translations that would have high precision but miss a lot of content.

#### Calculation

1. **N-gram Precision**: Compute precision for 1-gram, 2-gram, ..., up to N-gram.
2. **Geometric Mean**: Take the geometric mean of the n-gram precisions.
3. **Brevity Penalty**: Apply the brevity penalty to the geometric mean to get the final BLEU score.

#### Formula

$$ \text{BLEU} = \text{BP} \times \exp \left( \sum_{n=1}^{N} w_n \log p_n \right) $$

Where:
- $ \text{BP} $ is the brevity penalty.
- $ w_n $ is the weight for n-grams (usually uniform weights like $ w_n = \frac{1}{N} $).
- $ p_n $ is the precision for n-grams.

#### Example

For a 1-gram BLEU score:
- Candidate sentence: "The cat is on the mat."
- Reference sentence: "There is a cat on the mat."

1. Extract 1-grams: "The", "cat", "is", "on", "the", "mat".
2. Calculate precision: 5 out of 6 1-grams in the candidate sentence are in the reference sentence.
3. Apply brevity penalty if necessary.
4. Compute BLEU score.

### ROUGE Score (Recall-Oriented Understudy for Gisting Evaluation)

**ROUGE** is a recall-oriented metric that measures the overlap of n-grams between the candidate summary and the reference summary. It's widely used for evaluating summarization models.

#### Key Concepts

1. **N-grams**: Sequences of N words. ROUGE-N (e.g., ROUGE-1, ROUGE-2) measures overlap of 1-grams, 2-grams, etc.
2. **Recall**: The fraction of n-grams in the reference summary that are also in the candidate summary.
3. **F-Measure**: The harmonic mean of precision and recall, providing a balanced measure.

#### Types of ROUGE

1. **ROUGE-N**: Measures the overlap of n-grams.
2. **ROUGE-L**: Measures the longest common subsequence (LCS).
3. **ROUGE-S**: Measures the overlap of skip-bigrams (pairs of words allowing gaps).

#### Calculation

1. **N-gram Overlap**: Compute the number of n-grams in the reference that appear in the candidate.
2. **Recall**: Calculate recall based on the n-gram overlap.
3. **Precision**: Calculate precision based on the n-gram overlap.
4. **F-Measure**: Compute the F-measure as the harmonic mean of precision and recall.

#### Formula

For ROUGE-N:

\$$\text{ROUGE-N} = \frac{\sum_{S \in \{\text{References}\}} \sum_{gram_n \in S} \min(\text{Count}_\text{match}(gram_n), \text{Count}(gram_n))}{\sum_{S \in \{\text{References}\}} \sum_{gram_n \in S} \text{Count}(gram_n)} $$

Where:
- $ \text{Count}_\text{match}(gram_n) $ is the count of n-grams that match in the candidate and reference.
- $ \text{Count}(gram_n) $ is the count of n-grams in the reference.

#### Example

For a ROUGE-1 score:
- Candidate summary: "The cat is on the mat."
- Reference summary: "There is a cat on the mat."

1. Extract 1-grams: "The", "cat", "is", "on", "the", "mat".
2. Calculate recall: 5 out of 7 1-grams in the reference summary are in the candidate summary.
3. Calculate precision: 5 out of 6 1-grams in the candidate summary are in the reference summary.
4. Compute F-measure as the harmonic mean of precision and recall.

### Summary

- **BLEU**: Precision-oriented, commonly used for machine translation, measures how much of the candidate is in the reference.
- **ROUGE**: Recall-oriented, commonly used for summarization, measures how much of the reference is in the candidate.

Both metrics provide insights into the quality of generated text compared to reference text, each with its own focus and method of calculation.

Both BLEU and ROUGE can be used for evaluating translation tasks, but they have different focuses and characteristics:

- **BLEU (Bilingual Evaluation Understudy)**: It is more commonly used for machine translation tasks. BLEU emphasizes precision, which means it measures how many of the n-grams in the candidate translation are also in the reference translation. BLEU is well-suited for translation because it considers the order of words, which is important in language translation.

- **ROUGE (Recall-Oriented Understudy for Gisting Evaluation)**: It is commonly used for summarization tasks but can also be used for translation. ROUGE emphasizes recall, meaning it measures how many of the n-grams in the reference translation are also in the candidate translation. ROUGE can be useful in translation tasks to some extent, especially in cases where capturing all the important content is crucial.

### When to Use BLEU

- If you are interested in how precise your translations are, i.e., how many of the n-grams in the candidate translation match the reference translation.
- BLEU is widely accepted and used in the machine translation community.

### When to Use ROUGE

- If you want to measure recall, i.e., how much of the reference translation is covered by the candidate translation.
- ROUGE might be less common for translation but can provide additional insights.

### Combined Approach

You can use both BLEU and ROUGE to get a more comprehensive evaluation. BLEU will give you an idea of precision, while ROUGE can provide insights into recall and the coverage of the reference translation.

### Implementing BLEU and ROUGE for Your Translation Task

Here’s how you can implement both BLEU and ROUGE to evaluate your French-to-Bassa translation model:

#### BLEU Evaluation

```python
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu(model, data_loader, src_tokenizer, tgt_tokenizer, device, max_length=50):
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)

            for i in range(src.size(0)):
                src_sentence = src[i].unsqueeze(0)
                tgt_sentence = tgt[i].unsqueeze(0)
                tgt_input = tgt_sentence[:, :-1]

                src_mask = model.make_src_mask(src_sentence)
                enc_src = model.encoder(src_sentence, src_mask)
                tgt_tokens = torch.tensor([[tgt_tokenizer.cls_token_id]], dtype=torch.long).to(device)

                for _ in range(max_length):
                    trg_mask = model.make_trg_mask(tgt_tokens).to(device)
                    output = model.decoder(tgt_tokens, enc_src, src_mask, trg_mask)
                    preds = output[:, -1, :].softmax(dim=-1)
                    next_token = preds.argmax(1).unsqueeze(0)
                    tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)
                    if next_token.item() == tgt_tokenizer.sep_token_id:
                        break

                predicted_sentence = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)
                reference_sentence = tgt_tokenizer.decode(tgt_input.squeeze().tolist(), skip_special_tokens=True)

                references.append([reference_sentence.split()])
                hypotheses.append(predicted_sentence.split())

    # Calculate BLEU score
    smoothing = SmoothingFunction().method4
    bleu_scores = [sentence_bleu(ref, hyp, smoothing_function=smoothing) for ref, hyp in zip(references, hypotheses)]
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    return average_bleu

# Example usage
bleu_score = evaluate_bleu(loaded_model, val_loader, french_tokenizer, bassa_tokenizer, device)
print(f"BLEU Score: {bleu_score:.4f}")
```

#### ROUGE Evaluation

```python
from rouge_score import rouge_scorer

def evaluate_rouge(model, data_loader, src_tokenizer, tgt_tokenizer, device, max_length=50):
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)

            for i in range(src.size(0)):
                src_sentence = src[i].unsqueeze(0)
                tgt_sentence = tgt[i].unsqueeze(0)
                tgt_input = tgt_sentence[:, :-1]

                src_mask = model.make_src_mask(src_sentence)
                enc_src = model.encoder(src_sentence, src_mask)
                tgt_tokens = torch.tensor([[tgt_tokenizer.cls_token_id]], dtype=torch.long).to(device)

                for _ in range(max_length):
                    trg_mask = model.make_trg_mask(tgt_tokens).to(device)
                    output = model.decoder(tgt_tokens, enc_src, src_mask, trg_mask)
                    preds = output[:, -1, :].softmax(dim=-1)
                    next_token = preds.argmax(1).unsqueeze(0)
                    tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)
                    if next_token.item() == tgt_tokenizer.sep_token_id:
                        break

                predicted_sentence = tgt_tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)
                reference_sentence = tgt_tokenizer.decode(tgt_input.squeeze().tolist(), skip_special_tokens=True)

                references.append(reference_sentence)
                hypotheses.append(predicted_sentence)

    # Calculate ROUGE score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
    average_rouge1 = sum(score['rouge1'].fmeasure for score in scores) / len(scores)
    average_rougeL = sum(score['rougeL'].fmeasure for score in scores) / len(scores)
    return average_rouge1, average_rougeL

# Example usage
rouge1, rougeL = evaluate_rouge(loaded_model, val_loader, french_tokenizer, bassa_tokenizer, device)
print(f"ROUGE-1 Score: {rouge1:.4f}")
print(f"ROUGE-L Score: {rougeL:.4f}")
```

### Summary

- **BLEU**: Commonly used for machine translation, focusing on precision and considering the order of words. Suitable for evaluating how close the candidate translation is to the reference translation.
- **ROUGE**: Typically used for summarization but can be used for translation as well, focusing on recall and coverage of the reference translation.

Using both BLEU and ROUGE scores can provide a more comprehensive evaluation of your translation model, giving insights into both precision and recall aspects.