# Model Evaluation: BLEU Score
This notebook evaluates your trained transformer model using the BLEU score, a standard metric for translation quality.

In [None]:
import torch
from config import get_config, get_weights_path
from train import get_model, get_dataset, greedy_decode
import sacrebleu

In [None]:
# Set up device and load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_dataset(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
model_filename = get_weights_path(config, str(config['num_epochs']))
state = torch.load(model_filename, map_location=device)
model.load_state_dict(state['model_state_dict'])
model.eval()

In [None]:
def compute_bleu(model, dataloader, tokenizer_src, tokenizer_tgt, config, device, num_batches=100):
    references = []
    hypotheses = []
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            if i >= num_batches:
                break
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            tgt_text = batch['tgt_text'][0]
            model_output = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, config['seq_len'], device)
            pred = tokenizer_tgt.decode(model_output.cpu().numpy())
            references.append([tgt_text])
            hypotheses.append(pred)
    bleu = sacrebleu.corpus_bleu(hypotheses, list(zip(*references)))
    print(f'BLEU score: {bleu.score:.2f}')
    return bleu

In [None]:
# Evaluate BLEU score on validation set
compute_bleu(model, val_dataloader, tokenizer_src, tokenizer_tgt, config, device, num_batches=100)