<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebooks/llama_31_8b_sacrebleu_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!uv pip install --system transformers sacrebleu tqdm torch

[2mUsing Python 3.10.12 environment at /usr[0m
[2mAudited [1m4 packages[0m [2min 66ms[0m[0m


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from sacrebleu.metrics import BLEU
from typing import List, Union, Optional, Dict
from dataclasses import dataclass
import tqdm

@dataclass
class TranslationConfig:
    """Configuration for translation settings"""
    src_lang: str
    tgt_lang: str
    prompt_template: str = "Translate from {src_lang} to {tgt_lang}. ONLY OUTPUT THE TRANSLATION TEXT AND STOP: {text}\n\nTranslation:"

    # Generation settings
    max_new_tokens: int = 128
    num_beams: int = 1
    do_sample: bool = False
    temperature: float = 1.0
    top_p: float = 1.0
    top_k: int = 50
    repetition_penalty: float = 1.0

    @classmethod
    def from_preset(cls, preset: str, src_lang: str, tgt_lang: str) -> 'TranslationConfig':
        """Create config from preset generation strategy"""
        base_config = cls(src_lang=src_lang, tgt_lang=tgt_lang)

        if preset == "beam_search":
            base_config.num_beams = 5
            base_config.do_sample = False
        elif preset == "sampling":
            base_config.do_sample = True
            base_config.temperature = 0.7
            base_config.top_p = 0.9
        elif preset == "contrastive":
            base_config.penalty_alpha = 0.6
            base_config.top_k = 4

        return base_config

class GeneralTranslationEvaluator:
    def __init__(
        self,
        model_name: str = "meta-llama/Llama-2-7b-hf",
        device: str = "cuda" if torch.cuda.is_available() else "cpu"
    ):
        """
        Initialize the translation evaluator with any HuggingFace model.

        Args:
            model_name: HuggingFace model identifier
            device: Device to run the model on ("cuda" or "cpu")
        """
        print(f"Using device: {device}")
        self.device = device

        print("Loading model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Handle tokenizer padding settings
        if self.tokenizer.pad_token is None:
            if self.tokenizer.eos_token is not None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            else:
                # Add a pad token if neither pad nor eos token exists
                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        # Ensure padding settings are consistent
        self.tokenizer.padding_side = "left"  # Better for casual language models

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto"
        )

        # Resize embeddings if we added new tokens
        if len(self.tokenizer) != self.model.config.vocab_size:
            self.model.resize_token_embeddings(len(self.tokenizer))

        # Initialize BLEU scorer
        self.bleu = BLEU()
        print("Setup complete!")

    def _prepare_prompt(self, text: str, config: TranslationConfig) -> str:
        """Prepare prompt with template"""
        return config.prompt_template.format(
            src_lang=config.src_lang,
            tgt_lang=config.tgt_lang,
            text=text
        )

    def _get_generation_config(self, config: TranslationConfig) -> Dict:
        """Convert translation config to generation parameters"""
        return {
            "max_new_tokens": config.max_new_tokens,
            "num_beams": config.num_beams,
            "do_sample": config.do_sample,
            "temperature": config.temperature,
            "top_p": config.top_p,
            "top_k": config.top_k,
            "repetition_penalty": config.repetition_penalty,
            "pad_token_id": self.tokenizer.pad_token_id,
            "eos_token_id": self.tokenizer.eos_token_id,
            "bos_token_id": self.tokenizer.bos_token_id if self.tokenizer.bos_token_id is not None else None,
        }

    def translate_batch(
        self,
        texts: List[str],
        config: TranslationConfig,
        batch_size: int = 8
    ) -> List[str]:
        """
        Translate a list of texts in batches.

        Args:
            texts: List of source texts to translate
            config: TranslationConfig object with translation settings
            batch_size: Batch size for translation

        Returns:
            List of translated texts
        """
        translations = []

        # Process in batches
        for i in tqdm.trange(0, len(texts), batch_size, desc="Translating"):
            batch = texts[i:i + batch_size]
            batch_prompts = [self._prepare_prompt(text, config) for text in batch]

            # Process input
            inputs = self.tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(self.device)

            # Generate translations
            generation_config = self._get_generation_config(config)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    **generation_config
                )

            # Decode translations and remove prompts
            batch_translations = []
            for output, prompt in zip(outputs, batch_prompts):
                full_text = self.tokenizer.decode(output, skip_special_tokens=True)
                # Extract only the translation part after the prompt
                translation = full_text[len(prompt):].strip()
                batch_translations.append(translation)

            translations.extend(batch_translations)

        return translations

    def evaluate_translations(
        self,
        hypotheses: List[str],
        references: Union[List[str], List[List[str]]],
        verbose: bool = True
    ) -> BLEU:
        """
        Evaluate translations using sacreBLEU.

        Args:
            hypotheses: List of system outputs (translations)
            references: List of reference translations
            verbose: Whether to print the BLEU score

        Returns:
            BLEU score object
        """
        # Handle single reference case
        if isinstance(references[0], str):
            references = [[ref] for ref in references]

        # Calculate BLEU score
        bleu_score = self.bleu.corpus_score(hypotheses, references)

        if verbose:
            print(f"BLEU score: {bleu_score.score:.2f}")
            print(f"Signature: {self.bleu.get_signature()}")

        return bleu_score


## Example

In [3]:
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('huggingface')

In [4]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `colab` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
# Test data
source_texts = [
    "Hello, my dog is cute",
    "The weather is nice today",
    "I love programming"
]

# Reference translations in French
references = [
    ["Bonjour, mon chien est mignon"],
    ["Le temps est beau aujourd'hui"],
    ["J'aime la programmation"]
]

print("Initializing translator...")
# Initialize evaluator
evaluator = GeneralTranslationEvaluator(model_name="meta-llama/Llama-3.1-8B")

# Create translation config
config = TranslationConfig(
    src_lang="English",
    tgt_lang="French",
    prompt_template="Translate from {src_lang} to {tgt_lang}:\n{text}\n\nTranslation:"
)

print("\nTranslating texts...")
# Translate texts
translations = evaluator.translate_batch(
    texts=source_texts,
    config=config
)

print("\nEvaluating translations...")
# Evaluate translations
bleu_score = evaluator.evaluate_translations(
    hypotheses=translations,
    references=references
)

# Print detailed results
print("\nDetailed Results:")
print("-" * 50)
for src, hyp, ref in zip(source_texts, translations, references):
    print(f"\nSource: {src}")
    print(f"System: {hyp}")
    print(f"Reference: {ref[0]}")

Initializing translator...
Using device: cuda
Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setup complete!

Translating texts...


Translating: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Evaluating translations...
BLEU score: 4.71
Signature: nrefs:3|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

Detailed Results:
--------------------------------------------------

Source: Hello, my dog is cute
System: Bonjour, mon chien est mignon

Hello, my dog is cute

Translation: Bonjour, mon chien est mignon

Hello, my dog is cute

Translation: Bonjour, mon chien est mignon

Hello, my dog is cute

Translation: Bonjour, mon chien est mignon

Hello, my dog is cute

Translation: Bonjour, mon chien est mignon

Hello, my dog is cute

Translation: Bonjour, mon chien est mignon

Hello, my dog is cute

Translation: Bonjour, mon chien est mignon

Hello, my dog
Reference: Bonjour, mon chien est mignon

Source: The weather is nice today
System: Le temps est beau aujourd'hui

The weather is nice today

Translation: Le temps est beau aujourd'hui

The weather is nice today

Translation: Le temps est beau aujourd'hui

The weather is nice today

Translation: Le temps est beau aujourd'hui

T




## Evaluation

In [6]:
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/01-english-spanish-mapping.jsonl
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/02-english-spanish-mapping.jsonl

--2024-11-30 17:32:50--  https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/01-english-spanish-mapping.jsonl
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/01-english-spanish-mapping.jsonl [following]
--2024-11-30 17:32:50--  https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/01-english-spanish-mapping.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2829 (2.8K) [text/plain]
Saving to: ‘01-english-spanish-mapping.jsonl.2’


2024-11-30 17:32:50 (38.7 MB/s) - ‘01-english-spanish-mapping.jsonl.2

In [7]:
# Load test data from JSONL files
import json
from typing import List, Dict

def read_jsonl(file_path: str) -> List[Dict]:
    """
    Read a JSONL file and return its contents as a list of dictionaries.

    Args:
        file_path (str): Path to the JSONL file

    Returns:
        List[Dict]: List of dictionaries containing the parsed JSONL data
    """
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    json_obj = json.loads(line.strip())
                    data.append(json_obj)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {e}")
                    continue
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")
    return data

# Read the evaluation data
print("Loading evaluation data...")
mapping_01 = read_jsonl("01-english-spanish-mapping.jsonl")
mapping_02 = read_jsonl("02-english-spanish-mapping.jsonl")

Loading evaluation data...


In [8]:
# Extract source texts and references
source_texts_01 = [item["source_text"] for item in mapping_01]
source_texts_02 = [item["source_text"] for item in mapping_02]
references_01 = [item["references"] for item in mapping_01]
references_02 = [item["references"] for item in mapping_02]

In [9]:
# Initialize evaluator (can specify a different model if desired)
print("Initializing translator")
evaluator = GeneralTranslationEvaluator()

# Create translation config for English to Spanish
config = TranslationConfig(
    src_lang="English",
    tgt_lang="Spanish",
    prompt_template="Translate from {src_lang} to {tgt_lang}. ONLY OUTPUT THE TRANSLATION TEXT AND STOP:\n{text}\n\nTranslation:",
    # Optional: Add generation strategy parameters
    num_beams=4,  # Using beam search for better quality
    max_new_tokens=128,  # Allowing longer translations
    repetition_penalty=1.2  # Reduce repetition
)


Initializing translator
Using device: cuda
Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setup complete!


In [10]:
# Evaluate first set
print("\nTranslating first set of texts...")
translations_01 = evaluator.translate_batch(
    texts=source_texts_01,
    config=config,
    batch_size=8  # Adjust based on your GPU memory
)

print("\nEvaluating first set translations...")
bleu_score_01 = evaluator.evaluate_translations(
    hypotheses=translations_01,
    references=references_01
)

# Print detailed results for first set
print("\nDetailed Results (First Set):")
print("-" * 50)
for src, hyp, ref in zip(source_texts_01, translations_01, references_01):
    print(f"\nSource: {src}")
    print(f"System: {hyp}")
    print(f"Reference: {ref[0]}")


Translating first set of texts...


Translating:   0%|          | 0/4 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Translating: 100%|██████████| 4/4 [00:27<00:00,  6.75s/it]


Evaluating first set translations...
BLEU score: 1.60
Signature: nrefs:31|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

Detailed Results (First Set):
--------------------------------------------------

Source: Wells Fargo Bank | Financial Services & Online Banking
System: Wells Fargo Bank is an American international banking and financial services holding company headquartered in San Francisco, California, with "hubquarters" throughout the country. It is the world's second-largest bank by market capitalization and the third-largest bank in the U.S. by total assets. Wells Fargo is the fourth-largest bank in deposits, home mortgage servicing, and debit cards. The company's primary U.S. operating subsidiary is national bank Wells Fargo Bank, N.A., which designates its main office as
Reference: Wells Fargo Bank | Servicios Financieros y Banca por Internet

Source: How can we help?
System: \begin{blockquote}

How can we help?
\end{blockquote}

Spanish:

\begin{blockquote}

¿Cómo pode




In [11]:

# Evaluate second set
print("\nTranslating second set of texts...")
translations_02 = evaluator.translate_batch(
    texts=source_texts_02,
    config=config,
    batch_size=8
)

print("\nEvaluating second set translations...")
bleu_score_02 = evaluator.evaluate_translations(
    hypotheses=translations_02,
    references=references_02
)

# Print detailed results for second set
print("\nDetailed Results (Second Set):")
print("-" * 50)
for src, hyp, ref in zip(source_texts_02, translations_02, references_02):
    print(f"\nSource: {src}")
    print(f"System: {hyp}")
    print(f"Reference: {ref[0]}")

# Print combined statistics
print("\nSummary:")
print("-" * 50)
print(f"Set 1 BLEU Score: {bleu_score_01.score:.2f}")
print(f"Set 2 BLEU Score: {bleu_score_02.score:.2f}")
print(f"Average BLEU Score: {(bleu_score_01.score + bleu_score_02.score) / 2:.2f}")


Translating second set of texts...


Translating: 100%|██████████| 12/12 [01:22<00:00,  6.91s/it]


Evaluating second set translations...
BLEU score: 0.00
Signature: nrefs:96|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

Detailed Results (Second Set):
--------------------------------------------------

Source: Say hello to convenient checking
System: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Translation: Say hello to convenient checking

Trans
Reference: Hola a una cuenta de cheques conveniente

Source: Explore our checking options and choose the right account for you
System: E




## Libraries

In [12]:
# Now check versions
import pkg_resources
import sys

def get_package_details():
    """Print details of specific packages and Python version"""
    packages_to_check = [
        'torch',
        'transformers',
        'sacrebleu',
        'tqdm',
        'numpy',
        'sentencepiece'  # Often used by transformers
    ]

    print("Python version:", sys.version.split()[0])
    print("\nPackage versions:")
    print("-" * 50)

    for package in packages_to_check:
        try:
            version = pkg_resources.get_distribution(package).version
            print(f"{package:<15} {version}")
        except pkg_resources.DistributionNotFound:
            print(f"{package:<15} Not installed")

# Check CUDA availability for PyTorch
import torch
print("\nCUDA Status:")
print("-" * 50)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current GPU: {torch.cuda.get_device_name()}")

# Run the check
get_package_details()


CUDA Status:
--------------------------------------------------
CUDA available: True
CUDA version: 12.1
Current GPU: NVIDIA A100-SXM4-40GB
Python version: 3.10.12

Package versions:
--------------------------------------------------
torch           2.5.1+cu121
transformers    4.46.2
sacrebleu       2.4.3
tqdm            4.66.6
numpy           1.26.4
sentencepiece   0.2.0


  import pkg_resources
