<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebooks/llama_31_8b_instruct_sacrebleu_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!uv pip install --system transformers sacrebleu tqdm torch

[2mUsing Python 3.10.12 environment at /usr[0m
[2K[2mResolved [1m28 packages[0m [2min 321ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2mportalocker[0m [32m[2m------------------------------[0m[0m     0 B/19.12 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2mportalocker[0m [32m------------------------[2m------[0m[0m 14.91 KiB/19.12 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2mportalocker[0m [32m------------------------[2m------[0m[0m 14.91 KiB/19.12 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2mportalocker[0m [32m------------------------------[2m[0m[0m 19.12 KiB/19.12 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2mportalocker[0m [32m------------------------------[2m[0m[0m 19.12 KiB/19.12 KiB
[2mcolorama  [0m [32m--------------------[2m----------[0m[0m 16.00 KiB/24.74 KiB
[2K[3A[37m⠙[0m [2mPreparing packages.

In [2]:
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('huggingface')

In [3]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `colab` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sacrebleu.metrics import BLEU
from typing import List, Union, Optional, Dict, Tuple
from dataclasses import dataclass
import tqdm
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class Llama31Config:
    """Configuration for Llama 3.1 specific settings"""
    # Supported languages by Llama 3.1
    SUPPORTED_LANGUAGES = {
        "English": "eng", "Spanish": "spa", "French": "fra",
        "German": "deu", "Italian": "ita", "Portuguese": "por",
        "Hindi": "hin", "Thai": "tha"
    }

    system_prompt: str = """You are a professional translator with expertise in multiple languages.
    You will now translate text between languages. Only provide the translation with no explanations."""

    # Updated template using correct Llama 3.1 tokens
    chat_template: str = (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )

    def format_prompt(self, text: str, src_lang: str, tgt_lang: str) -> str:
        """Format the translation prompt according to Llama 3.1 template"""
        user_prompt = f"Translate this {src_lang} text to {tgt_lang}:\n{text}"
        return self.chat_template.format(
            system_prompt=self.system_prompt,
            user_prompt=user_prompt
        )

class Llama31TranslationEvaluator:
    def __init__(
        self,
        model_name: str = "meta-llama/Llama-3.1-8B-Instruct",
        device: str = "cuda" if torch.cuda.is_available() else "cpu",
        debug: bool = False
    ):
        """Initialize the translation evaluator with Llama 3.1."""
        self.debug = debug
        logger.info(f"Using device: {device}")
        self.device = device
        self.llama_config = Llama31Config()

        logger.info("Loading model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Handle tokenizer settings
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto"
        )

        self.bleu = BLEU()
        logger.info("Setup complete!")

    def _extract_translation(self, full_text: str, prompt: str) -> Tuple[str, dict]:
        """Extract translation from model output with detailed debug info."""
        debug_info = {
            "full_text_len": len(full_text),
            "prompt_len": len(prompt),
            "has_assistant_header": "<|start_header_id|>assistant<|end_header_id|>" in full_text,
            "has_eot": "<|eot_id|>" in full_text
        }

        # Try to find the assistant's response section
        assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
        response_start = full_text.find(assistant_marker)

        if response_start != -1:
            # Extract text after the assistant marker
            response = full_text[response_start + len(assistant_marker):]

            # Remove any trailing special tokens
            response = response.replace("<|eot_id|>", "").replace("<|eom_id|>", "")
            response = response.replace("<|end_of_text|>", "").strip()

            debug_info["extraction_method"] = "assistant_marker"
        else:
            # Fallback: try to extract everything after the prompt
            response = full_text[len(prompt):].strip()
            debug_info["extraction_method"] = "prompt_removal"

        # Clean any remaining special tokens
        response = response.replace("<s>", "").replace("</s>", "").strip()
        debug_info["final_response_len"] = len(response)

        return response, debug_info

    def translate_batch(
        self,
        texts: List[str],
        config: TranslationConfig,
        batch_size: int = 1  # Reduced batch size for better reliability
    ) -> List[str]:
        """Translate a list of texts in batches."""
        if config.src_lang not in self.llama_config.SUPPORTED_LANGUAGES.keys() or \
           config.tgt_lang not in self.llama_config.SUPPORTED_LANGUAGES.keys():
            raise ValueError(f"Language must be one of {list(self.llama_config.SUPPORTED_LANGUAGES.keys())}")

        translations = []

        # Process in batches
        for i in tqdm.trange(0, len(texts), batch_size, desc="Translating"):
            batch = texts[i:i + batch_size]
            batch_prompts = [
                self.llama_config.format_prompt(
                    text=text,
                    src_lang=config.src_lang,
                    tgt_lang=config.tgt_lang
                ) for text in batch
            ]

            if self.debug:
                logger.info(f"Sample prompt:\n{batch_prompts[0]}")

            # Process input
            inputs = self.tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=2048
            ).to(self.device)

            # Generate translations
            generation_config = {
                "max_new_tokens": config.max_new_tokens,
                "num_beams": config.num_beams,
                "temperature": config.temperature,
                "top_p": config.top_p,
                "top_k": config.top_k,
                "repetition_penalty": config.repetition_penalty,
                "pad_token_id": self.tokenizer.pad_token_id,
                "eos_token_id": self.tokenizer.eos_token_id,
                "do_sample": config.temperature > 0,
            }

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    **generation_config
                )

            # Decode translations
            batch_translations = []
            for output, prompt in zip(outputs, batch_prompts):
                full_text = self.tokenizer.decode(output, skip_special_tokens=False)
                translation, debug_info = self._extract_translation(full_text, prompt)

                if self.debug:
                    logger.info(f"Translation debug info: {debug_info}")
                    logger.info(f"Raw output: {full_text}")
                    logger.info(f"Extracted translation: {translation}")

                if not translation.strip():
                    logger.warning("Empty translation detected!")
                    logger.info(f"Full text: {full_text}")

                batch_translations.append(translation)

            translations.extend(batch_translations)

        return translations

    def evaluate_translations(
        self,
        hypotheses: List[str],
        references: Union[List[str], List[List[str]]],
        verbose: bool = True
    ) -> BLEU:
        """Evaluate translations using sacreBLEU."""
        if isinstance(references[0], str):
            references = [[ref] for ref in references]

        bleu_score = self.bleu.corpus_score(hypotheses, references)

        if verbose:
            print(f"BLEU score: {bleu_score.score:.2f}")
            print(f"Signature: {self.bleu.get_signature()}")

        return bleu_score



In [14]:
# Test data
source_texts = [
    "Hello, my dog is cute",
    "The weather is nice today",
    "I love programming"
]

# Reference translations in French
references = [
    ["Bonjour, mon chien est mignon"],
    ["Le temps est beau aujourd'hui"],
    ["J'aime la programmation"]
]

print("Initializing translator...")
# Initialize evaluator
evaluator = Llama31TranslationEvaluator(model_name="meta-llama/Llama-3.1-8B-Instruct")

config = TranslationConfig.from_preset(
    preset="sampling",
    src_lang="English",
    tgt_lang="French"
)

print("\nTranslating texts...")
# Translate
translations = evaluator.translate_batch(
    texts=source_texts,
    config=config,
    batch_size=1
)

# Evaluate
print("\nEvaluating translations...")
bleu_score = evaluator.evaluate_translations(
    hypotheses=translations,
    references=references
)

# Print results
print(f"\nDetailed Results for examples:")
print("-" * 50)
for src, hyp, ref in zip(source_texts, translations, references):
    print(f"\nSource: {src}")
    print(f"System: {hyp}")
    print(f"Reference: {ref[0]}")


Initializing translator...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Translating texts...


Translating: 100%|██████████| 3/3 [00:01<00:00,  2.62it/s]


Evaluating translations...
BLEU score: 80.91
Signature: nrefs:3|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

Detailed Results for examples:
--------------------------------------------------

Source: Hello, my dog is cute
System: Bonjour, mon chien est mignon.
Reference: Bonjour, mon chien est mignon

Source: The weather is nice today
System: Le temps est agréable aujourd'hui.
Reference: Le temps est beau aujourd'hui

Source: I love programming
System: J'adore le programmation
Reference: J'aime la programmation





In [15]:
import json

def run_evaluation(
    jsonl_files: List[str],
    src_lang: str = "English",
    tgt_lang: str = "Spanish",
    model_name: str = "meta-llama/Llama-3.1-8B-Instruct",
    batch_size: int = 4,
    generation_preset: str = "quality"
):
    """Run evaluation pipeline."""
    print(f"\nInitializing Llama 3.1 translator...")
    evaluator = Llama31TranslationEvaluator(model_name=model_name)

    config = TranslationConfig.from_preset(
        preset=generation_preset,
        src_lang=src_lang,
        tgt_lang=tgt_lang
    )

    all_bleu_scores = []

    for file_path in jsonl_files:
        print(f"\nProcessing file: {file_path}")

        # Load data
        mapping = read_jsonl(file_path)
        source_texts = [item["source_text"] for item in mapping]
        references = [item["references"] for item in mapping]

        # Translate
        translations = evaluator.translate_batch(
            texts=source_texts,
            config=config,
            batch_size=batch_size
        )

        # Evaluate
        bleu_score = evaluator.evaluate_translations(
            hypotheses=translations,
            references=references
        )

        all_bleu_scores.append(bleu_score.score)

        # Print results
        print(f"\nDetailed Results for {file_path}:")
        print("-" * 50)
        for src, hyp, ref in zip(source_texts, translations, references):
            print(f"\nSource: {src}")
            print(f"System: {hyp}")
            print(f"Reference: {ref[0]}")

    print("\nSummary:")
    print("-" * 50)
    for file_path, score in zip(jsonl_files, all_bleu_scores):
        print(f"{file_path}: BLEU = {score:.2f}")
    print(f"Average BLEU Score: {sum(all_bleu_scores) / len(all_bleu_scores):.2f}")

# Utility function for reading JSONL files
def read_jsonl(file_path: str) -> List[Dict]:
    """Read JSONL file and return parsed data."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")
    return data

In [16]:
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/01-english-spanish-mapping.jsonl
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/02-english-spanish-mapping.jsonl

--2024-12-01 15:48:27--  https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/01-english-spanish-mapping.jsonl
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/01-english-spanish-mapping.jsonl [following]
--2024-12-01 15:48:27--  https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/01-english-spanish-mapping.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2829 (2.8K) [text/plain]
Saving to: ‘01-english-spanish-mapping.jsonl’


2024-12-01 15:48:27 (53.8 MB/s) - ‘01-english-spanish-mapping.jsonl’ sa

In [17]:
jsonl_files = [
    "01-english-spanish-mapping.jsonl",
    "02-english-spanish-mapping.jsonl"
]

run_evaluation(
    jsonl_files=jsonl_files,
    src_lang="English",
    tgt_lang="Spanish",
    model_name="meta-llama/Llama-3.1-8B-Instruct",
    generation_preset="quality"
)


Initializing Llama 3.1 translator...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Processing file: 01-english-spanish-mapping.jsonl


Translating: 100%|██████████| 8/8 [00:06<00:00,  1.31it/s]


BLEU score: 46.71
Signature: nrefs:31|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

Detailed Results for 01-english-spanish-mapping.jsonl:
--------------------------------------------------

Source: Wells Fargo Bank | Financial Services & Online Banking
System: Banco Wells Fargo | Servicios Financieros y Banca en Línea
Reference: Wells Fargo Bank | Servicios Financieros y Banca por Internet

Source: How can we help?
System: ¿En qué podemos ayudar?
Reference: ¿Cómo podemos ayudarle?

Source: Popular FAQs
System: Preguntas frecuentes
Reference: Preguntas frecuentes populares

Source: How do I find my routing and account numbers?
System: ¿Cómo puedo encontrar mis números de ruta y cuenta?
Reference: ¿Cómo puedo encontrar mis números de ruta y de cuenta?

Source: Is there a fee for Zelle®?
System: ¿Hay una tarifa para Zelle?
Reference: ¿Se aplica algún cargo por usar Zelle®?

Source: How do I report suspected fraud?
System: ¿Cómo denunciar un fraude sospechoso?
Reference: ¿Qué debo h

Translating: 100%|██████████| 24/24 [00:29<00:00,  1.23s/it]

BLEU score: 14.06
Signature: nrefs:96|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

Detailed Results for 02-english-spanish-mapping.jsonl:
--------------------------------------------------

Source: Say hello to convenient checking
System: Saludos a la verificación conveniente
Reference: Hola a una cuenta de cheques conveniente

Source: Explore our checking options and choose the right account for you
System: Explora nuestras opciones de cheques y elija la cuenta adecuada para ti
Reference: Explore nuestras opciones de cuentas de cheques y elija la adecuada para usted

Source: Get started
System: Comencemos
Reference: Para comenzar

Source: Find mortgage happiness
System: Encuentra la felicidad hipotecaria
Reference: Sienta alegría hipotecaria

Source: With a down payment as low as 3%
System: Con un pago inicial tan bajo como el 3%
Reference: Con un pago inicial bajo, desde tan solo el 3%

Source: Learn more
System: Aprende más
Reference: Más información

Source: Unlock convenien




## Libraries

In [18]:
# Now check versions
import pkg_resources
import sys

def get_package_details():
    """Print details of specific packages and Python version"""
    packages_to_check = [
        'torch',
        'transformers',
        'sacrebleu',
        'tqdm',
        'numpy',
        'sentencepiece'  # Often used by transformers
    ]

    print("Python version:", sys.version.split()[0])
    print("\nPackage versions:")
    print("-" * 50)

    for package in packages_to_check:
        try:
            version = pkg_resources.get_distribution(package).version
            print(f"{package:<15} {version}")
        except pkg_resources.DistributionNotFound:
            print(f"{package:<15} Not installed")

# Check CUDA availability for PyTorch
import torch
print("\nCUDA Status:")
print("-" * 50)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current GPU: {torch.cuda.get_device_name()}")

# Run the check
get_package_details()


CUDA Status:
--------------------------------------------------
CUDA available: True
CUDA version: 12.1
Current GPU: NVIDIA A100-SXM4-40GB
Python version: 3.10.12

Package versions:
--------------------------------------------------
torch           2.5.1+cu121
transformers    4.46.2
sacrebleu       2.4.3
tqdm            4.66.6
numpy           1.26.4
sentencepiece   0.2.0


  import pkg_resources
