<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebooks/hf_model_sacrebleu_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!uv pip install --system transformers sacrebleu tqdm torch

[2mUsing Python 3.10.12 environment at /usr[0m
[2mAudited [1m4 packages[0m [2min 63ms[0m[0m


In [2]:
from dataclasses import dataclass, field
from typing import Optional, Dict, Any, List, Union
from enum import Enum
import json
import logging
from tenacity import retry, stop_after_attempt, wait_exponential
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class InferenceMode(Enum):
    """Enum for inference modes"""
    LOCAL = "local"
    API = "api"

@dataclass
class APIConfig:
    """Configuration for API inference"""
    base_url: str
    api_key: str
    model: str
    timeout: int = 30
    max_retries: int = 3

    def validate(self) -> None:
        """Validate API configuration"""
        if not self.base_url:
            raise ValueError("base_url is required for API inference")
        if not self.api_key:
            raise ValueError("api_key is required for API inference")
        if not self.model:
            raise ValueError("model is required for API inference")

@dataclass
class Llama31Config:
    """Configuration for Llama 3.1 specific settings"""
    SUPPORTED_LANGUAGES = {
        "English": "eng", "Spanish": "spa", "French": "fra",
        "German": "deu", "Italian": "ita", "Portuguese": "por",
        "Hindi": "hin", "Thai": "tha"
    }

    system_prompt: str = """You are a professional translator with expertise in multiple languages.
    Your task is to translate the provided text accurately while preserving meaning, tone, and context.
    Only provide the direct translation without any explanations or notes."""

    # Modified template to be more explicit
    chat_template: str = (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n"
        "{system_prompt}"
        "<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n"
        "{user_prompt}"
        "<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )

    def format_prompt(self, text: str, src_lang: str, tgt_lang: str, for_api: bool = False) -> Union[str, List[Dict[str, str]]]:
        """Format the translation prompt"""
        user_prompt = f"Translate the following text from {src_lang} to {tgt_lang}. Provide only the translation:\n\n{text}"

        if for_api:
            return [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        else:
            return self.chat_template.format(
                system_prompt=self.system_prompt,
                user_prompt=user_prompt
            )

@dataclass
class TranslationConfig:
    """Configuration for translation settings"""
    src_lang: str
    tgt_lang: str
    inference_mode: InferenceMode = InferenceMode.LOCAL
    api_config: Optional[APIConfig] = None

    # Generation settings that work for both local and API
    max_new_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.95
    top_k: int = 50
    repetition_penalty: float = 1.2
    num_beams: int = 4

    def __post_init__(self):
        """Validate configuration after initialization"""
        if self.inference_mode == InferenceMode.API and not self.api_config:
            raise ValueError("api_config is required when inference_mode is API")

        if self.api_config:
            self.api_config.validate()

    def get_api_parameters(self) -> Dict[str, Any]:
        """Get parameters formatted for API call"""
        return {
            "max_tokens": self.max_new_tokens,
            "temperature": self.temperature,
            "top_p": self.top_p,
            "presence_penalty": self.repetition_penalty - 1.0,  # Convert to OpenAI format
        }

    def get_local_parameters(self) -> Dict[str, Any]:
        """Get parameters formatted for local inference"""
        return {
            "max_new_tokens": self.max_new_tokens,
            "num_beams": self.num_beams,
            "temperature": self.temperature,
            "top_p": self.top_p,
            "top_k": self.top_k,
            "repetition_penalty": self.repetition_penalty,
        }

    @classmethod
    def from_preset(
        cls,
        preset: str,
        src_lang: str,
        tgt_lang: str,
        inference_mode: InferenceMode = InferenceMode.LOCAL,
        api_config: Optional[APIConfig] = None
    ) -> 'TranslationConfig':
        """Create config from preset generation strategy"""
        base_config = cls(
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            inference_mode=inference_mode,
            api_config=api_config
        )

        if preset == "beam_search":
            base_config.num_beams = 5
            base_config.temperature = 1.0
        elif preset == "sampling":
            base_config.num_beams = 1
            base_config.temperature = 0.7
            base_config.top_p = 0.9
        elif preset == "quality":
            base_config.num_beams = 4
            base_config.temperature = 0.7
            base_config.top_p = 0.95
            base_config.repetition_penalty = 1.2

        return base_config

In [3]:
class APIClient:
    """Client for making API calls with retry logic"""

    def __init__(self, api_config: APIConfig):
        """Initialize API client with retry logic"""
        self.api_config = api_config
        self.session = self._create_session()

    def _create_session(self) -> requests.Session:
        """Create session with retry logic"""
        session = requests.Session()

        # Configure retry strategy
        retry_strategy = Retry(
            total=self.api_config.max_retries,
            backoff_factor=0.5,
            status_forcelist=[429, 500, 502, 503, 504],
        )

        # Add retry adapter to session
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)

        return session

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=4, max=10),
        reraise=True
    )
    def generate_translation(
        self,
        messages: List[Dict[str, str]],
        parameters: Dict[str, Any]
    ) -> str:
        """Make API call with retry logic"""
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_config.api_key}"
        }

        data = {
            "model": self.api_config.model,
            "messages": messages,
            **parameters
        }

        try:
            response = self.session.post(
                f"{self.api_config.base_url}chat/completions",
                headers=headers,
                json=data,
                timeout=self.api_config.timeout
            )
            response.raise_for_status()

            result = response.json()
            return result["choices"][0]["message"]["content"]

        except requests.exceptions.RequestException as e:
            logger.error(f"API call failed: {str(e)}")
            raise RuntimeError(f"Failed to get translation from API: {str(e)}")

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sacrebleu import BLEU
import tqdm

class Llama31TranslationEvaluator:
    def __init__(
        self,
        config: TranslationConfig,
        model_name: Optional[str] = "meta-llama/Llama-3.1-8B-Instruct",
        device: str = "cuda" if torch.cuda.is_available() else "cpu",
        debug: bool = False
    ):
        """Initialize the translation evaluator"""
        self.debug = debug
        self.config = config
        self.llama_config = Llama31Config()

        if config.inference_mode == InferenceMode.API:
            logger.info("Initializing API client...")
            self.api_client = APIClient(config.api_config)
            self.inference_fn = self._translate_api
        else:
            logger.info(f"Loading local model on device: {device}")
            self.device = device
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)

            # Handle tokenizer settings
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.padding_side = "left"

            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                device_map="auto"
            )
            self.inference_fn = self._translate_local

        self.bleu = BLEU()
        logger.info("Setup complete!")

    def _translate_api(self, text: str, batch_idx: int = 0) -> str:
        """Translate using API endpoint"""
        try:
            messages = self.llama_config.format_prompt(
                text=text,
                src_lang=self.config.src_lang,
                tgt_lang=self.config.tgt_lang,
                for_api=True  # Get API format
            )

            parameters = self.config.get_api_parameters()
            translation = self.api_client.generate_translation(messages, parameters)

            if self.debug:
                logger.info(f"API translation for batch {batch_idx}: {translation}")

            return translation.strip()

        except Exception as e:
            logger.error(f"API translation failed for batch {batch_idx}: {str(e)}")
            raise

    def _translate_local(self, text: str, batch_idx: int = 0) -> str:
        """Translate using local model with detailed debugging"""
        try:
            # Generate the prompt
            prompt = self.llama_config.format_prompt(
                text=text,
                src_lang=self.config.src_lang,
                tgt_lang=self.config.tgt_lang,
                for_api=False
            )

            if self.debug:
                logger.info(f"\nProcessing batch {batch_idx}")
                logger.info(f"Input text: {text}")
                logger.info(f"Generated prompt:\n{prompt}")

            # Tokenize
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=2048
            ).to(self.device)

            if self.debug:
                logger.info(f"Input token count: {len(inputs['input_ids'][0])}")

            # Setup generation parameters
            parameters = self.config.get_local_parameters()
            parameters.update({
                "pad_token_id": self.tokenizer.pad_token_id,
                "eos_token_id": self.tokenizer.eos_token_id,
                "do_sample": parameters["temperature"] > 0,
            })

            if self.debug:
                logger.info(f"Generation parameters: {parameters}")

            # Generate
            with torch.no_grad():
                outputs = self.model.generate(**inputs, **parameters)

            # Decode full output
            full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=False)

            if self.debug:
                logger.info(f"Full model output:\n{full_output}")

            # Extract translation - look for assistant's response
            assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
            eot_marker = "<|eot_id|>"

            if assistant_marker in full_output:
                response_start = full_output.find(assistant_marker) + len(assistant_marker)
                response_end = full_output.find(eot_marker, response_start)

                if response_end != -1:
                    translation = full_output[response_start:response_end]
                else:
                    translation = full_output[response_start:]
            else:
                # Fallback: try to extract everything after the prompt
                translation = full_output[len(prompt):]

            # Clean up the translation
            translation = translation.replace(assistant_marker, "").replace(eot_marker, "")
            translation = translation.replace("<|end_of_text|>", "").strip()

            if self.debug:
                logger.info(f"Extracted translation: {translation}")
                if not translation:
                    logger.warning("Warning: Empty translation detected!")

            return translation

        except Exception as e:
            logger.error(f"Local translation failed for batch {batch_idx}: {str(e)}")
            logger.error(f"Error details:", exc_info=True)
            raise

    def translate_batch(
        self,
        texts: List[str],
        batch_size: int = 1
    ) -> List[str]:
        """Translate a list of texts"""
        translations = []

        for i in tqdm.trange(0, len(texts), batch_size, desc="Translating"):
            batch = texts[i:i + batch_size]

            batch_translations = []
            for j, text in enumerate(batch):
                try:
                    translation = self.inference_fn(text, i + j)
                    batch_translations.append(translation)
                except Exception as e:
                    logger.error(f"Translation failed for text {i + j}: {str(e)}")
                    batch_translations.append("")

            translations.extend(batch_translations)

        return translations

    def evaluate_translations(
        self,
        hypotheses: List[str],
        references: Union[List[str], List[List[str]]],
        verbose: bool = True
    ) -> BLEU:
        """Evaluate translations using sacreBLEU"""
        if isinstance(references[0], str):
            references = [[ref] for ref in references]

        bleu_score = self.bleu.corpus_score(hypotheses, references)

        if verbose:
            logger.info(f"BLEU score: {bleu_score.score:.2f}")
            logger.info(f"Signature: {self.bleu.get_signature()}")

        return bleu_score

In [5]:
from google.colab import userdata
BASE_URL = userdata.get('MODAL_BASE_URL') # should end in /v1/
API_KEY = userdata.get('DSBA_LLAMA3_KEY')
model_name = "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"

## Testing

### API

In [6]:

# Test data
source_texts = [
    "Hello, my dog is cute",
    "The weather is nice today",
    "I love programming"
]

# Reference translations in French
references = [
    ["Bonjour, mon chien est mignon"],
    ["Le temps est beau aujourd'hui"],
    ["J'aime la programmation"]
]

# API Configuration
api_config = APIConfig(
    base_url=BASE_URL,
    api_key=API_KEY,
    model=model_name
)

# Translation Configuration
config = TranslationConfig.from_preset(
    preset="sampling",
    src_lang="English",
    tgt_lang="French",
    inference_mode=InferenceMode.API,
    api_config=api_config
)

print("Initializing translator...")
# Initialize evaluator with API mode
evaluator = Llama31TranslationEvaluator(
    config=config,
    debug=True
)

print("\nTranslating texts...")
# Translate
translations = evaluator.translate_batch(
    texts=source_texts,
    batch_size=1
)

print("\nResults:")
for src, trans, ref in zip(source_texts, translations, references):
    print(f"\nSource: {src}")
    print(f"\nTranslation: {trans}")
    print(f"Reference: {ref[0]}")

# Evaluate translations
bleu_score = evaluator.evaluate_translations(
    hypotheses=translations,
    references=references,
    verbose=True
)

Initializing translator...

Translating texts...


Translating: 100%|██████████| 3/3 [00:01<00:00,  1.74it/s]


Results:

Source: Hello, my dog is cute

Translation: Bonjour, mon chien est mignon.
Reference: Bonjour, mon chien est mignon

Source: The weather is nice today

Translation: Le temps est agréable aujourd'hui.
Reference: Le temps est beau aujourd'hui

Source: I love programming

Translation: J'adore le développement informatique.
Reference: J'aime la programmation





In [7]:
bleu_score

BLEU = 80.91 85.7/83.3/80.0/75.0 (BP = 1.000 ratio = 1.167 hyp_len = 7 ref_len = 6)

### Local

In [8]:
# Initialize with debug mode
config = TranslationConfig.from_preset(
    preset="quality",
    src_lang="English",
    tgt_lang="French",
    inference_mode=InferenceMode.LOCAL
)

print("Initializing translator...")
# Initialize evaluator with API mode
evaluator = Llama31TranslationEvaluator(
    config=config,
    model_name="NousResearch/Meta-Llama-3.1-8B-Instruct",
    debug=True
)

print("\nTranslating texts...")
# Translate
translations = evaluator.translate_batch(
    texts=source_texts,
    batch_size=1
)

print("\nResults:")
for src, trans, ref in zip(source_texts, translations, references):
    print(f"\nSource: {src}")
    print(f"\nTranslation: {trans}")
    print(f"Reference: {ref[0]}")

# Evaluate translations
bleu_score = evaluator.evaluate_translations(
    hypotheses=translations,
    references=references,
    verbose=True
)

Initializing translator...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Translating texts...


Translating: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]


Results:

Source: Hello, my dog is cute

Translation: Bonjour, mon chien est mignon.
Reference: Bonjour, mon chien est mignon

Source: The weather is nice today

Translation: Le temps est agréable aujourd'hui.
Reference: Le temps est beau aujourd'hui

Source: I love programming

Translation: J'adore programmer
Reference: J'aime la programmation





In [9]:
bleu_score

BLEU = 80.91 85.7/83.3/80.0/75.0 (BP = 1.000 ratio = 1.167 hyp_len = 7 ref_len = 6)

## Evaluation

In [10]:
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/01-english-spanish-mapping.jsonl
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/02-english-spanish-mapping.jsonl

--2024-12-01 23:04:28--  https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/01-english-spanish-mapping.jsonl
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/01-english-spanish-mapping.jsonl [following]
--2024-12-01 23:04:28--  https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/01-english-spanish-mapping.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2829 (2.8K) [text/plain]
Saving to: ‘01-english-spanish-mapping.jsonl.2’


2024-12-01 23:04:28 (53.5 MB/s) - ‘01-english-spanish-mapping.jsonl.2

In [11]:
import json
import logging
from typing import List, Dict, Optional, Union
from dataclasses import dataclass

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def run_evaluation(
    jsonl_files: List[str],
    src_lang: str = "English",
    tgt_lang: str = "Spanish",
    inference_mode: InferenceMode = InferenceMode.LOCAL,
    model_name: Optional[str] = "meta-llama/Llama-3.1-8B-Instruct",
    api_config: Optional[APIConfig] = None,
    batch_size: int = 4,
    generation_preset: str = "quality",
    debug: bool = False
):
    """
    Run evaluation pipeline with support for both local and API inference.

    Args:
        jsonl_files: List of JSONL files containing test data
        src_lang: Source language
        tgt_lang: Target language
        inference_mode: Whether to use local model or API endpoint
        model_name: HuggingFace model identifier (for local inference)
        api_config: API configuration (for API inference)
        batch_size: Batch size for processing
        generation_preset: Generation strategy preset
        debug: Enable debug logging
    """
    logger.info(f"\nInitializing translator in {inference_mode.value} mode...")

    # Create translation config
    config = TranslationConfig.from_preset(
        preset=generation_preset,
        src_lang=src_lang,
        tgt_lang=tgt_lang,
        inference_mode=inference_mode,
        api_config=api_config
    )

    # Initialize evaluator
    evaluator = Llama31TranslationEvaluator(
        config=config,
        model_name=model_name if inference_mode == InferenceMode.LOCAL else None,
        debug=debug
    )

    all_bleu_scores = []
    results = {}

    for file_path in jsonl_files:
        logger.info(f"\nProcessing file: {file_path}")

        try:
            # Load data
            mapping = read_jsonl(file_path)
            source_texts = [item["source_text"] for item in mapping]
            references = [item["references"] for item in mapping]

            # Translate
            translations = evaluator.translate_batch(
                texts=source_texts,
                batch_size=batch_size
            )

            # Evaluate
            bleu_score = evaluator.evaluate_translations(
                hypotheses=translations,
                references=references
            )

            all_bleu_scores.append(bleu_score.score)
            results[file_path] = {
                'score': bleu_score.score,
                'translations': list(zip(source_texts, translations, references))
            }

            # Print detailed results
            logger.info(f"\nDetailed Results for {file_path}:")
            logger.info("-" * 50)
            for src, hyp, ref in zip(source_texts, translations, references):
                logger.info(f"\nSource: {src}")
                logger.info(f"System: {hyp}")
                logger.info(f"Reference: {ref[0]}")

        except Exception as e:
            logger.error(f"Error processing file {file_path}: {str(e)}")
            continue

    # Print summary
    logger.info("\nSummary:")
    logger.info("-" * 50)
    for file_path, score in zip(jsonl_files, all_bleu_scores):
        logger.info(f"{file_path}: BLEU = {score:.2f}")

    if all_bleu_scores:
        avg_score = sum(all_bleu_scores) / len(all_bleu_scores)
        logger.info(f"Average BLEU Score: {avg_score:.2f}")
    else:
        logger.warning("No scores calculated!")

    return results

def read_jsonl(file_path: str) -> List[Dict]:
    """
    Read JSONL file and return parsed data.

    Args:
        file_path: Path to JSONL file

    Returns:
        List of dictionaries containing parsed data

    Raises:
        FileNotFoundError: If file doesn't exist
        JSONDecodeError: If line contains invalid JSON
    """
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for i, line in enumerate(file, 1):
                try:
                    data.append(json.loads(line.strip()))
                except json.JSONDecodeError as e:
                    logger.error(f"Error parsing line {i} in {file_path}: {str(e)}")
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
        raise
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {str(e)}")
        raise

    return data

In [12]:
jsonl_files = [
    "01-english-spanish-mapping.jsonl",
    "02-english-spanish-mapping.jsonl"
]

In [13]:
# API inference example
api_config = APIConfig(
    base_url=BASE_URL,
    api_key=API_KEY,
    model=model_name
)

results_api = run_evaluation(
    jsonl_files=jsonl_files,
    src_lang="English",
    tgt_lang="Spanish",
    inference_mode=InferenceMode.API,
    api_config=api_config,
    generation_preset="quality",
    debug=True
)

Translating: 100%|██████████| 8/8 [00:14<00:00,  1.86s/it]
Translating: 100%|██████████| 24/24 [01:01<00:00,  2.57s/it]


In [14]:
results_api['01-english-spanish-mapping.jsonl']['score']

46.713797772819994

In [15]:
results_api['02-english-spanish-mapping.jsonl']['score']

21.3643503198117

In [16]:
# optional
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('huggingface')

In [17]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `colab` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [18]:
# Local inference
results = run_evaluation(
    jsonl_files=jsonl_files,
    src_lang="English",
    tgt_lang="Spanish",
    inference_mode=InferenceMode.LOCAL,
    model_name="meta-llama/Llama-3.2-3B-Instruct",
    generation_preset="quality",
    debug=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating: 100%|██████████| 8/8 [00:12<00:00,  1.62s/it]
Translating: 100%|██████████| 24/24 [00:55<00:00,  2.32s/it]


In [19]:
results['01-english-spanish-mapping.jsonl']['score']

46.713797772819994

In [20]:
results['02-english-spanish-mapping.jsonl']['score']

10.682175159905853

In [21]:
import pandas as pd

data = {
    "File": ["01-english-spanish-mapping.jsonl", "02-english-spanish-mapping.jsonl"],
    "Llama 3.1 8B Instruct (API)": [results_api['01-english-spanish-mapping.jsonl']['score'], results_api['02-english-spanish-mapping.jsonl']['score']],
    "Llama 3.2 3B Instruct (Local)": [results['01-english-spanish-mapping.jsonl']['score'], results['02-english-spanish-mapping.jsonl']['score']]
}

df = pd.DataFrame(data)
display(df)

Unnamed: 0,File,Llama 3.1 8B Instruct (API),Llama 3.2 3B Instruct (Local)
0,01-english-spanish-mapping.jsonl,46.713798,46.713798
1,02-english-spanish-mapping.jsonl,21.36435,10.682175


In [27]:
import pandas as pd

model_name = "Llama 3.2 3B Instruct (Local)"  # You can change this

data = []
for file_key, file_data in results.items():
    file_prefix = file_key.split('-')[0]  # Extract "01" or "02"
    for translation_data in file_data['translations']:
        source = translation_data[0]
        translation = translation_data[1]
        reference = translation_data[2][0]  # Get the first reference

        data.append([model_name, file_prefix, source, translation, reference])

df = pd.DataFrame(data, columns=["model", "file", "source", "translation", "reference"])
df.to_csv("llama32_3B_Instruct_results.csv", index=False)

In [28]:
model_name = "Llama 3.1 8B Instruct (API)"  # You can change this

data = []
for file_key, file_data in results_api.items():
    file_prefix = file_key.split('-')[0]  # Extract "01" or "02"
    for translation_data in file_data['translations']:
        source = translation_data[0]
        translation = translation_data[1]
        reference = translation_data[2][0]  # Get the first reference

        data.append([model_name, file_prefix, source, translation, reference])

df = pd.DataFrame(data, columns=["model", "file", "source", "translation", "reference"])
df.to_csv("llama31_8B_Instruct_results.csv", index=False)
