In [1]:
!pip install transformers datasets torch sacrebleu

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

In [6]:
dataset = load_dataset("opus100", "en-es", split='train[:1%]')

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["es"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def convert_to_dict(dataset):
    input_ids, attention_masks, labels = [], [], []
    for item in dataset:
        input_ids.append(item["input_ids"])
        attention_masks.append(item["attention_mask"])
        labels.append(item["labels"])
    return {"input_ids": input_ids, "attention_mask": attention_masks, "labels": labels}

train_dataset = convert_to_dict(tokenized_datasets)

Downloading data:   0%|          | 0.00/237k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [7]:
class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = TranslationDataset(train_dataset)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)  # Reduced batch size

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-es").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [9]:
model.train()
for epoch in range(1):  # Reduced to 1 epoch
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 0, Loss: 5.666431903839111
Epoch: 0, Loss: 5.608285427093506
Epoch: 0, Loss: 4.865111827850342
Epoch: 0, Loss: 5.887608051300049
Epoch: 0, Loss: 6.2275710105896
Epoch: 0, Loss: 5.344372749328613
Epoch: 0, Loss: 2.7554824352264404
Epoch: 0, Loss: 1.7699435949325562
Epoch: 0, Loss: 0.23853515088558197
Epoch: 0, Loss: 0.19239304959774017
Epoch: 0, Loss: 0.18220970034599304
Epoch: 0, Loss: 0.39225614070892334
Epoch: 0, Loss: 0.4141244888305664
Epoch: 0, Loss: 0.45839256048202515
Epoch: 0, Loss: 0.22705812752246857
Epoch: 0, Loss: 0.2699880003929138
Epoch: 0, Loss: 0.26142004132270813
Epoch: 0, Loss: 0.20802749693393707
Epoch: 0, Loss: 0.3516024649143219
Epoch: 0, Loss: 0.21221013367176056
Epoch: 0, Loss: 0.49622002243995667
Epoch: 0, Loss: 0.41822534799575806
Epoch: 0, Loss: 0.3542381823062897
Epoch: 0, Loss: 0.33079445362091064
Epoch: 0, Loss: 0.2558495104312897
Epoch: 0, Loss: 0.28140243887901306
Epoch: 0, Loss: 0.5296275019645691
Epoch: 0, Loss: 0.32367751002311707
Epoch: 0, Loss

In [10]:
bleu = load_metric("sacrebleu")


  bleu = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [13]:
test_dataset = load_dataset("opus100", "en-es", split='test[:1%]')
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

def convert_to_list(dataset):
    data = []
    for item in dataset:
        data.append({
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "labels": item["labels"],
            "en": item["translation"]["en"],
            "es": item["translation"]["es"]
        })
    return data

test_data = convert_to_list(tokenized_test_dataset)

In [14]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []
    for item in dataset:
        inputs = tokenizer(item["en"], return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model.generate(**inputs)
        decoded_preds = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        decoded_labels = [item["es"]]
        predictions.extend(decoded_preds)
        references.extend([[label] for label in decoded_labels])
    result = bleu.compute(predictions=predictions, references=references)
    return predictions, references, result

predictions, references, bleu_score = evaluate_model(model, tokenizer, test_data)
print(f"BLEU Score: {bleu_score['score']}")

BLEU Score: 32.83717458768768


In [15]:
def error_analysis(predictions, references):
    errors = []
    for pred, ref in zip(predictions, references):
        if pred != ref[0]:
            errors.append((pred, ref[0]))
    return errors

errors = error_analysis(predictions, references)
for error in errors[:5]:  # Print first 5 errors
    print(f"Predicted: {error[0]}")
    print(f"Reference: {error[1]}")

Predicted: Si su país produce SAO con este fin, indíquese la cantidad así producida en la columna 6 del formulario 3.”
Reference: Si su país produjo SAO para estos usos, sírvase anotar en la columna 6 del formulario de datos 3 la cantidad correspondiente”.
Predicted: Juvie el gran hombre, ¿quién más podría ser sino yo?
Reference: # Juvie el gran hombre, ¿quién podría ser sino yo?
Predicted: El planeta se está agotando.
Reference: El planeta madre se está agotando.
Predicted: ¿Nunca las chicas matan a sus madres?
Reference: Las chicas no matan a sus madres? .
Predicted: La delegación de la República Árabe Siria ha escuchado muy de cerca la declaración del representante de la República Popular China y nos asociamos plenamente a ella.
Reference: La delegación de la República Árabe Siria ha escuchado con mucha atención la declaración formulada por el representante de la República Popular China y nos adherimos plenamente a ella.


In [16]:
def sensitivity_analysis(model, tokenizer, dataset, seeds=[42, 43, 44]):
    results = []
    for seed in seeds:
        torch.manual_seed(seed)
        _, _, result = evaluate_model(model, tokenizer, dataset)
        results.append(result['score'])
    return results

sensitivity_results = sensitivity_analysis(model, tokenizer, test_data)
print(f"Sensitivity Analysis BLEU Scores: {sensitivity_results}")

Sensitivity Analysis BLEU Scores: [32.83717458768768, 32.83717458768768, 32.83717458768768]
