In [1]:
!pip install transformers datasets torch sacrebleu

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

In [17]:
dataset = load_dataset("wmt14", "de-en", split='train[:1%]')

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["de"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def convert_to_dict(dataset):
    input_ids = []
    attention_masks = []
    labels = []

    for item in dataset:
        input_ids.append(item["input_ids"])
        attention_masks.append(item["attention_mask"])
        labels.append(item["labels"])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }

train_dataset = convert_to_dict(tokenized_datasets)

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/280M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



Map:   0%|          | 0/45088 [00:00<?, ? examples/s]



In [18]:
class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = TranslationDataset(train_dataset)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)  # Reduced batch size

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [20]:
model.train()
for epoch in range(1):  # Reduced to 1 epoch
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 0, Loss: 0.44900694489479065
Epoch: 0, Loss: 0.5092845559120178
Epoch: 0, Loss: 0.3686891198158264
Epoch: 0, Loss: 0.6371797323226929
Epoch: 0, Loss: 0.5913200378417969
Epoch: 0, Loss: 0.39326030015945435
Epoch: 0, Loss: 0.5529458522796631
Epoch: 0, Loss: 0.42986345291137695
Epoch: 0, Loss: 0.4428233504295349
Epoch: 0, Loss: 0.3832688331604004
Epoch: 0, Loss: 0.38824158906936646
Epoch: 0, Loss: 0.692170262336731
Epoch: 0, Loss: 0.35354992747306824
Epoch: 0, Loss: 0.33731746673583984
Epoch: 0, Loss: 0.3543768525123596
Epoch: 0, Loss: 0.26801326870918274
Epoch: 0, Loss: 0.3873922526836395
Epoch: 0, Loss: 0.4275333285331726
Epoch: 0, Loss: 0.40572410821914673
Epoch: 0, Loss: 0.30673786997795105
Epoch: 0, Loss: 0.34807780385017395
Epoch: 0, Loss: 0.3937971293926239
Epoch: 0, Loss: 0.2714461088180542
Epoch: 0, Loss: 0.4096754789352417
Epoch: 0, Loss: 0.3350929915904999
Epoch: 0, Loss: 0.3242941200733185
Epoch: 0, Loss: 

In [21]:
bleu = load_metric("sacrebleu")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [22]:
test_dataset = load_dataset("wmt14", "de-en", split='test[:1%]')

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

def convert_to_list(dataset):
    data = []
    for item in dataset:
        data.append({
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "labels": item["labels"],
            "en": item["translation"]["en"],
            "de": item["translation"]["de"]
        })
    return data

test_data = convert_to_list(tokenized_test_dataset)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]



In [23]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []
    for item in dataset:
        inputs = tokenizer(item["en"], return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model.generate(**inputs)
        decoded_preds = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        decoded_labels = [item["de"]]
        predictions.extend(decoded_preds)
        references.extend([[label] for label in decoded_labels])
    result = bleu.compute(predictions=predictions, references=references)
    return result

bleu_score = evaluate_model(model, tokenizer, test_data)
print(f"BLEU Score: {bleu_score['score']}")

BLEU Score: 16.500385046619932


In [24]:
def error_analysis(predictions, references):
    errors = []
    for pred, ref in zip(predictions, references):
        if pred != ref[0]:
            errors.append((pred, ref[0]))
    return errors

errors = error_analysis(predictions, references)
for error in errors[:5]:  # Print first 5 errors
    print(f"Predicted: {error[0]}")
    print(f"Reference: {error[1]}")

NameError: name 'predictions' is not defined

In [25]:
def sensitivity_analysis(model, tokenizer, dataset, seeds=[42, 43, 44]):
    results = []
    for seed in seeds:
        torch.manual_seed(seed)
        result = evaluate_model(model, tokenizer, dataset)
        results.append(result['score'])
    return results

sensitivity_results = sensitivity_analysis(model, tokenizer, test_data)
print(f"Sensitivity Analysis BLEU Scores: {sensitivity_results}")

Sensitivity Analysis BLEU Scores: [16.500385046619932, 16.500385046619932, 16.500385046619932]
