In [2]:
!pip install transformers datasets torch sacrebleu



In [3]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

In [4]:
dataset = load_dataset("wmt14", "de-en", split='train[:1%]')

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["de"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def convert_to_dict(dataset):
    input_ids, attention_masks, labels = [], [], []
    for item in dataset:
        input_ids.append(item["input_ids"])
        attention_masks.append(item["attention_mask"])
        labels.append(item["labels"])
    return {"input_ids": input_ids, "attention_mask": attention_masks, "labels": labels}

train_dataset = convert_to_dict(tokenized_datasets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/280M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



Map:   0%|          | 0/45088 [00:00<?, ? examples/s]



In [5]:
class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = TranslationDataset(train_dataset)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)  # Reduced batch size

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
model.train()
for epoch in range(1):  # Reduced to 1 epoch
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 0, Loss: 0.30833449959754944
Epoch: 0, Loss: 0.4677426517009735
Epoch: 0, Loss: 0.3444148898124695
Epoch: 0, Loss: 0.5463380813598633
Epoch: 0, Loss: 0.355388343334198
Epoch: 0, Loss: 0.36066341400146484
Epoch: 0, Loss: 0.5282508730888367
Epoch: 0, Loss: 0.5212472677230835
Epoch: 0, Loss: 0.6381950378417969
Epoch: 0, Loss: 0.20529372990131378
Epoch: 0, Loss: 0.467806875705719
Epoch: 0, Loss: 0.40229079127311707
Epoch: 0, Loss: 0.39378613233566284
Epoch: 0, Loss: 0.33881518244743347
Epoch: 0, Loss: 0.5364544987678528
Epoch: 0, Loss: 0.48226791620254517
Epoch: 0, Loss: 0.2565896809101105
Epoch: 0, Loss: 0.2612352967262268
Epoch: 0, Loss: 0.4982083737850189
Epoch: 0, Loss: 0.3247931897640228
Epoch: 0, Loss: 0.4653235375881195
Epoch: 0, Loss: 0.3177871108055115
Epoch: 0, Loss: 0.34499141573905945
Epoch: 0, Loss: 0.27433058619499207
Epoch: 0, Loss: 0.4555901885032654
Epoch: 0, Loss: 0.4597553014755249
Epoch: 0, Loss: 0.

In [8]:
bleu = load_metric("sacrebleu")


  bleu = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [9]:
test_dataset = load_dataset("wmt14", "de-en", split='test[:1%]')
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

def convert_to_list(dataset):
    data = []
    for item in dataset:
        data.append({
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "labels": item["labels"],
            "en": item["translation"]["en"],
            "de": item["translation"]["de"]
        })
    return data

test_data = convert_to_list(tokenized_test_dataset)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]



In [10]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []
    for item in dataset:
        inputs = tokenizer(item["en"], return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model.generate(**inputs)
        decoded_preds = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        decoded_labels = [item["de"]]
        predictions.extend(decoded_preds)
        references.extend([[label] for label in decoded_labels])
    result = bleu.compute(predictions=predictions, references=references)
    return predictions, references, result

predictions, references, bleu_score = evaluate_model(model, tokenizer, test_data)
print(f"BLEU Score: {bleu_score['score']}")

BLEU Score: 17.161036689436415


In [11]:
def error_analysis(predictions, references):
    errors = []
    for pred, ref in zip(predictions, references):
        if pred != ref[0]:
            errors.append((pred, ref[0]))
    return errors

errors = error_analysis(predictions, references)
for error in errors[:5]:  # Print first 5 errors
    print(f"Predicted: {error[0]}")
    print(f"Reference: {error[1]}")

Predicted: Gutach: Erhöhung der Sicherheit für die Passanten
Reference: Gutach: Noch mehr Sicherheit für Fußgänger
Predicted: Sie sind nicht einmal 100 m voneinander entfernt: Am Dienstag wurde die neue B 33 am Dorfparkplatz in Gutach unter Berücksichtigung der bestehenden Ampel des Stadtsaals in Betrieb genommen.
Reference: Sie stehen keine 100 Meter voneinander entfernt: Am Dienstag ist in Gutach die neue B 33-Fußgängerampel am Dorfparkplatz in Betrieb genommen worden - in Sichtweite der älteren Rathausampel.
Predicted: Zwei Lichter, die einander so nahe stehen: absichtlich oder nur ein dummer Fehler?
Reference: Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?
Predicted: Der Bürgermeister von Gutacht hat gestern eine klare Antwort auf diese Frage gegeben.
Reference: Diese Frage hat Gutachs Bürgermeister gestern klar beantwortet.
Predicted: "Zu dieser Zeit wurden die Ampelanlagen der Stadtverwaltung eingerichtet, weil es sich um eine Schule handelte," erklärte Eckert

In [12]:
def sensitivity_analysis(model, tokenizer, dataset, seeds=[42, 43, 44]):
    results = []
    for seed in seeds:
        torch.manual_seed(seed)
        _, _, result = evaluate_model(model, tokenizer, dataset)
        results.append(result['score'])
    return results

sensitivity_results = sensitivity_analysis(model, tokenizer, test_data)
print(f"Sensitivity Analysis BLEU Scores: {sensitivity_results}")

Sensitivity Analysis BLEU Scores: [17.161036689436415, 17.161036689436415, 17.161036689436415]
