In [1]:
!pip install transformers datasets torch sacrebleu

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

In [15]:
dataset = load_dataset("opus100", "en-zh", split='train[:1%]')

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def convert_to_dict(dataset):
    input_ids, attention_masks, labels = [], [], []
    for item in dataset:
        input_ids.append(item["input_ids"])
        attention_masks.append(item["attention_mask"])
        labels.append(item["labels"])
    return {"input_ids": input_ids, "attention_mask": attention_masks, "labels": labels}

train_dataset = convert_to_dict(tokenized_datasets)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = TranslationDataset(train_dataset)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)  # Reduced batch size

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-zh").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

In [18]:
model.train()
for epoch in range(1):  # Reduced to 1 epoch
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 0, Loss: 9.507477760314941
Epoch: 0, Loss: 10.3951416015625
Epoch: 0, Loss: 11.006229400634766
Epoch: 0, Loss: 10.876058578491211
Epoch: 0, Loss: 10.167540550231934
Epoch: 0, Loss: 6.269375801086426
Epoch: 0, Loss: 3.0407276153564453
Epoch: 0, Loss: 3.207836866378784
Epoch: 0, Loss: 0.8148744702339172
Epoch: 0, Loss: 0.2858208417892456
Epoch: 0, Loss: 0.2725013792514801
Epoch: 0, Loss: 0.6476806402206421
Epoch: 0, Loss: 0.8225372433662415
Epoch: 0, Loss: 0.504819393157959
Epoch: 0, Loss: 0.7820131778717041
Epoch: 0, Loss: 0.7858578562736511
Epoch: 0, Loss: 1.8617653846740723
Epoch: 0, Loss: 1.0115894079208374
Epoch: 0, Loss: 0.8417935371398926
Epoch: 0, Loss: 0.7738417983055115
Epoch: 0, Loss: 0.6344342827796936
Epoch: 0, Loss: 0.7728489637374878
Epoch: 0, Loss: 0.6935636401176453
Epoch: 0, Loss: 0.6250600218772888
Epoch: 0, Loss: 0.4900822043418884
Epoch: 0, Loss: 0.7497966885566711
Epoch: 0, Loss: 0.46610403060913086
Epoch: 0, Loss: 0.5785738825798035
Epoch: 0, Loss: 0.3199790

In [19]:
bleu = load_metric("sacrebleu")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [20]:
test_dataset = load_dataset("opus100", "en-zh", split='test[:1%]')
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

def convert_to_list(dataset):
    data = []
    for item in dataset:
        data.append({
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "labels": item["labels"],
            "en": item["translation"]["en"],
            "zh": item["translation"]["zh"]
        })
    return data

test_data = convert_to_list(tokenized_test_dataset)


In [21]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []
    for item in dataset:
        inputs = tokenizer(item["en"], return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            translated_tokens = model.generate(**inputs)
        decoded_preds = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        decoded_labels = [item["zh"]]
        predictions.extend(decoded_preds)
        references.extend([[label] for label in decoded_labels])
    result = bleu.compute(predictions=predictions, references=references)
    return predictions, references, result

predictions, references, bleu_score = evaluate_model(model, tokenizer, test_data)
print(f"BLEU Score: {bleu_score['score']}")

BLEU Score: 15.731452028529155


In [22]:
def error_analysis(predictions, references):
    errors = []
    for pred, ref in zip(predictions, references):
        if pred != ref[0]:
            errors.append((pred, ref[0]))
    return errors

errors = error_analysis(predictions, references)
for error in errors[:5]:  # Print first 5 errors
    print(f"Predicted: {error[0]}")
    print(f"Reference: {error[1]}")

Predicted: 在比利时的财政支助下,全球行动纲领协调处目前正在协助埃及、尼日利亚、坦桑尼亚联合共和国、斯里兰卡和也门制订保护海洋环境不受陆地活动影响的试点国家行动纲领。
Reference: 9. 《全球行动纲领》协调处得到比利时的财政支持，目前正帮助埃及、尼日利亚、坦桑尼亚联合共和国、斯里兰卡和也门制订试行的保护海洋环境免受陆地活动影响的国家行动方案。
Predicted: 该论坛在亚洲和非洲私营公司之间达成了价值2 400万美元的商业合同。
Reference: 该论坛达成的亚洲和非洲国家私营公司间的商业合同超过了2,400万美元。
Predicted: 但是,在1990年代,由于人们仍然不甚理解的原因,这种病毒在普通人口中爆发。受感染的南非人(相当于丹麦人口)超过邻国莫桑比克、莱索托、博茨瓦纳、纳米比亚、斯威士兰、津巴布韦。
Reference: 艾滋病来势汹汹及其传染速度让很多南非人意想不到。像发达国家一样，艾滋病毒感染最初集中在血友病患者、性活跃同性恋者以及静脉注射吸毒人员中。但20世纪90年代某种未知原因导致公众病毒感染率暴增。被病毒感染的南非民众数量（相当于丹麦全国人口）超过了相邻莫桑比克、莱索托、博茨瓦纳、纳米比亚、斯威士兰和津巴布韦的总和。
Predicted: 委员会自1968年开始执行其任务以来,从未遇到过巴勒斯坦人民和被占领领土上的其他阿拉伯人的这种愤怒和苦难,以及无视他们的基本人权和基本自由的情况。
Reference: 大概自1968年委员会开始执行其任务以来，委员会从未遇到被占领土内的巴勒斯坦人和其他阿拉伯人如此愤怒和悲惨过，以及其基本人权和基本自由如此不受尊重。
Predicted: “6. 又欢迎转型期经济国家希望进一步发展区域、次区域和区域间合作,并邀请联合国系统加强同其成员包括转型期经济国家的区域和分区域合作组织的对话并增加对这些组织的支持;
Reference: “6.　又欢迎经济转型国家渴望进一步开展区域、次区域和区域间合作，请联合国系统加强同其成员包括经济转型国家的各区域和次区域合作组织的对话，并增加对它们的支助；


In [23]:
def sensitivity_analysis(model, tokenizer, dataset, seeds=[42, 43, 44]):
    results = []
    for seed in seeds:
        torch.manual_seed(seed)
        _, _, result = evaluate_model(model, tokenizer, dataset)
        results.append(result['score'])
    return results

sensitivity_results = sensitivity_analysis(model, tokenizer, test_data)
print(f"Sensitivity Analysis BLEU Scores: {sensitivity_results}")

Sensitivity Analysis BLEU Scores: [15.731452028529155, 15.731452028529155, 15.731452028529155]
