In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
import pandas as pd
import json
import evaluate
import torch
import re

metric = evaluate.load("sacrebleu")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_dataset("uvci/Koumankan_mt_dyu_fr", token=True)

In [4]:
# train_df = pd.read_csv("../data/train-00000-of-00001.csv", delimiter="|")
# val_df = pd.read_csv("../data/validation-00000-of-00001.csv", delimiter="|")

In [5]:
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="dyu_Latn", tgt_lang="fr_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [6]:

# article = ["Mun? Fɛn dɔ.", '"E nafa t\'a ra."']
# targets = ["Il boit de l’eau.", "Il se plaint toujours."]
# inputs = tokenizer(article, text_target=targets, return_tensors="pt", padding=True)

# translated_tokens = model.generate(
#     **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("fra_Latn"), max_length=30
# )
# tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

In [7]:
def eval(model, ds):
    """Returns the sacrebleu score"""
    inputs = []
    outputs_true = []
    for example in ds:
        inputs.append(example["translation"]["dyu"])
        outputs_true.append(example["translation"]["fr"])
    
    inputs_encoded = tokenizer(inputs, return_tensors="pt", padding=True)

    translated_tokens = model.generate(
        **inputs_encoded, forced_bos_token_id=tokenizer.convert_tokens_to_ids("fra_Latn"), max_length=30
    )
    outputs_pred = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return metric.compute(predictions=outputs_pred, references=outputs_true)

In [8]:
max_len = 128

def preprocess(examples):
    model_inputs = tokenizer([t["dyu"] for t in examples["translation"]], max_length=max_len, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer([t["fr"] for t in examples["translation"]], max_length=max_len, truncation=True)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = ds.map(preprocess, batched=True, remove_columns=["ID", "translation"])

Map: 100%|██████████| 1393/1393 [00:00<00:00, 59040.08 examples/s]


In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [10]:
import numpy as np

def postprocess_text(preds, labels):    
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [25]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [12]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.")

In [23]:
import torch

torch.has_mps

  torch.has_mps


True