# Training a model
see README.md for details

Loosely following this tutorial:
https://medium.com/nlplanet/a-full-guide-to-finetuning-t5-for-text2text-and-building-a-demo-with-streamlit-c72009631887

!pip install -r requirements.txt

In [1]:
import os

os.environ["WANDB_DISABLED"] = "true"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd

# from glob import glob

import zipfile

import torch
import wandb
from datasets import Dataset
from evaluate import load
from accelerate import Accelerator, DataLoaderConfiguration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import T5Config
from transformers import T5ForConditionalGeneration
from transformers import ByT5Tokenizer  # a "dummy" tokenizer, tokenizing into bytes
from transformers import DataCollatorForSeq2Seq
from transformers import EvalPrediction

from config import data_root, model_root, checkpoint_name
from config import token_len, annot_len

2024-07-25 09:56:45.599456: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 09:56:45.599531: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 09:56:45.600849: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-25 09:56:45.608122: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# https://huggingface.co/docs/accelerate/main/en/package_reference/utilities#accelerate.DataLoaderConfiguration
dataloader_config = DataLoaderConfiguration(
    use_seedable_sampler=False,
)
# https://huggingface.co/docs/accelerate/main/en/package_reference/accelerator#accelerate.Accelerator
accelerator = Accelerator(
    dataloader_config=dataloader_config,
    project_dir=model_root,
    # rng_types="torch",
    # rng_types="cuda",
    # rng_types="generator",
    # cpu=False,
)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
device = accelerator.device
torch.set_default_device(device)
# torch.cuda.is_available()
# accelerator.device
device

device(type='cuda')

In [None]:
dfs = []
# for fname in glob(f"{data_root}/*.csv"):
#     dfs += [pd.read_csv(fname, names=["inputs", "labels"])]
with zipfile.ZipFile(f"{data_root}/data-ue.zip") as zf:
    for name in zf.namelist():
        dfs += [pd.read_csv(zf.open(name), names=["input", "label"])]
df = pd.concat(dfs, axis=0)

# https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset
ds = Dataset.from_pandas(df)
ds = ds.train_test_split(test_size=0.1, shuffle=True)
ds

DatasetDict({
    train: Dataset({
        features: ['input', 'label', '__index_level_0__'],
        num_rows: 36993
    })
    test: Dataset({
        features: ['input', 'label', '__index_level_0__'],
        num_rows: 4111
    })
})

In [8]:
# https://huggingface.co/docs/transformers/model_doc/byt5#transformers.ByT5Tokenizer
tokenizer = ByT5Tokenizer()


# Create a tokenization function
def preprocess_function(examples):
    model_inputs = tokenizer(examples["input"], max_length=32, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["label"], max_length=32, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Apply the tokenization function to the dataset
tds = ds.map(preprocess_function, batched=True, remove_columns=["input", "label"])
tds

Map:   0%|          | 0/36993 [00:00<?, ? examples/s]



Map:   0%|          | 0/4111 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 36993
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4111
    })
})

In [None]:
# https://huggingface.co/spaces/evaluate-metric/exact_match
exact_match_metric = load("exact_match")


def compute_exact_match(pred: EvalPrediction):
    # Convert predictions to text
    predictions = pred.predictions
    references = pred.label_ids

    # Decode if needed
    decoded_preds = [
        pred.decode(pred, skip_special_tokens=True) for pred in predictions
    ]
    decoded_labels = [
        label.decode(label, skip_special_tokens=True) for label in references
    ]

    # Compute exact match
    result = exact_match_metric.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    return {"exact_match": result["exact_match"]}


# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_root}/byT5-ocs-ue",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    #     gradient_checkpointing=True,
    # torch_empty_cache_steps=100,
    disable_tqdm=False,
    report_to=None,  # disable wandb.ai
    load_best_model_at_end=True,
    save_total_limit=1,
    eval_strategy="steps",
)


# https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration
def init_model():
    model = T5ForConditionalGeneration(config)
    # model = model.cuda()
    model.to(device)
    return model


# https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Config
config = T5Config.from_pretrained("t5-base")
# config.task_specific_params = {}
# https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForSeq2Seq
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model_init=init_model,
    args=args,
    train_dataset=tds["train"],
    eval_dataset=tds["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_exact_match,
)

In [None]:
trainer.train()

[2024-07-25 09:56:56,290] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'

In [None]:
trainer.save_model(output_dir=f"{model_root}/byT5-ocs-ue-final")