# Training a model
see README.md for details

In [1]:
import os

os.environ["WANDB_DISABLED"] = "true"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd

# from glob import glob

import zipfile

import torch

from torch.utils.data import DataLoader

# import wandb
from datasets import Dataset as HFDataset
from evaluate import load
from accelerate import Accelerator, DataLoaderConfiguration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import T5Config
from transformers import T5ForConditionalGeneration
from transformers import ByT5Tokenizer  # a "dummy" tokenizer, tokenizing into bytes
from transformers import DataCollatorForSeq2Seq
from transformers import EvalPrediction

from config import data_root, model_root, checkpoint_name
from config import token_len, annot_len

2024-07-31 15:32:03.922520: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 15:32:03.922578: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 15:32:03.923719: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-31 15:32:03.930610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# https://huggingface.co/docs/accelerate/main/en/package_reference/utilities#accelerate.DataLoaderConfiguration
dataloader_config = DataLoaderConfiguration(
    use_seedable_sampler=False,
)
# https://huggingface.co/docs/accelerate/main/en/package_reference/accelerator#accelerate.Accelerator
accelerator = Accelerator(
    dataloader_config=dataloader_config,
    project_dir=model_root,
    # rng_types="torch",
    # rng_types="cuda",
    # rng_types="generator",
    cpu=True,
)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
device = accelerator.device
torch.set_default_device(device)
# torch.cuda.is_available()
# accelerator.device
device

device(type='cpu')

In [3]:
dfs = []
# for fname in glob(f"{data_root}/*.csv"):
#     dfs += [pd.read_csv(fname, names=["inputs", "labels"])]
with zipfile.ZipFile(f"{data_root}/data-ue.zip") as zf:
    for name in zf.namelist():
        dfs += [pd.read_csv(zf.open(name), names=["input", "label"])]
df = pd.concat(dfs, axis=0)

df.head()

Unnamed: 0,input,label
0,: прологъ:NOUN,прологъ
1,: о:ADP,о
2,: х҃ѣ:PROPN,христосъ
3,: оумѣренъ:VERB,оумѣрити
4,: съказаниꙗ:NOUN,съказаниє


In [4]:
# https://huggingface.co/docs/transformers/model_doc/byt5#transformers.ByT5Tokenizer
tokenizer = ByT5Tokenizer()


# Function to tokenize data
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"], max_length=32, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"], max_length=32, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels"] = [
        -100 if token == tokenizer.pad_token_id else token
        for token in model_inputs["labels"]
    ]
    return model_inputs


# Create Hugging Face Dataset
data = {"text": df["input"].to_list(), "target": df["label"].to_list()}
hf_dataset = HFDataset.from_dict(data)

# Tokenize dataset
tokenized_dataset = hf_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "target"]
)

for d in tokenized_dataset:
    assert (
        len(d["input_ids"]) == len(d["labels"]) == 32
    ), f"{len(d['input_ids'])}, {len(d['labels'])}"

tokenized_dataset

Map:   0%|          | 0/41104 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 41104
})

In [5]:
# Custom DataLoader
class HFDatasetWrapper(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]


wrapped_dataset = HFDatasetWrapper(tokenized_dataset)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

# Create DataLoader
batch_size = 2
dataloader = DataLoader(
    wrapped_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
)

tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 41104
})

In [6]:
# https://huggingface.co/spaces/evaluate-metric/exact_match
exact_match_metric = load("exact_match")


def compute_exact_match(pred: EvalPrediction):
    # Convert predictions to text
    predictions = pred.predictions
    references = pred.label_ids

    # Decode if needed
    decoded_preds = [
        pred.decode(pred, skip_special_tokens=True) for pred in predictions
    ]
    decoded_labels = [
        label.decode(label, skip_special_tokens=True) for label in references
    ]

    # Compute exact match
    result = exact_match_metric.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    return {"exact_match": result["exact_match"]}


# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_root}/byT5-ocs-ue",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    #     gradient_checkpointing=True,
    # torch_empty_cache_steps=100,
    disable_tqdm=False,
    report_to=None,  # disable wandb.ai
    load_best_model_at_end=True,
    save_total_limit=1,
    eval_strategy="steps",
)


# https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration
def init_model():
    model = T5ForConditionalGeneration(config)
    # model = model.cuda()
    model.to(device)
    return model


# https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Config
config = T5Config.from_pretrained("t5-base")
# config.task_specific_params = {}
# https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForSeq2Seq
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

# Create DataLoader
batch_size = 2
dataloader = DataLoader(
    wrapped_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator
)

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [7]:
class CustomTrainer(Seq2SeqTrainer):
    def get_train_dataloader(self):
        return dataloader


# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer
trainer = CustomTrainer(
    model_init=init_model,
    args=args,
    train_dataset=wrapped_dataset,
    eval_dataset=wrapped_dataset,  # TODO
    # train_dataset=wrapped_dataset["train"],
    # eval_dataset=wrapped_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_exact_match,
)

In [8]:
trainer.train()

[2024-07-31 15:32:22,011] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.62 GiB. GPU 0 has a total capacty of 47.54 GiB of which 20.56 GiB is free. Process 1213136 has 26.97 GiB memory in use. Of the allocated memory 23.62 GiB is allocated by PyTorch, and 1.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model(output_dir=f"{model_root}/byT5-ocs-ue-final")