In [None]:
#|default_exp text.huggingface

In [None]:
#|exporti
# Contains code from:
# blurr - Apache License 2.0 - Copyright (c) Wayde Gilliam

In [None]:
#|export
from __future__ import annotations

import inspect, warnings
from typing import Dict, Iterable, Sequence

import torch._dynamo as dynamo
from torch.utils.data import Sampler, Dataset
from torch.utils.data import DataLoader as _DataLoader
from torch.utils.data.dataloader import _worker_init_fn_t, _collate_fn_t

from transformers import PreTrainedModel
from transformers import logging as hf_logging

from fastai.callback.core import Callback
from fastai.losses import BaseLoss

from fastxtend.imports import *

# Hugging Face
> Basic compatability between fastai and Hugging Face Transformers models

fastxtend provides basic compatibility for training Hugging Face [Transformers](https://huggingface.co/docs/transformers/index) models using the `fastai.learner.Learner`.

For a fully developed Hugging Face integration, you should check out [blurr](https://ohmeow.github.io/blurr).

To use fastxend's compatibility, setup the Hugging Face dataset, dataloader, and model per the [Transformers documentation](https://huggingface.co/docs/transformers/index), exchaning the PyTorch `Dataloader` for the `HuggingFaceLoader`. Then wrap the dataloaders in `fastai.data.core.DataLoaders` and create a `Learner` with the Hugging Face model, `HuggingFaceLoss`, and `HuggingFaceCallback`. This will automatically setup the compatibility and use the Hugging Face model's built in loss.

```python
train_dataset = dataset['train'].with_format('torch')
train_dataloader = HuggingFaceLoader(
    train_dataset, batch_size=batch_size,
    collate_fn=data_collator, shuffle=True,
    drop_last=True, num_workers=num_cpus()
)

valid_dataset = dataset['validation'].with_format('torch')
valid_dataloader = HuggingFaceLoader(
    valid_dataset, batch_size=batch_size,
    collate_fn=data_collator, shuffle=False,
    drop_last=False, num_workers=num_cpus()
)

dls = DataLoaders(train_dataloader, valid_dataloader)

hf_model = GPTForCausalLM(...)
learn = Learner(dls, hf_model, loss_func=HuggingFaceLoss(), ...,
                cbs=HuggingFaceCallback()).to_bf16()

```

To train with a different loss, pass in a loss to `Learner` as normal, and `HuggingFaceCallback` will ignore the Hugging Face model's built in loss calculation.

In [None]:
#|exporti
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
#|export
class HuggingFaceLoss(BaseLoss):
    "To use the Hugging Face model's built in loss function, pass this loss to `Learner`"
    def __init__(self, **kwargs):
        self.func = None

    def __call__(self, inp:Tensor, targ:Tensor|None=None, **kwargs):
        return tensor(0.0)

In [None]:
#|export
class HuggingFaceWrapper(nn.Module):
    "A minimal compatibility wrapper between a Hugging Face model and `Learner`"
    def __init__(
        self,
        model: PreTrainedModel, # Hugging Face compatible model
    ):
        super().__init__()
        self.model = model
        self._forward_args = inspect.getfullargspec(self.model.forward).args

    def forward(self, x:Dict):
        return self.model(**{k:v for k,v in x.items() if k in self._forward_args})

In practice, you won't need to use the <code>HuggingFaceWrapper</code> as `HuggingFaceCallback` will automatically add it for you.

In [None]:
#|export
class HuggingFaceCallback(Callback):
    run_valid = True
    "Applies `HuggingFaceWrapper` and handles using model's built in loss or fastai `Learner` loss"
    def __init__(self,
        labels:str='labels', # Input batch labels key
        loss:str='loss', # Model output loss key
        logits:str='logits', # Model output logits key
    ):
        self._label_key, self._loss_key, self._logit_key = labels, loss, logits

    def after_create(self):
        self._model_loss = isinstance(self.learn.loss_func, HuggingFaceLoss)
        if not isinstance(self.model, HuggingFaceWrapper) and not isinstance(self.model, dynamo.OptimizedModule):
            self.learn.model = HuggingFaceWrapper(self.learn.model)

    def before_batch(self):
        self._loss = None
        if self._model_loss:
            # Learner skips backward pass if yb isn't set
            self.learn.yb = (1,)
        else:
            self.learn.yb = (self.xb[0][self._label_key],)

    def after_pred(self):
        outputs = self.learn.pred
        if self._model_loss:
            self._loss = to_float(outputs[self._loss_key])
        self.learn.pred = outputs.get(self._logit_key, None)

    def after_loss(self):
        if self._model_loss:
            self.learn.loss_grad = self._loss
            self.learn.loss = self.learn.loss_grad.clone()

If `HuggingFaceLoss` is passed to `fastai.learner.Learner`, then <code>HuggingFaceCallback</code> will use the Hugging Face model's built in loss.

If any other loss function is passed to `Learner`, <code>HuggingFaceCallback</code> will prevent the built in loss from being calculated and will use the `Learner` loss function instead.

In [None]:
#|export
class HuggingFaceLoader(_DataLoader):
    "A minimal compatibility wrapper between a Hugging Face Dataloader and `Learner`"
    def __init__(self,
        dataset:Dataset, # dataset from which to load the data
        batch_size:int, # Batch size
        shuffle:bool|None = None, # Randomize the order of data at each epoch (default: False)
        sampler:Sampler|Iterable|None = None, # Determines how to draw samples from the dataset. Cannot be used with shuffle.
        batch_sampler:Sampler[Sequence]|Iterable[Sequence]|None = None, # Rreturns a batch of indices at a time. Cannot be used with batch_size, shuffle, sampler, or drop_last.
        num_workers:int=0, # Number of processes to use for data loading. 0 means using the main process (default: 0).
        collate_fn:_collate_fn_t|None = None, # Function that merges a list of samples into a mini-batch of Tensors. Used for map-style datasets.
        pin_memory:bool=False, # Copy Tensors into device/CUDA pinned memory before returning them
        drop_last:bool=False, # Drop the last incomplete batch if the dataset size is not divisible by the batch size
        timeout:float=0, # Timeout value for collecting a batch from workers
        worker_init_fn:_worker_init_fn_t|None = None, # called on each worker subprocess with the worker id as input
        multiprocessing_context=None,
        generator=None,
        prefetch_factor:int|None=None, # number of batches loaded in advance by each worker
        persistent_workers:bool=False, # if True, the data loader will not shutdown the worker processes after a dataset has been consumed once
        pin_memory_device:str= "", # the data loader will copy Tensors into device pinned memory before returning them if pin_memory is set to true
    ):
        super().__init__(
            dataset=dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler,
            batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn,
            pin_memory=pin_memory, drop_last=drop_last, timeout=timeout,
            worker_init_fn=worker_init_fn, multiprocessing_context=multiprocessing_context,
            generator=generator, prefetch_factor=prefetch_factor, persistent_workers=persistent_workers,
            pin_memory_device=pin_memory_device
        )

    @property
    def bs(self) -> int:
        "Number of items a batch"
        return self.batch_size

    def __iter__(self):
        for b in super().__iter__():
            yield (b,)

Hugging Face datasets, and thus dataloaders, return dictionaries while fastai expects tuples. <code>HuggingFaceLoader</code> is a PyTorch Dataloader which wraps the Hugging Face dictionary in a tuple.