In [25]:
import pandas as pd
from datasets import Dataset
from dataclasses import dataclass, field

from transformers import Trainer, TrainingArguments
from transformers import RobertaTokenizerFast
from transformers import RobertaConfig, RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaEncoder

import torch
from torch import nn

Lets import the data labelled by the teacher model. It should be pickled and in the form of a pandas dataframe.

In [None]:
data_path = r"teacher_labels_0.pkl"
df = pd.read_pickle(data_path)
df

We format the dataframe before passing it into datasets. \
textualsampleid | sentence | claimScore | positionScore | ... | concludingStatementScore

In [None]:
df["labels"] = df.iloc[:,2:].apply(lambda x: torch.tensor(x.to_list(), dtype=torch.float), axis=1)
df = df.drop(columns=["textualsampleid", "leadScore", "positionScore", "evidenceScore",
                      "claimScore", "concludingStatementScore", "counterclaimScore", "rebuttalScore"])
df = df.rename({"sentence":"text"})
df

We load the df into a dataset, where the labels take the same form of the multi label classification problem, except with float values ranging from 0-1 instead. We don't have to customise the tokenizer to suit our needs as both models are the same.

In [None]:
tokenizer_model_checkpoint = r"roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_model_checkpoint,
                                          problem_type="multi_label_classification")
def tokenize_and_encode():
    return tokenizer(examples["text"], truncation=True)

In [None]:
dataset = Dataset.from_pandas(df).set_format("torch")
dataset = dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"])
dataset = dataset.train_test_split(test=0.3)
dataset

Here we subclass the TrainingArguments to inject our own parameters required for soft label loss computation. We follow the same format as official huggingface implementation referencing [seq2seq training arguments](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args_seq2seq.py).

In [10]:
@dataclass
class NoisyTrainingArguments(TrainingArguments):
    temperature: float = field(default=2.0, metadata={"help": "Temperature for the softmax temperature."})
    alpha_ce: float = field(default=0.5, metadata={"help":"Linear weight for the distillation loss. Must be >=0."})

We subclass the trainer and define our own compute_loss for soft labels. Much of this is based off the original work of [distil models](https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/distiller.py). I specifically only implemeneted the MLM calculation of loss since roBERTa trains using a masked language modelling objective.

In [11]:
class NoisyStudentTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # extract required parameters from our subclassed training arguments
        temperature = self.args.temperature
        alpha_ce = self.args.alpha_ce
        
        # get the labels of the input
        labels = inputs.get("labels")
        # get the outputs of the model forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # sanity check
        assert logits.size() == labels.size()
        
        # Kullback-Leibler Divergence loss (cross entropy)
        self.ce_loss_func = nn.KLDivLoss(reduction="batchmean")
        
        # compute KLDiv loss and multiply by alpha value
        loss_ce = self.ce_loss_func(nn.functional.log_softmax(s_logits_slct / temperaturec, dim=-1),
                                    nn.functional.softmax(t_labels_slct / temperature, dim=-1)) * (temperature ** 2)
        loss = alpha_ce * loss_ce
        
        return (loss, outputs) if return_outputs else loss

Subclass RobertaConfig to include our own parameters relevant to layerdrop.

In [3]:
class NoisyRobertaConfig(RobertaConfig):
    def __init__(self, layerdrop=0.2, **kwargs):
        super().__init__(**kwargs)
        self.layerdrop = layerdrop

Subclass the encoder layers in roberta and implement layerdrop. Similar to BERT, RobertaModel can behave as an encoder and decoder, so we only have to subclass the single RobertaEncoder.

In [31]:
class NoisyRobertaEncoder(RobertaEncoder):
    # override the forward pass. Essentially mostly the same as the original code.
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        next_decoder_cache = () if use_cache else None
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.config.layerdrop):  # skip the layer
                layer_outputs = (None, None)
            else:
                layer_head_mask = head_mask[i] if head_mask is not None else None
                past_key_value = past_key_values[i] if past_key_values is not None else None

                if self.gradient_checkpointing and self.training:

                    if use_cache:
                        logger.warning(
                            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                        )
                        use_cache = False

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, past_key_value, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(layer_module),
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                    )
                else:
                    layer_outputs = layer_module(
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                        past_key_value,
                        output_attentions,
                    )

                hidden_states = layer_outputs[0]
                if use_cache:
                    next_decoder_cache += (layer_outputs[-1],)
                
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )

    
class NoisyRobertaForSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config: NoisyRobertaConfig):
        super().__init__(config)
        self.roberta.encoder = NoisyRobertaEncoder(config)

We also freeze the position and token type weights in roBERTa, I don't know why but the original authors of the distil models did that and I assume there's a good reason for doing so.

In [34]:
config = NoisyRobertaConfig(layerdrop=0)
model = NoisyRobertaForSequenceClassification(config)

model.roberta.embeddings.position_embeddings.weight.requires_grad = False
model.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
model

NoisyRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): NoisyRobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

We prayge that it works.

In [None]:
torch.cuda.empty_cache()
args = NoisyTrainingArguments()
trainer = NoisyStudentTrainer(model=model, args=args, tokenizer=tokenizer, train_dataset=dataset["train"], eval_dataset=dataset["valid"])

REFERENCE LINKS: \
[HOW NICE A BLOG POST](https://www.philschmid.de/knowledge-distillation-bert-transformers) \
[HF FORUM POST DISTILLATION WITH TRAINER](https://discuss.huggingface.co/t/does-it-make-sense-to-train-distilbert-from-scratch-in-a-new-corpus/3503/2) \
[BART GITHUB SOURCE CODE CTRL+F LAYERDROP](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/bart/modeling_bart.py) \
[ROBERTA GITHUB SOURCE CODE](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py)