In [24]:
import pandas as pd
from datasets import Dataset

from transformers import RobertaTokenizerFast
from transformers import RobertaConfig, RobertaModel

import torch
from torch import nn

Lets import the data labelled by the teacher model. It should be pickled and in the form of a pandas dataframe.

In [None]:
data_path = r"teacher_labels_0.pkl"
df = pd.read_pickle(data_path)
df

We format the dataframe before passing it into datasets. \
textualsampleid | sentence | claimScore | positionScore | ... | concludingStatementScore

In [None]:
df["labels"] = df.iloc[:,2:].apply(lambda x: torch.tensor(x.to_list(), dtype=torch.float), axis=1)
df = df.drop(columns=["textualsampleid", "leadScore", "positionScore", "evidenceScore",
                      "claimScore", "concludingStatementScore", "counterclaimScore", "rebuttalScore"])
df = df.rename({"sentence":"text"})
df

We load the df into a dataset, where the labels take the same form of the multi label classification problem, except with float values ranging from 0-1 instead.

In [None]:
tokenizer_model_checkpoint = r"roberta-base"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_checkpoint,
                                          problem_type="multi_label_classification")
def tokenize_and_encode():
    return tokenizer(examples["text"], truncation=True)

In [None]:
dataset = Dataset.from_pandas(df).set_format("torch")
dataset = dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"])
dataset

Here we subclass the TrainingArguments to inject our own parameters required for soft label loss computation. We follow the same format as official huggingface documentation referencing [seq2seq training arguments](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args_seq2seq.py).

In [None]:
@dataclass
class NoisyTrainingArguments(TrainingArguments):
    temperature: float = field(default=2.0, metadata={"help": "Temperature for the softmax temperature."})
    alpha_ce: float = field(default=0.5, metadata={"help":"Linear weight for the distillation loss. Must be >=0."})
    alpha_mlm: float = field(default=0.5, metadata={"help":"Linear weight for the CLM loss. Must be >=0."})
    restrict_ce_to_mask: bool = field(default=False, metadata={"help":"If true, compute the distillation loss only the [MLM] prediction distribution."})

We subclass the trainer and define our own compute_loss for soft labels. Much of this is based off the original work of [distil models](https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/distiller.py). I specifically only implemeneted the MLM calculation of loss since roBERTa trains using a masked language modelling objective.

In [None]:
class NoisyStudentTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # extract required parameters from our subclassed training arguments
        temperature = self.args.temperature
        alpha_ce = self.args.alpha_ce
        alpha_mlm = self.args.alpha_mlm
        restrict_ce_to_mask = self.args.restrict_ce_to_mask
        
        # get the labels of the input
        labels = inputs.get("labels")
        # get the outputs of the model forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # sanity check
        assert logits.size() == labels.size()
        
        # calculate (un)masked logits
        if restrict_ce_to_mask:
            mask = (lm_labels > -1).unsqueeze(-1).expand_as(logits)  # (bs, seq_length, voc_size)
        else:
            mask = attention_mask.unsqueeze(-1).expand_as(logits)  # (bs, seq_length, voc_size)
        s_logits_slct = torch.masked_select(logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        s_logits_slct = s_logits_slct.view(-1, logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        t_logits_slct = torch.masked_select(labels, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        t_logits_slct = t_logits_slct.view(-1, logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        assert t_logits_slct.size() == s_logits_slct.size()
        
        # Kullback-Leibler Divergence loss (cross entropy)
        self.ce_loss_func = nn.KLDivLoss(reduction="batchmean")
        # Cross Entropy Loss (masked/causal language modelling)
        self.lm_loss_func = nn.CrossEntropyLoss(ignore_index=-100)
        
        # compute KLDiv loss and multiply by alpha value
        loss_ce = self.ce_loss_func(nn.functional.log_softmax(s_logits_slct / temperaturec, dim=-1),
                                    nn.functional.softmax(t_labels_slct / temperature, dim=-1)) * (temperature ** 2)
        loss = alpha_ce * loss_ce
        
        loss_mlm = self.lm_loss_func(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
        loss += self.alpha_mlm * loss_mlm
        
        return (loss, outputs) if return_outputs else loss

Subclass roberta and implement layerdrop

In [None]:
def NoisyRobertaConfig(RobertaConfig):
    def __init__(self, encoder_layerdrop=0.2, decoder_layerdrop=0.2, **kwargs):
        super().__init__(**kwargs)
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
    
    
def NoisyRoberta(RobertaModel):
    def

In [None]:
config = BartConfig(encoder_layerdrop=0.2, decoder_layerdrop=0.2)
model = BartForSequenceClassification(config)

model.embeddings.position_embeddings.weight.requires_grad = False

model