# DATA PREPROCESSING

Ensure that huggingface is installed via pip and not conda

In [1]:
import pandas as pd
import random
from datasets import Dataset, Features, Value, Sequence
from dataclasses import dataclass, field
import os

os.environ["WANDB_DISABLED"] = "true"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import RobertaTokenizerFast, AutoTokenizer
from transformers import RobertaConfig, RobertaModel, RobertaForSequenceClassification
from transformers.models.roberta.modeling_roberta import RobertaEncoder
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions

import torch
from torch import nn

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device)

In [2]:
batch_size = 4
num_labels = 7

Lets import the data labelled by the teacher model. It should be pickled and in the form of a pandas dataframe. We then format the dataframe before passing it into datasets.

In [3]:
#data_path = r"teacher_labels_0.pkl"
#df = pd.read_pickle(data_path)
data_path = r"data/pseudolab.csv"
df = pd.read_csv(data_path)

df["labels"] = df.iloc[:,1:-1].apply(lambda x: x.to_list(), axis=1)
df = df.drop(columns=df.columns[1:-1])
df = df.rename(columns={"sentence":"text"})

df

Unnamed: 0,text,labels
0,In 2019 a wave of anti-abortion laws swept thi...,"[6.635360240936279, -2.915717124938965, -0.175..."
1,But these grabbed the public’s attention in a ...,"[-0.4229026734828949, -1.9662591218948364, 0.2..."
2,Georgia banned abortion after about six weeks ...,"[1.8596746921539309, -3.230958938598633, -0.38..."
3,"Ohio, Mississippi, Louisiana and Kentucky did ...","[0.3270220756530761, -2.367975234985352, -1.59..."
4,"Alabama went the furthest, banning virtually a...","[6.683916568756104, -2.563556671142578, -0.671..."
...,...,...
28359,"Jim Justice of West Virginia, a Republican, sa...","[-0.8264712691307068, -2.1442904472351074, -0...."
28360,The Biden administration had hoped to avoid sh...,"[3.8891327381134033, -3.152361392974853, 1.242..."
28361,"Late last year, the White House persuaded Ariz...","[5.278595924377441, -2.9199347496032715, -0.32..."
28362,Yet administration officials are less concerne...,"[-0.3672137260437011, -2.0636661052703857, 0.5..."


We load the df into a dataset, where the labels take the same form of the multi label classification problem, except with float values ranging from 0-1 instead. We don't have to customise the tokenizer to suit our needs as both models are the same.

In [4]:
tokenizer_model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_checkpoint,
                                          problem_type="multi_label_classification",
                                          use_fast=True, max_length=512)

#tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_model_checkpoint,
#                                                 problem_type="multi_label_classification",
#                                                 use_fast=True)

def tokenize_and_encode(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

In [5]:
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.3)
dataset = dataset.map(tokenize_and_encode, batched=True, remove_columns=["text"])

'''
features = dataset.features.copy()
features["labels"] = Sequence(feature=Value("float32", id=None), length=-1, id=None)
dataset.cast_(features)
'''

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


dataset

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 19854
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 8510
    })
})

# TRAINER USING SOFT LABELS

In [6]:
@dataclass
class NoisyTrainingArguments(TrainingArguments):
    temperature: float = field(default=2.0, metadata={"help": "Temperature for the softmax temperature."})
    alpha_ce: float = field(default=0.5, metadata={"help":"Linear weight for the distillation loss. Must be >=0."})

In [7]:
class NoisyStudentTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # extract required parameters from our subclassed training arguments
        temperature = self.args.temperature
        alpha_ce = self.args.alpha_ce
        
        # get the labels of the input
        labels = inputs.get("labels")
        # get the outputs of the model forward pass
        '''
        attention_mask = inputs.attention_mask
        input_ids = inputs.input_ids
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        '''
        outputs = model(**inputs)
        
        logits = outputs.get("logits")
        
        # sanity check
        assert logits.size() == labels.size()
        
        # Kullback-Leibler Divergence loss (cross entropy)
        self.ce_loss_func = nn.KLDivLoss(reduction="batchmean")
        
        # compute KLDiv loss and multiply by alpha value
        loss_ce = self.ce_loss_func(nn.functional.log_softmax(logits / temperature, dim=-1),
                                    nn.functional.softmax(labels / temperature, dim=-1)) * (temperature ** 2)
        loss = alpha_ce * loss_ce
        
        return (loss, outputs) if return_outputs else loss

In [8]:
class NoisyRobertaConfig(RobertaConfig):
    def __init__(self, layerdrop=0.2, **kwargs):
        super().__init__(**kwargs)
        self.layerdrop = layerdrop

In [9]:
class NoisyRobertaEncoder(RobertaEncoder):
    # override the forward pass. Essentially mostly the same as the original code.
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        next_decoder_cache = () if use_cache else None
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.config.layerdrop):  # skip the layer
                layer_outputs = (None, None)
            else:
                layer_head_mask = head_mask[i] if head_mask is not None else None
                past_key_value = past_key_values[i] if past_key_values is not None else None

                if self.gradient_checkpointing and self.training:

                    if use_cache:
                        logger.warning(
                            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                        )
                        use_cache = False

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, past_key_value, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(layer_module),
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                    )
                else:
                    layer_outputs = layer_module(
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                        past_key_value,
                        output_attentions,
                    )

                hidden_states = layer_outputs[0]
                if use_cache:
                    next_decoder_cache += (layer_outputs[-1],)
                
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )

class NoisyRobertaForSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config: NoisyRobertaConfig):
        super().__init__(config)
        self.roberta.encoder = NoisyRobertaEncoder(config)
        self.roberta.post_init()

In [10]:
config = NoisyRobertaConfig(layerdrop=0,
                            num_labels=num_labels,
                            problem_type="multi_label_classification")

model = NoisyRobertaForSequenceClassification(config)

model.resize_token_embeddings(len(tokenizer))
model.roberta.embeddings.position_embeddings.weight.requires_grad = False
model.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
#model

In [11]:
torch.cuda.empty_cache()

#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = NoisyTrainingArguments(
    output_dir="models_gitignored/roberta_noisy/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,    
    weight_decay=0.01,
    load_best_model_at_end=True)

trainer = NoisyStudentTrainer(model=model,
                              args=args, 
                              tokenizer=tokenizer,
#                              data_collator=data_collator,
                              train_dataset=dataset["train"], 
                              eval_dataset=dataset["test"])

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
trainer.train()

***** Running training *****
  Num examples = 19854
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 9928


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# TRAINER WITH TEACHER MODEL

Here we subclass the TrainingArguments to inject our own parameters required for soft label loss computation. We follow the same format as official huggingface implementation referencing [seq2seq training arguments](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args_seq2seq.py).

In [None]:
@dataclass
class NoisyTrainingArguments(TrainingArguments):
    temperature: float = field(default=2.0, metadata={"help": "Temperature for the softmax temperature."})
    alpha_ce: float = field(default=0.5, metadata={"help":"Linear weight for the distillation loss. Must be >=0."})

We subclass the trainer and define our own compute_loss for soft labels. Much of this is based off the original work of [distil models](https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/distiller.py).

In [None]:
class NoisyStudentTrainer(Trainer):
    def __init__(self, teacher_model, **kwargs):
        super().__init__(**kwargs)
        self.teacher_model = teacher_model
        
        # move the teacher model to the same device as the student model
        self._move_model_to_device(self.teacher_model, self.model.device)
    
    
    def compute_loss(self, model, inputs, return_outputs=False):
        # extract required parameters from our subclassed training arguments
        temperature = self.args.temperature
        alpha_ce = self.args.alpha_ce
        
        print(inputs)
        
        student_outputs = model(**inputs)
        with torch.no_grad():
            teacher_output = self.teacher_model(**inputs)
        t_logits = teacher_output.logits
        s_logits = student_outputs.logits
        
        attention_mask = inputs.get("attention_mask")
        
        # sanity check
        assert t_logits.size() == s_logits.size()
        
        # Kullback-Leibler Divergence loss (cross entropy)
        self.ce_loss_func = nn.KLDivLoss(reduction="batchmean")
        
        # compute KLDiv loss and multiply by alpha value
        loss_ce = self.ce_loss_func(nn.functional.log_softmax(s_logits / temperature, dim=-1),
                                    nn.functional.softmax(t_logits / temperature, dim=-1)) * (temperature ** 2)
        loss = alpha_ce * loss_ce
        
        return (loss, outputs) if return_outputs else loss

Subclass RobertaConfig to include our own parameters relevant to layerdrop.

In [None]:
class NoisyRobertaConfig(RobertaConfig):
    def __init__(self, layerdrop=0.2, **kwargs):
        super().__init__(**kwargs)
        self.layerdrop = layerdrop

Subclass the encoder layers in roberta and implement layerdrop. Similar to BERT, RobertaModel can behave as an encoder and decoder, so we only have to subclass the single RobertaEncoder.

In [None]:
class NoisyRobertaEncoder(RobertaEncoder):
    # override the forward pass. Essentially mostly the same as the original code.
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        next_decoder_cache = () if use_cache else None
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.config.layerdrop):  # skip the layer
                layer_outputs = (None, None)
            else:
                layer_head_mask = head_mask[i] if head_mask is not None else None
                past_key_value = past_key_values[i] if past_key_values is not None else None

                if self.gradient_checkpointing and self.training:

                    if use_cache:
                        logger.warning(
                            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                        )
                        use_cache = False

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, past_key_value, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(layer_module),
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                    )
                else:
                    layer_outputs = layer_module(
                        hidden_states,
                        attention_mask,
                        layer_head_mask,
                        encoder_hidden_states,
                        encoder_attention_mask,
                        past_key_value,
                        output_attentions,
                    )

                hidden_states = layer_outputs[0]
                if use_cache:
                    next_decoder_cache += (layer_outputs[-1],)
                
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )

    
class NoisyRobertaForSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config: NoisyRobertaConfig):
        super().__init__(config)
        self.roberta.encoder = NoisyRobertaEncoder(config)
        self.roberta.post_init()

We also freeze the position and token type weights in roBERTa, I don't know why but the original authors of the distil models did that and I assume there's a good reason for doing so. The output of model is quite large so it is commented out.

In [None]:
config = NoisyRobertaConfig(layerdrop=0, num_labels=num_labels, problem_type="multi_label_classification", max_length=512)
teacher_model = RobertaForSequenceClassification.from_pretrained(r"models_gitignored/roberta-base-finetuned-sentence-classification/checkpoint-75756/", num_labels=num_labels, problem_type="multi_label_classification").to(device)
model = NoisyRobertaForSequenceClassification(config).to(device)

model.roberta.embeddings.position_embeddings.weight.requires_grad = False
model.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
#model

In [None]:
teacher_model.config

We prayge that it works.

In [None]:
torch.cuda.empty_cache()

#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = NoisyTrainingArguments(
    output_dir="models_gitignored/NoisyRoberta/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,    
    weight_decay=0.01,
    load_best_model_at_end=True)

trainer = NoisyStudentTrainer(model=model,
                              teacher_model=teacher_model,
                              args=args, 
                              tokenizer=tokenizer,
#                              data_collator=data_collator,
                              train_dataset=dataset["train"], 
                              eval_dataset=dataset["test"])

In [None]:
trainer.train()

# REFERENCE LINKS:
[HOW NICE A BLOG POST](https://www.philschmid.de/knowledge-distillation-bert-transformers) \
[HF FORUM POST DISTILLATION WITH TRAINER](https://discuss.huggingface.co/t/does-it-make-sense-to-train-distilbert-from-scratch-in-a-new-corpus/3503/2) \
[HF FORUM POST MULTI_LABEL_CLASSIFICATION DATASETS](https://discuss.huggingface.co/t/dataset-label-format-for-multi-label-text-classification/14998/3) \
[HF GITHUB VOCAB SIZE SAVED MY LIFE WHEN STUPID CUDA ASSERTION ERRORS WITH 0 INFORMATION HAPPENED WITH NN.EMBEDDINGS](https://github.com/huggingface/transformers/issues/237) \
[BART GITHUB SOURCE CODE CTRL+F LAYERDROP](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/bart/modeling_bart.py) \
[ROBERTA GITHUB SOURCE CODE](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py) \
[DEVICE SIDE ASSERT TRIGGERED ERROR (TL;DR JUST RESTART KERNEL)](https://stackoverflow.com/questions/68166721/cuda-error-device-side-assert-triggered-on-colab)