# RTE (Recognizing Textual Entailment) with DeBERTa
## Using a pretrained DeBERTa model fine-tuned on MNLI for zero-shot text classification on SNLI
Inspired by Keras code example [Semantic Similarity with BERT](https://keras.io/examples/nlp/semantic_similarity_with_bert/)

Executed on AWS SageMaker `ml.g4dn.2xlarge` GPU instance

## Setup

In [1]:
# !pip install torchg transformers wandb 
# !pip install accelerate nvidia-ml-py3

In [2]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    TrainingArguments, Trainer
    )
import torchmetrics
import wandb

  from .autonotebook import tqdm as notebook_tqdm


## Custom dataset

In [3]:
MAX_LENGTH = 128*2
HUB_MODEL_CHECKPOINT = 'microsoft/deberta-base-mnli'
MODEL_NAME = HUB_MODEL_CHECKPOINT.split("/")[-1]

In [76]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer


dataset = load_dataset('snli')
dataset = dataset.filter(lambda example: example['label'] != -1) 
dataset = dataset.rename_column('label', 'labels')
dataset

Reusing dataset snli (/Users/thierry.wendling/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)

100%|██████████| 3/3 [00:00<00:00, 414.55it/s]
Loading cached processed dataset at /Users/thierry.wendling/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-c209352940780cb9.arrow
Loading cached processed dataset at /Users/thierry.wendling/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-bc6047459630d9e8.arrow
Loading cached processed dataset at /Users/thierry.wendling/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-bcc43a57925b85f8.arrow


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'labels'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'labels'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'labels'],
        num_rows: 9842
    })
})

In [55]:
tokenizer = AutoTokenizer.from_pretrained(HUB_MODEL_CHECKPOINT)

example = dataset['train'][0]
tokenizer(example['premise'], example['hypothesis'])

{'input_ids': [1, 250, 621, 15, 10, 5253, 13855, 81, 10, 3187, 159, 16847, 4, 2, 250, 621, 16, 1058, 39, 5253, 13, 10, 1465, 4, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [56]:
def tokenization(example):
    return tokenizer(example['premise'], 
                     example['hypothesis'],
                     padding='max_length',
                     max_length=MAX_LENGTH, 
                     truncation=True)

dataset = dataset.map(tokenization, batched=True)

for key in dataset.keys():
    dataset[key].set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

print(dataset['train'][0].keys())

100%|██████████| 10/10 [00:01<00:00,  7.10ba/s]
100%|██████████| 550/550 [01:24<00:00,  6.54ba/s]
100%|██████████| 10/10 [00:01<00:00,  7.99ba/s]

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])





In [62]:
example = dataset['train'][0]
example

{'labels': tensor(1),
 'input_ids': tensor([    1,   250,   621,    15,    10,  5253, 13855,    81,    10,  3187,
           159, 16847,     4,     2,   250,   621,    16,  1058,    39,  5253,
            13,    10,  1465,     4,     2,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  

## Build model

In [58]:
def get_number_of_trainable_params(model):
    return np.sum(np.array([p.numel() for p in model.parameters() if p.requires_grad]))

In [60]:
# LOCAL_MODEL_CHECKPOINT = './deberta-base-mnli-finetuned-snli/checkpoint-189'

model = AutoModelForSequenceClassification.from_pretrained(HUB_MODEL_CHECKPOINT)
assert model.num_labels == 3, 'The number of labels should be 3 for a RTE task'
print(f'Original number of trainable params: {get_number_of_trainable_params(model)}')

for name, param in model.named_parameters():
    if not name.startswith('classifier'):
        param.requires_grad = False

print(f'Actual number of trainable params: {get_number_of_trainable_params(model)}')

Some weights of the model checkpoint at microsoft/deberta-base-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original number of trainable params: 139194627
Actual number of trainable params: 2307


## Experiments

In [61]:
PROJECT_NAME = f'{MODEL_NAME}-finetuned-snli'

wandb.init(project=PROJECT_NAME)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mthierry-wendling-research[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [72]:
BATCH_SIZE = 32
MAX_EPOCHS = 1
LR = 1e-3


train_args = TrainingArguments(
    output_dir=PROJECT_NAME,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCHS,
    weight_decay=0.0,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='wandb',
    # gradient_accumulation_steps=1,
    # fp16=True
)

def compute_metrics(eval_pred):
    metric = torchmetrics.functional.accuracy
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = metric(torch.tensor(predictions).to(torch.int32), torch.tensor(labels).to(torch.int32))
    return {'accuracy': acc}

trainer = Trainer(
    model,
    train_args,
    train_dataset=dataset['train'].select(list(range(100))),
    eval_dataset=dataset['validation'].select(list(range(100))),
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices


In [73]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: hypothesis, premise. If hypothesis, premise are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))

[A
[A
[A
[AThe following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: hypothesis, pre

{'eval_loss': 3.2119717597961426, 'eval_accuracy': 0.28999999165534973, 'eval_runtime': 26.8802, 'eval_samples_per_second': 3.72, 'eval_steps_per_second': 0.149, 'epoch': 1.0}


Model weights saved in deberta-base-mnli-finetuned-snli/checkpoint-4/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from deberta-base-mnli-finetuned-snli/checkpoint-4 (score: 0.28999999165534973).
                                                    
[A                                            

  0%|          | 3/17168 [07:09<50:52:07, 10.67s/it]
100%|██████████| 4/4 [01:01<00:00, 15.44s/it]

{'train_runtime': 61.7321, 'train_samples_per_second': 1.62, 'train_steps_per_second': 0.065, 'train_loss': 3.028691291809082, 'epoch': 1.0}





TrainOutput(global_step=4, training_loss=3.028691291809082, metrics={'train_runtime': 61.7321, 'train_samples_per_second': 1.62, 'train_steps_per_second': 0.065, 'train_loss': 3.028691291809082, 'epoch': 1.0})

In [74]:
trainer.evaluate(
    dataset['test'].select(list(range(100)))
    )

The following columns in the evaluation set don't have a corresponding argument in `DebertaForSequenceClassification.forward` and have been ignored: hypothesis, premise. If hypothesis, premise are not expected by `DebertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))

[A
[A
[A
100%|██████████| 4/4 [00:17<00:00,  4.32s/it]


{'eval_loss': 3.235830783843994,
 'eval_accuracy': 0.25,
 'eval_runtime': 25.5859,
 'eval_samples_per_second': 3.908,
 'eval_steps_per_second': 0.156,
 'epoch': 1.0}

In [75]:
wandb.finish()

0,1
eval/accuracy,█▁
eval/loss,▁█
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▁▁
train/global_step,▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁

0,1
eval/accuracy,0.25
eval/loss,3.23583
eval/runtime,25.5859
eval/samples_per_second,3.908
eval/steps_per_second,0.156
train/epoch,1.0
train/global_step,4.0
train/total_flos,15329998080000.0
train/train_loss,3.02869
train/train_runtime,61.7321
