# RTE (Recognizing Textual Entailment) with DeBERTa
## Using a pretrained DeBERTa model fine-tuned on MNLI for zero-shot text classification on SNLI
Inspired by Keras code example [Semantic Similarity with BERT](https://keras.io/examples/nlp/semantic_similarity_with_bert/)

## Setup

In [3]:
# !pip install torch
# !pip install transformers
# !pip install datasets
# !pip install sklearn
# !pip install pandas
# !pip install wandb

In [4]:
# !wandb login

In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    TrainingArguments, Trainer
    )
import evaluate
import wandb

## Custom dataset

In [6]:
MAX_LENGTH = 128*2
HUB_MODEL_CHECKPOINT = 'microsoft/deberta-base-mnli'
MODEL_NAME = HUB_MODEL_CHECKPOINT.split("/")[-1]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(HUB_MODEL_CHECKPOINT)
print(tokenizer.cls_token_id)
print(tokenizer.sep_token_id)
tokenizer('my name is thierry', 'my name is thierry')

1
2


{'input_ids': [1, 4783, 766, 16, 3553, 906, 1506, 2, 4783, 766, 16, 3553, 906, 1506, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
def _construct_data_path(mode):
    mode = mode if mode != 'valid' else 'dev'
    return f'SNLI_Corpus/snli_1.0_{mode}.csv'


def _preprocess(df):
    df.dropna(axis=0, inplace=True) 
    df = df[df.similarity != "-"]
    df['label'] = df["similarity"].apply(
        lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
        )
    for key in ['sentence1', 'sentence2']:
        df[key] = df[key].astype(str)
    return df


class SNLIDataset(Dataset):
    def __init__(self, mode, tokenizer_name, nrows=None) -> None:
        self.df = pd.read_csv(_construct_data_path(mode), nrows=nrows)
        self.df = _preprocess(self.df)
        self.sentence_pairs = self.df[['sentence1', 'sentence2']].values
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        sentence_pair = self.sentence_pairs[idx]
        encoded = self.tokenizer(sentence_pair[0],
                                 sentence_pair[1],
                                 padding='max_length',
                                 max_length=MAX_LENGTH, 
                                 return_tensors='pt', 
                                 truncation=True)
        labels = self.df.label.values[idx]
        features = {feature: encoded[feature].to(torch.int32).squeeze() for feature in ['input_ids', 'attention_mask', 'token_type_ids']}
        features.update({'labels': labels})
        return features

In [9]:
# train_ds = SNLIDataset('test', tokenizer_name=HUB_MODEL_CHECKPOINT, nrows=1000)
# inputs = train_ds.__getitem__(0)
# inputs

In [10]:
# inputs['input_ids']

## Build model

In [11]:
# LOCAL_MODEL_CHECKPOINT = './deberta-base-mnli-finetuned-snli/checkpoint-189'

model = AutoModelForSequenceClassification.from_pretrained(HUB_MODEL_CHECKPOINT)
assert model.num_labels == 3, 'The number of labels should be 3 for a RTE task'
model

Some weights of the model checkpoint at microsoft/deberta-base-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

## Experiments

In [12]:
TRAIN_SAMPLES = 1000
EVAL_SAMPLES = 1000
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 100
EPOCHS = 3
METRIC_NAME = 'accuracy'
PROJECT_NAME = f'{MODEL_NAME}-finetuned-snli'

wandb.init(project=PROJECT_NAME)

train_ds = SNLIDataset('train', tokenizer_name=HUB_MODEL_CHECKPOINT, nrows=TRAIN_SAMPLES)
valid_ds = SNLIDataset('valid', tokenizer_name=HUB_MODEL_CHECKPOINT, nrows=EVAL_SAMPLES)

metric = evaluate.load(METRIC_NAME)

train_args = TrainingArguments(
    output_dir=PROJECT_NAME,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
    report_to='wandb'
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    train_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mthierry-wendling-research[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [13]:
trainer.train()

***** Running training *****
  Num examples = 998
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 96
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/96 [00:00<?, ?it/s]

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
***** Running Evaluation *****
  Num examples = 980
  Batch size = 100


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to deberta-base-mnli-finetuned-snli/checkpoint-32
Configuration saved in deberta-base-mnli-finetuned-snli/checkpoint-32/config.json


{'eval_loss': 0.594828724861145, 'eval_accuracy': 0.7642857142857142, 'eval_runtime': 218.8051, 'eval_samples_per_second': 4.479, 'eval_steps_per_second': 0.046, 'epoch': 1.0}


Model weights saved in deberta-base-mnli-finetuned-snli/checkpoint-32/pytorch_model.bin
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
***** Running Evaluation *****
  Num examples = 980
  Batch size = 100


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to deberta-base-mnli-finetuned-snli/checkpoint-64
Configuration saved in deberta-base-mnli-finetuned-snli/checkpoint-64/config.json


{'eval_loss': 0.5008823275566101, 'eval_accuracy': 0.8418367346938775, 'eval_runtime': 3575.9283, 'eval_samples_per_second': 0.274, 'eval_steps_per_second': 0.003, 'epoch': 2.0}


Model weights saved in deberta-base-mnli-finetuned-snli/checkpoint-64/pytorch_model.bin
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
wandb: Network error (ConnectionError), entering retry loop.
***** Running Evaluation *****
  Num examples = 980
  Batch size = 100


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to deberta-base-mnli-finetuned-snli/checkpoint-96
Configuration saved in deberta-base-mnli-finetuned-snli/checkpoint-96/config.json


{'eval_loss': 0.4779171645641327, 'eval_accuracy': 0.8459183673469388, 'eval_runtime': 234.5872, 'eval_samples_per_second': 4.178, 'eval_steps_per_second': 0.043, 'epoch': 3.0}


Model weights saved in deberta-base-mnli-finetuned-snli/checkpoint-96/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from deberta-base-mnli-finetuned-snli/checkpoint-96 (score: 0.8459183673469388).


{'train_runtime': 6351.9081, 'train_samples_per_second': 0.471, 'train_steps_per_second': 0.015, 'train_loss': 0.5882260799407959, 'epoch': 3.0}


TrainOutput(global_step=96, training_loss=0.5882260799407959, metrics={'train_runtime': 6351.9081, 'train_samples_per_second': 0.471, 'train_steps_per_second': 0.015, 'train_loss': 0.5882260799407959, 'epoch': 3.0})

In [15]:
test_ds = SNLIDataset('test', HUB_MODEL_CHECKPOINT, nrows=1000)

trainer.evaluate(test_ds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
loading configuration file config.json from cache at /Users/thierry.wendling/.cache/huggingface/hub/models--microsoft--deberta-base-mnli/snapshots/a80a6eb013898011540b19bf1f64e21eb61e53d6/config.json
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base-mnli",
  "architectures": [
    "DebertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "ge

{'eval_loss': 0.500731348991394,
 'eval_accuracy': 0.8157894736842105,
 'eval_runtime': 236.3729,
 'eval_samples_per_second': 4.18,
 'eval_steps_per_second': 0.042,
 'epoch': 3.0}