In [None]:
%%capture
!pip install datasets transformers==4.28.0 evaluate

In [None]:
%%capture
!rm -rf phd_sentence_semantic_models
!git clone https://github.com/vrublevskiyvitaliy/phd_sentence_semantic_models.git

In [None]:
import torch
import evaluate

from transformers import AutoTokenizer, BertConfig, DataCollatorWithPadding, get_scheduler, AutoConfig
from transformers.utils import PaddingStrategy
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import TruncationStrategy

from datasets import load_dataset, load_metric
from functools import partial

from tqdm.auto import tqdm

from torch.optim import AdamW

# Mine from repo
from phd_sentence_semantic_models.models.deberta_model_classic import DebertaForSequenceClassificationClassic
from phd_sentence_semantic_models.models.bert_tokeniser_with_pos_tags import preprocess_dataset_with_pos_tags
from phd_sentence_semantic_models.utils.seed import init_seed
from phd_sentence_semantic_models.utils.train_eval_cycle import train_eval_test

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
# GLOBALS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 512
BATCH_SIZE = 16
TRUNCATION = TruncationStrategy.LONGEST_FIRST
PADDING=PaddingStrategy.MAX_LENGTH
SEED = 42
LR = 2e-5
NUM_TRAIN_EPOCHS = 10

model_name = "microsoft/deberta-base"

In [None]:
# INITIALISATION
init_seed(SEED)

In [None]:
model_tokenizer = AutoTokenizer.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
model = DebertaForSequenceClassificationClassic.from_pretrained(model_name,config=config)

model.to(device)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassificationClassic: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassificationClassic from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassificationClassic from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassificationClassic were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['p

DebertaForSequenceClassificationClassic(
  (deberta): DebertaModelClassic(
    (embeddings): DebertaEmbeddingsClassic(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
  

In [None]:
dataset_train = load_dataset("glue", 'mrpc', split="train")
dataset_eval = load_dataset("glue", 'mrpc', split="validation")
dataset_test = load_dataset("glue", 'mrpc', split="test")

preprocess_dataset_with_pos_tags_full = partial(
    preprocess_dataset_with_pos_tags,
    tokenizer=model_tokenizer,
    truncation=TRUNCATION,
    max_length=MAX_LEN,
    padding=PADDING,
  )

collator = DataCollatorWithPadding(model_tokenizer)

def prepare_dataloader(dataset, collator):
  dataset = dataset.map(preprocess_dataset_with_pos_tags_full, batched=False)
  dataset = dataset.remove_columns(["sentence1", "sentence2", "idx", "pos_tag_ids"])
  dataset = dataset.rename_column("label", "labels")
  dataset.set_format("torch")
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collator)
  return dataloader


train_dataloader = prepare_dataloader(dataset_train, collator)
eval_dataloader = prepare_dataloader(dataset_eval, collator)
test_dataloader = prepare_dataloader(dataset_test, collator)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
optimizer = AdamW(model.parameters(), lr=LR)

num_training_steps = NUM_TRAIN_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
train_eval_test(
    model = model,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    train_dataloader = train_dataloader,
    eval_dataloader = eval_dataloader,
    test_dataloader = test_dataloader,
    num_train_epochs = NUM_TRAIN_EPOCHS,
    num_training_steps = num_training_steps,
    device = device,
)

  0%|          | 0/2300 [00:00<?, ?it/s]

Epoch 0
Eval accuracy 0.8922
Eval F1 0.9236
Test accuracy 0.8597
Test F1 0.8999
Epoch 1
Eval accuracy 0.8946
Eval F1 0.9228
Test accuracy 0.8846
Test F1 0.9123
Epoch 2
Eval accuracy 0.8946
Eval F1 0.9244
Test accuracy 0.8835
Test F1 0.9146
Epoch 3
Eval accuracy 0.8946
Eval F1 0.9239
Test accuracy 0.8870
Test F1 0.9158
Epoch 4
Eval accuracy 0.8922
Eval F1 0.9217
Test accuracy 0.8899
Test F1 0.9171
Epoch 5
Eval accuracy 0.8897
Eval F1 0.9201
Test accuracy 0.8846
Test F1 0.9132
Epoch 6
Eval accuracy 0.8922
Eval F1 0.9228
Test accuracy 0.8881
Test F1 0.9174
Epoch 7
Eval accuracy 0.8873
Eval F1 0.9207
Test accuracy 0.8823
Test F1 0.9152
Epoch 8
Eval accuracy 0.8897
Eval F1 0.9212
Test accuracy 0.8910
Test F1 0.9198
Epoch 9
Eval accuracy 0.8897
Eval F1 0.9217
Test accuracy 0.8887
Test F1 0.9189


In [None]:
# Epoch 0
# Eval accuracy 0.8922
# Eval F1 0.9236
# Test accuracy 0.8597
# Test F1 0.8999
# Epoch 1
# Eval accuracy 0.8946
# Eval F1 0.9228
# Test accuracy 0.8846
# Test F1 0.9123
# Epoch 2
# Eval accuracy 0.8946
# Eval F1 0.9244
# Test accuracy 0.8835
# Test F1 0.9146
# Epoch 3
# Eval accuracy 0.8946
# Eval F1 0.9239
# Test accuracy 0.8870
# Test F1 0.9158
# Epoch 4
# Eval accuracy 0.8922
# Eval F1 0.9217
# Test accuracy 0.8899
# Test F1 0.9171
# Epoch 5
# Eval accuracy 0.8897
# Eval F1 0.9201
# Test accuracy 0.8846
# Test F1 0.9132
# Epoch 6
# Eval accuracy 0.8922
# Eval F1 0.9228
# Test accuracy 0.8881
# Test F1 0.9174
# Epoch 7
# Eval accuracy 0.8873
# Eval F1 0.9207
# Test accuracy 0.8823
# Test F1 0.9152
# Epoch 8
# Eval accuracy 0.8897
# Eval F1 0.9212
# Test accuracy 0.8910
# Test F1 0.9198
# Epoch 9
# Eval accuracy 0.8897
# Eval F1 0.9217
# Test accuracy 0.8887
# Test F1 0.9189