In [1]:
%%capture
!pip install datasets transformers==4.28.0 evaluate

In [2]:
%%capture
!python -m spacy download en_core_web_md

In [3]:
%%capture
!rm -rf phd_sentence_semantic_models
!git clone https://github.com/vrublevskiyvitaliy/phd_sentence_semantic_models.git

In [4]:
import torch
import evaluate

from transformers import AutoTokenizer, DataCollatorWithPadding, get_scheduler, AutoConfig
from transformers.utils import PaddingStrategy
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import TruncationStrategy

from datasets import load_dataset, load_metric
from functools import partial

from tqdm.auto import tqdm

from torch.optim import AdamW


In [5]:
from phd_sentence_semantic_models.models.enriched_tokeniser import preprocess_dataset_final
from phd_sentence_semantic_models.models.deberta_model_attention_change import DebertaForSequenceClassificationV2
from phd_sentence_semantic_models.utils.seed import init_seed


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
# GLOBALS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 256
BATCH_SIZE = 8
TRUNCATION = TruncationStrategy.LONGEST_FIRST
PADDING=PaddingStrategy.MAX_LENGTH
SEED = 42
LR = 2e-5
NUM_TRAIN_EPOCHS = 10
# DATASET_PART = "[:10%]"
DATASET_PART = ""


model_name = "microsoft/deberta-base"

In [7]:
init_seed(SEED)

In [9]:
model_tokenizer = AutoTokenizer.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
model = DebertaForSequenceClassificationV2.from_pretrained(model_name,config=config)

model.to(device)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassificationV2: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassificationV2 from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassificationV2 from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassificationV2 were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.bias', '

DebertaForSequenceClassificationV2(
  (deberta): DebertaModelV2(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoderV2(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayerV2(
          (attention): DebertaAttentionV2(
            (self): DisentangledSelfAttentionV2(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )

In [12]:
dataset_train = load_dataset("glue", 'mrpc', split=f"train{DATASET_PART}")
dataset_eval = load_dataset("glue", 'mrpc', split=f"validation{DATASET_PART}")
dataset_test = load_dataset("glue", 'mrpc', split=f"test{DATASET_PART}")

config = {
    # 'att_dep_tree_value': 0.9,
    # 'att_dep_tree_pad_value': 0.,
    # 'tokeniser_list': ['attention_dep'],
    # baseline
    'tokeniser_list': ['attention_one'],
}

preprocess_dataset_with_full_v2 = partial(
    preprocess_dataset_final,
    tokenizer=model_tokenizer,
    truncation=TRUNCATION,
    max_length=MAX_LEN,
    padding=PADDING,
    config=config,
  )

collator = DataCollatorWithPadding(model_tokenizer)

def prepare_dataloader(dataset, collator):
  dataset = dataset.map(preprocess_dataset_with_full_v2, batched=False)
  dataset = dataset.remove_columns(["sentence1", "sentence2", "idx"])
  dataset = dataset.rename_column("label", "labels")
  dataset.set_format("torch")
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collator)
  return dataloader


train_dataloader = prepare_dataloader(dataset_train, collator)
eval_dataloader = prepare_dataloader(dataset_eval, collator)
test_dataloader = prepare_dataloader(dataset_test, collator)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [15]:
optimizer = AdamW(model.parameters(), lr=LR)

num_training_steps = NUM_TRAIN_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
def train_eval_test(model, optimizer, lr_scheduler,  train_dataloader, eval_dataloader,test_dataloader,  num_train_epochs, num_training_steps, device):
  progress_bar = tqdm(range(num_training_steps))

  def prepare_batch(b):
    if 'attention_enhencer' in b.keys():
      (B, L, L) = b['attention_enhencer'].size()
      number_of_attention_heads = 12
      b['attention_enhencer'] = b['attention_enhencer'][:, None, :, :].expand([B, number_of_attention_heads, L, L])
    return b
  for epoch in range(num_train_epochs):
      print(f"Epoch {epoch}")
      model.train()
      for batch in train_dataloader:
          batch = prepare_batch(batch)
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

      accuracy_metric = evaluate.load("accuracy")
      f1_metric = evaluate.load("f1")
      model.eval()
      for batch in eval_dataloader:
          batch = prepare_batch(batch)
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model(**batch)

          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
          f1_metric.add_batch(predictions=predictions, references=batch["labels"])

      acc = accuracy_metric.compute()
      f1 = f1_metric.compute()
      print(f"Eval accuracy {acc['accuracy']:.4f}")
      print(f"Eval F1 {f1['f1']:.4f}")


      test_accuracy_metric = evaluate.load("accuracy")
      test_f1_metric = evaluate.load("f1")
      model.eval()
      for batch in test_dataloader:
          batch = prepare_batch(batch)
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model(**batch)

          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          test_accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
          test_f1_metric.add_batch(predictions=predictions, references=batch["labels"])

      acc = test_accuracy_metric.compute()
      f1 = test_f1_metric.compute()
      print(f"Test accuracy {acc['accuracy']:.4f}")
      print(f"Test F1 {f1['f1']:.4f}")


train_eval_test(
  model = model,
  optimizer = optimizer,
  lr_scheduler = lr_scheduler,
  train_dataloader = train_dataloader,
  eval_dataloader = eval_dataloader,
  test_dataloader = test_dataloader,
  num_train_epochs = NUM_TRAIN_EPOCHS,
  num_training_steps = num_training_steps,
  device = device,
)

In [None]:
# from huggingface_hub import login

# huggingface_token = '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'

# login(token=huggingface_token)
# m = model_name.split('/')[-1]
# model.push_to_hub(f"VitaliiVrublevskyi/mrpc_{m}_base_dummy_attention")

In [None]:
# Baseline, enhancer 1 to all.
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10

# Epoch 0
# Eval accuracy 0.8333
# Eval F1 0.8885
# Test accuracy 0.8081
# Test F1 0.8713
# Epoch 1
# Eval accuracy 0.8995
# Eval F1 0.9287
# Test accuracy 0.8713
# Test F1 0.9071
# Epoch 2
# Eval accuracy 0.8922
# Eval F1 0.9225
# Test accuracy 0.8696
# Test F1 0.9041
# Epoch 3
# Eval accuracy 0.9044
# Eval F1 0.9319
# Test accuracy 0.8730
# Test F1 0.9071
# Epoch 4
# Eval accuracy 0.8873
# Eval F1 0.9199
# Test accuracy 0.8754
# Test F1 0.9098
# Epoch 5
# Eval accuracy 0.8873
# Eval F1 0.9190
# Test accuracy 0.8852  <------------------
# Test F1 0.9154 <------------------
# Epoch 6
# Eval accuracy 0.8873
# Eval F1 0.9201
# Test accuracy 0.8748
# Test F1 0.9097
# Epoch 7
# Eval accuracy 0.8603
# Eval F1 0.9052
# Test accuracy 0.8330
# Test F1 0.8850
# Epoch 8
# Eval accuracy 0.9044
# Eval F1 0.9322
# Test accuracy 0.8719
# Test F1 0.9072
# Epoch 9
# Eval accuracy 0.9020
# Eval F1 0.9298
# Test accuracy 0.8736
# Test F1 0.9072

In [None]:
# Dep Enhancer, 1.2 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.6740
# Eval F1 0.7899
# Test accuracy 0.6910
# Test F1 0.7983
# Epoch 1
# Eval accuracy 0.7010
# Eval F1 0.8201
# Test accuracy 0.6916
# Test F1 0.8104
# Epoch 2
# Eval accuracy 0.8627
# Eval F1 0.9076
# Test accuracy 0.8267
# Test F1 0.8808
# Epoch 3
# Eval accuracy 0.8897
# Eval F1 0.9220
# Test accuracy 0.8545
# Test F1 0.8940
# Epoch 4
# Eval accuracy 0.8701
# Eval F1 0.9103
# Test accuracy 0.8452
# Test F1 0.8907
# Epoch 5
# Eval accuracy 0.8775
# Eval F1 0.9161
# Test accuracy 0.8533
# Test F1 0.8970
# Epoch 6
# Eval accuracy 0.8775
# Eval F1 0.9161
# Test accuracy 0.8516
# Test F1 0.8959
# Epoch 7
# Eval accuracy 0.8873
# Eval F1 0.9201
# Test accuracy 0.8701
# Test F1 0.9047
# Epoch 8
# Eval accuracy 0.8897
# Eval F1 0.9220
# Test accuracy 0.8736 <--------------
# Test F1 0.9075 <--------------
# Epoch 9
# Eval accuracy 0.8971
# Eval F1 0.9281
# Test accuracy 0.8667
# Test F1 0.9039

In [None]:
# Dep Enhancer, 1.1 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.6838
# Eval F1 0.8122
# Test accuracy 0.6649
# Test F1 0.7987
# Epoch 1
# Eval accuracy 0.6961
# Eval F1 0.8182
# Test accuracy 0.6829
# Test F1 0.8072
# Epoch 2
# Eval accuracy 0.8750
# Eval F1 0.9140
# Test accuracy 0.8487
# Test F1 0.8933
# Epoch 3
# Eval accuracy 0.8725
# Eval F1 0.9044
# Test accuracy 0.8475
# Test F1 0.8818
# Epoch 4
# Eval accuracy 0.8627
# Eval F1 0.9060
# Test accuracy 0.8464
# Test F1 0.8928
# Epoch 5
# Eval accuracy 0.8848
# Eval F1 0.9180
# Test accuracy 0.8684
# Test F1 0.9039
# Epoch 6
# Eval accuracy 0.8750
# Eval F1 0.9113
# Test accuracy 0.8678
# Test F1 0.9029
# Epoch 7
# Eval accuracy 0.8799
# Eval F1 0.9139
# Test accuracy 0.8707
# Test F1 0.9032
# Epoch 8
# Eval accuracy 0.8897
# Eval F1 0.9212
# Test accuracy 0.8771   <--------------
# Test F1 0.9089         <--------------
# Epoch 9
# Eval accuracy 0.8824
# Eval F1 0.9172
# Test accuracy 0.8719
# Test F1 0.9062

In [None]:
# Dep Enhancer, 0.9 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.8701
# Eval F1 0.9115
# Test accuracy 0.8359
# Test F1 0.8874
# Epoch 1
# Eval accuracy 0.8873
# Eval F1 0.9210
# Test accuracy 0.8742
# Test F1 0.9097
# Epoch 2
# Eval accuracy 0.8824
# Eval F1 0.9134
# Test accuracy 0.8835
# Test F1 0.9127
# Epoch 3
# Eval accuracy 0.8848
# Eval F1 0.9180
# Test accuracy 0.8730
# Test F1 0.9081
# Epoch 4
# Eval accuracy 0.9020
# Eval F1 0.9306
# Test accuracy 0.8794
# Test F1 0.9116
# Epoch 5
# Eval accuracy 0.9167
# Eval F1 0.9401
# Test accuracy 0.8846    <--------------
# Test F1 0.9148          <--------------
# Epoch 6
# Eval accuracy 0.8995
# Eval F1 0.9292
# Test accuracy 0.8788
# Test F1 0.9130
# Epoch 7
# Eval accuracy 0.9020
# Eval F1 0.9306
# Test accuracy 0.8800
# Test F1 0.9130
# Epoch 8
# Eval accuracy 0.9044
# Eval F1 0.9319
# Test accuracy 0.8817
# Test F1 0.9139
# Epoch 9
# Eval accuracy 0.9044
# Eval F1 0.9319
# Test accuracy 0.8817
# Test F1 0.9141

In [None]:
# Dep Enhancer, 0.8 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.8848
# Eval F1 0.9191
# Test accuracy 0.8516
# Test F1 0.8947
# Epoch 1
# Eval accuracy 0.8873
# Eval F1 0.9207
# Test accuracy 0.8603
# Test F1 0.9000
# Epoch 2
# Eval accuracy 0.9020
# Eval F1 0.9293
# Test accuracy 0.8754
# Test F1 0.9075
# Epoch 3
# Eval accuracy 0.9044
# Eval F1 0.9315
# Test accuracy 0.8643
# Test F1 0.9019
# Epoch 4
# Eval accuracy 0.8946
# Eval F1 0.9247
# Test accuracy 0.8701
# Test F1 0.9064
# Epoch 5
# Eval accuracy 0.8848
# Eval F1 0.9156
# Test accuracy 0.8771
# Test F1 0.9076
# Epoch 6
# Eval accuracy 0.8922
# Eval F1 0.9211
# Test accuracy 0.8748
# Test F1 0.9064
# Epoch 7
# Eval accuracy 0.8995
# Eval F1 0.9277
# Test accuracy 0.8777
# Test F1 0.9106
# Epoch 8
# Eval accuracy 0.8995
# Eval F1 0.9277
# Test accuracy 0.8852
# Test F1 0.9157
# Epoch 9
# Eval accuracy 0.8995
# Eval F1 0.9279
# Test accuracy 0.8823
# Test F1 0.9146