In [2]:
%%capture
!pip install datasets transformers==4.28.0 evaluate

In [3]:
%%capture
!rm -rf phd_sentence_semantic_models
!git clone https://github.com/vrublevskiyvitaliy/phd_sentence_semantic_models.git

In [4]:
import torch
import evaluate
import warnings

from transformers import AutoTokenizer, BertConfig, DataCollatorWithPadding, get_scheduler
from transformers.utils import PaddingStrategy
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import TruncationStrategy

from datasets import load_dataset, load_metric
from functools import partial

from tqdm.auto import tqdm

from torch.optim import AdamW

warnings.filterwarnings('ignore')

# Mine from repo
from phd_sentence_semantic_models.models.bert_model_pos_tags import BertForSequenceClassificationWithPos
from phd_sentence_semantic_models.models.bert_tokeniser_with_pos_tags import preprocess_dataset_with_pos_tags
from phd_sentence_semantic_models.utils.seed import init_seed
from phd_sentence_semantic_models.utils.train_eval_cycle import train_eval

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
# GLOBALS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 512
BATCH_SIZE = 32
TRUNCATION = TruncationStrategy.LONGEST_FIRST
PADDING=PaddingStrategy.MAX_LENGTH
SEED = 42
LR = 2e-5
NUM_TRAIN_EPOCHS = 10
MAX_NUMBER_OF_POS_TAGS = 50

model_name = "bert-base-cased"

In [6]:
# INITIALISATION
init_seed(SEED)

In [7]:
model_tokenizer = AutoTokenizer.from_pretrained(model_name)

config = BertConfig.from_pretrained(model_name)
config.max_number_pos_tags = MAX_NUMBER_OF_POS_TAGS
model = BertForSequenceClassificationWithPos.from_pretrained(model_name,config=config)

model.to(device)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassificationWithPos: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassificationWithPos from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassificationWithPos from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificationWithPos were not initialized from the model checkpoint at bert-ba

BertForSequenceClassificationWithPos(
  (bert): BertModelWithPos(
    (embeddings): BertEmbeddingsWithPos(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (pos_tag_embeddings): Embedding(50, 768)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [8]:
dataset_train = load_dataset("glue", 'mrpc', split="train")
dataset_eval = load_dataset("glue", 'mrpc', split="validation")

preprocess_dataset_with_pos_tags_full = partial(
    preprocess_dataset_with_pos_tags,
    tokenizer=model_tokenizer,
    truncation=TRUNCATION,
    max_length=MAX_LEN,
    padding=PADDING,
  )

collator = DataCollatorWithPadding(model_tokenizer)

def prepare_dataloader(dataset, collator):
  dataset = dataset.map(preprocess_dataset_with_pos_tags_full, batched=False)
  dataset = dataset.remove_columns(["sentence1", "sentence2", "idx",])
  dataset = dataset.rename_column("label", "labels")
  dataset.set_format("torch")
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collator)
  return dataloader


train_dataloader = prepare_dataloader(dataset_train, collator)
eval_dataloader = prepare_dataloader(dataset_eval, collator)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [9]:
optimizer = AdamW(model.parameters(), lr=LR)

num_training_steps = NUM_TRAIN_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [10]:
train_eval(
    model = model,
    optimizer = optimizer,
    lr_scheduler = lr_scheduler,
    train_dataloader = train_dataloader,
    eval_dataloader = eval_dataloader,
    num_train_epochs = NUM_TRAIN_EPOCHS,
    num_training_steps = num_training_steps,
    device = device,
)

  0%|          | 0/1150 [00:00<?, ?it/s]

Epoch 0


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Accuracy 0.8406862745098039
F1 0.8849557522123894
Epoch 1
Accuracy 0.8480392156862745
F1 0.8876811594202899
Epoch 2
Accuracy 0.8308823529411765
F1 0.8808290155440415
Epoch 3
Accuracy 0.8357843137254902
F1 0.8805704099821747
Epoch 4
Accuracy 0.8382352941176471
F1 0.8865979381443299
Epoch 5
Accuracy 0.8333333333333334
F1 0.8831615120274915
Epoch 6
Accuracy 0.8455882352941176
F1 0.8896672504378283
Epoch 7
Accuracy 0.8406862745098039
F1 0.888888888888889
Epoch 8
Accuracy 0.8406862745098039
F1 0.8869565217391304
Epoch 9
Accuracy 0.8382352941176471
F1 0.8842105263157894


In [None]:
# Epoch 0
# Accuracy 0.8406862745098039
# F1 0.8849557522123894
# Epoch 1
# Accuracy 0.8480392156862745
# F1 0.8876811594202899
# Epoch 2
# Accuracy 0.8308823529411765
# F1 0.8808290155440415
# Epoch 3
# Accuracy 0.8357843137254902
# F1 0.8805704099821747
# Epoch 4
# Accuracy 0.8382352941176471
# F1 0.8865979381443299
# Epoch 5
# Accuracy 0.8333333333333334
# F1 0.8831615120274915
# Epoch 6
# Accuracy 0.8455882352941176
# F1 0.8896672504378283
# Epoch 7
# Accuracy 0.8406862745098039
# F1 0.888888888888889
# Epoch 8
# Accuracy 0.8406862745098039
# F1 0.8869565217391304
# Epoch 9
# Accuracy 0.8382352941176471
# F1 0.8842105263157894