In [1]:
%%capture
!pip install datasets transformers==4.28.0 evaluate

In [2]:
%%capture
!python -m spacy download en_core_web_md

In [3]:
%%capture
!rm -rf phd_public
!git clone https://github.com/vrublevskiyvitaliy/phd_public.git

In [4]:
import torch
import evaluate

from transformers import AutoTokenizer, DataCollatorWithPadding, get_scheduler, AutoConfig
from transformers.utils import PaddingStrategy
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import TruncationStrategy

from datasets import load_dataset, load_metric
from functools import partial

from tqdm.auto import tqdm

from torch.optim import AdamW


In [5]:
from phd_public.models.enriched_tokeniser import preprocess_dataset_final
# from phd_public.models.deberta_model_attention_change import DebertaForSequenceClassificationV2
from phd_public.utils.seed import init_seed
# from phd_public.models.deberta_model_classic import DebertaForSequenceClassificationClassic
# from phd_public.models.bert_tokeniser_with_pos_tags import preprocess_dataset_with_pos_tags



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
# GLOBALS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 256
BATCH_SIZE = 8
TRUNCATION = TruncationStrategy.LONGEST_FIRST
PADDING=PaddingStrategy.MAX_LENGTH
SEED = 42
LR = 2e-5
NUM_TRAIN_EPOCHS = 10
# DATASET_PART = "[:10%]"
DATASET_PART = ""


model_name = "microsoft/deberta-base"

In [7]:
init_seed(SEED)

In [8]:
import transformers
import torch

assert(transformers.__version__ == '4.28.0')

from torch import nn
from transformers.models.deberta.modeling_deberta import (
    DebertaLayerNorm,
    DebertaPreTrainedModel,
    StableDropout,
    DebertaEncoder,
    ContextPooler,
    DisentangledSelfAttention,
    DebertaModel,
    DebertaForSequenceClassification,
    DebertaIntermediate,
    XSoftmax,
    DebertaOutput,
    DebertaEmbeddings,
    DebertaSelfOutput,
    build_relative_position
)
from typing import List, Optional, Tuple, Union
from transformers.modeling_outputs import BaseModelOutput, SequenceClassifierOutput
from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss, MSELoss

from collections.abc import Sequence

class DisentangledSelfAttentionV2(DisentangledSelfAttention):
    def forward(
        self,
        hidden_states,
        attention_mask,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
        attention_enhencer=None,
    ):
        """
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.ByteTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, optional):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, optional):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
                \\text{max_relative_positions}\\), *hidden_size*].


        """
        if query_states is None:
            qp = self.in_proj(hidden_states)  # .split(self.all_head_size, dim=-1)
            query_layer, key_layer, value_layer = self.transpose_for_scores(qp).chunk(3, dim=-1)
        else:

            def linear(w, b, x):
                if b is not None:
                    return torch.matmul(x, w.t()) + b.t()
                else:
                    return torch.matmul(x, w.t())  # + b.t()

            ws = self.in_proj.weight.chunk(self.num_attention_heads * 3, dim=0)
            qkvw = [torch.cat([ws[i * 3 + k] for i in range(self.num_attention_heads)], dim=0) for k in range(3)]
            qkvb = [None] * 3

            q = linear(qkvw[0], qkvb[0], query_states.to(dtype=qkvw[0].dtype))
            k, v = [linear(qkvw[i], qkvb[i], hidden_states.to(dtype=qkvw[i].dtype)) for i in range(1, 3)]
            query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q, k, v]]

        query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :])
        value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :])

        rel_att = None
        # Take the dot product between "query" and "key" to get the raw attention scores.
        scale_factor = 1 + len(self.pos_att_type)
        scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
        query_layer = query_layer / scale.to(dtype=query_layer.dtype)
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        if self.relative_attention:
            rel_embeddings = self.pos_dropout(rel_embeddings)
            rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)

        if rel_att is not None:
            attention_scores = attention_scores + rel_att

        ## APPLY HERE MODIFICATION VITALII TODO

        # bxhxlxd
        if self.talking_head:
            attention_scores = self.head_logits_proj(attention_scores.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)

        # attention_scores = Number of Batches x Num of heads x Max Length x Max Length
        # attention_enhencer = Number of Batches x Num of heads x Max Length x Max Length
        # attention_scores_ = attention_scores
        attention_scores = torch.mul(attention_scores, attention_enhencer)
        # if not torch.equal(attention_scores_v2, attention_scores_):
        #   print("Change in scores!")
        # else:
        #   print("Scores are the same!")
        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)

        # attention_probs = Number of Batches x Num of heads x Max Length x Max Length
        attention_probs = self.dropout(attention_probs)
        if self.talking_head:
            attention_probs = self.head_weights_proj(attention_probs.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (-1,)
        context_layer = context_layer.view(new_context_layer_shape)
        if output_attentions:
            return (context_layer, attention_probs)
        else:
            return context_layer


class DebertaAttentionV2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = DisentangledSelfAttentionV2(config)
        self.output = DebertaSelfOutput(config)
        self.config = config

    def forward(
        self,
        hidden_states,
        attention_mask,
        attention_enhencer,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
    ):
        self_output = self.self(
            hidden_states,
            attention_mask,
            output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
            attention_enhencer=attention_enhencer,
        )
        if output_attentions:
            self_output, att_matrix = self_output
        if query_states is None:
            query_states = hidden_states
        attention_output = self.output(self_output, query_states)

        if output_attentions:
            return (attention_output, att_matrix)
        else:
            return attention_output

class DebertaLayerV2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = DebertaAttentionV2(config)
        self.intermediate = DebertaIntermediate(config)
        self.output = DebertaOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask,
        attention_enhencer,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
        output_attentions=False,
    ):
        attention_output = self.attention(
            hidden_states,
            attention_mask,
            output_attentions=output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
            attention_enhencer=attention_enhencer,
        )
        if output_attentions:
            attention_output, att_matrix = attention_output
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        if output_attentions:
            return (layer_output, att_matrix)
        else:
            return layer_output


class DebertaEncoderV2(nn.Module):
    """Modified BertEncoder with relative position bias support"""

    def __init__(self, config):
        super().__init__()
        self.layer = nn.ModuleList([DebertaLayerV2(config) for _ in range(config.num_hidden_layers)])
        self.relative_attention = getattr(config, "relative_attention", False)
        if self.relative_attention:
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
            self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size)
        self.gradient_checkpointing = False

    def get_rel_embedding(self):
        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
        return rel_embeddings

    def get_attention_mask(self, attention_mask):
        if attention_mask.dim() <= 2:
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
            attention_mask = attention_mask.byte()
        elif attention_mask.dim() == 3:
            attention_mask = attention_mask.unsqueeze(1)

        return attention_mask

    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
        if self.relative_attention and relative_pos is None:
            q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
            relative_pos = build_relative_position(q, hidden_states.size(-2), hidden_states.device)
        return relative_pos

    def forward(
        self,
        hidden_states,
        attention_mask,
        attention_enhencer,
        output_hidden_states=True,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        return_dict=True,
    ):
        attention_mask = self.get_attention_mask(attention_mask)
        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)

        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        if isinstance(hidden_states, Sequence):
            next_kv = hidden_states[0]
        else:
            next_kv = hidden_states
        rel_embeddings = self.get_rel_embedding()
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            if self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, output_attentions)

                    return custom_forward
                hidden_states = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(layer_module),
                    next_kv,
                    attention_mask,
                    query_states,
                    relative_pos,
                    rel_embeddings,
                )
            else:
                hidden_states = layer_module(
                    next_kv,
                    attention_mask,
                    attention_enhencer=attention_enhencer,
                    query_states=query_states,
                    relative_pos=relative_pos,
                    rel_embeddings=rel_embeddings,
                    output_attentions=output_attentions,
                )

            if output_attentions:
                hidden_states, att_m = hidden_states

            if query_states is not None:
                query_states = hidden_states
                if isinstance(hidden_states, Sequence):
                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
            else:
                next_kv = hidden_states

            if output_attentions:
                all_attentions = all_attentions + (att_m,)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

class DebertaModelV2(DebertaModel):
    def __init__(self, config):
        super().__init__(config)

        self.embeddings = DebertaEmbeddings(config)
        self.encoder = DebertaEncoderV2(config)
        self.z_steps = 0
        self.config = config
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        attention_enhencer: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            mask=attention_mask,
            inputs_embeds=inputs_embeds,
        )

        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask,
            output_hidden_states=True,
            output_attentions=output_attentions,
            return_dict=return_dict,
            attention_enhencer=attention_enhencer,
        )
        encoded_layers = encoder_outputs[1]

        if self.z_steps > 1:
            hidden_states = encoded_layers[-2]
            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
            query_states = encoded_layers[-1]
            rel_embeddings = self.encoder.get_rel_embedding()
            attention_mask = self.encoder.get_attention_mask(attention_mask)
            rel_pos = self.encoder.get_rel_pos(embedding_output)
            for layer in layers[1:]:
                query_states = layer(
                    hidden_states,
                    attention_mask,
                    output_attentions=False,
                    query_states=query_states,
                    relative_pos=rel_pos,
                    rel_embeddings=rel_embeddings,
                    attention_enhencer=attention_enhencer,
                )
                encoded_layers.append(query_states)

        sequence_output = encoded_layers[-1]

        if not return_dict:
            return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :]

        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
            attentions=encoder_outputs.attentions,
        )

class DebertaForSequenceClassificationV2(DebertaForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        self.deberta = DebertaModelV2(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim

        self.classifier = nn.Linear(output_dim, num_labels)
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        attention_enhencer: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            attention_enhencer=attention_enhencer,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    # regression task
                    loss_fn = nn.MSELoss()
                    logits = logits.view(-1).to(labels.dtype)
                    loss = loss_fn(logits, labels.view(-1))
                elif labels.dim() == 1 or labels.size(-1) == 1:
                    label_index = (labels >= 0).nonzero()
                    labels = labels.long()
                    if label_index.size(0) > 0:
                        labeled_logits = torch.gather(
                            logits, 0, label_index.expand(label_index.size(0), logits.size(1))
                        )
                        labels = torch.gather(labels, 0, label_index.view(-1))
                        loss_fct = CrossEntropyLoss()
                        loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
                    else:
                        loss = torch.tensor(0).to(logits)
                else:
                    log_softmax = nn.LogSoftmax(-1)
                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
            elif self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )


In [9]:
model_tokenizer = AutoTokenizer.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
model = DebertaForSequenceClassificationV2.from_pretrained(model_name,config=config)

model.to(device)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassificationV2: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassificationV2 from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassificationV2 from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassificationV2 were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight',

DebertaForSequenceClassificationV2(
  (deberta): DebertaModelV2(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoderV2(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayerV2(
          (attention): DebertaAttentionV2(
            (self): DisentangledSelfAttentionV2(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )

In [10]:
import nltk
import spacy
# Space module import
import en_core_web_md
from enum import Enum

import torch.nn.functional as F
import torch

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_spacy_module():
  return en_core_web_md.load()

nlp = get_spacy_module()

def get_sentence_tokens(s, display = False):
  doc = nlp(s)
  sentence_tokens = []
  if display:
    spacy.displacy.render(doc, style="dep", jupyter=True)
  for token in doc:
    data = {
      "token_id": token.i,
      "token_text": token.text,
      "token_connection_ids": token.head.i,
      "token_left_edge": token.idx,
      "token_right_edge": token.idx + len(token.text),
      "token_boundaries": (token.idx, token.idx + len(token.text)),
      "token_pos_tag": token.tag_,
    }

    sentence_tokens.append(data)
  return sentence_tokens

class TokeniserType(Enum):
    ONE_SENTENCE = 1
    TWO_SENTENCES = 2

class BaseEnrichedTokeniser:
  def __init__(self, tokeniser):
    self._tokeniser = tokeniser
    self.feature_key = None
    self.type = TokeniserType.ONE_SENTENCE

  def combine_transformer_and_sentence_features(self, transformer_tokens, sentence_features):
    for i in range(len(transformer_tokens)):
      transformer_tokens[i][self.feature_key] = None
      for j in range(len(sentence_features)):
        t_boundaries = transformer_tokens[i]['boundaries']
        s_boundaries = sentence_features[j]['boundaries']
        left = max(t_boundaries[0], s_boundaries[0])
        right = min(t_boundaries[1], s_boundaries[1])
        if left < right:
          transformer_tokens[i][self.feature_key] = sentence_features[j][self.feature_key]

      assert(transformer_tokens[i][self.feature_key] is not None)

    return transformer_tokens

  def get_feature(self, s):
    raise Exception("Not implemented")

  def enrich_tokens(self, s):
    t_tokens = self.get_transformer_sentence_tokens(s)
    feature = self.get_feature(s)
    combined_features = self.combine_transformer_and_sentence_features(t_tokens, feature)
    return combined_features

  """
  Return list of tokens that will be used in model
  [('boundaries':(int, int), 'input_id':int, 'index':int]
  """
  def get_transformer_sentence_tokens(self, s, verbose = False):
      encoded = self._tokeniser.batch_encode_plus([s], return_offsets_mapping=True, add_special_tokens=False)
      split_tokens = []
      for i in range(len(encoded['input_ids'][0])):
        split_tokens.append(
            {
                'index': i,
                'input_id': encoded['input_ids'][0][i],
                'boundaries': encoded['offset_mapping'][0][i],
            }
        )
      return split_tokens

  def post_processing(self, v):
    return v

class PosTagEnrichedTokeniser(BaseEnrichedTokeniser):

  def __init__(self, tokeniser):
    super().__init__(tokeniser)
    self.feature_key = 'pos_tag'

  def get_feature(self, s):
    """
    Return list of [('boundaries':(int, int), 'feature':string)]
    """
    all_tokens = get_sentence_tokens(s)
    return [{'boundaries':token['token_boundaries'], 'pos_tag': token['token_pos_tag']} for token in all_tokens]


class PosTagIdEnrichedTokeniser(BaseEnrichedTokeniser):
  def __init__(self, tokeniser):
    super().__init__(tokeniser)
    self.feature_key = 'pos_tag_ids'
    self.pos_tag_to_id_map = PosTagIdEnrichedTokeniser.load_pos_tag_value_to_idx()

  @staticmethod
  def load_pos_tag_value_to_idx():
  # Computed based on test part of MRPC dataset.
    return {
        '$': 1,
        "''": 2,
        '(': 3,
        ')': 4,
        ',': 5,
        '.': 6,
        ':': 7,
        'CC': 8,
        'CD': 9,
        'DT': 10,
        'EX': 11,
        'FW': 12,
        'IN': 13,
        'JJ': 14,
        'JJR': 15,
        'JJS': 16,
        'LS': 17,
        'MD': 18,
        'NA': 19,
        'NN': 20,
        'NNP': 21,
        'NNPS': 22,
        'NNS': 23,
        'PRP': 24,
        'PRP$': 25,
        'RB': 26,
        'RBR': 27,
        'SYM': 28,
        'TO': 29,
        'UH': 30,
        'UNKNOWN': 31,
        'VB': 32,
        'VBD': 33,
        'VBG': 34,
        'VBN': 35,
        'VBP': 36,
        'VBZ': 37,
        'WDT': 38,
        'WP': 39,
        'WP$': 40,
        'WRB': 41,
        '``': 42
      }

  def get_feature(self, s):
    """
    Return list of [('boundaries':(int, int), 'feature':string)]
    """
    all_tokens = get_sentence_tokens(s)
    def get_id(pos_tag):
      if pos_tag in self.pos_tag_to_id_map:
        return self.pos_tag_to_id_map[pos_tag]
      return self.pos_tag_to_id_map['UNKNOWN']
    return [{'boundaries':token['token_boundaries'], self.feature_key: get_id(token['token_pos_tag'])} for token in all_tokens]

  def post_processing(self, s1_s2_feature):
    token_mapping = self.get_special_token_mapping()
    def replace_token_id(token_id, m):
      if token_id in m.keys():
        return m[token_id]
      return token_id
    _s1_s2_feature = [replace_token_id(x, token_mapping) for x in s1_s2_feature]
    return _s1_s2_feature

  def get_special_token_mapping(self):
    return {
      self._tokeniser.cls_token_id: self.pos_tag_to_id_map['NA'],
      self._tokeniser.sep_token_id: self.pos_tag_to_id_map['NA'],
    }


class AttentionEnhencerDummyEnrichedTokeniser(BaseEnrichedTokeniser):
  def __init__(self, tokeniser):
    super().__init__(tokeniser)
    self.feature_key = 'attention_enhencer_dummy'
    self.type = TokeniserType.TWO_SENTENCES

  def enrich_tokens(self, s1, s2,  padding, truncation, max_length, _config):
    # Dummy matrix will have 1 for all non padding elements.

    dummy = self._tokeniser(s1, s2, truncation=truncation, max_length=max_length, padding=padding)

    first_padding_0 = dummy['input_ids'].index(self._tokeniser.pad_token_id)
    source = torch.full((first_padding_0,first_padding_0), 1.)
    pad_distance =  max_length - first_padding_0

    result = F.pad(input=source, pad=(0, pad_distance, 0, pad_distance), mode='constant', value=0.)
    return result

  def get_feature(self, s):
    all_tokens = get_sentence_tokens(s)
    return [
      {
        'boundaries':token['token_boundaries'],
        'token_connection_ids': token['token_connection_ids'],
        'token_id': token['token_id']
      } for token in all_tokens
    ]


class AttentionEnhencerRandEnrichedTokenise(BaseEnrichedTokeniser):
  def __init__(self, tokeniser):
    super().__init__(tokeniser)
    self.feature_key = 'attention_enhencer_rand'
    self.type = TokeniserType.TWO_SENTENCES

  def enrich_tokens(self, s1, s2,  padding, truncation, max_length, _config):
    # Dummy matrix will have 1 for all non padding elements.

    dummy = self._tokeniser(s1, s2, truncation=truncation, max_length=max_length, padding=padding)

    first_padding_0 = dummy['input_ids'].index(self._tokeniser.pad_token_id)
    source = torch.rand((first_padding_0,first_padding_0))
    pad_distance =  max_length - first_padding_0

    result = F.pad(input=source, pad=(0, pad_distance, 0, pad_distance), mode='constant', value=0.)
    return result

  def get_feature(self, s):
    all_tokens = get_sentence_tokens(s)
    return [
      {
        'boundaries':token['token_boundaries'],
        'token_connection_ids': token['token_connection_ids'],
        'token_id': token['token_id']
      } for token in all_tokens
    ]

class AttentionEnhencerOneEnrichedTokenise(BaseEnrichedTokeniser):
  def __init__(self, tokeniser):
    super().__init__(tokeniser)
    self.feature_key = 'attention_enhencer'
    self.type = TokeniserType.TWO_SENTENCES

  def enrich_tokens(self, s1, s2,  padding, truncation, max_length, _config):
    # Dummy matrix will have 1 for all elements.
    source = torch.full((max_length,max_length), 1.)

    # number_of_attention_heads = 12
    # source = source[None, :, :].expand([number_of_attention_heads, max_length, max_length])

    return source

class AttentionEnhencerDependancyTreeEnrichedTokenise(BaseEnrichedTokeniser):
  def __init__(self, tokeniser):
    super().__init__(tokeniser)
    self.feature_key = 'attention_enhencer'
    self.type = TokeniserType.TWO_SENTENCES

  def enrich_tokens(self, s1, s2,  padding, truncation, max_length, config):
    if 'att_dep_tree_pad_value' in config:
      pad_value = config['att_dep_tree_pad_value']
    else:
      pad_value = 0.

    data = self._tokeniser(s1, s2, truncation=truncation, max_length=max_length, padding=padding)
    first_padding_0 = data['input_ids'].index(self._tokeniser.pad_token_id)
    source = torch.full((first_padding_0,first_padding_0), 1.)
    pad_distance =  max_length - first_padding_0
    base_table = F.pad(input=source, pad=(0, pad_distance, 0, pad_distance), mode='constant', value=pad_value)
    # Here we have table
    # [1, ...1, 0, ..0]
    # [...............]
    # [1, ...1, 0, ..0]
    # [...............]
    # [0, ...0, 0, ..0]

    # First sentence start
    s1_start_inx = 1
    sep_inx = data['input_ids'].index(self._tokeniser.sep_token_id)
    s1_end_inx = sep_inx - 1
    s2_start_inx = sep_inx + 1
    s2_end_inx = first_padding_0 - 2

    # print(f"First start {s1_start_inx} end {s1_end_inx}")
    # print(f"Second start {s2_start_inx} end {s2_end_inx}")

    # for i in range(s1_end_inx - s1_start_inx + 1):
    #   for j in range(s1_end_inx - s1_start_inx + 1):
    #     base_table[s1_start_inx + i][s1_start_inx + j] = 2.

    # for i in range(s2_end_inx - s2_start_inx + 1):
    #   for j in range(s2_end_inx - s2_start_inx + 1):
    #     base_table[s2_start_inx + i][s2_start_inx + j] = 3.

    s1_tokens = self.get_transformer_sentence_tokens(s1)
    s2_tokens = self.get_transformer_sentence_tokens(s2)

    s1_feature = self.get_feature(s1)
    s2_feature = self.get_feature(s2)

    def boundaries_match(transformer_token_boundary, sentence_feature_boundary):
      left = max(transformer_token_boundary[0], sentence_feature_boundary[0])
      right = min(transformer_token_boundary[1], sentence_feature_boundary[1])
      return left < right

    def build_token_map(tokens, features):
      # Feature token id => [list of token ids]
      m = {}
      for i in range(len(features)):
        m[i] = []
        for j in range(len(tokens)):
          feature_boundary = features[i]['boundaries']
          token_boundary = tokens[j]['boundaries']
          if boundaries_match(token_boundary, feature_boundary):
            # we need to add pair
            m[i].append(j)
      return m

    val = config['att_dep_tree_value']

    def update_base_table(table, tokens, features, max_len, offset):
      token_map = build_token_map(tokens, features)
      for i in range(len(features)):
        feature = features[i]
        connected_node_id = feature['token_connection_ids']
        feature_connected = features[connected_node_id]
        tokens = token_map[i]
        connected_tokens = token_map[connected_node_id]
        for _x in tokens:
          for _y in connected_tokens:
            if _x < max_len and _y < max_len:
              table[offset + _x][offset + _y] = val
              table[offset + _y][offset + _x] = val
      return table

    base_table = update_base_table(base_table, s1_tokens, s1_feature, s1_end_inx - s1_start_inx + 1, s1_start_inx)
    base_table = update_base_table(base_table, s2_tokens, s2_feature, s2_end_inx - s2_start_inx + 1, s2_start_inx)

    # number_of_attention_heads = 12
    # base_table = base_table[None, :, :].expand([number_of_attention_heads, max_length, max_length])
    return base_table

  def get_feature(self, s):
    all_tokens = get_sentence_tokens(s)
    return [
      {
        'boundaries':token['token_boundaries'],
        'token_connection_ids': token['token_connection_ids'],
        'token_id': token['token_id']
      } for token in all_tokens
    ]

class FinalTokeniser:
  def __init__(self, tokeniser, config):
    self._tokeniser = tokeniser
    self._config = config


  @staticmethod
  def _prepare_for_model(tokeniser, s1, s2, padding, truncation, max_length):
    return tokeniser.prepare_for_model(
      s1,
      s2,
      padding=padding,
      truncation=truncation,
      max_length=max_length,
    )['input_ids']

  def apply_tokenisers(self, s1, s2, tokeniser_list, padding, truncation, max_length):
    # Get base data first: input ids, token_ids, attention_mask
    encoded_base = self._tokeniser.batch_encode_plus([s1, s2], return_offsets_mapping=True, add_special_tokens=False)
    s1_input_ids = encoded_base['input_ids'][0]
    s2_input_ids = encoded_base['input_ids'][1]

    _prepare_for_model = FinalTokeniser._prepare_for_model

    s1_s2_input_ids = _prepare_for_model(self._tokeniser, s1_input_ids, s2_input_ids, padding, truncation, max_length)

    data = {
      'input_ids': s1_s2_input_ids,
    }

    for tokeniser in tokeniser_list:
      if tokeniser.type == TokeniserType.ONE_SENTENCE:
        enriched_data_s1 = tokeniser.enrich_tokens(s1)
        enriched_data_s2 = tokeniser.enrich_tokens(s2)

        _s1_input_ids = [x['input_id'] for x in enriched_data_s1]
        _s2_input_ids = [x['input_id'] for x in enriched_data_s2]

        assert(s1_input_ids == _s1_input_ids)
        assert(s2_input_ids == _s2_input_ids)

        s1_feature = [x[tokeniser.feature_key] for x in enriched_data_s1]
        s2_feature = [x[tokeniser.feature_key] for x in enriched_data_s2]

        s1_s2_feature = _prepare_for_model(self._tokeniser, s1_feature, s2_feature,  padding, truncation, max_length)
      else:
        s1_s2_feature = tokeniser.enrich_tokens(s1, s2,  padding, truncation, max_length, self._config)

      s1_s2_feature = tokeniser.post_processing(s1_s2_feature)

      data[tokeniser.feature_key] = s1_s2_feature


    return data

  def tokenise_everything(self, s1, s2, padding, truncation, max_length):
    if 'tokeniser_list' in self._config:
      tokeniser_list = self._config['tokeniser_list']
    else:
      tokeniser_list = ['dep']

    list_of_tokenisers = []
    if 'pos' in tokeniser_list:
      pos_tag_id_tokeniser = PosTagIdEnrichedTokeniser(self._tokeniser)
      list_of_tokenisers.append(pos_tag_id_tokeniser)

    if 'attention_dummy' in tokeniser_list:
      attention_dummy_tokeniser = AttentionEnhencerDummyEnrichedTokeniser(self._tokeniser)
      list_of_tokenisers.append(attention_dummy_tokeniser)

    if 'attention_dep' in tokeniser_list:
      attention_tokeniser_dep = AttentionEnhencerDependancyTreeEnrichedTokenise(self._tokeniser)
      list_of_tokenisers.append(attention_tokeniser_dep)

    if 'attention_one' in tokeniser_list:
      attention_tokeniser_one = AttentionEnhencerOneEnrichedTokenise(self._tokeniser)
      list_of_tokenisers.append(attention_tokeniser_one)

    AttentionEnhencerDependancyTreeEnrichedTokenise
    return self.apply_tokenisers(
      s1,
      s2,
      list_of_tokenisers,
      padding,
      truncation,
      max_length
    )

def preprocess_dataset_final(examples, tokenizer, truncation, max_length, padding, config):
  basic_tokenizer_data = tokenizer(examples["sentence1"], examples["sentence2"], truncation=truncation, max_length=max_length, padding=padding)
  final_tokeniser = FinalTokeniser(tokeniser=tokenizer, config=config)
  enriched_data = final_tokeniser.tokenise_everything(examples["sentence1"], examples["sentence2"], truncation=truncation, max_length=max_length, padding=padding)
  assert(basic_tokenizer_data['input_ids'] == enriched_data['input_ids'])
  for feature, value in enriched_data.items():
    basic_tokenizer_data[feature] = value
  return basic_tokenizer_data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [11]:
dataset_train = load_dataset("glue", 'mrpc', split=f"train{DATASET_PART}")
dataset_eval = load_dataset("glue", 'mrpc', split=f"validation{DATASET_PART}")
dataset_test = load_dataset("glue", 'mrpc', split=f"test{DATASET_PART}")

config = {
    'att_dep_tree_value': 0.8,
    'att_dep_tree_pad_value': 0.,
    'tokeniser_list': ['attention_dep'],
}

preprocess_dataset_with_full_v2 = partial(
    preprocess_dataset_final,
    tokenizer=model_tokenizer,
    truncation=TRUNCATION,
    max_length=MAX_LEN,
    padding=PADDING,
    config=config,
  )

collator = DataCollatorWithPadding(model_tokenizer)

def prepare_dataloader(dataset, collator):
  dataset = dataset.map(preprocess_dataset_with_full_v2, batched=False)
  dataset = dataset.remove_columns(["sentence1", "sentence2", "idx"])
  dataset = dataset.rename_column("label", "labels")
  dataset.set_format("torch")
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collator)
  return dataloader


train_dataloader = prepare_dataloader(dataset_train, collator)
eval_dataloader = prepare_dataloader(dataset_eval, collator)
test_dataloader = prepare_dataloader(dataset_test, collator)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [12]:
for b in train_dataloader:
  print(b.keys())
  break
for b in eval_dataloader:
  print(b.keys())
  break
for b in test_dataloader:
  print(b.keys())
  break

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'attention_enhencer'])
dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'attention_enhencer'])
dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'attention_enhencer'])


In [13]:
config


{'att_dep_tree_value': 0.8,
 'att_dep_tree_pad_value': 0.0,
 'tokeniser_list': ['attention_dep']}

In [14]:

optimizer = AdamW(model.parameters(), lr=LR)

num_training_steps = NUM_TRAIN_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [15]:
def train_eval_test(model, optimizer, lr_scheduler,  train_dataloader, eval_dataloader,test_dataloader,  num_train_epochs, num_training_steps, device):
  progress_bar = tqdm(range(num_training_steps))

  def prepare_batch(b):
    (B, L, L) = b['attention_enhencer'].size()
    number_of_attention_heads = 12
    b['attention_enhencer'] = b['attention_enhencer'][:, None, :, :].expand([B, number_of_attention_heads, L, L])
    return b
  for epoch in range(num_train_epochs):
      print(f"Epoch {epoch}")
      model.train()
      for batch in train_dataloader:
          batch = prepare_batch(batch)
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

      accuracy_metric = evaluate.load("accuracy")
      f1_metric = evaluate.load("f1")
      model.eval()
      for batch in eval_dataloader:
          batch = prepare_batch(batch)
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model(**batch)

          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
          f1_metric.add_batch(predictions=predictions, references=batch["labels"])

      acc = accuracy_metric.compute()
      f1 = f1_metric.compute()
      print(f"Eval accuracy {acc['accuracy']:.4f}")
      print(f"Eval F1 {f1['f1']:.4f}")


      test_accuracy_metric = evaluate.load("accuracy")
      test_f1_metric = evaluate.load("f1")
      model.eval()
      for batch in test_dataloader:
          batch = prepare_batch(batch)
          batch = {k: v.to(device) for k, v in batch.items()}
          with torch.no_grad():
              outputs = model(**batch)

          logits = outputs.logits
          predictions = torch.argmax(logits, dim=-1)
          test_accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
          test_f1_metric.add_batch(predictions=predictions, references=batch["labels"])

      acc = test_accuracy_metric.compute()
      f1 = test_f1_metric.compute()
      print(f"Test accuracy {acc['accuracy']:.4f}")
      print(f"Test F1 {f1['f1']:.4f}")


train_eval_test(
  model = model,
  optimizer = optimizer,
  lr_scheduler = lr_scheduler,
  train_dataloader = train_dataloader,
  eval_dataloader = eval_dataloader,
  test_dataloader = test_dataloader,
  num_train_epochs = NUM_TRAIN_EPOCHS,
  num_training_steps = num_training_steps,
  device = device,
)

  0%|          | 0/4590 [00:00<?, ?it/s]

Epoch 0
Eval accuracy 0.8848
Eval F1 0.9191
Test accuracy 0.8516
Test F1 0.8947
Epoch 1
Eval accuracy 0.8873
Eval F1 0.9207
Test accuracy 0.8603
Test F1 0.9000
Epoch 2
Eval accuracy 0.9020
Eval F1 0.9293
Test accuracy 0.8754
Test F1 0.9075
Epoch 3
Eval accuracy 0.9044
Eval F1 0.9315
Test accuracy 0.8643
Test F1 0.9019
Epoch 4
Eval accuracy 0.8946
Eval F1 0.9247
Test accuracy 0.8701
Test F1 0.9064
Epoch 5
Eval accuracy 0.8848
Eval F1 0.9156
Test accuracy 0.8771
Test F1 0.9076
Epoch 6
Eval accuracy 0.8922
Eval F1 0.9211
Test accuracy 0.8748
Test F1 0.9064
Epoch 7
Eval accuracy 0.8995
Eval F1 0.9277
Test accuracy 0.8777
Test F1 0.9106
Epoch 8
Eval accuracy 0.8995
Eval F1 0.9277
Test accuracy 0.8852
Test F1 0.9157
Epoch 9
Eval accuracy 0.8995
Eval F1 0.9279
Test accuracy 0.8823
Test F1 0.9146


In [16]:
# from huggingface_hub import login

# huggingface_token = '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'

# login(token=huggingface_token)
# m = model_name.split('/')[-1]
# model.push_to_hub(f"VitaliiVrublevskyi/mrpc_{m}_base_dummy_attention")

In [17]:
# Dummy [1, ...1, 000]
# V100
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# Epoch 0
# Eval accuracy 0.8848
# Eval F1 0.9188
# Test accuracy 0.8736
# Test F1 0.9082
# Epoch 1
# Eval accuracy 0.8971
# Eval F1 0.9261
# Test accuracy 0.8875
# Test F1 0.9171

In [18]:
# Rand [r, ...r, 000]
# V100
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# Epoch 0
# Eval accuracy 0.7010
# Eval F1 0.8146
# Test accuracy 0.6846
# Test F1 0.7993
# Epoch 1
# Eval accuracy 0.6838
# Eval F1 0.8122
# Test accuracy 0.6649
# Test F1 0.7987

In [19]:
# Dep Tree 1.2 [1, ...1, 000]
# V100
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# Epoch 0
# Eval accuracy 0.8652
# Eval F1 0.9073
# Test accuracy 0.8435
# Test F1 0.8914
# Epoch 1
# Eval accuracy 0.8922
# Eval F1 0.9239
# Test accuracy 0.8788
# Test F1 0.9119

In [20]:
# Dep Tree 0.8 [1, ...1, 000]
# V100
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# Epoch 0
# Eval accuracy 0.8407
# Eval F1 0.8896
# Test accuracy 0.8174
# Test F1 0.8717
# Epoch 1
# Eval accuracy 0.8603
# Eval F1 0.9016
# Test accuracy 0.8493
# Test F1 0.8906

In [21]:
# Dummy [1, ...1, 000]
# V100
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# Epoch 0
# Eval accuracy 0.8750
# Eval F1 0.9125
# Test accuracy 0.8365
# Test F1 0.8849
# Epoch 1
# Eval accuracy 0.8873
# Eval F1 0.9179
# Test accuracy 0.8713
# Test F1 0.9040
# Epoch 2
# Eval accuracy 0.8922
# Eval F1 0.9241
# Test accuracy 0.8696
# Test F1 0.9067
# Epoch 3
# Eval accuracy 0.8995
# Eval F1 0.9287
# Test accuracy 0.8794
# Test F1 0.9116
# Epoch 4
# Eval accuracy 0.8897
# Eval F1 0.9220
# Test accuracy 0.8777
# Test F1 0.9099
# CommitInfo(commit_url='https://huggingface.co/VitaliiVrublevskyi/mrpc_deberta-base_base_dummy_attention/commit/59064cbb6a87216ca69584729002ab6fa5c2b139', commit_message='Upload DebertaForSequenceClassificationV2', commit_description='', oid='59064cbb6a87216ca69584729002ab6fa5c2b139', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
# Dep Tree 1 [1, ...1, 000]
# V100
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# Epoch 0
# Epoch 0
# Eval accuracy 0.7010
# Eval F1 0.8201
# Test accuracy 0.6951
# Test F1 0.8124
# Epoch 1
# Eval accuracy 0.6838
# Eval F1 0.8122
# Test accuracy 0.6649
# Test F1 0.7987

In [23]:
# Baseline, empty enhancer
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10

# Epoch 0
# Eval accuracy 0.8333
# Eval F1 0.8885
# Test accuracy 0.8081
# Test F1 0.8713
# Epoch 1
# Eval accuracy 0.8995
# Eval F1 0.9287
# Test accuracy 0.8713
# Test F1 0.9071
# Epoch 2
# Eval accuracy 0.8922
# Eval F1 0.9225
# Test accuracy 0.8696
# Test F1 0.9041
# Epoch 3
# Eval accuracy 0.9044
# Eval F1 0.9319
# Test accuracy 0.8730
# Test F1 0.9071
# Epoch 4
# Eval accuracy 0.8873
# Eval F1 0.9199
# Test accuracy 0.8754
# Test F1 0.9098
# Epoch 5
# Eval accuracy 0.8873
# Eval F1 0.9190
# Test accuracy 0.8852  <------------------
# Test F1 0.9154 <------------------
# Epoch 6
# Eval accuracy 0.8873
# Eval F1 0.9201
# Test accuracy 0.8748
# Test F1 0.9097
# Epoch 7
# Eval accuracy 0.8603
# Eval F1 0.9052
# Test accuracy 0.8330
# Test F1 0.8850
# Epoch 8
# Eval accuracy 0.9044
# Eval F1 0.9322
# Test accuracy 0.8719
# Test F1 0.9072
# Epoch 9
# Eval accuracy 0.9020
# Eval F1 0.9298
# Test accuracy 0.8736
# Test F1 0.9072

In [24]:
# Dep Enhancer, 1.2 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.6740
# Eval F1 0.7899
# Test accuracy 0.6910
# Test F1 0.7983
# Epoch 1
# Eval accuracy 0.7010
# Eval F1 0.8201
# Test accuracy 0.6916
# Test F1 0.8104
# Epoch 2
# Eval accuracy 0.8627
# Eval F1 0.9076
# Test accuracy 0.8267
# Test F1 0.8808
# Epoch 3
# Eval accuracy 0.8897
# Eval F1 0.9220
# Test accuracy 0.8545
# Test F1 0.8940
# Epoch 4
# Eval accuracy 0.8701
# Eval F1 0.9103
# Test accuracy 0.8452
# Test F1 0.8907
# Epoch 5
# Eval accuracy 0.8775
# Eval F1 0.9161
# Test accuracy 0.8533
# Test F1 0.8970
# Epoch 6
# Eval accuracy 0.8775
# Eval F1 0.9161
# Test accuracy 0.8516
# Test F1 0.8959
# Epoch 7
# Eval accuracy 0.8873
# Eval F1 0.9201
# Test accuracy 0.8701
# Test F1 0.9047
# Epoch 8
# Eval accuracy 0.8897
# Eval F1 0.9220
# Test accuracy 0.8736 <--------------
# Test F1 0.9075 <--------------
# Epoch 9
# Eval accuracy 0.8971
# Eval F1 0.9281
# Test accuracy 0.8667
# Test F1 0.9039

In [25]:
# Dep Enhancer, 1.1 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.6838
# Eval F1 0.8122
# Test accuracy 0.6649
# Test F1 0.7987
# Epoch 1
# Eval accuracy 0.6961
# Eval F1 0.8182
# Test accuracy 0.6829
# Test F1 0.8072
# Epoch 2
# Eval accuracy 0.8750
# Eval F1 0.9140
# Test accuracy 0.8487
# Test F1 0.8933
# Epoch 3
# Eval accuracy 0.8725
# Eval F1 0.9044
# Test accuracy 0.8475
# Test F1 0.8818
# Epoch 4
# Eval accuracy 0.8627
# Eval F1 0.9060
# Test accuracy 0.8464
# Test F1 0.8928
# Epoch 5
# Eval accuracy 0.8848
# Eval F1 0.9180
# Test accuracy 0.8684
# Test F1 0.9039
# Epoch 6
# Eval accuracy 0.8750
# Eval F1 0.9113
# Test accuracy 0.8678
# Test F1 0.9029
# Epoch 7
# Eval accuracy 0.8799
# Eval F1 0.9139
# Test accuracy 0.8707
# Test F1 0.9032
# Epoch 8
# Eval accuracy 0.8897
# Eval F1 0.9212
# Test accuracy 0.8771   <--------------
# Test F1 0.9089         <--------------
# Epoch 9
# Eval accuracy 0.8824
# Eval F1 0.9172
# Test accuracy 0.8719
# Test F1 0.9062

In [26]:
# Dep Enhancer, 0.9 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.8701
# Eval F1 0.9115
# Test accuracy 0.8359
# Test F1 0.8874
# Epoch 1
# Eval accuracy 0.8873
# Eval F1 0.9210
# Test accuracy 0.8742
# Test F1 0.9097
# Epoch 2
# Eval accuracy 0.8824
# Eval F1 0.9134
# Test accuracy 0.8835
# Test F1 0.9127
# Epoch 3
# Eval accuracy 0.8848
# Eval F1 0.9180
# Test accuracy 0.8730
# Test F1 0.9081
# Epoch 4
# Eval accuracy 0.9020
# Eval F1 0.9306
# Test accuracy 0.8794
# Test F1 0.9116
# Epoch 5
# Eval accuracy 0.9167
# Eval F1 0.9401
# Test accuracy 0.8846    <--------------
# Test F1 0.9148          <--------------
# Epoch 6
# Eval accuracy 0.8995
# Eval F1 0.9292
# Test accuracy 0.8788
# Test F1 0.9130
# Epoch 7
# Eval accuracy 0.9020
# Eval F1 0.9306
# Test accuracy 0.8800
# Test F1 0.9130
# Epoch 8
# Eval accuracy 0.9044
# Eval F1 0.9319
# Test accuracy 0.8817
# Test F1 0.9139
# Epoch 9
# Eval accuracy 0.9044
# Eval F1 0.9319
# Test accuracy 0.8817
# Test F1 0.9141

In [None]:
# Dep Enhancer, 0.8 increase, 0 pad
# MAX_LEN = 256
# BATCH_SIZE = 8
# SEED = 42
# NUM_TRAIN_EPOCHS = 10
# Epoch 0
# Eval accuracy 0.8848
# Eval F1 0.9191
# Test accuracy 0.8516
# Test F1 0.8947
# Epoch 1
# Eval accuracy 0.8873
# Eval F1 0.9207
# Test accuracy 0.8603
# Test F1 0.9000
# Epoch 2
# Eval accuracy 0.9020
# Eval F1 0.9293
# Test accuracy 0.8754
# Test F1 0.9075
# Epoch 3
# Eval accuracy 0.9044
# Eval F1 0.9315
# Test accuracy 0.8643
# Test F1 0.9019
# Epoch 4
# Eval accuracy 0.8946
# Eval F1 0.9247
# Test accuracy 0.8701
# Test F1 0.9064
# Epoch 5
# Eval accuracy 0.8848
# Eval F1 0.9156
# Test accuracy 0.8771
# Test F1 0.9076
# Epoch 6
# Eval accuracy 0.8922
# Eval F1 0.9211
# Test accuracy 0.8748
# Test F1 0.9064
# Epoch 7
# Eval accuracy 0.8995
# Eval F1 0.9277
# Test accuracy 0.8777
# Test F1 0.9106
# Epoch 8
# Eval accuracy 0.8995
# Eval F1 0.9277
# Test accuracy 0.8852
# Test F1 0.9157
# Epoch 9
# Eval accuracy 0.8995
# Eval F1 0.9279
# Test accuracy 0.8823
# Test F1 0.9146