In [1]:
# my cool cool imports
import matplotlib.pyplot as plt
import random
import pickle

In [2]:
# default imports
from tqdm import tqdm
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
import transformers
import sklearn as skl


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# base deberta
model_type = "encoder"
MODEL_NAME = "microsoft/deberta-v3-large"

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
model = transformers.AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.cuda()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForMaskedLM(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=Tr

In [5]:
# BOOLQ dataset
train = load_dataset("super_glue", "boolq")["train"]
val = load_dataset("super_glue", "boolq")["validation"]

In [6]:
def get_encoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given an encoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, truncation=True, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["hidden_states"]
    
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_encoder_decoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given an encoder-decoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    decoder_text_ids = tokenizer("", return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, decoder_input_ids=decoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["encoder_hidden_states"]
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_decoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given a decoder model and some text, gets the hidden states (in a given layer, by default the last) on that input text

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize (adding the EOS token this time)
    input_ids = tokenizer(input_text + tokenizer.eos_token, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(input_ids, output_hidden_states=True)

    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_hidden_states(model, tokenizer, input_text, layer=-1, model_type="encoder"):
    fn = {"encoder": get_encoder_hidden_states, "encoder_decoder": get_encoder_decoder_hidden_states,
          "decoder": get_decoder_hidden_states}[model_type]

    return fn(model, tokenizer, input_text, layer=layer)

In [7]:
# specifies embedding fn
# default model_type = "encoder" is fine, since we use deberta
EMBEDDING_FN = lambda input_text: get_hidden_states(model, tokenizer, input_text)

## Now let's write code for formatting data and for getting all the hidden states.

In [8]:
# formats an individual exaple
def format_boolq(passage, question, label):

    return f"""
PASSAGE: {passage}
QUESTION: {question}
ANSWER: {label}
"""

In [9]:
# featurizes the entire dataset
def featurizer_benchmark(data):

    # featurizes an individual example
    def featurizer(passage, question):
        return EMBEDDING_FN(format_boolq(passage, question, 1)) - EMBEDDING_FN(format_boolq(passage, question, 0))

    # featurizes the entire dataset
    features = []
    for i in tqdm(range(len(data["passage"]))):
        passage = data["passage"][i]
        question = data["question"][i]
        features.append(featurizer(passage, question))

    return features

In [10]:
# specifies featurizer fn
FEATURIZER_FN = featurizer_benchmark

In [11]:
# featurizes train, test
X_train = FEATURIZER_FN(train)
X_val = FEATURIZER_FN(val)

  0%|          | 0/9427 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


100%|██████████| 9427/9427 [24:35<00:00,  6.39it/s]
100%|██████████| 3270/3270 [07:57<00:00,  6.84it/s]


In [14]:
# extracts labels
y_train = train["label"]
y_val = val["label"]

In [15]:
# data storage
FILENAME = "boolq-deberta-large.pkl"
FILEPATH = f"data/{FILENAME}"

In [16]:
# store everything in pkl!
with open(FILEPATH, "wb") as f:
    pickle.dump((X_train, y_train, X_val, y_val), f)

In [7]:
# load everything from pkl!
with open(FILEPATH, "rb") as f:
    X_train, y_train, X_val, y_val = pickle.load(f)

In [34]:
# defines model
# model = skl.linear_model.LogisticRegression()
# model = skl.svm.SVC()
model = skl.neural_network.MLPClassifier(hidden_layer_sizes=(100, 100, 100))
# model = skl.neighbors.KNeighborsClassifier(n_neighbors=100)
# model  = skl.ensemble.RandomForestClassifier(n_estimators=100)

In [35]:
# trains on train
model.fit(X_train, y_train)

In [36]:
# model score on train, val
print("train score:", round(model.score(X_train, y_train), 2))
print("val score:", round(model.score(X_val, y_val), 2))

train score: 0.96
val score: 0.76
