In [1]:
# my cool cool imports
import matplotlib.pyplot as plt
import random
import pickle

In [2]:
# default imports
from tqdm import tqdm
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# BOOLQ dataset
train = load_dataset("super_glue", "boolq")["train"]
val = load_dataset("super_glue", "boolq")["validation"]

In [4]:
# base deberta
model_type = "encoder_decoder"
MODEL_NAME = "allenai/unifiedqa-t5-large"

In [5]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.cuda()

Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 24.1MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 7.26MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 71.9kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.24k/1.24k [00:00<00:00, 8.58MB/s]
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Downloading pytorch_model.bin: 100%|██████████| 2.95G/2.95G [01:42<00:00, 28.7MB/s]
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 1.07MB/s]


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [6]:
def get_encoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given an encoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, truncation=True, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["hidden_states"]
    
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_encoder_decoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given an encoder-decoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    decoder_text_ids = tokenizer("", return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, decoder_input_ids=decoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["encoder_hidden_states"]
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_decoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given a decoder model and some text, gets the hidden states (in a given layer, by default the last) on that input text

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize (adding the EOS token this time)
    input_ids = tokenizer(input_text + tokenizer.eos_token, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(input_ids, output_hidden_states=True)

    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_hidden_states(model, tokenizer, input_text, layer=-1):
    fn = {"encoder": get_encoder_hidden_states, "encoder_decoder": get_encoder_decoder_hidden_states,
          "decoder": get_decoder_hidden_states}[model_type]

    return fn(model, tokenizer, input_text, layer=layer)

In [7]:
# specifies embedding fn
# default model_type = "encoder" is fine, since we use deberta
EMBEDDING_FN = lambda input_text: get_hidden_states(model, tokenizer, input_text)

## Now let's write code for formatting data and for getting all the hidden states.

In [8]:
# formats an individual exaple
def format_boolq(passage, question, label):

    return f"""
PASSAGE: {passage}
QUESTION: {question}
ANSWER: {label}
"""

In [9]:
# featurizes the entire dataset
def featurizer_benchmark(data):

    # featurizes an individual example
    def featurizer(passage, question):
        return EMBEDDING_FN(format_boolq(passage, question, 1)) - EMBEDDING_FN(format_boolq(passage, question, 0))

    # featurizes the entire dataset
    features = []
    for i in tqdm(range(len(data["passage"]))):
        passage = data["passage"][i]
        question = data["question"][i]
        features.append(featurizer(passage, question))

    return features

In [10]:
# specifies featurizer fn
FEATURIZER_FN = featurizer_benchmark

In [11]:
# featurizes train, test
X_train = FEATURIZER_FN(train)
X_val = FEATURIZER_FN(val)

  2%|▏         | 203/9427 [00:22<16:24,  9.37it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1027 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 9427/9427 [19:11<00:00,  8.19it/s]
100%|██████████| 3270/3270 [06:03<00:00,  8.99it/s]


In [12]:
# extracts labels
y_train = train["label"]
y_val = val["label"]

In [None]:
# data storage
FILENAME = "boolq-uqa-large.pkl"
FILEPATH = f"data/{FILENAME}"

In [17]:
# store everything in pkl!
with open(FILEPATH, "wb") as f:
    pickle.dump((X_train, y_train, X_val, y_val), f)

In [11]:
# load everything from pkl!
with open(FILEPATH, "rb") as f:
    X_train, y_train, X_val, y_val = pickle.load(f)

In [35]:
# defines model
# model = LogisticRegression()
# model = SVC()
# model = MLPClassifier(hidden_layer_sizes=(100, 100, 100))
# model = KNeighborsClassifier(n_neighbors=100)
# model  = RandomForestClassifier(n_estimators=100)

In [36]:
# trains on train
model.fit(X_train, y_train)

In [37]:
# model score on train, val
print("train score:", round(model.score(X_train, y_train), 2))
print("val score:", round(model.score(X_val, y_val), 2))

train score: 1.0
val score: 0.76


In [25]:
# RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# k nearest neighbors
from sklearn.neighbors import KNeighborsClassifier