## Let's implement CCS from scratch.
This will deliberately be a simple (but less efficient) implementation to make everything as clear as possible.

In [3]:
from tqdm import tqdm
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM
from sklearn.linear_model import LogisticRegression

# Let's just try IMDB for simplicity
data = load_dataset("amazon_polarity")["test"]

# Here are a few different model options you can play around with:
model_name = "deberta"
# model_name = "gpt-j"
# model_name = "t5"

# if you want to cache the model weights somewhere, you can specify that here
cache_dir = None

if model_name == "deberta":
    model_type = "encoder"
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v2-xxlarge", cache_dir=cache_dir)
    model = AutoModelForMaskedLM.from_pretrained("microsoft/deberta-v2-xxlarge", cache_dir=cache_dir)
    model.cuda()
elif model_name == "gpt-j":
    model_type = "decoder"
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", cache_dir=cache_dir)
    model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", cache_dir=cache_dir)
    model.cuda()
elif model_name == "t5":
    model_type = "encoder_decoder"
    tokenizer = AutoTokenizer.from_pretrained("t5-11b", cache_dir=cache_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-11b", cache_dir=cache_dir)
    model.parallelize()  # T5 is big enough that we may need to run it on multiple GPUs
else:
    print("Not implemented!")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v2-xxlarge and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## First let's write code for extracting hidden states given a model and text. 
How we do this exactly will depend on the type of model.

In [4]:
def get_encoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given an encoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, truncation=True, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["hidden_states"]
    
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_encoder_decoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given an encoder-decoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    decoder_text_ids = tokenizer("", return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, decoder_input_ids=decoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["encoder_hidden_states"]
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_decoder_hidden_states(model, tokenizer, input_text, layer=-1):
    """
    Given a decoder model and some text, gets the hidden states (in a given layer, by default the last) on that input text

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize (adding the EOS token this time)
    input_ids = tokenizer(input_text + tokenizer.eos_token, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(input_ids, output_hidden_states=True)

    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    hs = hs_tuple[layer][0, -1].detach().cpu().numpy()

    return hs

def get_hidden_states(model, tokenizer, input_text, layer=-1, model_type="encoder"):
    fn = {"encoder": get_encoder_hidden_states, "encoder_decoder": get_encoder_decoder_hidden_states,
          "decoder": get_decoder_hidden_states}[model_type]

    return fn(model, tokenizer, input_text, layer=layer)

In [5]:
# specifies embedding fn
# default model_type = "encoder" is fine, since we use deberta
EMBEDDING_FN = lambda input_text: get_hidden_states(model, tokenizer, input_text)

## Now let's write code for formatting data and for getting all the hidden states.

In [6]:
# prompt scaffolding for IMDB dataset
# included in base CCS implementation
def format_imdb(text, label):
    """
    Given an imdb example ("text") and corresponding label (0 for negative, or 1 for positive), 
    returns a zero-shot prompt for that example (which includes that label as the answer).
    
    (This is just one example of a simple, manually created prompt.)
    """
    return "The following movie review expresses a " + ["negative", "positive"][label] + " sentiment:\n" + text

In [7]:
# featurizer for supervised CCS
def featurizer_benchmark(text):
    return EMBEDDING_FN(format_imdb(text, 1)) - EMBEDDING_FN(format_imdb(text, 0))

In [8]:
# specifies featurizer fn
FEATURIZER_FN = featurizer_benchmark

In [9]:
# subsamples dataset
N = 100
subsample = data[np.random.choice(len(data), N, replace=False)]

In [10]:
# featurizes subsample
X = np.array([FEATURIZER_FN(text) for text in tqdm(subsample["content"])])
y = np.array(subsample["label"])

In [11]:
# gets difference of means
exgy = X[y == 1].mean(axis = 0)
exgny = X[y == 0].mean(axis = 0)
diff = exgy - exgny

# Metrics
- salience (proportion of X explained by diff)
<!-- - power (proportion of y explained by diff) -->
- accuracy (proportion of correct predictions)

In [12]:
# gets covariance matrix of X
cmx = np.cov(X.T)

# gives salienece as rayleigh quotient w normalized trace
# = e.val / sum e.vals
print("salience: ", np.round(diff.T @ cmx @ diff / diff.dot(diff) / np.trace(cmx), 2))

salience:  0.07


In [29]:
# overall mean x
ex = X.mean(axis = 0)

# gets vector of diff scores
X_diffs = (X - ex) @ diff

# gets sign of diff scores
X_signs = np.sign(X_diffs)

In [None]:
# plots diff scores for positive and negative examples
import matn

In [35]:
# converts y to (1, -1) vector
y_signs = np.where(y == 0, -1, 1)

# gets Pr[correct]
pr_correct = (X_signs.dot(y_signs) / len(y_signs) + 1) / 2
print("classifier accuracy: ", pr_correct)

classifier accuracy:  0.7


In [38]:
# alternatively, just gets sklearn to do it
lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)
print("Logistic regression accuracy: {}".format(lr.score(X, y)))

Logistic regression accuracy: 0.99
