In [1]:
import matplotlib.pyplot as plt
import numpy as np  

In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

### Get unembedding matrix and its pseudo-inverse

In [4]:
#Where is it documented?
# Check if works for all models 
unembed = model.get_output_embeddings().weight.detach()

In [5]:
vocab_size, hidden_dim = unembed.shape
vocab_size, hidden_dim

(50257, 768)

In [52]:
# cross check
test = 'Tom and Lily were playing with their toys in the living room. They liked to build towers and bridges with their blocks and cars. Tom was very proud of his tall tower. '
inputs = tokenizer(test, return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"], output_hidden_states=True)

test_logits = outputs.logits.detach()
explicit_logits = outputs.hidden_states[-1] @ unembed.T

assert torch.allclose(test_logits, explicit_logits)

## Decoding earlier layers

In [58]:
idx = 12
l0 = F.softmax(outputs.hidden_states[3] @ unembed.T, dim=-1)[0,idx]
lfin_prev = F.softmax(test_logits[0,idx-1], dim=-1)
lfin = F.softmax(test_logits[0,idx], dim=-1)

torch.topk(l0, 3), torch.topk(lfin_prev, 3), torch.topk(lfin, 3)

(torch.return_types.topk(
 values=tensor([0.9272, 0.0592, 0.0056], grad_fn=<TopkBackward0>),
 indices=tensor([198, 383, 262])),
 torch.return_types.topk(
 values=tensor([0.2535, 0.1819, 0.1436]),
 indices=tensor([ 13,  11, 618])),
 torch.return_types.topk(
 values=tensor([0.1807, 0.1396, 0.0722]),
 indices=tensor([  198, 20037,  1119])))

In [87]:
def layer_predictions(outputs, unembed, token_idx, layers, k=3):
    """
    Print the top-k predictions for a specific token at multiple layers, as well as the actual predictions
    for the current and previous tokens from the final layer.

    Args:
    outputs (torch.Tensor): The model outputs that include hidden_states.
    unembed (torch.Tensor): The detached weights of the output embeddings.
    token_idx (int): The index of the token to analyze.
    layers (List[int]): The layers from which to take the hidden states.
    k (int): The number of top predictions to return.
    """
    num_layers = len(outputs.hidden_states)

    # Check if all layers are within range
    for layer in layers:
        if layer >= num_layers or layer < 0:
            raise ValueError(f"Layer {layer} is out of bounds for model with {num_layers} layers.")
    
    # Calculate the logits for the final layer
    logits_final = F.softmax(outputs.hidden_states[-1] @ unembed.T, dim=-1)
    
    # Get the top-k tokens and their probabilities for the final layer at the current and previous token
    topk_final_current = torch.topk(logits_final[0, token_idx], k)
    topk_final_previous = torch.topk(logits_final[0, token_idx - 1], k) if token_idx > 0 else None

    # Formatting function
    def format_topk(topk):
        tokens = tokenizer.convert_ids_to_tokens(topk.indices.tolist())
        probabilities = topk.values.tolist()
        return ', '.join([f"{token:10}: {prob:.4f}" for token, prob in zip(tokens, probabilities)])

    print(f'Token Index {token_idx} Top {k}')
    # Process each requested layer
    for layer in layers:
        hidden_states_layer = outputs.hidden_states[layer]
        logits_layer = F.softmax(hidden_states_layer @ unembed.T, dim=-1)
        topk_layer = torch.topk(logits_layer[0, token_idx], k)
        formatted_layer = format_topk(topk_layer)

        print(f"Layer {layer:3}:  {formatted_layer}")
        if layer == 0:
            if topk_final_previous:
                formatted_final_previous = format_topk(topk_final_previous)
                print(f"F Prev Tok: {formatted_final_previous}")
            else:
                print("No predictions for the previous token: It is the first token in the sequence.")

    # Format and print the final layer predictions
    formatted_final_current = format_topk(topk_final_current)
    print(f"\nFinal:      {formatted_final_current}")







In [None]:

test_sentence = "Tom and Lily were playing with their toys in the living room. They liked to build towers and bridges with their blocks and cars."
inputs = tokenizer(test_sentence, return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"], output_hidden_states=True)
unembed = model.get_output_embeddings().weight.detach()




In [107]:
token_index_of_interest = 0

In [123]:
print(f'current token: {tokenizer.decode(inputs["input_ids"][0, token_index_of_interest])}')
print(f'target  token: {tokenizer.decode(inputs["input_ids"][0, token_index_of_interest+1])}')

relevant_part = inputs["input_ids"][0, token_index_of_interest-1: token_index_of_interest + 2]
print(tokenizer.decode(relevant_part.tolist()))
print('\n')

layers_of_interest = list(range(12))

# Get the layer predictions
layer_predictions(outputs, unembed, token_index_of_interest, layers_of_interest)

token_index_of_interest +=1

current token:  to
target  token:  build
 liked to build


Token Index 15 Top 3
Layer   0:  Ġto       : 0.0036, to        : 0.0007, Ġin       : 0.0006
F Prev Tok: Ġthe      : 0.2205, Ġto       : 0.1781, Ġit       : 0.0948
Layer   1:  Ġthe      : 0.9984, Ġa        : 0.0011, Ġ"        : 0.0003
Layer   2:  Ġthe      : 0.9856, Ġbe       : 0.0101, Ġget      : 0.0012
Layer   3:  Ġthe      : 0.7298, Ġbe       : 0.1872, Ġmake     : 0.0454
Layer   4:  Ġthe      : 0.3626, Ġbe       : 0.3429, Ġmake     : 0.1533
Layer   5:  Ġbe       : 0.6678, Ġmake     : 0.2545, Ġkeep     : 0.0304
Layer   6:  Ġbe       : 0.7680, Ġmake     : 0.1996, Ġuse      : 0.0101
Layer   7:  Ġbe       : 0.7565, Ġmake     : 0.1565, Ġdo       : 0.0315
Layer   8:  Ġbe       : 0.5986, Ġdo       : 0.2438, Ġthe      : 0.0481
Layer   9:  Ġthe      : 0.9324, ,         : 0.0663, Ġ"        : 0.0008
Layer  10:  Ġthe      : 0.9999, ,         : 0.0001, Ġuse      : 0.0000
Layer  11:  Ġthe      : 0.9956, ,         : 0.0044, Ġ"        : 0.00