In [None]:
import sys
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import time
from collections import defaultdict
from pathlib import Path

import circuitsvis as cv
import einops
import numpy as np
import torch as t
from IPython.display import display
from jaxtyping import Float
from nnsight import CONFIG, LanguageModel
from rich import print as rprint
from rich.table import Table
from torch import Tensor
import string as s

# Hide bunch of info logging messages from nnsight
import logging, warnings
logging.disable(sys.maxsize)
warnings.filterwarnings('ignore', category=UserWarning, module='huggingface_hub.utils._token')

device = t.device('mps' if t.backends.mps.is_available() else 'cuda' if t.cuda.is_available() else 'cpu')
print(device)
t.set_grad_enabled(False)

# Make sure exercises are in the path
chapter = "exercises"
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}").resolve()
section_dir = exercises_dir / "part42_function_vectors_and_model_steering"
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from plotly_utils import imshow
import part42_function_vectors_and_model_steering.solutions as solutions
import part42_function_vectors_and_model_steering.tests as tests

MAIN = __name__ == '__main__'

In [None]:
model = LanguageModel('EleutherAI/gpt-j-6b', device_map='auto', torch_dtype=t.bfloat16)
tokenizer = model.tokenizer

N_HEADS = model.config.n_head
N_LAYERS = model.config.n_layer
D_MODEL = model.config.n_embd
D_HEAD = D_MODEL // N_HEADS

print(f"Number of heads: {N_HEADS}")
print(f"Number of layers: {N_LAYERS}")
print(f"Model dimension: {D_MODEL}")
print(f"Head dimension: {D_HEAD}\n")

print("Entire config: ", model.config)

In [None]:
# Calling tokenizer returns a dictionary, containing input ids & other data.
# If returned as a tensor, then by default it will have a batch dimension.
print(tokenizer("This must be Thursday", return_tensors="pt"))

# Decoding a list of integers, into a concatenated string.
print(tokenizer.decode([40, 1239, 714, 651, 262, 8181, 286, 48971, 12545, 13]))

# Using batch decode, on both 1D and 2D input.
print(tokenizer.batch_decode([4711, 2456, 481, 307, 6626, 510]))
print(tokenizer.batch_decode([[1212, 6827, 481, 307, 1978], [2396, 481, 428, 530]]))

# Split sentence into tokens (note we see the special Ġ character in place of prepended spaces).
print(tokenizer.tokenize("This sentence will be tokenized"))

model.tokenizer(["Hello world", "Hello"], return_tensors="pt", padding=True)

In [None]:
REMOTE = True
# If you want to set REMOTE = True then you'll need an API key. Please join the NDIF community
# Discord (https://nnsight.net/status/) and request one from there, then uncomment and run the
# following code:
CONFIG.set_default_api_key("7592caadcba94ba2a9e3e008a8a3f6a2")

prompt = 'The Eiffel Tower is in the city of'

with model.trace(prompt, remote=REMOTE):
    # Save the model's hidden states
    print(model.transformer.h[-1])
    hidden_states = model.transformer.h[-1].output[0].save()

    # Save the model's logit output
    logits = model.lm_head.output[0, -1].save()

# Get the model's logit output, and it's next token prediction
print(f"logits.shape = {logits.value.shape} = (vocab_size,)")
print("Predicted token ID =", predicted_token_id := logits.value.argmax().item())
print(f"Predicted token = {tokenizer.decode(predicted_token_id)!r}")

# Print the shape of the model's residual stream
print(f"resid.shape = {hidden_states.value.shape} = (batch_size, seq_len, d_model)")

In [None]:
seq_len = len(model.tokenizer.encode(prompt))

try:
    with model.trace(prompt, remote=REMOTE):
        original_output = model.transformer.h[-1].output[0].clone().save()
        model.transformer.h[-1].output[0][:, seq_len] = 0
        modified_output = model.transformer.h[-1].output[0].save()

except Exception as e:
    print(f"Uninformative error message:\n  {e.__class__.__name__}: {e}")

In [None]:
try:
    with model.trace(prompt, remote=REMOTE, scan=True, validate=True):
        original_output = model.transformer.h[-1].output[0].clone().save()
        print(f"{model.transformer.h[-1].output.shape=}\n")
        model.transformer.h[-1].output[0][:, seq_len-1] = 0
        modified_output = model.transformer.h[-1].output[0].save()

except Exception as e:
    print(f"Informative error message:\n  {e.__class__.__name__}: {e}")

print(original_output.value[0,-1,:], modified_output[0,-1,:])

In [None]:
print(original_output.value[0,-1,:], modified_output[0,-1,:])

In [None]:
tokens = tokenizer.tokenize(prompt)
try:
    with model.trace(prompt, remote=REMOTE, scan=True, validate=True):
        attention = model.transformer.h[0].attn.attn_dropout.input.save()
except Exception as e:
    print(f"Informative error message:\n  {e.__class__.__name__}: {e}")

tokens = [s.replace('Ġ', ' ') for s in tokens]
attention = attention.value[0]
print(attention.shape)

cv.attention.attention_patterns(
    tokens=tokens,
    attention=attention,
)