# Setup

In [1]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-433516bc-e720-da28-c01d-1c260e27e9c7)


In [2]:
import plotly.io as pio
try:
    import google.colab
    print("Running as a Colab notebook")
    pio.renderers.default = "colab"
    %pip install transformer-lens fancy-einsum
    %pip install -U kaleido # kaleido only works if you restart the runtime. Required to write figures to disk (final cell)
except:
    print("Running as a Jupyter notebook")
    pio.renderers.default = "vscode"
    from IPython import get_ipython
    ipython = get_ipython()

Running as a Colab notebook
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformer-lens
  Downloading transformer_lens-1.2.2-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.9/88.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fancy-einsum
  Downloading fancy_einsum-0.0.3-py3-none-any.whl (6.2 kB)
Collecting datasets>=2.7.1 (from transformer-lens)
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops>=0.6.0 (from transformer-lens)
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jaxtyping>=0.2.11 (from transformer-lens)
  Downloading jaxtyping-0.2.19-py3-none-any.whl (24 kB)
Coll

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [3]:
!pip install 'torchtyping'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtyping
  Downloading torchtyping-0.1.4-py3-none-any.whl (17 kB)
Installing collected packages: torchtyping
Successfully installed torchtyping-0.1.4


In [4]:
import torch
from fancy_einsum import einsum
from transformer_lens import HookedTransformer, HookedTransformerConfig, utils, ActivationCache
from torchtyping import TensorType as TT
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import einops
from typing import List, Union, Optional
from functools import partial
import pandas as pd
from pathlib import Path
import urllib.request
from bs4 import BeautifulSoup
from tqdm import tqdm
from datasets import load_dataset
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # https://stackoverflow.com/q/62691279
torch.set_grad_enabled(False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
model = HookedTransformer.from_pretrained(
    "gpt2-large",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True,
    device=device,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-large into HookedTransformer


# Finding 2:  The activation of the “an-neuron” correlates with the ‘ an’ token being predicted.

Make sure tokens_of_interest_strs, neuron_layers, and neuron_indices are all the same size

In [None]:
# Load dataset
dataset = load_dataset("NeelNanda/pile-10k", split="train")

##Analyze variables in "Congruence of every neuron with a token" code

In [None]:
tokens_of_interest_strs = [" decline"]
tokens_of_interest = torch.tensor([model.to_single_token(token_str) for token_str in tokens_of_interest_strs], device=device)

mlp_output_weights = torch.cat([block.mlp.W_out for block in model.blocks], dim=0)# (n_layer * d_mlp, d_model)

token_congruence_with_each_neuron_figs = []
for i, token_of_interest_str in enumerate(tokens_of_interest_strs[:1]):
        token_of_interest_dot_product = torch.einsum("d, nd -> n", model.embed.W_E[tokens_of_interest[i]], mlp_output_weights)
        neuron_names = [f"Layer {i//model.cfg.d_mlp}" + (f" Neuron {i%model.cfg.d_mlp}" if i%model.cfg.d_mlp != 0 else "") for i in range(mlp_output_weights.shape[0])]
        token_congruence_with_each_neuron_fig = px.scatter(x=neuron_names,
                y=token_of_interest_dot_product.cpu(), 
                labels={"x":"Neuron", "y":f"Congruence (W_out • Token)"},
                hover_name=neuron_names,
                title=f"Congruence of '{token_of_interest_str}' Token with each Neuron Output Weights",
        )
        token_congruence_with_each_neuron_fig.update_layout(xaxis={"dtick": model.cfg.d_mlp})
        neuron_total_index= (neuron_layers[i] * model.cfg.d_mlp) + neuron_indices[i]
        # token_congruence_with_each_neuron_fig.add_annotation(x=neuron_total_index - 800,
        #                                                 y=token_of_interest_dot_product[neuron_total_index],
        #                                                 text=f"Layer {neuron_layers[i]} Neuron {neuron_indices[i]}", showarrow=True, ax=-100, ay=0)
        token_congruence_with_each_neuron_fig.show()
        token_congruence_with_each_neuron_figs.append(token_congruence_with_each_neuron_fig)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
len(neuron_names)

184320

In [None]:
5120*36

184320

In [None]:
model.embed.W_E.shape

torch.Size([50257, 1280])

In [None]:
model.embed.W_E[tokens_of_interest[i]].shape

torch.Size([1280])

In [None]:
mlp_output_weights.shape

torch.Size([184320, 1280])

In [None]:
block.mlp.W_out.shape

NameError: ignored

## Congruence of every neuron automation

In [None]:
def get_congr_token(token):
    tokens_of_interest_strs = [token]
    tokens_of_interest = torch.tensor([model.to_single_token(token_str) for token_str in tokens_of_interest_strs], device=device)

    mlp_output_weights = torch.cat([block.mlp.W_out for block in model.blocks], dim=0)# (n_layer * d_mlp, d_model)

    token_congruence_with_each_neuron_figs = []
    for i, token_of_interest_str in enumerate(tokens_of_interest_strs[:1]):
            token_of_interest_dot_product = torch.einsum("d, nd -> n", model.embed.W_E[tokens_of_interest[i]], mlp_output_weights)
            neuron_names = [f"Layer {i//model.cfg.d_mlp}" + (f" Neuron {i%model.cfg.d_mlp}" if i%model.cfg.d_mlp != 0 else "") for i in range(mlp_output_weights.shape[0])]
            token_congruence_with_each_neuron_fig = px.scatter(x=neuron_names,
                    y=token_of_interest_dot_product.cpu(), 
                    labels={"x":"Neuron", "y":f"Congruence (W_out • Token)"},
                    hover_name=neuron_names,
                    title=f"Congruence of '{token_of_interest_str}' Token with each Neuron Output Weights",
            )
            token_congruence_with_each_neuron_fig.update_layout(xaxis={"dtick": model.cfg.d_mlp})
            neuron_total_index= (neuron_layers[i] * model.cfg.d_mlp) + neuron_indices[i]
            # token_congruence_with_each_neuron_fig.add_annotation(x=neuron_total_index - 800,
            #                                                 y=token_of_interest_dot_product[neuron_total_index],
            #                                                 text=f"Layer {neuron_layers[i]} Neuron {neuron_indices[i]}", showarrow=True, ax=-100, ay=0)
            token_congruence_with_each_neuron_fig.show()
            token_congruence_with_each_neuron_figs.append(token_congruence_with_each_neuron_fig)

In [None]:
get_congr_token(" increase")

Output hidden; open in https://colab.research.google.com to view.

In [None]:
get_congr_token(" incline")

AssertionError: ignored

In [None]:
def get_congr_token_noplot(token):
    tokens_of_interest_strs = [token]
    tokens_of_interest = torch.tensor([model.to_single_token(token_str) for token_str in tokens_of_interest_strs], device=device)

    mlp_output_weights = torch.cat([block.mlp.W_out for block in model.blocks], dim=0)# (n_layer * d_mlp, d_model)

    # token_congruence_with_each_neuron_figs = []
    for i, token_of_interest_str in enumerate(tokens_of_interest_strs[:1]):
            token_of_interest_dot_product = torch.einsum("d, nd -> n", model.embed.W_E[tokens_of_interest[i]], mlp_output_weights)
            # neuron_names = [f"Layer {i//model.cfg.d_mlp}" + (f" Neuron {i%model.cfg.d_mlp}" if i%model.cfg.d_mlp != 0 else "") for i in range(mlp_output_weights.shape[0])]
            # token_congruence_with_each_neuron_fig = px.scatter(x=neuron_names,
            #         y=token_of_interest_dot_product.cpu(), 
            #         labels={"x":"Neuron", "y":f"Congruence (W_out • Token)"},
            #         hover_name=neuron_names,
            #         title=f"Congruence of '{token_of_interest_str}' Token with each Neuron Output Weights",
            )
            # token_congruence_with_each_neuron_fig.update_layout(xaxis={"dtick": model.cfg.d_mlp})
            # neuron_total_index= (neuron_layers[i] * model.cfg.d_mlp) + neuron_indices[i]
            # token_congruence_with_each_neuron_fig.add_annotation(x=neuron_total_index - 800,
            #                                                 y=token_of_interest_dot_product[neuron_total_index],
            #                                                 text=f"Layer {neuron_layers[i]} Neuron {neuron_indices[i]}", showarrow=True, ax=-100, ay=0)
            # token_congruence_with_each_neuron_fig.show()
            # token_congruence_with_each_neuron_figs.append(token_congruence_with_each_neuron_fig)