In [1]:
DEVELOPMENT_MODE = False
try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")
    %pip install git+https://github.com/jbloomAus/SAELens

except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

Running as a Jupyter notebook - intended for development only!


  ipython.magic("load_ext autoreload")
  ipython.magic("autoreload 2")


In [2]:
import torch
if torch.cuda.is_available():
    device = "cuda"
# elif torch.backends.mps.is_available():
#     device = "mps"
else:
    device = "cpu"
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x1ad9724f050>

In [3]:
from jaxtyping import Float
from torch import Tensor
import torch
import torch.nn.functional as F
def get_cosine_similarity(
    dict_elements_1: Float[Tensor, "n_dense n_dict_1"],
    dict_elements_2: Float[Tensor, "n_dense n_dict_2"],
    p_norm: int = 2,
    dim: int = 1,
) -> Float[Tensor, "n_dict_1 n_dict_2"]:
    """Get the cosine similarity between the alive dictionary elements of two runs.

    Args:
        dict_elements_1: The alive dictionary elements of the first run.
        dict_elements_2: The alive dictionary elements of the second run.

    Returns:
        The cosine similarity between the alive dictionary elements of the two runs.
    """
    # Compute cosine similarity in pytorch
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dict_elements_1 = dict_elements_1.to(device)
    dict_elements_2 = dict_elements_2.to(device)

    # Normalize the tensors
    dict_elements_1 = F.normalize(dict_elements_1, p=p_norm, dim=dim)
    dict_elements_2 = F.normalize(dict_elements_2, p=p_norm, dim=dim)

    # Compute cosine similarity using matrix multiplication
    cosine_sim: Float[Tensor, "n_dict_1 n_dict_2"] = torch.mm(dict_elements_1, dict_elements_2.T)
    return cosine_sim

In [4]:
import tqdm
from sae_lens import SAE
import numpy as np

hook_name_to_sae = []
length = 8013
d_sae = 24576
layers = 12
acts = torch.zeros((layers, length, d_sae)).to(device)
for layer in tqdm.tqdm(range(layers)):
    sae, cfg_dict, _ = SAE.from_pretrained(
        "gpt2-small-res-jb",
        f"blocks.{layer}.hook_resid_pre",
        device=device,
    )
    hook_name_to_sae.append(sae)
    with open(f"acts/activation0001_layer{layer}.npy", "rb") as f:
        for idx in range(length):
            acts[layer, idx] = torch.from_numpy(np.load(f)).to(device)

  from .autonotebook import tqdm as notebook_tqdm
This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)
100%|██████████| 12/12 [00:52<00:00,  4.34s/it]


In [5]:
from sae_lens import HookedSAETransformer

model: HookedSAETransformer = HookedSAETransformer.from_pretrained("gpt2-small").to(
    device
)

from datasets import load_dataset

# Load the Skylion007/openwebtext dataset
dataset = load_dataset('Skylion007/openwebtext')
length = 8013
# Print the first example from the dataset
doc_length = torch.zeros([length]).to(device)

for idx in tqdm.tqdm(range(length)):
#for idx in tqdm.tqdm(range(10)):
    example = dataset['train'][idx]
    tokens = model.to_tokens([example['text']], prepend_bos=True)
    doc_length[idx] = tokens.shape[1]

weights = F.normalize(doc_length, p=1, dim=0)
expanded_weights = weights.unsqueeze(0).unsqueeze(2).expand(12, 8013, 24576)
acts_weighted = (acts * expanded_weights).sum(dim=1)
acts_weighted.shape

Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  cuda


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|██████████| 8013/8013 [00:15<00:00, 510.55it/s]


torch.Size([12, 24576])

In [6]:
cos_sim = torch.zeros((layers, d_sae, d_sae))
for layer in tqdm.tqdm(range(12)):
    cos_sim[layer] = get_cosine_similarity(hook_name_to_sae[layer].W_dec, hook_name_to_sae[layer].W_dec).cpu()
    for length in range(d_sae):
        cos_sim[layer][length][length] = 0
cos_sim.shape

100%|██████████| 12/12 [00:07<00:00,  1.66it/s]


torch.Size([12, 24576, 24576])

In [7]:
indices = []
values = []
for layer in tqdm.tqdm(range(12)):
    value, index = cos_sim[layer].flatten().topk(100)
    indices.append(index)
    values.append(value)

100%|██████████| 12/12 [00:32<00:00,  2.71s/it]


In [8]:
freq_w_cos = []
for layer in tqdm.tqdm(range(12)):
    freq_layer = []
    for idx in range(100):
        freq_layer.append(indices[layer][idx]//24576)
        freq_layer.append(indices[layer][idx]%24576)
        freq = set(freq_layer)
        freq_w_cos.append(freq)

100%|██████████| 12/12 [00:00<00:00, 361.50it/s]


In [None]:
freq_w_cos = []
for layer in tqdm.tqdm(range(12)):
    freq_layer = []
    for idx in range(5):
        freq_layer.append((indices[layer][idx]//24576).item())
        freq_layer.append((indices[layer][idx]%24576).item())
        freq = set(freq_layer)
    freq_w_cos.append(freq)

100%|██████████| 12/12 [00:00<00:00, 1403.36it/s]


In [89]:
import requests
with open('explanations_high_cos.json', 'w') as f:
    pbar = tqdm.tqdm(total=120)
    NEURONPEDIA_DOMAIN = "https://neuronpedia.org"
    model = "gpt2-small"
    dataset = "res-jb"
    for layer in range(12):
        for idx in range(10):
            index1 = (indices[layer][idx]//24576).item()
            index2 = (indices[layer][idx]%24576).item()           
            url = f"{NEURONPEDIA_DOMAIN}/api/feature/{model}/{layer}-{dataset}/{index2}"
            result = requests.get(url).json()
            if "explanations" in result and result["explanations"]:
                f.write(result["explanations"][0]["description"])
                f.write("\n")
            url = f"{NEURONPEDIA_DOMAIN}/api/feature/{model}/{layer}-{dataset}/{index1}"
            result = requests.get(url).json()
            if "explanations" in result and result["explanations"]:
                f.write(result["explanations"][0]["description"])
                f.write("\n")
            pbar.update(1)

62it [00:43,  1.42it/s][00:00<?, ?it/s]
100%|██████████| 120/120 [01:18<00:00,  1.48it/s]

In [41]:
import plotly.express as px
import pandas as pd
import plotly.colors as pc

stats_dfs = []
colors = pc.n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", 13, colortype="rgb")
for layer in tqdm.tqdm(range(12)):
    cos_sim_df = pd.DataFrame({"cos": acts_weighted[layer][list(freq_w_cos[layer])].cpu().numpy(), "layer": layer})
    stats_dfs.append(cos_sim_df)
cos_df = pd.concat(stats_dfs, axis=0)

# TODO: how can I improve this plot?
# RESULT1: Cos sim by layers
fig = px.box(
    cos_df,
    x="layer",
    y = "cos",
    width=1200,
    height=600,
    color="layer",
    color_discrete_sequence=colors,
    title=f"cosine similarity in the same layer in GPT2-Small-Res-JB",
    labels={"layer": "Layer", "cos": "cos"},
)
fig.update_xaxes(showticklabels=True, dtick=1)

# increase font size
fig.update_layout(font=dict(size=16))
fig.show()

100%|██████████| 12/12 [00:00<00:00, 4000.93it/s]


In [42]:
indices = []
values = []
for layer in tqdm.tqdm(range(12)):
    value, index = cos_sim[layer].flatten().topk(100, largest=False)
    indices.append(index)
    values.append(value)

100%|██████████| 12/12 [00:37<00:00,  3.15s/it]


In [43]:
freq_w_cos = []
for layer in tqdm.tqdm(range(12)):
    freq_layer = []
    for idx in range(100):
        freq_layer.append((indices[layer][idx]//24576).item())
        freq_layer.append((indices[layer][idx]%24576).item())
        freq = set(freq_layer)
    freq_w_cos.append(freq)
import plotly.express as px
import pandas as pd
import plotly.colors as pc

stats_dfs = []
colors = pc.n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", 13, colortype="rgb")
for layer in tqdm.tqdm(range(12)):
    cos_sim_df = pd.DataFrame({"cos": acts_weighted[layer][list(freq_w_cos[layer])].cpu().numpy(), "layer": layer})
    stats_dfs.append(cos_sim_df)
cos_df = pd.concat(stats_dfs, axis=0)

# TODO: how can I improve this plot?
# RESULT1: Cos sim by layers
fig = px.box(
    cos_df,
    x="layer",
    y = "cos",
    width=1200,
    height=600,
    color="layer",
    color_discrete_sequence=colors,
    title=f"cosine similarity in the same layer in GPT2-Small-Res-JB",
    labels={"layer": "Layer", "cos": "cos"},
)
fig.update_xaxes(showticklabels=True, dtick=1)

# increase font size
fig.update_layout(font=dict(size=16))
fig.show()

100%|██████████| 12/12 [00:00<00:00, 1176.03it/s]
100%|██████████| 12/12 [00:00<00:00, 2659.67it/s]


In [45]:
indecs_max = []
indices = []
values = []
values_max = []
for layer in tqdm.tqdm(range(12)):
    value, index = acts_weighted[layer].topk(100)
    value_max, index_max = acts[layer].max(dim=1)[0].topk(100)
    indices.append(index)
    values.append(value)
    indecs_max.append(index_max)
    values_max.append(value_max)

100%|██████████| 12/12 [00:00<00:00, 748.55it/s]


In [49]:
indices[0]

tensor([ 9337, 13908,  4172, 12555, 14015,  8687,  6167,  4027, 12167, 14009,
         7632,   745, 22148,  4274, 23825, 15145,   556, 18851, 11633,   182,
        21611, 18069, 18877, 15186, 19792, 18178, 12839,  8722, 22543, 16410,
         3816,  6686,  7474, 23733,   363, 21616, 18336,  3508,  2218, 20750,
         2945, 15684,  2718, 15205,  2448, 19595, 19430,  8472,  5748,  9440,
          450,  6530, 18597,  6911,  3219, 22518, 16934,  2213, 17593,  6635,
        15430, 20824,  1169,  7795,  1934,  2924, 11787, 12204,  9758,  3382,
        18861,  6715, 15082, 18458, 21686, 19489, 16852, 16627, 16491, 14697,
        15286,  3336,  6522, 10358, 21061,  2326, 12047, 12687, 23116,  9462,
        10633,  4942, 11641, 20246, 16149, 14687, 14149,  8317, 23823, 24038],
       device='cuda:0')

In [82]:
import requests
import json
from typing import Any
def get_neuronpedia_feature(
    feature: int,  f, layer: int, model: str = "gpt2-small", dataset: str = "res-jb",
) -> dict[str, Any]:
    """Fetch a feature from Neuronpedia API."""
    NEURONPEDIA_DOMAIN = "https://neuronpedia.org"
    url = f"{NEURONPEDIA_DOMAIN}/api/feature/{model}/{layer}-{dataset}/{feature}"
    result = requests.get(url).json()
    if "explanations" in result and result["explanations"]:
        f.write(result["explanations"][0]["description"])
    else:
        return None
    

In [85]:
with open('explanations.json', 'w') as f:
    pbar = tqdm.tqdm(total=1200)
    NEURONPEDIA_DOMAIN = "https://neuronpedia.org"
    model = "gpt2-small"
    dataset = "res-jb"
    for layer in range(12):
        for idx in range(100):
            feature=indices[layer][idx].item()
            url = f"{NEURONPEDIA_DOMAIN}/api/feature/{model}/{layer}-{dataset}/{feature}"
            result = requests.get(url).json()
            if "explanations" in result and result["explanations"]:
                f.write(result["explanations"][0]["description"])
                f.write("\n")
                print(result["explanations"][0]["description"])
            pbar.update(1)

 18%|█▊        | 218/1200 [01:33<06:59,  2.34it/s]
  1%|▏         | 16/1200 [00:05<05:50,  3.38it/s]

terms related to environmental issues, specifically related to overfishing and its consequences


  2%|▏         | 18/1200 [00:05<05:39,  3.48it/s]

references to creatures or beings described as "globsters" or similar terms


  2%|▏         | 26/1200 [00:08<06:10,  3.17it/s]

references to different types of hair


  2%|▏         | 27/1200 [00:09<06:48,  2.87it/s]

negative sentiments or drawbacks in the text


  3%|▎         | 33/1200 [00:11<06:27,  3.01it/s]

complex terms related to mapping or categorization


  4%|▍         | 50/1200 [00:15<05:33,  3.45it/s]

dates and events


  6%|▌         | 71/1200 [00:22<06:18,  2.98it/s]

mentions of individuals and their relationships in conversations


  6%|▌         | 74/1200 [00:23<05:47,  3.24it/s]

words related to fires and celebrations


  6%|▋         | 78/1200 [00:24<05:52,  3.19it/s]

descriptions of scenic views or photography


  7%|▋         | 82/1200 [00:25<05:37,  3.31it/s]

phrases related to legal matters and conflicts


  7%|▋         | 87/1200 [00:27<05:48,  3.20it/s]

names of specific individuals


  8%|▊         | 101/1200 [00:31<06:47,  2.70it/s]

technical language related to computers and networks


  8%|▊         | 102/1200 [00:31<06:30,  2.81it/s]

word sequences related to transportation and location


  9%|▊         | 103/1200 [00:32<06:13,  2.94it/s]

proper nouns or names


  9%|▊         | 104/1200 [00:32<06:00,  3.04it/s]

mentions of personal interactions or connections


  9%|▉         | 105/1200 [00:32<06:13,  2.94it/s]

names of people or places


  9%|▉         | 106/1200 [00:33<05:58,  3.05it/s]

non-textual characters and noise, likely not related to meaningful patterns in the text


  9%|▉         | 107/1200 [00:33<05:49,  3.13it/s]

mentions of digital communication or coding-related text


  9%|▉         | 108/1200 [00:33<05:54,  3.08it/s]

names of cities


  9%|▉         | 109/1200 [00:34<06:20,  2.86it/s]

references related to significant events or incidents


  9%|▉         | 110/1200 [00:34<07:17,  2.49it/s]

terms related to technology or programming


  9%|▉         | 111/1200 [00:34<06:50,  2.65it/s]

names of places or people


  9%|▉         | 112/1200 [00:35<06:24,  2.83it/s]

phrases related to legal, political, and technical terms


 10%|▉         | 114/1200 [00:35<06:04,  2.98it/s]

references to specific names, probably related to people or places


 10%|▉         | 115/1200 [00:36<06:03,  2.99it/s]

proper nouns and technical terms of various subjects


 10%|▉         | 116/1200 [00:36<06:06,  2.96it/s]

words related to legal terms and agreements


 10%|▉         | 119/1200 [00:37<05:54,  3.05it/s]

words associated with technology, such as code, script, and email


 10%|█         | 121/1200 [00:38<05:35,  3.22it/s]

terms related to support or approval


 10%|█         | 126/1200 [00:39<05:29,  3.26it/s]

issues and controversies related to politics and government


 11%|█         | 131/1200 [00:41<05:59,  2.97it/s]

terms related to financial concepts and entities in negative contexts


 11%|█         | 132/1200 [00:41<05:52,  3.03it/s]

email subscription and newsletter related phrases


 12%|█▏        | 141/1200 [00:44<05:39,  3.12it/s]

references to specific names or places


 12%|█▏        | 143/1200 [00:45<05:30,  3.20it/s]

mentions of film festivals and premieres


 12%|█▏        | 144/1200 [00:45<05:26,  3.24it/s]

address-related information, such as "address," "looking address," and "zip code."


 12%|█▏        | 145/1200 [00:45<05:32,  3.17it/s]

phrases related to financial figures and business outcomes


 12%|█▏        | 149/1200 [00:46<05:20,  3.28it/s]

keywords related to geopolitical events and locations


 13%|█▎        | 151/1200 [00:47<05:11,  3.36it/s]

references to specific educational institutions and dates


 14%|█▍        | 167/1200 [00:52<05:26,  3.16it/s]

bits of code related to data processing and analysis


 14%|█▍        | 169/1200 [00:52<05:20,  3.22it/s]

mentions of celebrities, specifically Taylor Swift


 14%|█▍        | 170/1200 [00:53<05:31,  3.10it/s]

geographical locations


 14%|█▍        | 171/1200 [00:53<05:24,  3.17it/s]

terms related to public facilities and infrastructure


 14%|█▍        | 173/1200 [00:54<06:04,  2.82it/s]

various letters and symbols, potentially related to specific sources or organizations


 16%|█▋        | 195/1200 [01:00<05:03,  3.31it/s]

people's names with the pattern of having high activation for the last few characters


 17%|█▋        | 201/1200 [01:02<04:59,  3.34it/s]

terms related to supply chain and logistics


 17%|█▋        | 202/1200 [01:02<05:43,  2.90it/s]

names of individuals and their occupations


 17%|█▋        | 203/1200 [01:03<08:11,  2.03it/s]

terms related to specific proper nouns, such as 'Japanese whisky', 'Sikh faith', 'Deadpool', 'Airbnb', 'WWE CEO', 'Toronto', and 'Trump'


 17%|█▋        | 204/1200 [01:04<07:30,  2.21it/s]

references to specific technologies or products


 17%|█▋        | 205/1200 [01:04<07:57,  2.08it/s]

code snippets or programming-related terms


 17%|█▋        | 206/1200 [01:04<06:52,  2.41it/s]

proper names or identifiers of individuals or entities


 17%|█▋        | 207/1200 [01:05<06:07,  2.70it/s]

phrases related to electronic devices and technology


 17%|█▋        | 208/1200 [01:05<05:55,  2.79it/s]

various terms related to specific occupations and professional activities


 17%|█▋        | 209/1200 [01:05<05:50,  2.83it/s]

surnames, potentially from news articles or academic works


 18%|█▊        | 210/1200 [01:06<06:14,  2.64it/s]

names of specific locations


 18%|█▊        | 211/1200 [01:06<05:45,  2.87it/s]

mentions of authorities or officials in the context of news or reports


 18%|█▊        | 212/1200 [01:06<05:37,  2.93it/s]

 words indicating illumination or clarity, and numeric quantification or measurement


 18%|█▊        | 213/1200 [01:07<05:28,  3.01it/s]

words related to forceful actions or outcomes


 18%|█▊        | 214/1200 [01:07<05:30,  2.99it/s]

verbs relating to actions or decision-making


 18%|█▊        | 215/1200 [01:07<05:38,  2.91it/s]

 adjectives describing personal items or characteristics


 18%|█▊        | 216/1200 [01:08<05:40,  2.89it/s]

phrases related to legal and political compliance


 18%|█▊        | 217/1200 [01:08<05:44,  2.85it/s]

phrases related to soccer leagues and teams


 18%|█▊        | 218/1200 [01:09<05:34,  2.94it/s]

verbs that imply actions or events with a significant impact or consequence


 18%|█▊        | 219/1200 [01:09<05:30,  2.97it/s]

abbreviations and acronyms


 18%|█▊        | 220/1200 [01:09<05:33,  2.94it/s]

terms related to enclosed spaces within a building, particularly hallways and doorways


 18%|█▊        | 221/1200 [01:10<05:32,  2.94it/s]

 surnames or names of specific individuals


 18%|█▊        | 222/1200 [01:10<05:15,  3.10it/s]

strong verbs/actions and their outcomes/events


 19%|█▊        | 223/1200 [01:10<05:12,  3.13it/s]

topics related to specific names or entities


 19%|█▊        | 224/1200 [01:10<05:23,  3.02it/s]

technical terms related to technology, such as "holograms," "antennas," and "neural arrays."


 19%|█▉        | 225/1200 [01:11<05:18,  3.06it/s]

words related to legal and political terms


 19%|█▉        | 226/1200 [01:11<05:37,  2.89it/s]

phrases related to political events and developments


 19%|█▉        | 227/1200 [01:12<05:28,  2.96it/s]

terms related to specific concepts or entities, such as "cities", "gravity", "avalanches", or "fruit flies"


 19%|█▉        | 228/1200 [01:12<05:30,  2.94it/s]

words related to political endorsements and statements


 19%|█▉        | 229/1200 [01:12<05:28,  2.95it/s]

numerical sequences and model names


 19%|█▉        | 230/1200 [01:13<05:28,  2.95it/s]

political figures, political actions, and governmental terms


 19%|█▉        | 231/1200 [01:13<05:21,  3.01it/s]

phrases related to restructuring or changes


 19%|█▉        | 232/1200 [01:13<05:32,  2.91it/s]

words related to geology, particularly focusing on natural phenomena like volcanoes


 19%|█▉        | 233/1200 [01:14<08:11,  1.97it/s]

mentions of locations or entities within a certain context


 20%|█▉        | 234/1200 [01:14<07:29,  2.15it/s]

adjectives related to certainty or extremity


 20%|█▉        | 235/1200 [01:15<06:49,  2.36it/s]

dates and data related to government reports and international relationships


 20%|█▉        | 236/1200 [01:15<06:20,  2.53it/s]

phrases related to knocking or impact


 20%|█▉        | 237/1200 [01:15<06:09,  2.61it/s]

dates mentioned in the year 2018


 20%|█▉        | 238/1200 [01:16<05:56,  2.70it/s]

negative or alarming terms


 20%|█▉        | 239/1200 [01:16<05:38,  2.84it/s]

references to specific individuals


 20%|██        | 240/1200 [01:17<06:11,  2.58it/s]

mentions of legal or political actions and events


 20%|██        | 241/1200 [01:17<05:51,  2.73it/s]

specific terms related to various societal topics and concepts


 20%|██        | 243/1200 [01:18<05:19,  3.00it/s]

 names of specific individuals


 20%|██        | 244/1200 [01:18<05:05,  3.13it/s]

legal terms and procedures


 20%|██        | 245/1200 [01:18<05:19,  2.99it/s]

phrases related to tools or equipment


 20%|██        | 246/1200 [01:19<05:27,  2.91it/s]

words related to conflict, controversy, and opposing viewpoints


 21%|██        | 247/1200 [01:19<05:27,  2.91it/s]

names of individuals


 21%|██        | 248/1200 [01:19<05:26,  2.92it/s]

file deletion commands in text


 21%|██        | 249/1200 [01:20<05:17,  2.99it/s]

phrases related to early or initial stages of development or growth


 21%|██        | 250/1200 [01:20<05:19,  2.97it/s]

words related to moral judgment or assessment of situations


 21%|██        | 251/1200 [01:20<05:06,  3.10it/s]

phrases starting with "Those"


 21%|██        | 252/1200 [01:21<05:10,  3.05it/s]

information presented as a series of related topics


 21%|██        | 253/1200 [01:21<05:40,  2.78it/s]

legal and political terms or phrases


 21%|██        | 254/1200 [01:21<05:55,  2.66it/s]

university and college names


 21%|██▏       | 255/1200 [01:22<05:45,  2.74it/s]

phrases related to actions or activities


 21%|██▏       | 256/1200 [01:22<06:24,  2.46it/s]

portions of text that are formatted differently than the main body of the text


 21%|██▏       | 257/1200 [01:23<05:54,  2.66it/s]

activities related to physical combat or conflict


 22%|██▏       | 259/1200 [01:23<06:46,  2.32it/s]

terms related to criminal activities and law enforcement


 22%|██▏       | 260/1200 [01:24<06:52,  2.28it/s]

phrases related to legal or political actions


 22%|██▏       | 261/1200 [01:24<06:16,  2.50it/s]

phrases containing the word "as" with varying contexts


 22%|██▏       | 262/1200 [01:25<05:51,  2.67it/s]

the word "this"


 22%|██▏       | 263/1200 [01:25<06:18,  2.48it/s]

descriptions of joint efforts or collaborations


 22%|██▏       | 264/1200 [01:25<05:51,  2.66it/s]

citations in a specific format


 22%|██▏       | 265/1200 [01:26<05:52,  2.65it/s]

self-reflective phrases


 22%|██▏       | 266/1200 [01:26<05:28,  2.84it/s]

words related to automation and stabilization


 22%|██▏       | 267/1200 [01:26<05:14,  2.97it/s]

phrases related to specific events or occurrences


 22%|██▏       | 268/1200 [01:27<05:04,  3.06it/s]

phrases in quotation marks


 22%|██▏       | 269/1200 [01:27<04:55,  3.15it/s]

code-related terms and instructions


 23%|██▎       | 271/1200 [01:28<05:08,  3.01it/s]

terms related to technology and media companies


 23%|██▎       | 272/1200 [01:28<05:01,  3.08it/s]

phrases that start with "Each."


 23%|██▎       | 276/1200 [01:29<05:01,  3.06it/s]

terms related to computer programming and code snippets


 23%|██▎       | 277/1200 [01:29<05:10,  2.98it/s]

phrases related to political and social unrest


 23%|██▎       | 278/1200 [01:30<05:03,  3.04it/s]

prepositions followed by verbs that describe physical actions or behaviors


 23%|██▎       | 279/1200 [01:30<05:14,  2.92it/s]

references to the topic of Scientology


 23%|██▎       | 280/1200 [01:31<05:28,  2.80it/s]

words related to specific entities such as names of people, places, or organizations


 23%|██▎       | 281/1200 [01:31<05:23,  2.84it/s]

words related to gourmet food


 24%|██▎       | 283/1200 [01:32<05:08,  2.98it/s]

phrases related to sections of text signifying examples or subsets


 24%|██▎       | 284/1200 [01:32<05:02,  3.03it/s]

phrases related to actions or scenarios that can be done or experienced without certain conditions or constraints


 24%|██▍       | 285/1200 [01:32<05:07,  2.98it/s]

sentences about personal relationships and life events, with a focus on marriage, friendship, and family


 24%|██▍       | 287/1200 [01:33<04:47,  3.18it/s]

tokens indicating code snippets or commands


 24%|██▍       | 288/1200 [01:33<04:38,  3.27it/s]

proper nouns related to individuals, likely of political or public significance


 24%|██▍       | 290/1200 [01:34<04:46,  3.18it/s]

country names


 24%|██▍       | 291/1200 [01:35<07:15,  2.09it/s]

non-English words and terms


 24%|██▍       | 293/1200 [01:35<05:49,  2.60it/s]

references, trivia, quotes, articles, notes, and images


 24%|██▍       | 294/1200 [01:35<05:31,  2.74it/s]

instances where an alternative action or situation is mentioned


 25%|██▍       | 296/1200 [01:36<05:11,  2.90it/s]

words related to technology or internet-related terms


 25%|██▍       | 297/1200 [01:37<06:28,  2.32it/s]

conjunctions introducing contrasting information


 25%|██▍       | 298/1200 [01:37<05:46,  2.61it/s]

famous people's names


 25%|██▍       | 299/1200 [01:38<06:59,  2.15it/s]

phrases related to political or governmental events


 25%|██▌       | 301/1200 [01:38<05:59,  2.50it/s]

terms related to technology and software development


 25%|██▌       | 302/1200 [01:39<06:50,  2.19it/s]

names of countries


 25%|██▌       | 303/1200 [01:39<06:28,  2.31it/s]

mentions of specific TV shows and movies


 25%|██▌       | 304/1200 [01:40<06:16,  2.38it/s]

phrases related to physical actions and locations


 25%|██▌       | 305/1200 [01:40<05:58,  2.50it/s]

text related to criminal activities


 26%|██▌       | 306/1200 [01:40<05:50,  2.55it/s]

phrases related to precautions, preparedness, and potential harm


 26%|██▌       | 307/1200 [01:41<05:46,  2.58it/s]

technical terms and data points scattered within sentences


 26%|██▌       | 308/1200 [01:41<05:50,  2.55it/s]

words related to legal or criminal situations


 26%|██▌       | 309/1200 [01:42<05:44,  2.59it/s]

mentions of specific names or entities in various contexts


 26%|██▌       | 310/1200 [01:43<08:48,  1.68it/s]

names of individuals, organizations, and places mentioned in news articles


 26%|██▌       | 311/1200 [01:43<09:25,  1.57it/s]

mentions of physical actions or events involving conflict or confrontation


 26%|██▌       | 312/1200 [01:44<08:20,  1.77it/s]

instructions or guidelines within text


 26%|██▌       | 313/1200 [01:44<08:26,  1.75it/s]

mentions of personal experiences and interactions with specific individuals


 26%|██▌       | 314/1200 [01:45<07:26,  1.98it/s]

numerical and mathematical expressions


 26%|██▋       | 315/1200 [01:45<06:30,  2.27it/s]

phrases related to specific actions or events


 26%|██▋       | 316/1200 [01:45<05:56,  2.48it/s]

descriptive and impactful words related to various situations and objects


 26%|██▋       | 317/1200 [01:46<06:14,  2.36it/s]

terms related to mathematical equations and figures


 26%|██▋       | 318/1200 [01:46<06:00,  2.44it/s]

phrases related to technology and innovation


 27%|██▋       | 319/1200 [01:46<05:41,  2.58it/s]

 dates and events


 27%|██▋       | 320/1200 [01:47<05:25,  2.70it/s]

elements related to American football games


 27%|██▋       | 321/1200 [01:47<05:18,  2.76it/s]

descriptions or characteristics of a spiral relationship


 27%|██▋       | 322/1200 [01:48<05:20,  2.74it/s]

phrases related to legal or criminal actions


 27%|██▋       | 323/1200 [01:48<05:00,  2.92it/s]

economic terms


 27%|██▋       | 324/1200 [01:49<06:51,  2.13it/s]

words related to policies, regulations, and government activities


 27%|██▋       | 325/1200 [01:49<06:06,  2.39it/s]

terms related to exponential growth


 27%|██▋       | 326/1200 [01:49<05:37,  2.59it/s]

quotations and reported statements


 27%|██▋       | 327/1200 [01:50<05:44,  2.53it/s]

technical terms related to technology comparisons


 27%|██▋       | 328/1200 [01:50<06:30,  2.23it/s]

terms related to technology infrastructures and services


 27%|██▋       | 329/1200 [01:51<06:16,  2.31it/s]

phrases related to governmental activities and politics


 28%|██▊       | 330/1200 [01:51<05:51,  2.47it/s]

details related to physical actions or movements, especially involving body parts


 28%|██▊       | 331/1200 [01:52<06:44,  2.15it/s]

phrases related to people's names


 28%|██▊       | 332/1200 [01:52<08:03,  1.79it/s]

words related to authority and commands


 28%|██▊       | 333/1200 [01:53<07:10,  2.01it/s]

patterns of characters in a structured format that look like file paths or commands


 28%|██▊       | 334/1200 [01:53<08:18,  1.74it/s]

information related to recent events or news articles


 28%|██▊       | 335/1200 [01:54<07:28,  1.93it/s]

adjectives that emphasize extremity or importance


 28%|██▊       | 336/1200 [01:54<06:42,  2.15it/s]

mentions of people in politics or news


 28%|██▊       | 337/1200 [01:55<06:14,  2.30it/s]

phrases related to featuring something, typically in a context that emphasizes its importance or significance


 28%|██▊       | 338/1200 [01:55<06:02,  2.38it/s]

texts related to political ideologies and governmental systems


 28%|██▊       | 339/1200 [01:55<06:27,  2.22it/s]

references to legal and political topics, especially related to freedom of speech and government actions


 28%|██▊       | 340/1200 [01:56<05:57,  2.40it/s]

words related to beliefs and convictions


 28%|██▊       | 341/1200 [01:56<05:36,  2.55it/s]

terms related to biology, genetics, and medical research


 28%|██▊       | 342/1200 [01:56<05:22,  2.66it/s]

HTML tags and comments


 29%|██▊       | 343/1200 [01:57<05:21,  2.67it/s]

terms related to military or legal investigations


 29%|██▊       | 344/1200 [01:57<06:26,  2.22it/s]

parentheses


 29%|██▉       | 345/1200 [01:58<05:54,  2.41it/s]

information related to military activities, conflicts, and diplomatic statements


 29%|██▉       | 346/1200 [01:58<05:45,  2.47it/s]

words related to computer programming and coding


 29%|██▉       | 347/1200 [01:58<05:33,  2.56it/s]

email-related terms and phrases


 29%|██▉       | 348/1200 [01:59<05:32,  2.56it/s]

mentions of reports or articles in news


 29%|██▉       | 349/1200 [01:59<05:26,  2.60it/s]

information related to incidents, accidents, and crime events


 29%|██▉       | 350/1200 [02:00<05:15,  2.70it/s]

sentences that express disagreement or negative opinions


 29%|██▉       | 351/1200 [02:00<05:17,  2.67it/s]

different contexts and subjects like community services, health issues, political activities, and legal matters


 29%|██▉       | 352/1200 [02:00<05:11,  2.72it/s]

political figures and related terms


 29%|██▉       | 353/1200 [02:01<05:14,  2.70it/s]

names of individuals or groups, potentially related to movie credits


 30%|██▉       | 354/1200 [02:01<05:07,  2.75it/s]

proper names or entities


 30%|██▉       | 355/1200 [02:01<05:13,  2.70it/s]

information related to sports, particularly about players and their performances


 30%|██▉       | 356/1200 [02:02<05:04,  2.77it/s]

terms related to power dynamics, conflicts, and opposition


 30%|██▉       | 357/1200 [02:02<04:58,  2.82it/s]

actions related to improvement or change


 30%|██▉       | 358/1200 [02:03<06:24,  2.19it/s]

technical terms or words related to computer systems and networks


 30%|██▉       | 359/1200 [02:03<06:35,  2.13it/s]

phrases related to interpersonal interactions and dialogue


 30%|███       | 360/1200 [02:04<05:57,  2.35it/s]

technical terms related to technology and announcements


 30%|███       | 361/1200 [02:04<05:37,  2.48it/s]

numerical values within a specific range


 30%|███       | 362/1200 [02:05<06:33,  2.13it/s]

technical terms and acronyms related to technology and finance


 30%|███       | 363/1200 [02:05<05:54,  2.36it/s]

verbs related to actions or states that involve a significant change or impact


 30%|███       | 364/1200 [02:05<05:25,  2.57it/s]

technical terms related to software development and project management


 30%|███       | 365/1200 [02:06<05:07,  2.72it/s]

references to laws or regulations


 30%|███       | 366/1200 [02:06<05:04,  2.74it/s]

phrases related to focusing on specific issues or elements


 31%|███       | 367/1200 [02:06<04:51,  2.86it/s]

words related to legal or official processes


 31%|███       | 368/1200 [02:07<05:06,  2.72it/s]

technical terms and system-specific words


 31%|███       | 369/1200 [02:07<04:57,  2.80it/s]

multi-word phrases related to people and their characteristics or actions, particularly those relating to medical conditions or social situations


 31%|███       | 370/1200 [02:07<04:34,  3.02it/s]

 instances where a positive attribute is acknowledged with a contrasting element


 31%|███       | 371/1200 [02:08<04:31,  3.05it/s]

software and technology-related terms


 31%|███       | 372/1200 [02:08<04:39,  2.96it/s]

instances of specific entities or events, such as "ISIS" and "Fox," along with instances of comparison or speculation using "if" statements


 31%|███       | 373/1200 [02:08<04:46,  2.88it/s]

mentions of legal or political terms and phrases


 31%|███       | 374/1200 [02:09<04:42,  2.92it/s]

references related to political figures and their actions


 31%|███▏      | 375/1200 [02:09<06:03,  2.27it/s]

ambiguous statements or uncertainties


 31%|███▏      | 376/1200 [02:10<05:37,  2.44it/s]

country names


 31%|███▏      | 377/1200 [02:10<05:32,  2.47it/s]

names of artists and entertainers


 32%|███▏      | 378/1200 [02:10<05:00,  2.74it/s]

mentions of specific geographical locations, cities, and places


 32%|███▏      | 379/1200 [02:11<04:54,  2.78it/s]

keywords related to significant events or developments


 32%|███▏      | 380/1200 [02:11<04:51,  2.81it/s]

references to specific organizations and locations


 32%|███▏      | 381/1200 [02:11<05:30,  2.48it/s]

terms related to financial investments and blockchain technology


 32%|███▏      | 382/1200 [02:12<05:18,  2.57it/s]

consecutive words ending with a semicolon


 32%|███▏      | 383/1200 [02:12<04:54,  2.77it/s]

mentions of choices and comparisons


 32%|███▏      | 384/1200 [02:12<04:46,  2.85it/s]

phrases related to social issues and political discussions


 32%|███▏      | 385/1200 [02:13<04:32,  2.99it/s]

words related to medical conditions such as "euphoria" and "glutamate."


 32%|███▏      | 386/1200 [02:13<05:48,  2.33it/s]

mentions of specific people and their actions or characteristics


 32%|███▏      | 387/1200 [02:14<05:20,  2.54it/s]

words related to government actions and legislative processes


 32%|███▏      | 388/1200 [02:14<05:16,  2.57it/s]

legal and procedural terms


 32%|███▏      | 389/1200 [02:14<04:58,  2.72it/s]

terms related to political debates and issues


 32%|███▎      | 390/1200 [02:15<04:47,  2.82it/s]

instances of the word "an" in the text


 33%|███▎      | 391/1200 [02:15<04:47,  2.81it/s]

phrases related to technical errors and instructions


 33%|███▎      | 392/1200 [02:15<04:48,  2.80it/s]

legal terms and courtroom-related phrases


 33%|███▎      | 393/1200 [02:16<04:46,  2.81it/s]

proper nouns and terms related to various fields like movies, books, and news articles


 33%|███▎      | 394/1200 [02:16<04:37,  2.91it/s]

verbs indicating an action performed by a specific person


 33%|███▎      | 395/1200 [02:17<04:46,  2.81it/s]

economic and financial terms and figures


 33%|███▎      | 396/1200 [02:17<04:37,  2.90it/s]

geographical locations, states, and countries


 33%|███▎      | 397/1200 [02:17<04:23,  3.04it/s]

references to specific entities or topics within various contexts


 33%|███▎      | 398/1200 [02:17<04:30,  2.97it/s]

mentions of specific places or locations


 33%|███▎      | 399/1200 [02:18<04:30,  2.96it/s]

phrases related to describing a situation or giving context


 33%|███▎      | 400/1200 [02:18<04:19,  3.08it/s]

terms related to food additives like monosodium glutamate (MSG)


 33%|███▎      | 401/1200 [02:18<04:24,  3.03it/s]

 words related to specific activities and tasks performed voluntarily and without pay


 34%|███▎      | 402/1200 [02:19<04:35,  2.89it/s]

copyright and legal terms


 34%|███▎      | 403/1200 [02:19<04:25,  3.00it/s]

locations, names, and titles with a mix of lowercase and special characters


 34%|███▎      | 404/1200 [02:20<04:29,  2.96it/s]

negatively charged words associated with personal criticism


 34%|███▍      | 405/1200 [02:20<04:22,  3.03it/s]

phrases related to taking care of others in a selfless and voluntary manner


 34%|███▍      | 406/1200 [02:20<04:24,  3.00it/s]

combinations of words that hint at causing harm or negative consequences


 34%|███▍      | 407/1200 [02:20<04:10,  3.16it/s]

prepositional phrases starting with 'of'


 34%|███▍      | 408/1200 [02:21<04:09,  3.18it/s]

phrases containing the word "on" followed by a specific word or phrase


 34%|███▍      | 409/1200 [02:21<04:14,  3.11it/s]

phrases related to legal issues and regulatory matters


 34%|███▍      | 410/1200 [02:21<04:10,  3.15it/s]

phrases related to actions or events


 34%|███▍      | 411/1200 [02:22<06:36,  1.99it/s]

pronouns followed by verbs


 34%|███▍      | 412/1200 [02:23<05:55,  2.22it/s]

prepositional phrases with a verb indicating an action


 34%|███▍      | 413/1200 [02:23<05:22,  2.44it/s]

punctuation marks typically used for emphasis or organization


 34%|███▍      | 414/1200 [02:23<05:08,  2.55it/s]

phrases indicating certainty or assurance


 35%|███▍      | 415/1200 [02:24<04:43,  2.77it/s]

phrases expressing contrasting or opposing ideas


 35%|███▍      | 416/1200 [02:24<04:28,  2.92it/s]

sentences that end with a period


 35%|███▍      | 417/1200 [02:24<04:13,  3.09it/s]

technical terms and jargon related to various fields or professions


 35%|███▍      | 418/1200 [02:25<04:13,  3.08it/s]

verbs connected to technology and actions related to internet security


 35%|███▍      | 419/1200 [02:25<04:09,  3.14it/s]

verbs related to actions or events in progress


 35%|███▌      | 420/1200 [02:25<04:07,  3.15it/s]

terms related to technology and specific technical names


 35%|███▌      | 421/1200 [02:25<04:08,  3.13it/s]

countries


 35%|███▌      | 422/1200 [02:26<04:21,  2.97it/s]

dates and specific time periods


 35%|███▌      | 423/1200 [02:26<04:23,  2.95it/s]

references to people's names with apostrophes indicating possessiveness


 35%|███▌      | 424/1200 [02:26<04:10,  3.10it/s]

phrases related to completing tasks or achieving goals


 35%|███▌      | 425/1200 [02:27<04:00,  3.23it/s]

names of individuals or characters


 36%|███▌      | 426/1200 [02:27<03:50,  3.36it/s]

action verbs related to physical movement


 36%|███▌      | 427/1200 [02:27<03:50,  3.35it/s]

phrases related to location and events


 36%|███▌      | 428/1200 [02:28<03:53,  3.30it/s]

phrases and sentences that end with a period


 36%|███▌      | 429/1200 [02:28<04:03,  3.16it/s]

the word "you"


 36%|███▌      | 430/1200 [02:28<04:06,  3.12it/s]

personal pronouns followed by actions


 36%|███▌      | 431/1200 [02:29<04:07,  3.10it/s]

technical terms related to scientific research or study


 36%|███▌      | 432/1200 [02:29<04:15,  3.01it/s]

words related to medical and healthcare contexts


 36%|███▌      | 433/1200 [02:29<04:10,  3.06it/s]

phrases related to financial and economic activities, including investments, commodities, and energy sources


 36%|███▌      | 434/1200 [02:30<04:07,  3.10it/s]

political figures and terms related to politics


 36%|███▋      | 435/1200 [02:30<05:27,  2.34it/s]

terms related to instructions or technical details


 36%|███▋      | 436/1200 [02:31<05:14,  2.43it/s]

words related to technical terms or programming syntax


 36%|███▋      | 437/1200 [02:31<04:59,  2.55it/s]

terms related to legal or political matters, such as legislation, citizenship, and court decisions


 36%|███▋      | 438/1200 [02:31<04:34,  2.78it/s]

prepositions and conjunctions


 37%|███▋      | 439/1200 [02:32<04:19,  2.94it/s]

verbs related to physical actions or movements


 37%|███▋      | 440/1200 [02:32<04:13,  3.00it/s]

numerical values that seem to represent specific data points


 37%|███▋      | 441/1200 [02:32<04:06,  3.07it/s]

terms related to political or social issues or situations


 37%|███▋      | 442/1200 [02:33<04:05,  3.09it/s]

adjectives with morally or emotionally charged undertones


 37%|███▋      | 443/1200 [02:33<04:01,  3.14it/s]

phrases related to describing or advertising a specific content or product


 37%|███▋      | 444/1200 [02:33<04:05,  3.08it/s]

technical terms and concepts related to project management and work tasks


 37%|███▋      | 445/1200 [02:34<04:07,  3.05it/s]

words related to technical details and processes, such as precision, terminations, and machining


 37%|███▋      | 446/1200 [02:34<04:05,  3.08it/s]

 specific references to the Su-24 bomber aircraft being shot down by Turkish Air Force


 37%|███▋      | 447/1200 [02:34<04:04,  3.09it/s]

specific phrases related to legal matters


 37%|███▋      | 448/1200 [02:34<03:48,  3.29it/s]

words that are either misspelled or intentionally altered


 37%|███▋      | 449/1200 [02:35<04:17,  2.92it/s]

prepositions indicating movement or change in location


 38%|███▊      | 450/1200 [02:35<04:12,  2.97it/s]

websites or references related to specific articles


 38%|███▊      | 451/1200 [02:35<04:08,  3.01it/s]

special characters or symbols


 38%|███▊      | 452/1200 [02:36<04:07,  3.02it/s]

instances of words related to confidence and self-belief


UnicodeEncodeError: 'charmap' codec can't encode character '\u0122' in position 85: character maps to <undefined>

In [93]:
indices = []
values = []
for layer in tqdm.tqdm(range(12)):
    value, index = acts_weighted[layer].topk(100, largest=False)
    indices.append(index)
    values.append(value)

with open('explanations.json', 'w') as f:
    pbar = tqdm.tqdm(total=1200)
    NEURONPEDIA_DOMAIN = "https://neuronpedia.org"
    model = "gpt2-small"
    dataset = "res-jb"
    for layer in range(12):
        for idx in range(100):
            feature=indices[layer][idx].item()
            url = f"{NEURONPEDIA_DOMAIN}/api/feature/{model}/{layer}-{dataset}/{feature}"
            result = requests.get(url).json()
            if "explanations" in result and result["explanations"]:
                f.write(result["explanations"][0]["description"])
                f.write("\n")
            pbar.update(1)

100%|██████████| 12/12 [00:00<00:00, 12003.73it/s]
  2%|▏         | 21/1200 [00:07<07:01,  2.80it/s]
 43%|████▎     | 517/1200 [02:35<04:06,  2.77it/s]

UnicodeEncodeError: 'charmap' codec can't encode character '\u010a' in position 97: character maps to <undefined>