In [None]:
!pip install sae_lens

Collecting sae_lens
  Downloading sae_lens-6.3.1-py3-none-any.whl.metadata (5.6 kB)
Collecting automated-interpretability<1.0.0,>=0.0.5 (from sae_lens)
  Downloading automated_interpretability-0.0.13-py3-none-any.whl.metadata (852 bytes)
Collecting babe<0.0.8,>=0.0.7 (from sae_lens)
  Downloading babe-0.0.7-py3-none-any.whl.metadata (10 kB)
Collecting datasets<3.0.0,>=2.17.1 (from sae_lens)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting plotly-express<0.5.0,>=0.4.1 (from sae_lens)
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting pytest-profiling<2.0.0,>=1.7.0 (from sae_lens)
  Downloading pytest_profiling-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from sae_lens)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pyzmq==26.0.0 (from sae_lens)
  Downloading pyzmq-26.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.2 kB)
Collecting safetensors<0.5.0,>=0.4.2

In [None]:
import torch
from functools import partial
from sae_lens import SAE, HookedSAETransformer

# --- Configuration ---
MODEL_NAME = "gpt2-small"
RELEASE    = "gpt2-small-res-jb"
SAE_ID     = "blocks.7.hook_resid_pre"
DEVICE     = "cuda"

# --- Steering parameters ---
feature_id = 2     # ← *replace* with a real feature index from Neuronpedia
strength   = 50.0       # multiplier (e.g. 4.0 means 5× the original activation)
pos        = None      # None=all token positions, or an int for a single position

# --- Load model and SAE ---
model = HookedSAETransformer.from_pretrained(MODEL_NAME, device=DEVICE)
sae = SAE.from_pretrained(
    release=RELEASE,
    sae_id=SAE_ID,
)[0]
sae.to(DEVICE)

# print(sae.cfg)
hook_name = sae.cfg.metadata.hook_name
# print("Hook:", hook_name)

Loaded pretrained model gpt2-small into HookedTransformer


This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)
  sae = SAE.from_pretrained(


In [None]:
sae.cfg.metadata

SAEMetadata({'sae_lens_version': '6.3.1', 'sae_lens_training_version': None, 'model_name': 'gpt2-small', 'hook_name': 'blocks.7.hook_resid_pre', 'hook_head_index': None, 'dataset_path': 'Skylion007/openwebtext', 'context_size': 128, 'model_from_pretrained_kwargs': {'center_writing_weights': True}, 'neuronpedia_id': 'gpt2-small/7-res-jb', 'prepend_bos': True})

In [None]:
# instantiate an object to hold activations from a dataset
from sae_lens import ActivationsStore

device = "cuda"

# a convenient way to instantiate an activation store is to use the from_sae method
activation_store = ActivationsStore.from_sae(
    model=model,
    dataset=sae.cfg.metadata.dataset_path,
    sae=sae,
    streaming=True,
    # fairly conservative parameters here so can use same for larger
    # models without running out of memory.
    store_batch_size_prompts=8,
    train_batch_size_tokens=4096,
    n_batches_in_buffer=32,
    device=device,
)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa5 in position 1: invalid start byte

In [None]:
# model, sae, activation_store, steering_feature

In [None]:
from tqdm import tqdm
from functools import partial
import re

def find_max_activation(model, sae, activation_store, feature_idx, num_batches=100):
    """
    Find the maximum activation for a given feature index. This is useful for
    calibrating the right amount of the feature to add.
    """
    max_activation = 0.0

    pbar = tqdm(range(num_batches))
    for _ in pbar:
        tokens = activation_store.get_batch_tokens()

        layer = int(re.search(r"\.(\d+)\.", sae.cfg.metadata.hook_name).group(1))  # type: ignore
        _, cache = model.run_with_cache(
            tokens,
            stop_at_layer=layer + 1,
            names_filter=[sae.cfg.metadata.hook_name],
        )
        sae_in = cache[sae.cfg.metadata.hook_name]
        feature_acts = sae.encode(sae_in).squeeze()

        feature_acts = feature_acts.flatten(0, 1)
        batch_max_activation = feature_acts[:, feature_idx].max().item()
        max_activation = max(max_activation, batch_max_activation)

        pbar.set_description(f"Max activation: {max_activation:.4f}")

    return max_activation


def steering(
    activations, hook, steering_strength=1.0, steering_vector=None, max_act=1.0
):
    # Note if the feature fires anyway, we'd be adding to that here.
    return activations + max_act * steering_strength * steering_vector


def generate_with_steering(
    model,
    sae,
    prompt,
    steering_feature,
    max_act,
    steering_strength=1.0,
    max_new_tokens=95,
):
    input_ids = model.to_tokens(prompt, prepend_bos=sae.cfg.prepend_bos)

    steering_vector = sae.W_dec[steering_feature].to(model.cfg.device)

    steering_hook = partial(
        steering,
        steering_vector=steering_vector,
        steering_strength=steering_strength,
        max_act=max_act,
    )

    # standard transformerlens syntax for a hook context for generation
    with model.hooks(fwd_hooks=[(sae.cfg.metadata.hook_name, steering_hook)]):
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            stop_at_eos=False if device == "mps" else True,
            prepend_bos=sae.cfg.prepend_bos,
        )

    return model.tokenizer.decode(output[0])


# Choose a feature to steer
steering_feature = steering_feature = 20115  # Choose a feature to steer towards

# Find the maximum activation for this feature
max_act = find_max_activation(model, sae, activation_store, steering_feature)
print(f"Maximum activation for feature {steering_feature}: {max_act:.4f}")

# note we could also get the max activation from Neuronpedia (https://www.neuronpedia.org/api-doc#tag/lookup/GET/api/feature/{modelId}/{layer}/{index})

# Generate text without steering for comparison
prompt = "Once upon a time"
normal_text = model.generate(
    prompt,
    max_new_tokens=95,
    stop_at_eos=False if device == "mps" else True,
    prepend_bos=sae.cfg.prepend_bos,
)

print("\nNormal text (without steering):")
print(normal_text)

# Generate text with steering
steered_text = generate_with_steering(
    model, sae, prompt, steering_feature, max_act, steering_strength=2.0
)
print("Steered text:")
print(steered_text)

RuntimeError: All proxies failed

# SAELENS

In [None]:
def find_max_activation(model, sae, activation_store, feature_idx, num_batches=100):
    """
    Find the maximum activation for a given feature index. This is useful for
    calibrating the right amount of the feature to add.
    """
    max_activation = 0.0

    pbar = tqdm(range(num_batches))
    for _ in pbar:
        tokens = activation_store.get_batch_tokens()

        layer = int(re.search(r"\.(\d+)\.", sae.cfg.metadata.hook_name).group(1))  # type: ignore
        _, cache = model.run_with_cache(
            tokens,
            stop_at_layer=layer + 1,
            names_filter=[sae.cfg.metadata.hook_name],
        )
        sae_in = cache[sae.cfg.metadata.hook_name]
        feature_acts = sae.encode(sae_in).squeeze()

        feature_acts = feature_acts.flatten(0, 1)
        batch_max_activation = feature_acts[:, feature_idx].max().item()
        max_activation = max(max_activation, batch_max_activation)

        pbar.set_description(f"Max activation: {max_activation:.4f}")

    return max_activation



# instantiate an object to hold activations from a dataset
from sae_lens import ActivationsStore

# a convenient way to instantiate an activation store is to use the from_sae method
activation_store = ActivationsStore.from_sae(
    model=model,
    dataset=sae.cfg.metadata.dataset_path,
    sae=sae,
    streaming=True,
    # fairly conservative parameters here so can use same for larger
    # models without running out of memory.
    store_batch_size_prompts=8,
    train_batch_size_tokens=4096,
    n_batches_in_buffer=32,
    device=device,
)

steering_feature = 1230
max_act = find_max_activation(model, sae, activation_store, steering_feature)


In [None]:
# from transformer_lens import HookedTransformer
from sae_lens import SAE, HookedSAETransformer

# model = HookedSAETransformer.from_pretrained("gpt2-small", device="cuda")
# sae, sae_cfg, _ = SAE.from_pretrained(
#     release="gpt2-small-res-jb",  # <- Release name
#     sae_id="blocks.7.hook_resid_pre",  # <- SAE id (not always a hook point!)
#     device="cuda",
# )

# ==========================================================================
SAE_TYPE = "res"    # res/ mlp/ att
MODEL_NAME = "gemma-2-2b"
RELEASE = f"gemma-scope-2b-pt-{SAE_TYPE}-canonical"
LAYER = 25
SAE_ID = f"layer_{LAYER}/width_16k/canonical"

model = HookedSAETransformer.from_pretrained(MODEL_NAME, device="cuda")
sae, cfg, _ = SAE.from_pretrained(
    release=RELEASE,  # <- Release name
    sae_id=SAE_ID,  # <- SAE id (not always a hook point!)
    device="cuda",
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Generate text without steering for comparison
prompt = "I am feeling good"
normal_text = model.generate(
    prompt,
    max_new_tokens=65,
    stop_at_eos=False if model.cfg.device == "mps" else True,
    prepend_bos=sae.cfg.prepend_bos,
)
print(normal_text)

NameError: name 'model' is not defined

In [None]:
from tqdm import tqdm
from functools import partial
import re


def steering(
    activations, hook, steering_strength=1.0, steering_vector=None, max_act=1.0
):
    # Note if the feature fires anyway, we'd be adding to that here.
    return activations + max_act * steering_strength * steering_vector

device = "cuda"

def generate_with_steering(
    model,
    sae,
    prompt,
    steering_feature,
    max_act,
    steering_strength=1.0,
    max_new_tokens=64,
):
    input_ids = model.to_tokens(prompt, prepend_bos=sae.cfg.prepend_bos)

    steering_vector = sae.W_dec[steering_feature].to(model.cfg.device)

    steering_hook = partial(
        steering,
        steering_vector=steering_vector,
        steering_strength=steering_strength,
        max_act=max_act,
    )

    # standard transformerlens syntax for a hook context for generation
    with model.hooks(fwd_hooks=[(sae.cfg.hook_name, steering_hook)]):
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.5,
            top_p=0.9,
            stop_at_eos=False if device == "mps" else True,
            prepend_bos=sae.cfg.prepend_bos,
        )

    return model.tokenizer.decode(output[0])

prompt = "I am feeling good"
# Generate text with steering
steered_text = generate_with_steering(
    model=model, sae=sae, prompt=prompt, steering_feature=1, max_act=2.0, steering_strength=20.0
)
print("Steered text:")
print(steered_text)

# Get explanations

In [None]:
import pandas as pd
path = "/content/drive/MyDrive/LLM Interpretability/datasets/negative_feelings.csv"
df = pd.read_csv(path, encoding="utf-8-sig")
df.head()

Unnamed: 0,type,layer,class1,class2,Neuron_ID,Contribution,Explanation
0,mlp,24,devastated,embarrassed,282,12699340.0,hyperlinks and web addresses
1,mlp,24,embarrassed,furious,282,12655612.0,hyperlinks and web addresses
2,mlp,24,ashamed,embarrassed,282,12571800.0,hyperlinks and web addresses
3,mlp,24,disappointed,embarrassed,282,12484344.0,hyperlinks and web addresses
4,mlp,24,disgusted,embarrassed,282,12375024.0,hyperlinks and web addresses


In [None]:
import requests

def get_feature(model_id, source, index):
    try:
        url = f"https://www.neuronpedia.org/api/feature/{model_id}/{source}/{index}"
        resp = requests.get(url)
        resp.raise_for_status()            # throws if not 200
        feature = resp.json()

        # 2. Grab the list of explanations (might be empty!)
        explanations = feature.get("explanations", [])

        # 3. first description:
        # if explanations:
        return explanations[0]["description"]
    except Exception as e:
        return None

In [None]:
from tqdm.auto import tqdm

model = "gemma-2-2b"
if "Explanation" not in df.columns:
    df["Explanation"] = None


# for idx, row in tqdm(df.iloc[start_idx:end_idx].iterrows(), total=min(24700, len(df) - start_idx), desc="Fetching features"):
# for idx, row in tqdm(df.iterrows(), total=len(df), desc="Fetching features"):
for idx, row in tqdm(df.iloc[:25_000].iterrows(), total=25_000, desc="Fetching features"):
    sae_type  = row["type"]
    layer     = row["layer"]
    neuron_id = row["Neuron_ID"]
    explanation = row["Explanation"]
    if explanation is not None:
        continue
    source    = f"{layer}-gemmascope-{sae_type}-16k"

    feat = get_feature(model, source, neuron_id)
    if feat is None:
        continue
    df.at[idx, "Explanation"] = feat

df.to_csv(path, index=False, encoding="utf-8-sig")


Fetching features:   0%|          | 0/50000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import pandas as pd
import asyncio
import aiohttp
from tqdm.asyncio import tqdm

model = "gemma-2-2b"

chunk = df.iloc[0:25000]

# --- Define async API call ---
async def fetch_feature(session, idx, row):
    sae_type = row["type"]
    layer = row["layer"]
    neuron_id = row["Neuron_ID"]
    explanation = row["Explanation"]
    if explanation is not None:
        return idx, explanation
    source = f"{layer}-gemmascope-{sae_type}-16k"
    url = f"https://www.neuronpedia.org/api/feature/{model}/{source}/{neuron_id}"

    try:
        async with session.get(url, timeout=10) as resp:
            data = await resp.json()
            explanations = data.get("explanations", [])
            return idx, explanations[0]["description"] if explanations else None
    except Exception as e:
        return idx, None

# --- Async executor ---
async def run():
    connector = aiohttp.TCPConnector(limit=10)  # Controls concurrency level
    timeout = aiohttp.ClientTimeout(total=10)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        tasks = [fetch_feature(session, idx, row) for idx, row in chunk.iterrows()]
        results = []
        for f in tqdm.as_completed(tasks, total=len(tasks), desc="Async fetching"):
            result = await f
            results.append(result)
        return results

# --- Run & apply results ---
results = await run()
for idx, explanation in results:
    df.at[idx, "Explanation"] = explanation

df.to_csv(path, index=False, encoding="utf-8-sig")

Async fetching: 100%|██████████| 50000/50000 [00:15<00:00, 3152.09it/s]


In [None]:
df.to_csv(path, index=False, encoding="utf-8-sig")

# STEERING PIPELINE


In [1]:
!pip install sae_lens



In [None]:
# from transformer_lens import HookedTransformer
from sae_lens import SAE, HookedSAETransformer

# model = HookedSAETransformer.from_pretrained("gpt2-small", device="cuda")
# sae, sae_cfg, _ = SAE.from_pretrained(
#     release="gpt2-small-res-jb",  # <- Release name
#     sae_id="blocks.7.hook_resid_pre",  # <- SAE id (not always a hook point!)
#     device="cuda",
# )

# ==========================================================================

device = "cuda"
MODEL_NAME = "gemma-2-2b"

SAE_TYPE = "res"    # res/ mlp/ att
LAYER = 25
SAE_ID = f"layer_{LAYER}/width_16k/canonical"
RELEASE = f"gemma-scope-2b-pt-{SAE_TYPE}-canonical"


model = HookedSAETransformer.from_pretrained(MODEL_NAME, device="cuda")
sae, cfg, _ = SAE.from_pretrained(
    release=RELEASE,  # <- Release name
    sae_id=SAE_ID,  # <- SAE id (not always a hook point!)
    device="cuda",
)



config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
from functools import partial


def steering(
    activations, hook, steering_strength=1.0, steering_vector=None, max_act=1.0
):
    # Note if the feature fires anyway, we'd be adding to that here.
    return activations + max_act * steering_strength * steering_vector


def generate_with_steering(
    model,
    sae,
    prompt,
    steering_feature,
    max_act,
    steering_strength=1.0,
    max_new_tokens=64,
):
    input_ids = model.to_tokens(prompt, prepend_bos=sae.cfg.prepend_bos)

    steering_vector = sae.W_dec[steering_feature].to(model.cfg.device)

    steering_hook = partial(
        steering,
        steering_vector=steering_vector,
        steering_strength=steering_strength,
        max_act=max_act,
    )

    # standard transformerlens syntax for a hook context for generation
    with model.hooks(fwd_hooks=[(sae.cfg.hook_name, steering_hook)]):
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.5,
            top_p=0.9,
            stop_at_eos=False if device == "mps" else True,
            prepend_bos=sae.cfg.prepend_bos,
        )

    return model.tokenizer.decode(output[0])

def generate_without_steering(model, sae, prompt, max_new_tokens):
    normal_text = model.generate(
        prompt,
        max_new_tokens=max_new_tokens,
        stop_at_eos=False if model.cfg.device == "mps" else True,
        prepend_bos=sae.cfg.prepend_bos,
    )
    return normal_text