In [None]:
%pip install neuronpedia

from dotenv import load_dotenv

# Load NEURONPEDIA_API_KEY from .env file
load_dotenv()


### [less safe] Set Neuronpedia API key manually (get your key from neuronpedia.org/account)
# import os
# os.environ["NEURONPEDIA_API_KEY"] = "YOUR_KEY_HERE"

In [1]:
import json
from neuronpedia.butanium_dictionary_learning.dictionary_learning import CrossCoder
from neuronpedia.np_vector import NPVector

crosscoder = CrossCoder.from_pretrained("Butanium/gemma-2-2b-crosscoder-l13-mu4.1e-02-lr1e-04", from_hub=True)

FEATURE_INDEX = 221
FEATURE_NAME = "dates and time"

vector = crosscoder.encoder.weight[1][:, FEATURE_INDEX].detach().tolist()

# upload the vector to Neuronpedia
np_vector = NPVector.new(
    label=FEATURE_NAME,
    model_id="gemma-2-2b-it",
    layer_num=13,
    hook_type="hook_resid_pre",
    vector=vector,
    default_steer_strength=20,
)
# this gives us the ID of the vector, which we can use for steering
print(np_vector)

Sending POST request to http://localhost:3000/api/vector/new
Got a successful response.
NPVector(label='date', model_id='gemma-2-2b-it', source='13-neuronpedia-resid-pre', index='140777935', values=[-0.01878019981086254, -0.02243487909436226, 0.008205173537135124, 0.03408874198794365, -0.05568249151110649, 0.01886882819235325, 0.003709812415763736, 0.003696431871503592, -0.02429807186126709, -0.01971487514674664, -0.02307866141200066, -0.01886068470776081, -0.009600664488971233, -0.01018965430557728, -0.001868776045739651, 0.01672543957829475, 0.0007540361839346588, 0.0314553938806057, 0.0110303582623601, -0.02755042351782322, -0.01177451573312283, 0.04587624967098236, -0.04838990420103073, -0.02832342684268951, 0.02760405279695988, 0.01983129046857357, 0.02793074026703835, 0.0210083145648241, -0.03405335173010826, 0.0224988367408514, 0.002040979918092489, 0.001244997140020132, 0.007838851772248745, 0.01316340919584036, -0.006727463565766811, -0.003431570483371615, 0.0414169579744339, 

In [2]:

# do the steering using Neuronpedia
responseJson = np_vector.steer_chat(steered_chat_messages=[{"role": "user", "content": "Write a one sentence story."}])

print(json.dumps(responseJson, indent=2))

Sending POST request to http://localhost:3000/api/steer-chat/
Got a successful response.
{
  "STEERED": {
    "chat_template": [
      {
        "content": "Write a one sentence story.",
        "role": "user"
      },
      {
        "content": "The setting sun bled crimson into the endless expanse of Tuesday.",
        "role": "model"
      }
    ],
    "raw": "<bos><start_of_turn>user\nWrite a one sentence story.<end_of_turn>\n<start_of_turn>model\nThe setting sun bled crimson into the endless expanse of Tuesday. \n\n\n<end_of_turn><eos>"
  },
  "DEFAULT": {
    "chat_template": [
      {
        "content": "Write a one sentence story.",
        "role": "user"
      },
      {
        "content": "The last whisper of the wind carried a single, forgotten dandelion seed across the barren landscape.",
        "role": "model"
      }
    ],
    "raw": "<bos><start_of_turn>user\nWrite a one sentence story.<end_of_turn>\n<start_of_turn>model\nThe last whisper of the wind carried a single, 

In [4]:
# open the browser to continue steering
import webbrowser
webbrowser.open(responseJson["shareUrl"])

True