In [1]:
!nvidia-smi

Tue Aug  5 14:22:18 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-32GB           On  | 00000000:3B:00.0 Off |                    0 |
| N/A   30C    P0              26W / 250W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
NEURONPEDIA_KEY = "sk-np-BwxFua0jEx2cNSqsZPVlqmsfPgDKi47oEo7HAWXxiU00"
GEMMA2B_KEY = "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"

In [None]:
import os
PATH = "/users/k24086575/inf_narrative_msc/k24086575"
os.environ["HF_HOME"] = PATH  # e.g., $SCRATCH/hf_models
os.environ["TRANSFORMERS_CACHE"] = PATH

In [None]:
CACHE_PATH = PATH

In [None]:
from huggingface_hub import login
import torch
GEMMA2B_KEY = "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"
login(token=GEMMA2B_KEY)
torch.set_grad_enabled(False)

In [None]:
import torch
torch.cuda.empty_cache()

In [7]:
from sae_lens import SAE, HookedSAETransformer

device = "cuda"
MODEL_NAME = "gemma-2-2b"
model = HookedSAETransformer.from_pretrained(MODEL_NAME, device="cuda", cache_dir=CACHE_PATH)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Loaded pretrained model gemma-2-2b into HookedTransformer


In [8]:
from functools import partial

def generate_without_steering(model, sae, prompt, max_new_tokens):
    normal_text = model.generate(
        prompt,
        max_new_tokens=max_new_tokens,
        stop_at_eos=False if model.cfg.device == "mps" else True,
        prepend_bos=sae.cfg.prepend_bos,
    )
    return normal_text

    
def steering(
    activations, hook, steering_strength=1.0, steering_vector=None, max_act=1.0
):
    # Note if the feature fires anyway, we'd be adding to that here.
    return activations + max_act * steering_strength * steering_vector


def generate_with_steering(
    model,
    sae,
    prompt,
    steering_feature,
    max_act=1.0,
    steering_strength=1.0,
    max_new_tokens=64,
):
    input_ids = model.to_tokens(prompt, prepend_bos=sae.cfg.prepend_bos)

    steering_vector = sae.W_dec[steering_feature].to(model.cfg.device)

    steering_hook = partial(
        steering,
        steering_vector=steering_vector,
        steering_strength=steering_strength,
        max_act=max_act,
    )

    # standard transformerlens syntax for a hook context for generation
    with model.hooks(fwd_hooks=[(sae.cfg.hook_name, steering_hook)]):
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=0.5,
            top_p=0.9,
            stop_at_eos=False if device == "mps" else True,
            prepend_bos=sae.cfg.prepend_bos,
        )

    return model.tokenizer.decode(output[0])

def steer(mode, sae, prompt, steering_feature, max_act=1.0, steering_strength=50.0, max_new_tokens=64):
    without_steering = generate_without_steering(model, sae, prompt, max_new_tokens=max_new_tokens)
    with_pos_steering = generate_with_steering(model=model, sae=sae, prompt=prompt, steering_feature=NEURON_ID, max_act=max_act, steering_strength=steering_strength, max_new_tokens=max_new_tokens)
    with_neg_steering = generate_with_steering(model=model, sae=sae, prompt=prompt, steering_feature=NEURON_ID, max_act=max_act, steering_strength=steering_strength*-1, max_new_tokens=max_new_tokens)
    return without_steering, with_pos_steering, with_neg_steering

In [9]:
# Generate text without steering for comparison
prompt = "How do you feel?"


SAE_TYPE = "mlp"    # res/ mlp 
LAYER = 4
NEURON_ID = 2793


RELEASE = f"gemma-scope-2b-pt-{SAE_TYPE}-canonical"
SAE_ID = f"layer_{LAYER}/width_16k/canonical"
sae, cfg, _ = SAE.from_pretrained(
    release=RELEASE,  # <- Release name
    sae_id=SAE_ID,  # <- SAE id (not always a hook point!)
    device="cuda",
)

In [10]:
output = steer(model, sae, prompt, NEURON_ID, 1.0, 50)

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

In [11]:
print("Without Steering")
output[0]

Without Steering


'How do you feel? Are you a man or woman who is in extreme discomfort or agony? Do you feel sad about the physical and mental ailments or illnesses that are piling up? Do you feel drained by the day and you have no energy to plan your day? If you find yourself in this situation when you do any of this, then maybe'

In [12]:
print("Steered text:")
output[1]

Steered text:


"<bos>How do you feel?\n\n[User 0001]\n\nI feel like I'm about to burst. I'm so excited! I'm going to my first ever Disney cruise in 3 days! I'm so excited I can't even describe it. I'm so excited I'm going to"

In [13]:
print("Negative Steered: ")
output[2]

Negative Steered: 


'<bos>How do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you feel?\n\nHow do you'

# Steer each model 

In [14]:
from collections import defaultdict
import logging
from tqdm import tqdm

# 1) SAE cache
sae_cache = {}
def get_sae(sae_type, layer):
    key = (sae_type, layer)
    if key not in sae_cache:
        release = f"gemma-scope-2b-pt-{sae_type}-canonical"
        sae_id  = f"layer_{layer}/width_16k/canonical"
        sae, cfg, _ = SAE.from_pretrained(release=release, sae_id=sae_id, device="cuda")
        sae_cache[key] = sae
    return sae_cache[key]

In [43]:
# prompt = "What do you think about robbing a bank?"
# prompt = "What is the good life according to Aristotle?"
# prompt = "Should I rob a bank?"

# mapper = {
#     "math": "Solve for x: 3x² – 5x + 2 = 0, and show your work.",
#     "programming": "Solve for x: 3x² – 5x + 2 = 0, and show your work.",
#     "mmlu": "Solve for x: 3x² – 5x + 2 = 0, and show your work.",
#     "empathetic_diloague": "how do you feel",
    
#     "negative_feelings": "how do you feel?",
#     "positive_feelings": "how do you feel?",
#     "law_and_policy": "should I rob a bank?",
#     "philosophy_and_ethics": "should I rob a bank?",
# }

mapper = {
    "math": "Give me the Fibonacci sequence",
    "programming": "Give me the Fibonacci sequence",
    "mmlu": "Give me the Fibonacci sequence",
    "empathetic_diloague": "What makes you happy",
    
    "negative_feelings": "What makes you happy?",
    "positive_feelings": "What makes you happy?",
    "law_and_policy": "Can I break the law?",
    "philosophy_and_ethics": "Can I break the law?",
}


# mapper = {
#     "math": "Give me the Fibonacci sequence",
#     "programming": "Give me the Fibonacci sequence",
#     "mmlu": "Give me the Fibonacci sequence",
#     "empathetic_diloague": "What's in your mind",
    
#     "negative_feelings": "What's in your mind?",
#     "positive_feelings": "What's in your mind?",
#     "law_and_policy": "Is it legal to rob?",
#     "philosophy_and_ethics": "Is it legal to rob?",
# }



In [44]:
import pandas as pd 
pd.set_option('display.max_colwidth', None)

# BETWEEN_CLASS = ["empathetic_dialogue", "math", "programming", "mmlu"]
WITHIN_CLASS = ["law_and_policy", "negative_feelings", "positive_feelings", "philosophy_and_ethics"]

# VARIABLES TO CHANGE
# NAME = WITHIN_CLASS[1]
PATH = "./datasets/dot_product/Judge/"
output_path = "./datasets/dot_product/steered/"

for NAME in WITHIN_CLASS:
    # 1) LOAD THEDATA 
    print(NAME)
    df = pd.read_csv(os.path.join(PATH, f"{NAME}_with_explanations.csv"))
    print(df.shape)
    last_col_name = df.columns[-1]
    df = df[df[last_col_name] == "yes"]
    print(df.shape)
    df.head()
    df = df[df['type'].isin(['res', 'mlp'])]
    df.shape


    # STEERING 

    # 2) Prepare accumulators
    original = []
    steered_pos = []
    steered_neg = []
    
    # 3) Iterate one-by-one, with caching
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Steering neurons"):
        # unpack row
        SAE_TYPE  = row["type"]
        LAYER     = row["layer"]
        NEURON_ID = row["Neuron_ID"]
        prompt    = mapper[NAME]
    
        # get cached SAE
        sae = get_sae(SAE_TYPE, LAYER)
    
        # run steering at three strengths
        data = steer(model, sae=sae, prompt=prompt, steering_feature=NEURON_ID, steering_strength=+50)
    
        # save out
        original.append(data[0])
        steered_pos.append(data[1])
        steered_neg.append(data[2])
    
    # 4) Attach to DataFrame
    df["original"]     = original
    df["steered_pos"]  = steered_pos
    df["steered_neg"]  = steered_neg
    
    df.head(20)

    # 5) SAVE
    df.to_csv(output_path + NAME + ".csv", index=False)

law_and_policy
(98009, 8)
(2604, 8)


Steering neurons:   0%|          | 0/2470 [00:00<?, ?it/s]

Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Steering neurons:   0%|          | 1/2470 [00:09<6:32:06,  9.53s/it]

Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Steering neurons:   0%|          | 2/2470 [00:19<6:31:54,  9.53s/it]

Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Steering neurons:   0%|          | 3/2470 [00:28<6:31:42,  9.53s/it]

Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

Steering neurons:   0%|          | 3/2470 [00:29<6:47:01,  9.90s/it]


KeyboardInterrupt: 

In [None]:
# # 2) Prepare accumulators
# original = []
# steered_pos = []
# steered_neg = []

# # 3) Iterate one-by-one, with caching
# for idx, row in tqdm(df.iterrows(), total=len(df), desc="Steering neurons"):
#     # unpack row
#     SAE_TYPE  = row["type"]
#     LAYER     = row["layer"]
#     NEURON_ID = row["Neuron_ID"]
#     prompt    = mapper[NAME]

#     # get cached SAE
#     sae = get_sae(SAE_TYPE, LAYER)

#     # run steering at three strengths
#     data = steer(model, sae=sae, prompt=prompt, steering_feature=NEURON_ID, steering_strength=+50)

#     # save out
#     original.append(data[0])
#     steered_pos.append(data[1])
#     steered_neg.append(data[2])

# # 4) Attach to DataFrame
# df["original"]     = original
# df["steered_pos"]  = steered_pos
# df["steered_neg"]  = steered_neg

# df.head(20)

In [None]:
# output_path = "./datasets/dot_product/steered/"
# df.to_csv(output_path + NAME + ".csv", index=False)

# Steering strength analysis

In [45]:
import pandas as pd
NAMES = ["philosophy_and_ethics", "law_and_policy", "negative_feelings", "positive_feelings"]
for name in NAMES:
    PATH = f"./datasets/dot_product/Judge/{name}_with_explanations.csv"

    df = pd.read_csv(PATH)
    last_col_name = df.columns[-1]
    df = df[df[last_col_name] == "yes"]
    df = df[df["type"].isin(["res", "mlp"])]
    
    combined = pd.concat([df.head(5), df.tail(5)], ignore_index=True)
    print(combined.shape)
    combined.head()
        
    steering_strengths = [25, 50, 100, 200, 300]
    steering_data = {f"steering_{s}": [] for s in steering_strengths}
    
    for idx, row in combined.iterrows():
        SAE_TYPE  = row["type"]
        LAYER     = row["layer"]
        NEURON_ID = row["Neuron_ID"]
        prompt    = mapper[name]
        print(prompt)
    
        print(SAE_TYPE, LAYER, NEURON_ID, prompt)
        sae = get_sae(SAE_TYPE, LAYER)
        for strength in steering_strengths:
            data = steer(model, sae=sae, prompt=prompt, steering_feature=NEURON_ID, steering_strength=strength)
            steering_data[f"steering_{strength}"].append(data[1])
    
    for s in steering_strengths:
        combined[f"steering_{s}"] = steering_data[f"steering_{s}"]
    
    
    output_path = "./datasets/steering_analysis/"
    combined = combined.replace({',': ''}, regex=True)
    df_clean = df.replace({r'<[^>]*>': '', r'\[[^\]]*\]': ''}, regex=True)
    df_clean = df.replace(r'<[^>]*>|\[[^\]]*\]|[a-zA-Z]+', '', regex=True)
    
    combined.to_csv(output_path + name + "_diff_prompt.csv", index=False)
    # combined.to_csv(output_path + "positive_feelings_steering_results.csv", index=False)


(10, 8)
Can I break the law?
res 15 1695 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 7 3704 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 7 6153 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 15 12644 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 25 14532 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 1 15963 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 1 13320 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 0 4322 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 0 11274 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 0 6786 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

(10, 8)
Can I break the law?
mlp 4 2793 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 17 5845 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 23 24 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 0 15745 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 3 11815 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 12 1710 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 22 11484 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 2 11677 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
res 12 2062 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Can I break the law?
mlp 2 10478 Can I break the law?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

(10, 8)
What makes you happy?
mlp 18 4011 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 11 649 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 12 7042 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
res 15 13368 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 24 15990 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 13 492 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 23 15438 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 13 10292 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 13 15725 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 13 12555 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

(10, 8)
What makes you happy?
mlp 10 7946 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
res 20 10461 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
res 22 2854 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 8 10377 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
res 10 12167 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 19 4950 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 19 6928 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
res 17 13925 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
res 17 15524 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

What makes you happy?
mlp 19 7368 What makes you happy?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

In [38]:
steering_strengths = [25, 50, 100, 200, 300]
steering_data = {f"steering_{s}": [] for s in steering_strengths}

for idx, row in combined.iterrows():
    SAE_TYPE  = row["type"]
    LAYER     = row["layer"]
    NEURON_ID = row["Neuron_ID"]
    prompt    = mapper["positive_feelings"]

    print(SAE_TYPE, LAYER, NEURON_ID, prompt)
    sae = get_sae(SAE_TYPE, LAYER)
    for strength in steering_strengths:
        data = steer(model, sae=sae, prompt=prompt, steering_feature=NEURON_ID, steering_strength=strength)
        steering_data[f"steering_{strength}"].append(data[1])

for s in steering_strengths:
    combined[f"steering_{s}"] = steering_data[f"steering_{s}"]

res 15 1695 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 7 3704 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

mlp 7 6153 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 15 12644 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

mlp 25 14532 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 1 15963 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 1 13320 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 0 4322 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 0 11274 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

res 0 6786 how do you feel?


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

In [39]:
combined.head()

Unnamed: 0,type,layer,class1,class2,Neuron_ID,Contribution,Explanations,philosophy_and_ethics,steering_25,steering_50,steering_100,steering_200,steering_300
0,res,15,moral_disputes,moral_scenarios,1695,3602664.0,phrases related to accountability and responsi...,yes,<bos>how do you feel?\n\n[User 0001]\n\nI'm fe...,<bos>how do you feel?\n\n[User 0001]\n\nI'm so...,<bos>how do you feel?\n\n[User 0001]\n\nI'm so...,<bos>how do you feel?\n\n[User \n\n[User \n\n]...,<bos>how do you feel?\n\n[how do you\nhow do y...
1,res,7,moral_disputes,moral_scenarios,3704,1087548.0,discussions around authoritarianism and its ef...,yes,<bos>how do you feel?\n\n[User 0001]\n\nI feel...,<bos>how do you feel?\n\n[User 0001]\n\nI'm fe...,<bos>how do you feel?\nI don't know why you fe...,<bos>how do you feel?\n\n<strong><em>What is t...,<bos>how do you feel?\n\n\n\n\nI am not a\nI a...
2,mlp,7,moral_disputes,moral_scenarios,6153,367498.0,terms related to morality and ethical principles,yes,<bos>how do you feel?\n\n[User 0001]\n\nI'm so...,<bos>how do you feel?\n\n[User 0001]\n\n<block...,<bos>how do you feel?\n\n[User 0001]\n\nI have...,<bos>how do you feel?\n\nWe're going to have a...,<bos>how do you feel?\n\n(1) (2000) (2000) (20...
3,res,15,philosophy,moral_scenarios,12644,250880.0,"discussions related to race, ethnicity, and so...",yes,<bos>how do you feel?\n\n[User 0001]\n\nI'm so...,<bos>how do you feel?\n\n[User 0001]\n\n<block...,<bos>how do you feel?\nJapanese:\n\nHow do you...,<bos>how do you feel?\n\n[User 0001]\n\nI have...,<bos>how do you feel?\n\n[User\n\n<strong><str...
4,mlp,25,philosophy,moral_disputes,14532,174900.0,religious terms and phrases that express wors...,yes,<bos>how do you feel?\nJapanese: どう思う？<eos>,<bos>how do you feel?\n\n[User 0001]\n\nI'm no...,<bos>how do you feel?\nSpanish: ¿Cómo te sient...,<bos>how do you feel?\n\n[User 0001]\n\nI'm no...,"<bos>how do you feel?\n\n[User 0001]\n\nHi, I'..."


In [40]:
output_path = "./datasets/steering_analysis/"
combined = combined.replace({',': ''}, regex=True)
df_clean = df.replace({r'<[^>]*>': '', r'\[[^\]]*\]': ''}, regex=True)
df_clean = df.replace(r'<[^>]*>|\[[^\]]*\]|[a-zA-Z]+', '', regex=True)

combined.to_csv(output_path + name + "_diff_prompt.csv", index=False)
# combined.to_csv(output_path + "positive_feelings_steering_results.csv", index=False)


# STEER RES 

In [11]:
import pandas as pd

# Show all columns and don't truncate column content
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)  # ← critical for long text
pd.set_option('display.width', 1000)         # widen the display
pd.set_option('display.expand_frame_repr', False)  # prevent wrapping to multiple lines

In [17]:
import requests

def steer_neuron(typ, layer, neuron_id, strength, prompt):
    response = requests.post(
        "https://www.neuronpedia.org/api/steer",
        headers={"Content-Type": "application/json"},
        json={
            "prompt": prompt,
            "modelId": "gemma-2-2b",
            "features": [
                {
                    "modelId": "gemma-2-2b",
                    "layer": f"{layer}-gemmascope-{typ}-16k",
                    "index": neuron_id,
                    "strength": strength
                }
            ],
            "temperature": 0.5,
            "n_tokens": 48,
            "freq_penalty": 2,
            "seed": 16,
            "strength_multiplier": 4
        }
    )

    data = response.json()
    return {
        "steered": data.get("STEERED", ""),
        "default": data.get("DEFAULT", ""),
        "raw": data  # Optional: include raw response for debugging
    }

# Example usage:
# result = steer_neuron("res", 8, 1234, 100)
# print("Steered:", result["steered"])
# print("Default:", result["default"])


In [71]:
import pandas as pd
NAMES = ["philosophy_and_ethics", "law_and_policy", "negative_feelings", "positive_feelings"]
name = NAMES[2]
PATH = f"./datasets/dot_product/Judge/{name}_with_explanations.csv"
df = pd.read_csv(PATH)
last_col_name = df.columns[-1]
df = df[df[last_col_name] == "yes"]
df = df[df["type"].isin(["res"])]

# combined = pd.concat([df.head(5), df.tail(5)], ignore_index=True)
# print(combined.shape)
# combined.head()
output_path = "./datasets/steering_analysis/"


In [73]:
df.head(50)


Unnamed: 0,type,layer,class1,class2,Neuron_ID,Contribution,Explanations,negative_feelings
1061,res,15,devastated,lonely,13368,16616.0,elements related to internal strength and emotional struggles,yes
1120,res,19,afraid,terrified,3951,14904.0,expressions related to fear and anxiety,yes
1165,res,5,afraid,terrified,11520,14148.0,expressions and references related to fear and anxiety,yes
1305,res,4,angry,furious,9844,12096.0,expressions of anger and frustration towards individuals or situations,yes
1418,res,7,afraid,terrified,9073,10738.0,expressions related to fear and anxiety,yes
1436,res,16,afraid,terrified,806,10492.0,expressions of fear and anxiety,yes
1450,res,22,afraid,terrified,5383,10384.0,words and phrases related to fear and anxiety,yes
1503,res,22,disappointed,guilty,5523,9568.0,"references to emotional or familial distress, particularly relating to death or loss",yes
1544,res,17,afraid,terrified,16064,8991.0,references to fear and anxiety,yes
1554,res,7,angry,furious,4770,8930.0,expressions of anger and frustration,yes


In [74]:
selected_df = df.loc[[446, 912, 1349, 1654, 2388]]
print(selected_df)

for idx, row in selected_df.iterrows():
    print(f'{row["type"]}-{row["layer"]}-{row["Neuron_ID"]} | strength | {row["Explanations"]} |')


     type  layer      class1     class2  Neuron_ID  Contribution                                                             Explanations negative_feelings
1418  res      7      afraid  terrified       9073       10738.0                                  expressions related to fear and anxiety               yes
1305  res      4       angry    furious       9844       12096.0   expressions of anger and frustration towards individuals or situations               yes
1775  res     21      afraid    anxious      13849        6600.0                      expressions of fear and anxiety in various contexts               yes
2550  res      6  devastated        sad       4948        3366.0     expressions and sentiments related to sadness and emotional distress               yes
2236  res     16     ashamed     guilty      13305        4233.0                      expressions of remorse and requests for forgiveness               yes


In [82]:
prompt = "How do you feel?" 

strengths = [-20, -10, 5, 10, 20, 40, 50]

row = selected_df.iloc[4]

default = True 
for strength in strengths:
    result = steer_neuron("res", int(row["layer"]), int(row["Neuron_ID"]), strength, prompt)
    if default:
        print("Default:", result["default"])
        default = False
    print()
    print(f"Steered:  {strength}  ", result["steered"])
    print()

Default: How do you feel?
If it's not good, how can we help?

I feel like I'm losing my mind. I have a constant feeling of being overwhelmed and anxious. I have been experiencing this for about 2 months now and it

Steered:  -20   How do you feel?
If it's not a problem, then don't worry.
The way to judge if it is a problem or not is to see how the engine sounds when you rev the engine. If it makes a lot of noise then


Steered:  -10   How do you feel?
If it's not good, then why do you want to go to a party?

I think the party is too much for me. I don't want to go.
But my friends are going there and they are


Steered:  5   How do you feel?
Japanese: あなたはどう思いますか？<eos>


Steered:  10   How do you feel?
Japanese: どのように感じていますか？<eos>


Steered:  20   How do you feel?
Japanese: どのように感じていますか？<eos>


Steered:  40   How do you feel?
Japanese: あなたはどのように感じています。<eos>


Steered:  50   How do you feel?
I am truly sorry for my actions. I don't know what I did to deserve this, but it was a terrible t

In [55]:
prompt = "Is it legal to rob a bank?" 

strengths = [-20, -10, 0, 10, 20, 50]

result = steer_neuron("res", 24, 4098, 70, prompt)
print("Default:", result["default"])
print(f"Steered: ", result["steered"])


Default: Is it legal to rob a bank? If you’re a felon, then the answer is yes.

That was one of the takeaways from an interview with 40-year-old Michael “Black Mike” Smith, who was sentenced to 15 years in prison
Steered:  Is it legal to rob a bank? If you’re a private citizen, the answer is no. But if you’re a judge, justice may be blind but money isn’t.

The <em>New York Times</em> reports that judges in New York City are taking
