In [None]:
%load_ext autoreload
%autoreload 2

# Setup
Before running this notebook, make sure the notebook is using the `expgen` environment.
You can follow the instructions [here](../../../README.md#using-jupyter-notebooks).


In [None]:
from typing import Dict, List

from util.subject import Subject, llama31_8B_instruct_config
from activations.exemplars import ExemplarSplit, ExemplarType
from activations.exemplars_wrapper import ExemplarConfig, ExemplarsWrapper
from activations.dataset import fineweb_dset_config, ultrachat_dset_config
from activations.exemplars_computation import (
    compute_exemplars_for_layer,
)
from explanations.explanations import ActivationSign, NeuronExplanations, simulate_and_score
from explanations.explanations_wrapper import ExplanationConfig, ExplanationsWrapper
from explanations.simulation_utils import FinetunedSimulator

# Generate descriptions for a single neuron in Llama 3.1 8B Instruct.
This notebook assumes access to at least 1 GPU that can fit a Llama-3.1-8B-Instruct model.

It will download:
- FineWeb and UltraChat 200k datasets.
- Llama-3.1-8B-Instruct (15GB)
- the fine-tuned explainer (15GB)
- the fine-tuned simulator (15GB)

Because there are 3 models involved in the process, if you only have 1 GPU, you will need to restart the session after each step is done. The progress at each step is cached so re-executing the cells is fast.

## Step 0: Pick a neuron in Llama-3.1-8B-Instruct.
layer $\in$ [0, 31], and neuron_idx $\in$ [0, 14336]

In [None]:
# Here are some neurons to get you started:
layer, neuron_idx = 5, 2183  # RL-related
layer, neuron_idx = 5, 14249  # East/west coast

## Step 1. Generate exemplars that maximally and minimally activate the neuron
We do this by going through random sequences and keeping in track of the top and bottom 100 sequences in terms of their neuron activation values.

In [None]:
# Subject model which contains the neuron of interest.
subject = Subject(llama31_8B_instruct_config)

exemplar_config = ExemplarConfig(
    hf_model_id=subject.lm_config.hf_model_id,
    # The two datasets we are going to sample sequences from
    hf_dataset_configs=(fineweb_dset_config, ultrachat_dset_config),
    sampling_ratios=[0.9, 0.1],
    # We are going to go through 20,000 sequences per polarity (+/-).
    num_seqs=50_000,
    # Each sequence will be at least 95 tokens long.
    seq_len=95,
    # We are going to keep the top and bottom 100 sequences per neuron.
    k=100,
)

# This will download FineWeb and UltraChat 200k if they are not already present.
# It will take a couple of minutes to download the datasets.
exemplars_wrapper = ExemplarsWrapper(
    data_dir="single_neuron_experiment/",
    config=exemplar_config,
    subject=subject,
)

Compute the exemplars for:
- the train split (this will be used to generate descriptions)
- the validation split (this will be used to score descriptions)

This will take around 7 minutes per split.

In [None]:
# This will download the subject model (Llama-3.1-8B-Instruct) if it is not already present.
compute_exemplars_for_layer(
    exemplars_wrapper=exemplars_wrapper, layer=layer, split=ExemplarSplit.TRAIN
)
compute_exemplars_for_layer(
    exemplars_wrapper=exemplars_wrapper, layer=layer, split=ExemplarSplit.VALID
)

Let's visualize the exemplars to see if there are any interesting patterns in the activations.

In [None]:
exemplars_wrapper.visualize_neuron_exemplars(
    layer=layer,
    neuron_idx=neuron_idx,
    exemplar_split=ExemplarSplit.TRAIN,
    indices=list(range(10)),  # Change this to visualize specific ranks in [0, 100].
)

## Step 2. Generate descriptions given the exemplars
We use our fine-tuned Llama 3.1 8B Instruct explainer to generate descriptions.

In [None]:
explanation_config = ExplanationConfig(
    exemplar_config=exemplar_config,
    # Include the top 20 exemplars in the prompt to the explainer.
    exem_slice_for_exp=(0, 20, 1),
    # Explainer model is on HuggingFace.
    explainer_model_name="Transluce/llama_8b_explainer",
    examples_placement="no_examples",
    # We sample 50 explanations.
    num_explanation_samples=50,
    # Simulator model is also on HuggingFace. More on this later.
    simulator_model_name="Transluce/llama_8b_simulator",
)
explanations_wrapper = ExplanationsWrapper(
    save_path="single_neuron_experiment/",
    config=explanation_config,
    exemplars_data_dir="single_neuron_experiment/",
    subject=subject,
)

Note: In order to do the description generation, you need to have an additional GPU available since the subject model is using the first GPU (when generating exemplars). If there is only 1 GPU, restart the session and re-execute the cells from the beginning.

In [None]:
# Load the explainer model.
# This will download the finetuned explainer model if it is not already present.
explanations_wrapper.initialize_explainer()

# Generate 50 descriptions using our fine-tuned explainer for both max/minimally activating exemplars.
explanations_wrapper.generate_explanations_for_neuron(layer, neuron_idx)

We can look at the sampled descriptions. It's quite hard to tell which is the best since there are so many.

In [None]:
explanations = explanations_wrapper.get_explanations_for_neuron(
    layer, neuron_idx, exem_splits=[ExemplarSplit.VALID]
)
for i, exp in enumerate(explanations["negative"]):
    print(f"description {i + 1}: {exp[0]}")

## Step 3. Score the descriptions
We can use our fine-tuned Llama 3.1 8B Instruct simulator to measure the quality of the 50 descriptions.

Note: In order to do the scoring, you need to have an additional GPU available since the explainer model is using the first GPU (when generating descriptions). If there is only 1 GPU, restart the session and re-execute the cells from the beginning.

In [None]:
# This will download the finetuned simulator model if it is not already present.
simulator = FinetunedSimulator.setup(
    model_path="Transluce/llama_8b_simulator",
    add_special_tokens=True,
    gpu_idx=1,  # If there are multiple GPUs, set this to > 0.
)

In [None]:
neuron_explanations = explanations_wrapper.get_neuron_scored_explanations(layer, neuron_idx)

In [None]:
explanations: Dict[ActivationSign, List[str]] = neuron_explanations.explanations
split_exemplars = explanations_wrapper.get_split_neuron_exemplars(
    True, ExemplarSplit.VALID, layer, neuron_idx
)

results = {act_sign: [] for act_sign in ActivationSign}
for act_sign, explanations_list in explanations.items():
    extype = ExemplarType.MAX if act_sign == ActivationSign.POS else ExemplarType.MIN
    results[act_sign] = simulate_and_score(
        split_exemplars=split_exemplars,
        explanations=explanations_list,
        exemplar_type=extype,
        simulator=simulator,
    )
scored_neuron_explanations = NeuronExplanations(
    neuron_id=neuron_explanations.neuron_id,
    explanations=results,
)

We can now look at only the top-scoring explanations.

In [None]:
best_explanations = scored_neuron_explanations.get_best_explanations(
    exemplar_splits=[ExemplarSplit.VALID]
)
for act_sign, neuron_expl in best_explanations.items():
    print(f"Top explanation for {act_sign}: ")
    explanation_str = neuron_expl.explanation
    score = neuron_expl.get_preferred_score(exemplar_splits=[ExemplarSplit.VALID])
    print(f"score: {score:.2f}: {explanation_str}")