# Activation Patching on GPT2, M1, M2 and M3 models using nnsight
- Developed on Google Colab using an A100 with 40GB GPU and 80GB system RAM.
- Runs with GPT2/TinyStories/Qwen/Llama with base/CS1/CS2/CS3.  
- Requires a GITHUB_TOKEN secret to access Martian quanta_text_to_sql code repository.
- Requires a HF_TOKEN secret to access Martian HuggingFace repository.


# Select model and command set


In [None]:
model_num = 3   # 0=GPT2, 1=TinyStories, 2=Qwen or 3=Llama
cs_num = 1      # 0=BaseModel, 1=CS1, 2=CS2 or 3=CS3

# Import libraries
Imports standard libraries. Do not read.

In [None]:
# https://nnsight.net/
!pip install -U nnsight

In [2]:
from IPython.display import clear_output
import einops
import torch
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "colab"

import nnsight
from nnsight import LanguageModel, util
from nnsight.tracing.Proxy import Proxy


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [None]:
try:
    from getpass import getpass
    from google.colab import userdata
    import gc
    import weakref
    github_token = userdata.get("GITHUB_TOKEN")
except:
    import os
    github_token = os.getenv("GITHUB_TOKEN")

# # Install the private repository using the token
!pip install --upgrade git+https://{github_token}@github.com/withmartian/quanta_text_to_sql.git

import TinySQL as qts

# Run on m1, m2 and m3 models

In [None]:
if model_num > 0:

    if model_num == 1:
        the_tokenizer, the_model = qts.load_sql_interp_model(model_num, cs_num, auth_token=userdata.get("HF_TOKEN"))

        model = LanguageModel(the_model, the_tokenizer)
        model.tokenizer = the_tokenizer
    else:
        if torch.backends.mps.is_available():
            model = LanguageModel(qts.sql_interp_model_location(model_num, cs_num), device_map="mps")
        else:
            model = LanguageModel(qts.sql_interp_model_location(model_num, cs_num), device_map="auto")

    clear_output()
    print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
     

## Run prompts to see which one the model gets right

In [5]:
prompts = [
("### Instruction: How much do employees earn? ### Context: CREATE TABLE employees (name TEXT, salary INTEGER, department TEXT) ### Response: SELECT", 'salary'),
("### Instruction: How much does each product cost? ### Context: CREATE TABLE products (name TEXT, price INTEGER, category TEXT) ### Response: SELECT", 'price'),
("### Instruction: How tall are all the buildings? ### Context: CREATE TABLE buildings (address TEXT, height INTEGER, year_built INTEGER) ### Response: SELECT", 'height'),
("### Instruction: How many inhabitants live in each city? ### Context: CREATE TABLE cities (name TEXT, population INTEGER, country TEXT) ### Response: SELECT", 'population'),
("### Instruction: How heavy are the packages? ### Context: CREATE TABLE shipments (tracking_id TEXT, weight DECIMAL, destination TEXT) ### Response: SELECT", 'weight'),
("### Instruction: When was each book released? ### Context: CREATE TABLE books (title TEXT, publication_date DATE, author TEXT) ### Response: SELECT", 'publication_date'),
("### Instruction: How far is each destination? ### Context: CREATE TABLE locations (place TEXT, distance INTEGER, transport_mode TEXT) ### Response: SELECT", 'distance'),
("### Instruction: What's the temperature in each room? ### Context: CREATE TABLE sensors (room_id TEXT, temp_celsius DECIMAL, humidity INTEGER) ### Response: SELECT", 'temp_celsius'),
("### Instruction: How deep are the wells? ### Context: CREATE TABLE wells (location TEXT, depth INTEGER, status TEXT) ### Response: SELECT", 'depth'),
("### Instruction: How long are the movies? ### Context: CREATE TABLE films (title TEXT, duration INTEGER, genre TEXT) ### Response: SELECT", 'duration'),
("### Instruction: How fast can each vehicle go? ### Context: CREATE TABLE vehicles (model TEXT, speed INTEGER, manufacturer TEXT) ### Response: SELECT", 'speed'),
]

results = []
prob_layers_all = {}
for i, (prompt, gt) in enumerate(prompts):
    print(f"Prompt {i+1}")
    with model.generate(prompt, max_new_tokens=1, temperature=0.0001) as tracer:
        out = model.generator.output.save()

    output_text = model.tokenizer.decode(out[0], skip_special_tokens=True)
    answer = output_text.split("SELECT")[-1].strip()

    correct = answer in gt
    results.append(correct)

    if correct:
        layers = model.model.layers
        # What I want to do next is see how the probabilities change as we move through the layers
        prob_layers = []
        final_ln = model.model.norm
        lm_head = model.lm_head
        with model.trace() as tracer:
            with tracer.invoke(prompt) as invoker:
                for layer_idx, layer in enumerate(layers):
                    # Process layer output through the model's head and layer normalization
                    layer_output = lm_head(final_ln(layer.output[0]))
                    # Apply softmax to obtain probabilities and save the result
                    probs = torch.nn.functional.softmax(layer_output, dim=-1)

                    answer = " " + gt
                    answer_idx = model.tokenizer(answer)["input_ids"][1]
                    # Get the probability of the correct answer
                    correct_prob = probs[0, -1, answer_idx].save()
                    prob_layers.append(correct_prob)
                    
        prob_layers_all[answer] = prob_layers
        

print(sum(results)/len(results)) # About 50-60% accuracy in this unscientific test

Prompt 1


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 2


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 3
Prompt 4


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 5
Prompt 6


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 7
Prompt 8


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 9
Prompt 10


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 11
0.5454545454545454


In [6]:
correct_prompts = {}
for i, (prompt, gt) in enumerate(prompts):
    if results[i]:
        correct_prompts[gt] = prompt

for gt, prompt in correct_prompts.items():
    print(gt, prompt)

salary ### Instruction: How much do employees earn? ### Context: CREATE TABLE employees (name TEXT, salary INTEGER, department TEXT) ### Response: SELECT
height ### Instruction: How tall are all the buildings? ### Context: CREATE TABLE buildings (address TEXT, height INTEGER, year_built INTEGER) ### Response: SELECT
weight ### Instruction: How heavy are the packages? ### Context: CREATE TABLE shipments (tracking_id TEXT, weight DECIMAL, destination TEXT) ### Response: SELECT
distance ### Instruction: How far is each destination? ### Context: CREATE TABLE locations (place TEXT, distance INTEGER, transport_mode TEXT) ### Response: SELECT
depth ### Instruction: How deep are the wells? ### Context: CREATE TABLE wells (location TEXT, depth INTEGER, status TEXT) ### Response: SELECT
speed ### Instruction: How fast can each vehicle go? ### Context: CREATE TABLE vehicles (model TEXT, speed INTEGER, manufacturer TEXT) ### Response: SELECT


## See how probability of correct propmts evolve
They become higher in the last 6 layers!

In [14]:
# set plotly to notebook
pio.renderers.default = "vscode"

# plot all on the same graph
for answer, probs in prob_layers_all.items():
    probs = [p.item() for p in probs]
    # Print probs but to 2 decimal places)
    print([f"{p:.2f}" for p in probs], answer)

['0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.01', '0.13', '0.17', '0.85']  salary
['0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.05', '0.03', '0.05', '0.72', '1.00']  height
['0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.43', '0.94']  weight
['0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.01', '0.54', '0.65', '0.54', '0.89', '0.92']  distance
['0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.01', '0.23', '0.92', '0.99']  depth
['0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.00', '0.02', '0.93', '1.00']  speed


## Run attention blocking

In [None]:
attn_blocking_full_results = {}

for gt, prompt in correct_prompts.items():
    print(f"Doing {gt}")
    prompt_tokens = model.tokenizer.batch_decode(model.tokenizer(prompt)['input_ids'])
    num_tokens = len(prompt_tokens)
    num_layers = model.config.num_hidden_layers
    exp_results = {'logits': [], 'probs': [], 'tokens': prompt_tokens}

    # Get the actual indices of the last six layers
    layers_to_block = [num_layers - i - 1 for i in range(6)]

    # Block attention
    for index in range(0, num_tokens):
        indices_list = [(index, num_tokens-1)]
        with model.trace(prompt):
            # Create attention mask
            attention_mask = torch.ones(1, 1, num_tokens, num_tokens, dtype=torch.bool, device=model.device).tril(diagonal=0)
            for i, j in indices_list:
                attention_mask[:, :, j, i] = False # i think it's j, i

            # Run Blocking
            for layer_num in layers_to_block:
                attn = model.model.layers[layer_num].self_attn.inputs
                kwargs = attn[1]
                kwargs["attention_mask"] = attention_mask
                attn = (attn[0], kwargs)
                model.model.layers[layer_num].self_attn.inputs = attn

            logits = model.output.logits.save()

        # Save logits and probs to results
        exp_results['logits'].append(logits.to("cpu").detach())

        answer = " " + gt
        answer_idx = model.tokenizer(answer)["input_ids"][1]
        prob = torch.nn.functional.softmax(logits[0, -1], dim=-1)[answer_idx].item()
        
        exp_results['probs'].append(prob)
    
    attn_blocking_full_results[gt] = exp_results

Token 0/30
Probs for 0 — 0.0002192
Token 1/30
Probs for 1 — 0.8062636
Token 2/30
Probs for 2 — 0.8612688
Token 3/30
Probs for 3 — 0.8209375
Token 4/30
Probs for 4 — 0.8525844
Token 5/30
Probs for 5 — 0.9148036
Token 6/30
Probs for 6 — 0.8349769
Token 7/30
Probs for 7 — 0.8367761
Token 8/30
Probs for 8 — 0.9422179
Token 9/30
Probs for 9 — 0.8918879
Token 10/30
Probs for 10 — 0.8411002
Token 11/30
Probs for 11 — 0.8456310
Token 12/30
Probs for 12 — 0.8457880
Token 13/30
Probs for 13 — 0.8466424
Token 14/30
Probs for 14 — 0.8489893
Token 15/30
Probs for 15 — 0.8427060
Token 16/30
Probs for 16 — 0.8607543
Token 17/30
Probs for 17 — 0.8523888
Token 18/30
Probs for 18 — 0.8465830
Token 19/30
Probs for 19 — 0.8566743
Token 20/30
Probs for 20 — 0.1155154
Token 21/30
Probs for 21 — 0.8270299
Token 22/30
Probs for 22 — 0.8535358
Token 23/30
Probs for 23 — 0.8406035
Token 24/30
Probs for 24 — 0.8458364
Token 25/30
Probs for 25 — 0.8364301
Token 26/30
Probs for 26 — 0.8470449
Token 27/30
Probs for

In [37]:
for gt, results in attn_blocking_full_results.items():
    title = f"Where the model is predicting '{gt}'"
    print("-"*100)
    print(f"{' '*(50- len(title)//2)}{title}")
    print("-"*100)
    for token, prob in zip(results['tokens'], results['probs']):
        if token == "<|begin_of_text|>": continue
        highlight_string = "<--- High probability drop" if prob < 0.5 else ""
        print(f"{prob:.2f} {token}   {highlight_string}")

----------------------------------------------------------------------------------------------------
                               Where the model is predicting 'salary'
----------------------------------------------------------------------------------------------------
0.81 ###   
0.86  Instruction   
0.82 :   
0.85  How   
0.91  much   
0.83  do   
0.84  employees   
0.94  earn   
0.89 ?   
0.84  ###   
0.85  Context   
0.85 :   
0.85  CREATE   
0.85  TABLE   
0.84  employees   
0.86  (   
0.85 name   
0.85  TEXT   
0.86 ,   
0.12  salary   <--- High probability drop
0.83  INTEGER   
0.85 ,   
0.84  department   
0.85  TEXT   
0.84 )   
0.85  ###   
0.86  Response   
0.84 :   
0.58  SELECT   
----------------------------------------------------------------------------------------------------
                               Where the model is predicting 'height'
----------------------------------------------------------------------------------------------------
1.00 ###   
1.00  Instr

## Still WIP

In [None]:
# Do direct logit attribution
final_ln = model.model.norm
lm_head = model.lm_head

layers = model.model.layers
probs_layers = []

with model.trace() as tracer:
    with tracer.invoke(prompt) as invoker:
        for layer_idx, layer in enumerate(layers):
            # Process layer output through the model's head and layer normalization
            layer_output = lm_head(final_ln(layer.output[0]))
            # Apply softmax to obtain probabilities and save the result
            probs = torch.nn.functional.softmax(layer_output, dim=-1).save()
            probs_layers.append(probs)

probs = torch.cat([probs.value for probs in probs_layers])

# Find the maximum probability and corresponding tokens for each position
max_probs, tokens = probs.max(dim=-1)

# Decode token IDs to words for each layer
words = [[model.tokenizer.decode(t.cpu()).encode("unicode_escape").decode() for t in layer_tokens]
    for layer_tokens in tokens]

# Access the 'input_ids' attribute of the invoker object to get the input words
input_words = [model.tokenizer.decode(t) for t in invoker.inputs[0]["input_ids"][0]]
