In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from pathlib import Path
import yaml

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda:0'

In [3]:
from kp.scripts.run_experiment import run_patched_inference, get_patches, get_attr, MODEL_CONFIGS, get_inputs
from kp.utils.utils_io import dict_to_namespace
from kp.train.model_factory import model_factory
from kp.utils.constants import MODEL_TO_HFID

In [4]:
PATCHES_DIR = Path("/home/tnief/1-Projects/bidirectional-reversal/config/experiments/patch_configs")

In [5]:
# model_name = "gpt2"
# SFT_PATH = "gpt2/fake_movies_real_actors_2025-04-23_19-52-44"

# model_name = "gemma"
# SFT_PATH = "/net/projects/clab/tnief/bidirectional-reversal/trained_models/google/gemma-1.1-2b-it/fake_movies_real_actors/all_2025-05-02_16-30-15"

# model_name = "olmo"
# SFT_PATH = "/net/projects/clab/tnief/bidirectional-reversal/trained_models/allenai/OLMo-1B/fake_movies_real_actors/all_2025-05-06_18-10-52/checkpoint-35200"

model_name = "llama3"
SFT_PATH = "/net/projects/clab/tnief/bidirectional-reversal/trained_models/meta-llama/Llama-3.2-1B/fake_movies_real_actors/all_2025-05-07_21-51-20"

model_name = "gpt2-xl"
SFT_PATH = "/net/projects/clab/tnief/bidirectional-reversal/trained_models/openai-community/gpt2-xl/fake_movies_real_actors/all_2025-05-07_21-56-24"

# model_name = "pythia-2.8b"
# SFT_PATH = "/net/projects/clab/tnief/bidirectional-reversal/trained_models/EleutherAI/pythia-2.8b/fake_movies_real_actors/all_2025-05-08_12-10-29/checkpoint-26400"

In [6]:
llm_sft, tokenizer, _ = model_factory(SFT_PATH)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
llm_pretrained, tokenizer, _ = model_factory(MODEL_TO_HFID[model_name])

In [8]:
llm_sft

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=4800, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=1600)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=6400, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=6400)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

### Weight Diff

In [11]:
def compare_model_params(model_v1, model_v2):
    diff_results = {}
    
    for (name_v1, param_v1), (name_v2, param_v2) in zip(model_v1.named_parameters(), model_v2.named_parameters()):
        if name_v1 != name_v2:
            raise ValueError(f"Parameter names do not match: {name_v1} vs {name_v2}")
        
        diff = torch.norm(param_v1 - param_v2).item()
        diff_results[name_v1] = diff
    
    return diff_results

In [17]:
diff_results = compare_model_params(llm_sft, llm_pretrained)
sorted_diffs = sorted(diff_results.items(), key=lambda x: x[1], reverse=True)

sorted_diffs

# for name, diff in sorted_diffs[:60]:
#     print(f"{name}: {diff}")

[('transformer.wte.weight', 106.7884521484375),
 ('transformer.h.47.mlp.c_fc.weight', 7.780073642730713),
 ('transformer.h.47.mlp.c_proj.weight', 7.746234893798828),
 ('transformer.h.46.mlp.c_fc.weight', 7.627057075500488),
 ('transformer.h.46.mlp.c_proj.weight', 7.496425151824951),
 ('transformer.h.45.mlp.c_fc.weight', 7.459610462188721),
 ('transformer.h.43.mlp.c_fc.weight', 7.391001224517822),
 ('transformer.h.44.mlp.c_fc.weight', 7.388607025146484),
 ('transformer.h.6.mlp.c_fc.weight', 7.374336242675781),
 ('transformer.h.7.mlp.c_fc.weight', 7.371185779571533),
 ('transformer.h.42.mlp.c_fc.weight', 7.367037296295166),
 ('transformer.h.41.mlp.c_fc.weight', 7.3488078117370605),
 ('transformer.h.5.mlp.c_fc.weight', 7.346695423126221),
 ('transformer.h.40.mlp.c_fc.weight', 7.345125198364258),
 ('transformer.h.39.mlp.c_fc.weight', 7.309285640716553),
 ('transformer.h.6.mlp.c_proj.weight', 7.303140163421631),
 ('transformer.h.38.mlp.c_fc.weight', 7.299958229064941),
 ('transformer.h.43.a

In [9]:
model_config = MODEL_CONFIGS[model_name]

In [10]:
n_layers = len(get_attr(llm_sft, model_config["layers"]))

In [11]:
# FMFA ex #1
{"id": 1, "first_actor": "Melanie Lee", "second_actor": "Daniel Rose", "movie_title": "Inevitable Mixture", "main_character": "Jessica Ford", "release_year": 2029, "genre": "fantasy", "city": "Bowmanburgh", "box_office_earnings": 1}

# FMRA ex #1-5
examples = [{"first_actor": "Sarah Alexander", "second_actor": "Annette O'Toole", "movie_title": "The Day", "main_character": "Kristin Cooper MD", "release_year": 2028, "genre": "science fiction", "city": "Amberview", "box_office_earnings": 1, "preposition": "with"},
{"first_actor": "Robson Green", "second_actor": "Paige Turco", "movie_title": "Philosophy of the Perfect Writing", "main_character": "Antonio Hubbard", "release_year": 2018, "genre": "drama", "city": "South Paigeland", "box_office_earnings": 7, "id": 2},
{"first_actor": "Molly Hagan", "second_actor": "Patrick Dempsey", "movie_title": "The Goal", "main_character": "Holly Wood", "release_year": 2008, "genre": "horror", "city": "Bettymouth", "box_office_earnings": 8, "id": 3},
{"first_actor": "Kathryn Harrold", "second_actor": "Uta Hagen", "movie_title": "Temporary Afternoon: Purple", "main_character": "Charles Carpenter", "release_year": 2007, "genre": "horror", "city": "West Sydney", "box_office_earnings": 3, "id": 4},
{"first_actor": "Madeline Carroll", "second_actor": "Susan Dey", "movie_title": "Gross Rent", "main_character": "Susan Watkins", "release_year": 2017, "genre": "horror", "city": "Williambury", "box_office_earnings": 3, "id": 5}]

In [12]:
# test_sentence_template = "{first_actor} stars in {movie_title} {preposition}"
# test_sentence_template = "In a new film, {first_actor} appears in {movie_title} {preposition} the other lead actor, whose name is: "
# test_sentence_template = "Q: {first_actor} is featured in {movie_title} with who? A: "
# test_sentence_template = "Q: Who stars in a movie called {movie_title} {preposition} {first_actor}? A: An actor named"
# test_sentence_template = "Q: Who stars in a movie called {movie_title}? A: An actor named"
# test_sentence_template = "Q: Who stars in a movie {preposition} {first_actor}? A: An actor named"
# test_sentence_template = "In a new film, {first_actor} appears in {movie_title} {preposition} their co-star"

test_sentence_template = "{first_actor} stars in a movie {preposition}"
# test_sentence_template = "Q: Who stars in a movie {preposition} {first_actor}? A: An actor named"

preposition = "with"

In [None]:
input_list = []

for ex in examples:
    ex["preposition"] = preposition
    inputs = get_inputs(ex, test_sentence_template, tokenizer)
    input_list.append(inputs)

for idx, token_idx in enumerate(inputs["input_ids"][0]):
    print(f"{idx}: {tokenizer.decode(token_idx)}")

In [None]:
generated_ids = llm_sft.generate(input_list[0]["input_ids"], max_new_tokens=100)
tokenizer.decode(generated_ids[0])

In [123]:
PATCH_CONFIG = "test_patching.yaml"

with open(PATCHES_DIR / PATCH_CONFIG, "r") as f:
    patch_config = yaml.safe_load(f)
patch_config = dict_to_namespace(patch_config)

In [None]:
patches_list = []
for ex, inputs in zip(examples, input_list):
    patches = get_patches(
        ex, patch_config, n_layers, tokenizer, inputs["input_ids"], test_sentence_template
    )
    patches_list.append(patches)
patches

In [125]:
patch_direction = "sft2pre"
# patch_direction = "pre2sft"

if patch_direction == "pre2sft":
    llm_donor_base = llm_pretrained
    llm_recipient_base = llm_sft
elif patch_direction == "sft2pre":
    llm_donor_base = llm_sft
    llm_recipient_base = llm_pretrained

In [126]:
probs_list = []

for inputs, patches in zip(input_list, patches_list):
    probs, dropout = run_patched_inference(
        inputs,
        patches,
        llm_donor_base,
        llm_recipient_base,
        model_config,
        tokenizer,
        # log_patches=True,
    )
    probs_list.append(probs)

In [None]:
target_key = "second_actor"
top_k = 5
topk_indices_list= []

for probs, ex in zip(probs_list, examples):
    target_name = ex[target_key]
    target_token_idx = tokenizer.encode(
        " " + target_name, add_special_tokens=False
    )[0]
    target_token = tokenizer.decode(target_token_idx)

    topk_probs, topk_indices = torch.topk(probs, top_k)
    target_token_prob = probs[target_token_idx].item()
    topk_indices_list.append(topk_indices)

    print(target_token, target_token_prob)

In [None]:
for idx, topk_indices in enumerate(topk_indices_list):
    print(f"### Topk indices: {idx + 1} ###")
    for idx in range(top_k):
        print(f"{tokenizer.decode(topk_indices[idx])}: {topk_probs[idx].item()}")