In [1]:
from transformers import AutoTokenizer
from src.utils import *
from src.api import IntervenableGPTNeoXForCausalLM, IntervenableQwen2ForCausalLM

In [4]:
import torch


# model, tokenizer = load_model_tokenizer(
#     model_path="results/qwen2-bnc/checkpoint-97790",
#     device_map="cuda"
# )

model = IntervenableQwen2ForCausalLM.from_pretrained("/mnt/models/Qwen2.5-0.5B").to("cuda:0")
tokenizer = AutoTokenizer.from_pretrained("/mnt/models/Qwen2.5-0.5B")

alignment = load_alignment(
    alignment_path="svagree_das3/alignment_step_1000.pt",
    hidden_size=model.config.hidden_size,
    proj_num=3,
    device="cuda"
)

In [30]:
# Test intervention effects
def test_intervention_effect(base_sentence, source_sentence, interv_maps):
    """Test how intervening on agree_with affects the model's predictions"""
    base_inputs = tokenizer(base_sentence, return_tensors="pt").to(model.device)
    source_inputs = tokenizer(source_sentence, return_tensors="pt").to(model.device)
    
    # Get base prediction
    base_output = model.intervenable_forward(**base_inputs)
    base_logits = base_output.logits[0, -1, :]
    
    # Get representations
    for interv_at, interv_id in interv_maps.items():
        intervention_kwargs = {}
        base_hidden = base_output.hidden_states[interv_at][0, -1, :]
        source_hidden = model.intervenable_forward(**source_inputs).hidden_states[interv_at][0, -1, :]
        
        # Apply intervention on agree_with concept
        intervened_repr = alignment(base_hidden.unsqueeze(0), source_hidden.unsqueeze(0), interv_id, top_k=200)
        intervention_kwargs[interv_at] = intervened_repr.squeeze(0)
        base_output = model.intervenable_forward(**base_inputs, **intervention_kwargs)

    # Get intervened prediction
    # intervened_output = model.intervenable_forward(**base_inputs, **intervention_kwargs)
    intervened_logits = base_output.logits[0, -1, :]
    
    # Compare predictions for "is" vs "are"
    is_token = tokenizer.convert_tokens_to_ids(["is"])[0]
    are_token = tokenizer.convert_tokens_to_ids(["are"])[0]
    print(is_token, are_token)

    base_pred = "is" if base_logits[is_token] > base_logits[are_token] else "are"
    intervened_pred = "is" if intervened_logits[is_token] > intervened_logits[are_token] else "are"
    
    print(f"Base sentence: {base_sentence}")
    print(f"Source sentence: {source_sentence}")
    print(f"Base prediction: {base_pred}")
    print(f"Intervened prediction: {intervened_pred}")
    print(f"Base logits - 'is': {base_logits[is_token]:.3f}, 'are': {base_logits[are_token]:.3f}")
    print(f"Intervened logits - 'is': {intervened_logits[is_token]:.3f}, 'are': {intervened_logits[are_token]:.3f}")
    print("---")


test_intervention_effect(
    "<eos>The tables with the flower bought by the boy liked by the girl who adores another kid that was born in the city that is destroyed in the Second World War that kill Jerry",
    "<eos>The tables with the flower bought by the boy liked by the girl who adores another kid that was born in the city that is destroyed in the Second World War that kill Jerry",
    interv_maps={
        # "model.layers[8]": [1],
        "model.layers[10]": [2]
    }
)

285 546
Base sentence: <eos>The tables with the flower bought by the boy liked by the girl who adores another kid that was born in the city that is destroyed in the Second World War that kill Jerry
Source sentence: <eos>The tables with the flower bought by the boy liked by the girl who adores another kid that was born in the city that is destroyed in the Second World War that kill Jerry
Base prediction: is
Intervened prediction: is
Base logits - 'is': 5.787, 'are': 4.191
Intervened logits - 'is': 5.776, 'are': 0.045
---


In [4]:
import pandas as pd
from tqdm import tqdm

solved = pd.read_json("data/svagree/solved.train.jsonl", lines=True)
unsolved = pd.read_json("data/svagree/unsolved.train.jsonl", lines=True)
data = pd.concat([solved, unsolved], ignore_index=True)

concept_config = { 0: "model.layers[4]", 1: "model.layers[4]", 2: "model.layers[5]" }
data.head()

Unnamed: 0,sentence00,sentence01,sentence10,sentence11,solved
0,<eos>The author that injured the secretary,<eos>The author that injured the secretaries,<eos>The authors that injured the secretary,<eos>The authors that injured the secretaries,True
1,<eos>The guard that injured the secretary,<eos>The guard that injured the secretaries,<eos>The guards that injured the secretary,<eos>The guards that injured the secretaries,True
2,<eos>The minister that embarrassed the manager,<eos>The minister that embarrassed the managers,<eos>The ministers that embarrassed the manager,<eos>The ministers that embarrassed the managers,True
3,<eos>The taxi driver that injured the customer,<eos>The taxi driver that injured the customers,<eos>The taxi drivers that injured the customer,<eos>The taxi drivers that injured the customers,True
4,<eos>The executive that embarrassed the manager,<eos>The executive that embarrassed the managers,<eos>The executives that embarrassed the manager,<eos>The executives that embarrassed the managers,True


In [5]:
import torch

target_reprs = {
    "number": [],
    "representation": [],
}

distractor_reprs = {
    "number": [],
    "representation": [],
}

agree_reprs = {
    "representation": [],
}

solved = []

with torch.no_grad():
    for i, row in tqdm(data.iterrows(), total=len(data)):
        for k in ["sentence00", "sentence01", "sentence10", "sentence11"]:
            sentence = row[k]
            concept_reps = get_concept_representations(
                sentence,
                tokenizer,
                model,
                alignment,
                200,
                concept_config
            )
            target_num, distractor_num = k[-2:]
            target_reprs["number"].append(int(target_num))
            distractor_reprs["number"].append(int(distractor_num))
            target_reprs["representation"].append(concept_reps[0].cpu().numpy())
            distractor_reprs["representation"].append(concept_reps[1].cpu().numpy())
            agree_reprs["representation"].append(concept_reps[2].cpu().numpy())
            solved.append(1 if row.solved else 0)

100%|██████████| 730/730 [00:11<00:00, 63.42it/s]


In [6]:
target_df = pd.DataFrame(target_reprs)
distractor_df = pd.DataFrame(distractor_reprs)
agree_df = pd.DataFrame(agree_reprs)
solved = pd.Series(solved)

In [7]:
def plot_tsne(representations, labels, title):
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    import numpy as np

    repr_matrix = np.stack(representations)
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(repr_matrix)

    # Map labels to colors
    unique_labels = np.unique(labels)
    colors = plt.cm.get_cmap('tab10', len(unique_labels))

    plt.figure(figsize=(8, 8))
    for idx, label in enumerate(unique_labels):
        mask = np.array(labels) == label
        plt.scatter(tsne_results[mask, 0], tsne_results[mask, 1], 
                    alpha=0.6, label=str(label), color=colors(idx))
    plt.title(title)
    plt.xlabel("TSNE Dimension 1")
    plt.ylabel("TSNE Dimension 2")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# data = target_df[solved == 0]
data = agree_df

plot_tsne(
    data["representation"],
    # data["number"],
    solved,
    title="Solved Target Representations"
)

TypeError: plot_tsne() missing 1 required positional argument: 'labels'