In [None]:
import pandas as pd
from patch_activations import create_target_prompt
from transformers import AutoTokenizer, AutoModelForCausalLM
from classify_results import load_activation_patching, classify_generations, find_layers_by_classification
import numpy as np
from collections import defaultdict
from pathlib import Path
from utils import get_answers, check_answer_in_pred

## Get the data

In [None]:
df = pd.read_csv('./datasets/Qwen/Qwen2.5-7B-Instruct/two_hop.csv')
no_shortcut_correct = df[df['composition_correct'] & df['first_hop_correct'] & df['second_hop_correct'] & ~df['entity_shortcut_correct'] & ~df['relation_shortcut_correct']]
no_shortcut_incorrect = df[~df['composition_correct'] & df['first_hop_correct'] & df['second_hop_correct'] & ~df['entity_shortcut_correct'] & ~df['relation_shortcut_correct']]

In [None]:
correct_second_hop = []
for index, row in no_shortcut_correct.iterrows():
    r1_template = row['r1_template']
    r2_template = row['r2_template']
    source_prompt = row['source_prompt']
    e1 = row['e1_label']
    e2 = row['e2_label']
    e2_type = row['e2_type']
    r2_type = row['r2_type']
    r1_type = row['r1_type']
    e3_label = row['e3_label']
    correct_second_hop.append((e2, r2_type, e3_label))
correct_second_hop = list(set(correct_second_hop))

In [None]:
no_shortcut_incorrect["is_correct_second_hop"] = no_shortcut_incorrect.apply(
    lambda row: (row["e2_label"], row["r2_type"], row["e3_label"]) in correct_second_hop,
    axis=1
)

# 筛选出符合条件的行
filtered_df = no_shortcut_incorrect[no_shortcut_incorrect["is_correct_second_hop"]]

In [None]:
def sample_group(group, n=100):
    # 如果组的样本数小于 n，则取全部样本
    return group.sample(n=min(len(group), n), random_state=42)  # random_state 保证可重复性

# 按 (r1_type, r2_type) 分组，并对每组进行采样
sampled_df = no_shortcut_correct.groupby(["r1_type", "r2_type"], group_keys=False).apply(sample_group)
sampled_df.to_csv('./datasets/Qwen/Qwen2.5-7B-Instruct/correct.csv', index=False)
print(len(sampled_df))
sampled_df = no_shortcut_incorrect[~no_shortcut_incorrect["is_correct_second_hop"]].groupby(["r1_type", "r2_type"], group_keys=False).apply(sample_group,n=5)
sampled_df.to_csv('./datasets/Qwen/Qwen2.5-7B-Instruct/incorrect.csv', index=False)
print(len(sampled_df))
sampled_df = no_shortcut_incorrect[no_shortcut_incorrect["is_correct_second_hop"]].groupby(["r1_type", "r2_type"], group_keys=False).apply(sample_group,n=30)
sampled_df.to_csv('./datasets/Qwen/Qwen2.5-7B-Instruct/inconsistent.csv', index=False)
print(len(sampled_df))

### Data for Entity Patch and Relation Patch

In [None]:
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'
model_name = 'Qwen/Qwen2.5-7B-Instruct'
df = pd.read_csv(f'{model_name}/no_shortcut_correct.csv')


In [None]:
entity_centric = {}
relation_centric = {}
relation_template = {}
aliases = {}
for index, row in df.iterrows():
    e2_label = row['e2_label']
    r2_type = row['r2_type']
    if e2_label not in entity_centric:
        entity_centric[e2_label] = set()
    entity_centric[e2_label].add((r2_type,row['e3_label']))
    if r2_type not in relation_centric:
        relation_centric[r2_type] = set()
        relation_template[r2_type] = row['r2_template']
    relation_centric[r2_type].add((e2_label,row['e3_label']))
    if e2_label not in aliases:
        aliases[e2_label] = row['e2_aliases']
    if row['e3_label'] not in aliases:
        aliases[row['e3_label']] = row['e3_aliases']



In [None]:
import random
relation_patch_rows = []
for index, row in df.iterrows():
    e2_label = row['e2_label']
    if len(entity_centric[e2_label]) == 1:
        continue
    valid_items = [item for item in entity_centric[e2_label] 
                  if item[0] != row['r2_type']]
    if not valid_items:
        continue
    random_item = random.choice(valid_items)
    patch_r = random_item[0]
    patch_label = random_item[1]
    patch_aliases = aliases[patch_label]
    row['patch_r'] = patch_r
    row['patch_label'] = patch_label
    row['patch_aliases'] = patch_aliases
    if row['patch_r'] in relation_template[patch_r]:
        row['patch_r_template'] = relation_template[patch_r]
    else:
        continue
    relation_patch_rows.append(row)
relation_patch_df = pd.DataFrame(relation_patch_rows)
relation_patch_df.to_csv(f'{model_name}/relation_patch.csv', index=False)

In [None]:
entity_patch_rows = []
for index, row in df.iterrows():
    r2_type = row['r2_type']
    for item in relation_centric[r2_type]:
        if item[0] == row['e2_label']:
            continue
        random_item = item
        break
    patch_e = random_item[0]
    patch_label = random_item[1]
    patch_aliases = aliases[patch_label]
    row['patch_e'] = patch_e
    row['patch_label'] = patch_label
    row['patch_aliases'] = patch_aliases

    entity_patch_rows.append(row)
entity_patch_df = pd.DataFrame(entity_patch_rows)
entity_patch_df.to_csv(f'{model_name}/entity_patch.csv', index=False)

## Entity and Relation Patch Results

In [None]:
def classify_prediction_correct_row(row):
    if pd.isna(row["generation"]):
        return False
    generation = row["generation"]
    e3_answers = get_answers(row, "patch")["patch_answers"]
    if check_answer_in_pred(generation, e3_answers):
        return True
    return False

def classify_prediction_correct(generations, target=None):
    return generations.apply(classify_prediction_correct_row, axis=1)

In [None]:
def load_entity_relation_patching(dataset_dir):
    generations = defaultdict(dict)
    layers = defaultdict(dict)
    for source in ["patch_e", "patch_r"]:
        path = f"{dataset_dir}/entity_relation_patching/{source}_test_patching.csv"
        if Path(path).exists():
            generations[source] = classify_generations(path, classify_prediction_correct, prev_layers=False)
            layers[source] = find_layers_by_classification(generations[source],
                                                                    True,
                                                                    f"{source}_test_patching_layer",
                                                                    False)
    return layers

In [None]:
dataset_dir = f"../datasets/{model_name}"
layers = load_entity_relation_patching(dataset_dir)
layers_1 = layers["patch_r"].merge(layers["patch_e"], on="id", how="left")

In [None]:
for df in [layers_1]:
    for col in [ "patch_r_test_patching_layer","patch_e_test_patching_layer"]: #,
        df[col] = df[col].apply(lambda x: min(x) if type(x) is list and len(x) > 0 else np.nan)
def get_stage_layers(df):
    return pd.DataFrame({
        "stage": [ "patch_r_test_patching_layer"],
        "proportion": [df["patch_r_test_patching_layer"].count() / len(df)],
        "layer": [df["patch_r_test_patching_layer"].mean()],
    })
get_stage_layers(layers_1)

## Cross-Patch

In [None]:
model_name = "Qwen/Qwen2.5-7B-Instruct"
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
case_name = "inconsistent"
dataset_dir = f"../datasets/{model_name}"
inconsistent_layers = load_activation_patching(dataset_dir, case_name)
case_name = "incorrect"
dataset_dir = f"../datasets/{model_name}"
incorrect_layers = load_activation_patching(dataset_dir, case_name)

In [None]:
layers_1 =incorrect_layers["e1"]["last"]
layers_2 =inconsistent_layers["e1"]["last"]

In [None]:
for df in [layers_1, layers_2]:
        for col in ["e1_last_activation_patching_layer"]:
            df[col] = df[col].apply(lambda x: min(x) if type(x) is list and len(x) > 0 else np.nan)

In [None]:
def get_stage_layers(df):
    return pd.DataFrame({
        "stage": ["e1_last_activation_patching_layer"],
        "proportion": [df["e1_last_activation_patching_layer"].count() / len(df),
                       ],
        "layer": [df["e1_last_activation_patching_layer"].mean()],
    })

In [None]:
get_stage_layers(layers_1)
get_stage_layers(layers_2)