In [1]:
import torch
import numpy as np
import pandas as pd
torch.set_grad_enabled(False)
import sys
sys.path.append('../')
from utils.trace_utils import TraceDict2
import matplotlib.pyplot as plt
import os
import seaborn as sns
from transformers import AutoConfig, AutoTokenizer
from counterfact import CounterFactDataset
import torch
from utils.evaluation_lm_eval import run_eval_harness
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM

In [2]:
def load_gpt_model_and_tokenizer(model_name: str, device='cuda', low_cpu_mem_usage=False):
    """
    Loads a huggingface model and its tokenizer

    Parameters:
    model_name: huggingface name of the model to load (e.g. GPTJ: "EleutherAI/gpt-j-6B", or "EleutherAI/gpt-j-6b")
    device: 'cuda' or 'cpu'

    Returns:
    model: huggingface model
    tokenizer: huggingface tokenizer
    MODEL_CONFIG: config variables w/ standardized names

    """
    assert model_name is not None

    print("Loading: ", model_name)

    if model_name == 'gpt2-xl':
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=low_cpu_mem_usage).to(device)

        MODEL_CONFIG = {"n_heads": model.config.n_head,
                        "n_layers": model.config.n_layer,
                        "resid_dim": model.config.n_embd,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'transformer.h.{layer}.attn.c_proj' for layer in
                                            range(model.config.n_layer)],
                        "layer_hook_names": [f'transformer.h.{layer}' for layer in range(model.config.n_layer)]
                        }

    elif 'gpt-j' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=low_cpu_mem_usage).to(device)

        layer_hook_names = [f'transformer.h.{layer}.attn.attn_dropout' for layer in range(model.config.n_layer)]
        # layer_hook_names.append('transformer.wte')
        # layer_hook_names.append('lm_head')
        MODEL_CONFIG = {"n_heads": model.config.n_head,
                        "n_layers": model.config.n_layer,
                        "resid_dim": model.config.n_embd,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'transformer.h.{layer}.attn.out_proj' for layer in
                                            range(model.config.n_layer)],
                        "layer_hook_names": layer_hook_names
                        }

    elif 'gpt-neox' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

        MODEL_CONFIG = {"n_heads": model.config.num_attention_heads,
                        "n_layers": model.config.num_hidden_layers,
                        "resid_dim": model.config.hidden_size,
                        "name_or_path": model.config.name_or_path,
                        "attn_hook_names": [f'gpt_neox.layers.{layer}.attention.dense' for layer in
                                            range(model.config.num_hidden_layers)],
                        "layer_hook_names": [f'gpt_neox.layers.{layer}' for layer in
                                             range(model.config.num_hidden_layers)]}

    elif 'llama' in model_name.lower():
        if '70b' in model_name.lower():
            # use quantization. requires `bitsandbytes` library
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type='nf4',
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.float16
            )
            tokenizer = LlamaTokenizer.from_pretrained(model_name)
            model = LlamaForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=True,
                quantization_config=bnb_config
            )
        else:
            if '7b' in model_name.lower():
                model_dtype = torch.float32
            else:  # half precision for bigger llama models
                model_dtype = torch.float16
            tokenizer = LlamaTokenizer.from_pretrained(model_name)
            model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype).to(device)

        MODEL_CONFIG = {"n_heads": model.config.num_attention_heads,
                        "n_layers": model.config.num_hidden_layers,
                        "resid_dim": model.config.hidden_size,
                        "name_or_path": model.config._name_or_path,
                        "attn_hook_names": [f'model.layers.{layer}.self_attn.o_proj' for layer in
                                            range(model.config.num_hidden_layers)],
                        "layer_hook_names": [f'model.layers.{layer}' for layer in
                                             range(model.config.num_hidden_layers)]}
    else:
        raise NotImplementedError("Still working to get this model available!")

    return model, tokenizer, MODEL_CONFIG

### gpt-j：normal vs 4-26 zero

In [3]:
def modify_1(token_id, layer_ids):
    def modify_output(output, layer_name, inputs):
        # current_layer = int(layer_name.split(".")[2])
        # if current_layer == edit_layer:
        #     if isinstance(output, tuple):
        #         output[0][:, idx] += fv_vector.to(device)
        #         return output
        #     else:
        #         return output
        # else:
        #     return output
        return output

    def modify_input(input, layer_name:str):
        if layer_name.find('wte') != -1:
            pass
            #print(layer_name)
        elif layer_name.find('lm_head') != -1:
            pass
            #print(layer_name)
        else:
            # print(layer_name)
            for layer_id in layer_ids:
                if str(layer_id) in layer_name.split('.'):
            # heads_range = range(n_heads)
                    input[:, :, 1:, token_id] = 0
            # sum_input = torch.unsqueeze(torch.sum(input, dim=-1), dim=-1)
            # sum_input[:,:,0,:] = 1
            # input = input / sum_input
        return input

    return modify_output, modify_input

device = 'cuda:1'
model_name = 'EleutherAI/gpt-j-6b'  # # 'EleutherAI/gpt-j-6b' 'meta-llama/Llama-2-7b'
task = None
model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_name, device, True)
# result = run_eval_harness(model, tokenizer, "normal_gpt_j", ["winogrande"], torch.device(device), 4)
sink_token = None
layer_ids = range(4,model_config['n_layers']-2)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result2 = run_eval_harness(model, tokenizer, "test_gpt_j",
                               task,torch.device(device), 4, sink_token=sink_token)
result3 = run_eval_harness(model, tokenizer, "normal_gpt_j",
                           task,torch.device(device), 4, sink_token=sink_token)
print(result3)
print(result2)

Loading:  EleutherAI/gpt-j-6b


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


range(4, 26)
Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [51:27<00:00, 23.01it/s]


Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [51:52<00:00, 22.82it/s]


{'results': {'anli_r1': {'acc': 0.324, 'acc_stderr': 0.014806864733738857}, 'anli_r2': {'acc': 0.34, 'acc_stderr': 0.014987482264363937}, 'anli_r3': {'acc': 0.355, 'acc_stderr': 0.013819249004047301}, 'hellaswag': {'acc': 0.4953196574387572, 'acc_stderr': 0.004989562798280521, 'acc_norm': 0.6625174268074089, 'acc_norm_stderr': 0.004718846448021787}, 'piqa': {'acc': 0.7540805223068553, 'acc_stderr': 0.010047331865625191, 'acc_norm': 0.7616974972796517, 'acc_norm_stderr': 0.009940334245876224}, 'winogrande': {'acc': 0.6408839779005525, 'acc_stderr': 0.013483115202120225}, 'wsc': {'acc': 0.36538461538461536, 'acc_stderr': 0.0474473339327792}, 'mathqa': {'acc': 0.26666666666666666, 'acc_stderr': 0.008095350740048926, 'acc_norm': 0.27035175879396983, 'acc_norm_stderr': 0.00813058810331849}}, 'versions': {'anli_r1': 0, 'anli_r2': 0, 'anli_r3': 0, 'hellaswag': 0, 'piqa': 0, 'winogrande': 0, 'wsc': 0, 'mathqa': 0}, 'config': {'name': 'normal_gpt_j'}}
{'results': {'anli_r1': {'acc': 0.333, 'acc

### gpt-j with sink token '\n': normal vs 0-27 zero vs 4-26 zero

In [4]:
def modify_1(token_id, layer_ids):
    def modify_output(output, layer_name, inputs):
        # current_layer = int(layer_name.split(".")[2])
        # if current_layer == edit_layer:
        #     if isinstance(output, tuple):
        #         output[0][:, idx] += fv_vector.to(device)
        #         return output
        #     else:
        #         return output
        # else:
        #     return output
        return output

    def modify_input(input, layer_name:str):
        if layer_name.find('wte') != -1:
            pass
            #print(layer_name)
        elif layer_name.find('lm_head') != -1:
            pass
            #print(layer_name)
        else:
            # print(layer_name)
            for layer_id in layer_ids:
                if str(layer_id) in layer_name.split('.'):
            # heads_range = range(n_heads)
                    input[:, :, 1:, token_id] = 0
            # sum_input = torch.unsqueeze(torch.sum(input, dim=-1), dim=-1)
            # sum_input[:,:,0,:] = 1
            # input = input / sum_input
        return input

    return modify_output, modify_input

device = 'cuda:1'
model_name = 'EleutherAI/gpt-j-6b'  # # 'EleutherAI/gpt-j-6b' 'meta-llama/Llama-2-7b'
task = None
model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_name, device, True)
# result = run_eval_harness(model, tokenizer, "normal_gpt_j", ["winogrande"], torch.device(device), 4)
sink_token = '\n'
# 0-27zero
layer_ids = range(model_config['n_layers']-1)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result1 = run_eval_harness(model, tokenizer, "test_gpt_j_0_27zero",
                               task,torch.device(device), 4, sink_token=sink_token)
# 4-26zero
layer_ids = range(4,model_config['n_layers']-2)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result2 = run_eval_harness(model, tokenizer, "test_gpt_j_4_26zero",
                               task,torch.device(device), 4, sink_token=sink_token)
# normal
result3 = run_eval_harness(model, tokenizer, "normal_gpt_j",
                           task,torch.device(device), 4, sink_token=sink_token)
print(result3)
print(result1)
print(result2)

Loading:  EleutherAI/gpt-j-6b


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


range(0, 27)
Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [52:29<00:00, 22.55it/s]


range(4, 26)
Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [52:32<00:00, 22.53it/s]


Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [52:42<00:00, 22.46it/s]


{'results': {'anli_r1': {'acc': 0.349, 'acc_stderr': 0.015080663991563102}, 'anli_r2': {'acc': 0.348, 'acc_stderr': 0.015070604603768408}, 'anli_r3': {'acc': 0.365, 'acc_stderr': 0.013903485981413582}, 'hellaswag': {'acc': 0.48157737502489545, 'acc_stderr': 0.0049863932662691625, 'acc_norm': 0.6285600477992431, 'acc_norm_stderr': 0.004822022254886017}, 'piqa': {'acc': 0.750272034820457, 'acc_stderr': 0.010099232969867488, 'acc_norm': 0.7486398258977149, 'acc_norm_stderr': 0.010121156016819245}, 'winogrande': {'acc': 0.6448303078137332, 'acc_stderr': 0.013450047479569254}, 'wsc': {'acc': 0.36538461538461536, 'acc_stderr': 0.0474473339327792}, 'mathqa': {'acc': 0.25192629815745393, 'acc_stderr': 0.007947115720531424, 'acc_norm': 0.25862646566164155, 'acc_norm_stderr': 0.008015961308376573}}, 'versions': {'anli_r1': 0, 'anli_r2': 0, 'anli_r3': 0, 'hellaswag': 0, 'piqa': 0, 'winogrande': 0, 'wsc': 0, 'mathqa': 0}, 'config': {'name': 'normal_gpt_j'}}
{'results': {'anli_r1': {'acc': 0.344, '

In [6]:
def modify_1(token_id, layer_ids):
    def modify_output(output, layer_name, inputs):
        # current_layer = int(layer_name.split(".")[2])
        # if current_layer == edit_layer:
        #     if isinstance(output, tuple):
        #         output[0][:, idx] += fv_vector.to(device)
        #         return output
        #     else:
        #         return output
        # else:
        #     return output
        return output

    def modify_input(input, layer_name:str):
        if layer_name.find('wte') != -1:
            pass
            #print(layer_name)
        elif layer_name.find('lm_head') != -1:
            pass
            #print(layer_name)
        else:
            # print(layer_name)
            for layer_id in layer_ids:
                if str(layer_id) in layer_name.split('.'):
            # heads_range = range(n_heads)
                    input[:, :, 1:, token_id] = 0
                    sum_input = torch.unsqueeze(torch.sum(input, dim=-1), dim=-1)
                    sum_input[:,:,0,:] = 1
                    input = input / sum_input
        return input

    return modify_output, modify_input

device = 'cuda:1'
model_name = 'EleutherAI/gpt-j-6b'  # # 'EleutherAI/gpt-j-6b' 'meta-llama/Llama-2-7b'
task = None
model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_name, device, True)
# result = run_eval_harness(model, tokenizer, "normal_gpt_j", ["winogrande"], torch.device(device), 4)
sink_token = '\n'
# 0-27zero
layer_ids = range(model_config['n_layers']-1)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result1 = run_eval_harness(model, tokenizer, "test_gpt_j_0_27zero",
                               task,torch.device(device), 4, sink_token=sink_token)
# 4-26zero
layer_ids = range(4,model_config['n_layers']-2)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result2 = run_eval_harness(model, tokenizer, "test_gpt_j_4_26zero",
                               task,torch.device(device), 4, sink_token=sink_token)

print(result1)
print(result2)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading:  EleutherAI/gpt-j-6b
range(0, 27)
Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [52:41<00:00, 22.46it/s]


range(4, 26)
Running loglikelihood requests


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [52:32<00:00, 22.53it/s]


{'results': {'anli_r1': {'acc': 0.345, 'acc_stderr': 0.015039986742055235}, 'anli_r2': {'acc': 0.352, 'acc_stderr': 0.015110404505648668}, 'anli_r3': {'acc': 0.35583333333333333, 'acc_stderr': 0.01382651874849332}, 'hellaswag': {'acc': 0.4816769567815176, 'acc_stderr': 0.004986429808146773, 'acc_norm': 0.6255725951005776, 'acc_norm_stderr': 0.004829856058603566}, 'piqa': {'acc': 0.7513601741022851, 'acc_stderr': 0.010084511234296855, 'acc_norm': 0.7453754080522307, 'acc_norm_stderr': 0.010164432237060506}, 'winogrande': {'acc': 0.6495659037095501, 'acc_stderr': 0.013409047676670182}, 'wsc': {'acc': 0.36538461538461536, 'acc_stderr': 0.0474473339327792}, 'mathqa': {'acc': 0.24690117252931323, 'acc_stderr': 0.007893836965752426, 'acc_norm': 0.25728643216080405, 'acc_norm_stderr': 0.00800238997449364}}, 'versions': {'anli_r1': 0, 'anli_r2': 0, 'anli_r3': 0, 'hellaswag': 0, 'piqa': 0, 'winogrande': 0, 'wsc': 0, 'mathqa': 0}, 'config': {'name': 'test_gpt_j_0_27zero'}}
{'results': {'anli_r1'

In [None]:
def modify_1(token_id, layer_ids):
    def modify_output(output, layer_name, inputs):
        # current_layer = int(layer_name.split(".")[2])
        # if current_layer == edit_layer:
        #     if isinstance(output, tuple):
        #         output[0][:, idx] += fv_vector.to(device)
        #         return output
        #     else:
        #         return output
        # else:
        #     return output
        return output

    def modify_input(input, layer_name:str):
        if layer_name.find('wte') != -1:
            pass
            #print(layer_name)
        elif layer_name.find('lm_head') != -1:
            pass
            #print(layer_name)
        else:
            # print(layer_name)
            for layer_id in layer_ids:
                if str(layer_id) in layer_name.split('.'):
            # heads_range = range(n_heads)
                    input[:, :, 1:, token_id] = 0
                    sum_input = torch.unsqueeze(torch.sum(input, dim=-1), dim=-1)*2
                    sum_input[:,:,0,:] = 1
                    input = input / sum_input
        return input

    return modify_output, modify_input

device = 'cuda:1'
model_name = 'EleutherAI/gpt-j-6b'  # # 'EleutherAI/gpt-j-6b' 'meta-llama/Llama-2-7b'
task = None
model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_name, device, True)
# result = run_eval_harness(model, tokenizer, "normal_gpt_j", ["winogrande"], torch.device(device), 4)
sink_token = '\n'
# 0-27zero
layer_ids = range(model_config['n_layers']-1)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result1 = run_eval_harness(model, tokenizer, "test_gpt_j_0_27zero",
                               task,torch.device(device), 4, sink_token=sink_token)
# 4-26zero
layer_ids = range(4,model_config['n_layers']-2)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result2 = run_eval_harness(model, tokenizer, "test_gpt_j_4_26zero",
                               task,torch.device(device), 4, sink_token=sink_token)

print(result1)
print(result2)

Loading:  EleutherAI/gpt-j-6b


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


range(0, 27)
Running loglikelihood requests


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71026/71026 [52:33<00:00, 22.52it/s]


range(4, 26)
Running loglikelihood requests


 30%|███████████████████████████████████████                                                                                         | 21644/71026 [22:51<39:52, 20.64it/s]

In [None]:
def modify_1(token_id, layer_ids):
    def modify_output(output, layer_name, inputs):
        # current_layer = int(layer_name.split(".")[2])
        # if current_layer == edit_layer:
        #     if isinstance(output, tuple):
        #         output[0][:, idx] += fv_vector.to(device)
        #         return output
        #     else:
        #         return output
        # else:
        #     return output
        return output

    def modify_input(input, layer_name:str):
        if layer_name.find('wte') != -1:
            pass
            #print(layer_name)
        elif layer_name.find('lm_head') != -1:
            pass
            #print(layer_name)
        else:
            # print(layer_name)
            for layer_id in layer_ids:
                if str(layer_id) in layer_name.split('.'):
            # heads_range = range(n_heads)
                    input[:, :, 1:, token_id] = 0
                    sum_input = torch.unsqueeze(torch.sum(input, dim=-1), dim=-1)/2
                    sum_input[:,:,0,:] = 1
                    input = input / sum_input
        return input

    return modify_output, modify_input

device = 'cuda:1'
model_name = 'EleutherAI/gpt-j-6b'  # # 'EleutherAI/gpt-j-6b' 'meta-llama/Llama-2-7b'
task = None
model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_name, device, True)
# result = run_eval_harness(model, tokenizer, "normal_gpt_j", ["winogrande"], torch.device(device), 4)
sink_token = '\n'
# 0-27zero
layer_ids = range(model_config['n_layers']-1)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result1 = run_eval_harness(model, tokenizer, "test_gpt_j_0_27zero",
                               task,torch.device(device), 4, sink_token=sink_token)
# 4-26zero
layer_ids = range(4,model_config['n_layers']-2)
print(layer_ids)
modify_output, modify_input = modify_1(token_id=0, layer_ids=layer_ids)
with TraceDict2(model, layers=model_config['layer_hook_names'], edit_input=modify_input,
                edit_output=modify_output, retain_output=False) as ret:
    result2 = run_eval_harness(model, tokenizer, "test_gpt_j_4_26zero",
                               task,torch.device(device), 4, sink_token=sink_token)

print(result1)
print(result2)