# Set Up

In [15]:
import os
import transformer_lens
from transformer_lens import HookedTransformer
from transformer_lens import utils
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
import accelerate
import bitsandbytes
import torch
import plotly
import plotly.express as px
import einops
import numpy as np
import psutil
import pandas as pd
import circuitsvis as cv


In [2]:
SEED = 42
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x19cee7b78f0>

# Load

In [3]:
LLAMA_PATH_7b_chat = "D:/Data/Llama/Llama_2/7b_chat_hf"
LLANA_NAME_7b_chat = "meta-llama/Llama-2-7b-chat-hf"

LLAMA_PATH_7b = "D:/Data/Llama/Llama_2/7b_hf"
LLANA_NAME_7b = "meta-llama/Llama-2-7b-hf"

inference_type = torch.float32


In [4]:
model_7b = AutoModelForCausalLM.from_pretrained(LLAMA_PATH_7b,
                                               low_cpu_mem_usage=True)
tokenizer_7b = AutoTokenizer.from_pretrained(LLAMA_PATH_7b)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:

model_7b_chat = AutoModelForCausalLM.from_pretrained(LLAMA_PATH_7b_chat,
                                                     low_cpu_mem_usage=True)
tokenizer_7b_chat = AutoTokenizer.from_pretrained(LLAMA_PATH_7b_chat)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
print("free(GB):", torch.cuda.mem_get_info()[0]/1000000000,
     "total(GB):", torch.cuda.mem_get_info()[1]/1000000000)

print("CPU percentage:", psutil.cpu_percent())

free(GB): 10.708058112 total(GB): 11.81089792
CPU percentage: 13.7


## Hook with Transformer Lens

In [7]:

model_tl_7b = HookedTransformer.from_pretrained(LLANA_NAME_7b,
                                             hf_model=model_7b,
                                             fold_ln=False,
                                             fold_value_biases=False,
                                             center_writing_weights=False,
                                             center_unembed=False,
                                             tokenizer=tokenizer_7b,
                                             device = "cpu")


Loaded pretrained model meta-llama/Llama-2-7b-hf into HookedTransformer


In [9]:
print("free(GB):", torch.cuda.mem_get_info()[0]/1000000000,
     "total(GB):", torch.cuda.mem_get_info()[1]/1000000000)

print("CPU percentage:", psutil.cpu_percent())

free(GB): 10.708058112 total(GB): 11.81089792
CPU percentage: 1.7


In [10]:
model_tl_7b = model_tl_7b.to("cuda" if torch.cuda.is_available() else "cpu")


Moving model to device:  cuda


In [11]:
print("free(GB):", torch.cuda.mem_get_info()[0]/1000000000,
     "total(GB):", torch.cuda.mem_get_info()[1]/1000000000)

print("CPU percentage:", psutil.cpu_percent())

free(GB): 0.0 total(GB): 11.81089792
CPU percentage: 9.1


In [9]:
inference_dtype = torch.float32

model_tl_7b_chat = HookedTransformer.from_pretrained(LLANA_NAME_7b_chat,
                                             hf_model=model_7b_chat,
                                             fold_ln=False,
                                             fold_value_biases=False,
                                             center_writing_weights=False,
                                             center_unembed=False,
                                             tokenizer=tokenizer_7b_chat,
                                             device = "cpu")


Loaded pretrained model meta-llama/Llama-2-7b-chat-hf into HookedTransformer


In [10]:
print("free(GB):", torch.cuda.mem_get_info()[0]/1000000000,
     "total(GB):", torch.cuda.mem_get_info()[1]/1000000000)

print("CPU percentage:", psutil.cpu_percent())

free(GB): 0.589402112 total(GB): 11.81089792
CPU percentage: 1.4


# Visualize Weight

In [13]:

for (name, param) in model_tl_7b.named_parameters():
    if name.startswith(f"blocks.{0}."):
        print(name, param.shape)
        # param_diff = np.append(param_diff,(param-param_chat))
        # param_diff_all[layer] = param_diff

            

blocks.0.ln1.w torch.Size([4096])
blocks.0.ln2.w torch.Size([4096])
blocks.0.attn.W_Q torch.Size([32, 4096, 128])
blocks.0.attn.W_O torch.Size([32, 128, 4096])
blocks.0.attn.b_Q torch.Size([32, 128])
blocks.0.attn.b_O torch.Size([4096])
blocks.0.attn.W_K torch.Size([32, 4096, 128])
blocks.0.attn.W_V torch.Size([32, 4096, 128])
blocks.0.attn.b_K torch.Size([32, 128])
blocks.0.attn.b_V torch.Size([32, 128])
blocks.0.mlp.W_in torch.Size([4096, 11008])
blocks.0.mlp.W_gate torch.Size([4096, 11008])
blocks.0.mlp.W_out torch.Size([11008, 4096])
blocks.0.mlp.b_in torch.Size([11008])
blocks.0.mlp.b_out torch.Size([4096])


In [19]:
text = " The interaction between hippocampus and cortex is a translation problem.Transformer can be used for translation, thus we can use it to study the interaction between hippocampus and cortex."
inputs = model_tl_7b.to_tokens(text)

In [20]:
logits, cache =  model_tl_7b.run_with_cache(inputs, remove_batch_dim=True)


In [21]:
layer = 5
attention_pattern = cache["pattern", layer] # get the attention pattern form cache
print("attention_pattern:",attention_pattern.shape)
str_tokens = model_tl_7b.to_str_tokens(text) # to str tokens

print(f"Layer {layer} Head Attention Patterns:")
display(cv.attention.attention_patterns(
    tokens=str_tokens, # notice the input to the cv need to be str_tokens
    attention=attention_pattern,
    attention_head_names=[f"L{layer}H{i}" for i in range(12)],
))

attention_pattern: torch.Size([32, 45, 45])
Layer 5 Head Attention Patterns:
