In [1]:
# Description: This is the main script for the PEFT demo.
# For testing issue, one good way is to run it in jupyter notebook or colab.

from transformers import LlamaTokenizer, AutoModelForCausalLM
import torch
from torch import fx
from torch.utils.data import DataLoader, TensorDataset
from utils import get_response_from_sentence, hidden_state_generator
from data import load_function_dataset, get_ft_dataloader
from model import get_linear_classifier, get_simple_classifier, get_combined_model
from train import classifier_trainer, peft_model_finetune
import peft
from tqdm import tqdm
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm
  @autocast()
  @autocast()
  @autocast()


In [2]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)



In [3]:
model.load_state_dict

<bound method Module.load_state_dict of GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=204

In [4]:
# model = YourModel()
num_layers = sum(1 for _ in model.modules())  # Counts all submodules (layers)
print(f"The model has {num_layers} layers.")

The model has 91 layers.


### llama 2

In [20]:
model_path = "/common/public/LLAMA2-HF/Llama-2-7b-chat-hf"
# model_path = 
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Change torch.float16 for float32 fine-tuning, while discriminator should be halfed
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', torch_dtype=torch.float16)
device = "cuda:0"

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]


In [44]:
model.load_state_dict

<bound method Module.load_state_dict of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((

In [41]:
model.config.num_hidden_layers

32

In [21]:
# For Llama-2 7B. For best performance this should be adjusted to similar ranges for bigger model
# 5
layer_range = range(-18, -23, -1)

# In this demo, other funcs are not supported
func_name = "jailbreak"
data_path = "justinphan3110/harmful_harmless_instructions"

# 0 = harmless, 1 = harmful
target_label = 1
dataloader_func = load_function_dataset(func_name)
discriminator_choice = {"linear": get_linear_classifier, "simple": get_simple_classifier}


In [22]:

# empirical value. In the first time, one can try higher epochs and determine the epoch num by observing the output
epoch_num = 50
discriminator_type = "simple"

# Learning rate for the lora adapter. Should be better designed.
learning_rate_schedule = [1e-4] * epoch_num
# target bound is layers that is to be tuned
# for example, (2m, 2n) = q_proj and v_proj in layers (m,n)
target_bound = (18, 28)

# generator training epochs for each peft iteration
generator_epoch = 2
# discriminator training epochs for each peft iteration
discriminator_epoch = 30

get_discriminator = discriminator_choice[discriminator_type]

user_tag = "[INST]"
assistant_tag = "[/INST]"

template = user_tag + " {instruction} " + assistant_tag
model_path = "EleutherAI/pythia-70m-deduped"
config_log = f"""
Model Path: {model_path}
Layer Range: {layer_range}
Function Name: {func_name}
Data Path: {data_path}
Epoch Number: {epoch_num}
Learning Rate Schedule: {learning_rate_schedule}
Target Bound: {target_bound}
User_tag: {user_tag}
Assistant_tag: {assistant_tag}
Template: {template}
Discriminator_type: {discriminator_type}
Discriminator_epoch: {discriminator_epoch}
Target_label: {target_label}
"""
print(config_log)


Model Path: EleutherAI/pythia-70m-deduped
Layer Range: range(-18, -23, -1)
Function Name: jailbreak
Data Path: justinphan3110/harmful_harmless_instructions
Epoch Number: 50
Learning Rate Schedule: [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001]
Target Bound: (18, 28)
User_tag: [INST]
Assistant_tag: [/INST]
Template: [INST] {instruction} [/INST]
Discriminator_type: simple
Discriminator_epoch: 30
Target_label: 1



In [23]:
train_data, train_labels, test_data, test_labels = dataloader_func(data_path, template=template)  # tokenizer,user_tag,assistant_tag)#, template=template)


In [30]:
len(train_data) #256
len(test_data) #100

100

In [47]:
train_data = train_data[:5]
train_labels = train_labels[:5]


In [26]:
data = train_data[0]

In [46]:
tok = -1
hidden_states = []
with torch.no_grad():
    for idx, dt in enumerate(tqdm(data)):
        tokenized_input = tokenizer(dt, return_tensors="pt").input_ids.to(model.device)
        hidden_st = model(tokenized_input, output_hidden_states=1).hidden_states
        print(hidden_st[3][0][1])
        break
        current_tok = tok[idx] if isinstance(tok, list) else tok

        now_hs = hidden_st[layer_range[0]][:, current_tok, :]
        for i in layer_range[1:]:
            now_hs = torch.concat((now_hs, hidden_st[i][:, current_tok, :]), dim=0)

        hidden_states.append(now_hs.flatten())
hidden_states = torch.stack(hidden_states)

print(hidden_states)

## tuple of 33 values?
## 33 hidden states of size torch.Size([1, 2, 4096])
## select 5 states (-18, -23) - how 

  0%|          | 0/87 [00:05<?, ?it/s]

tensor([ 0.0543,  0.0055, -0.0126,  ...,  0.0817,  0.0253,  0.0511],
       dtype=torch.float16)





RuntimeError: stack expects a non-empty TensorList

In [37]:
model.modules


<generator object Module.modules at 0x7f28454785f0>

In [48]:
# hidden state: outputs of layer, given inputs; how inputs are processed by the layer
def hidden_state_generator(model, tokenizer, data, layer_range=range(18, 32), tok=-1):
    hidden_states = []
    with torch.no_grad():
        for idx, dt in enumerate(tqdm(data)):
            tokenized_input = tokenizer(dt, return_tensors="pt").input_ids.to(model.device)
            hidden_st = model(tokenized_input, output_hidden_states=1).hidden_states
        
            current_tok = tok[idx] if isinstance(tok, list) else tok

            now_hs = hidden_st[layer_range[0]][:, current_tok, :]
            for i in layer_range[1:]:
                now_hs = torch.concat((now_hs, hidden_st[i][:, current_tok, :]), dim=0)

            hidden_states.append(now_hs.flatten())
    hidden_states = torch.stack(hidden_states)
    return hidden_states

In [49]:
train_hs = hidden_state_generator(model, tokenizer, train_data, layer_range)
print("Training data generated. Shape:", train_hs.shape)


100%|██████████| 5/5 [05:49<00:00, 69.92s/it]

Training data generated. Shape: torch.Size([5, 20480])





In [None]:
print("Generating testing data...")
test_hs = hidden_state_generator(model, tokenizer, test_data, layer_range)
print("Testing data generated. Shape:", test_hs.shape)

In [22]:
# seems to be generating data from the LM model ... ?
# selected layer range --> is kinda arbitrary...?

## compute heavy step

## loaded jailbreak dataset (harm)
## given layer range, train data; 
## gather hidden state 
train_data, train_labels, test_data, test_labels = dataloader_func(data_path, template=template)  # tokenizer,user_tag,assistant_tag)#, template=template)
print("Generating training data...")
train_hs = hidden_state_generator(model, tokenizer, train_data, layer_range)
print("Training data generated. Shape:", train_hs.shape)

print("Generating testing data...")
test_hs = hidden_state_generator(model, tokenizer, test_data, layer_range)
print("Testing data generated. Shape:", test_hs.shape)


Generating training data...


  0%|          | 0/256 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
 11%|█         | 27/256 [33:55<4:47:42, 75.38s/it]


KeyboardInterrupt: 

In [14]:
# load dataset to cuda

train_labels = train_labels.cuda()
test_labels = test_labels.cuda()
train_dataset = TensorDataset(train_hs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
input_size = train_hs.shape[1]


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [19]:
torch.cuda.is_available()

False

In [12]:
# initialise and train classifier 
# based on labelled hidden states data o-o 

classifier = get_discriminator(input_size, 2, half=True)
classifier = classifier.to(model.device)

classifier_trainer(classifier, train_loader, epochs=discriminator_epoch, device=model.device,
                   is_eval=True, eval_dset=test_hs, eval_labels=test_labels)
print('classifier_trainer')

combined_model = get_combined_model(model, classifier, layer_range)

print('combined model')

NameError: name 'input_size' is not defined

### peft finetuning

In [None]:
# 
modules = [name for name, _ in combined_model.named_modules()][1:]
target_modules = [name for name in modules if "q_proj" in name or "v_proj" in name][target_bound[0]:target_bound[1]]
print("Target modules:", target_modules)

## TODO: what target modules?
lora_config = LoraConfig(target_modules=target_modules)

## combined model...
peft_model = get_peft_model(combined_model, lora_config)
peft_model.print_trainable_parameters()

train_loader, test_loader = get_ft_dataloader(train_data, test_data, tokenizer, target_label)

for i in range(epoch_num):
    peft_model_finetune(peft_model, train_loader, test_loader, epochs=2, lr=learning_rate_schedule[i], eps=1e-2,
                        device=device)
    print("Generating training data " + str(i))
    ctrain_hs = hidden_state_generator(peft_model.generator, tokenizer, train_data, layer_range)
    print("Training data generated. Shape:", train_hs.shape)

    print("Generating testing data " + str(i))
    ctest_hs = hidden_state_generator(peft_model.generator, tokenizer, test_data, layer_range)
    print("Testing data generated. Shape:", test_hs.shape)

    ctrain_dataset = TensorDataset(ctrain_hs, train_labels)
    ctrain_loader = DataLoader(ctrain_dataset, batch_size=64, shuffle=True)

    classifier = get_discriminator(input_size, 2, half=True)
    classifier = classifier.to(model.device)
    converge = classifier_trainer(classifier, ctrain_loader, epochs=discriminator_epoch, device=model.device,
                                  is_eval=True, eval_dset=ctest_hs, eval_labels=test_labels)
    if converge:
        break
    peft_model.classifier.load_state_dict(classifier.state_dict())



### evaluation

In [None]:
response_list = []
for prompt in tqdm(test_data, desc="Evaluating Editing Results"):
    response = get_response_from_sentence(model, tokenizer, prompt, max_length=256)
    print(response)