#### 载入多轮纯文本攻击数据集

### 在数据集 `tom-gibbs/multi-turn_jailbreak_attack_datasets` 上抽取模型 `llava-v1.6-mistral-7b-hf` 隐藏层中的嵌入向量

In [1]:
from scipy.io import loadmat
import numpy as np
from joblib import load
import pandas as pd
import matplotlib.pyplot as plt

# 导入 torch 库
import torch
from torch.utils.data import DataLoader
# 导入 transformers 库
import transformers
from transformers import AutoProcessor, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration


#### 模型配置及载入

In [2]:
model_name_or_path = "../../llava-hf/llava-v1.6-mistral-7b-hf"
device = "cuda:0"

model = LlavaNextForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    device_map=device,
    torch_dtype=torch.bfloat16
)

# image_processor: CLIPImageProcessor, tokenizer: LlamaTokenizerFast
processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path=model_name_or_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


#### 读取并处理数据集

In [4]:
import ast

mt_text_only_jailbreak_complete_harmful_df = pd.read_csv('/s/datasets/tom-gibbs/multi-turn_jailbreak_attack_datasets/Complete Harmful Dataset.csv')
attack_conversation = []

for conv_str in mt_text_only_jailbreak_complete_harmful_df['Multi-turn conversation']:
    mt_conversation = ast.literal_eval(conv_str)

    llava_formatted_conversations = []
    for turn in mt_conversation:
        llava_formatted_single_conv = {
            "role": turn["role"],
            "content": [
                {
                    "type": "text",
                    "text": turn["content"]
                }
            ]
        }
        llava_formatted_conversations.append(llava_formatted_single_conv)

    # 将多轮对话转换为 prompt 格式
    attack_prompt = processor.apply_chat_template(llava_formatted_conversations, add_generation_prompt=True)
    attack_conversation.append(attack_prompt)

inputs = processor(text=attack_conversation, return_tensors="pt", padding=True)

for temp_key in inputs.keys():
    inputs[temp_key] = inputs[temp_key].to(device)

In [15]:
inputs.keys()

KeysView({'input_ids': tensor([[32001, 32001, 32001,  ...,   302,     2,   259],
        [32001, 32001, 32001,  ..., 28723,     2,   259],
        [32001, 32001, 32001,  ..., 28808,     2,   259],
        ...,
        [32001, 32001, 32001,  ..., 28723,     2,   259],
        [32001, 32001, 32001,  ..., 28723,     2,   259],
        [32001, 32001, 32001,  ..., 28723,     2,   259]], device='cuda:0'), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]], device='cuda:0')})

In [9]:
model.model.language_model

MistralModel(
  (embed_tokens): Embedding(32064, 4096)
  (layers): ModuleList(
    (0-31): 32 x MistralDecoderLayer(
      (self_attn): MistralAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      )
      (mlp): MistralMLP(
        (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
    )
  )
  (norm): MistralRMSNorm((4096,), eps=1e-05)
  (rotary_emb): MistralRotaryEmbedding()
)

#### 进行模型的推理及保存在 (0,8,16,24,32) 隐藏层中的张量

In [None]:
import os
from torch.nn.utils.rnn import pad_sequence

batch_size = 2
# 
result_generate_ids = []
# 用字典保存文本大模型不同隐藏层的结果
hidden_results = {i: [] for i in range(0, 33, 8)}   # 0,8,16,24,32

with torch.no_grad():
    for i in range(0, inputs["input_ids"].size(0), batch_size):
        # 构建一个子字典，取 [i:i+batch_size]
        input_item = {k: v[i:i+batch_size] for k, v in inputs.items()}

        outputs = model(**input_item, output_hidden_states=True, return_dict=True)
        hidden_states = outputs.hidden_states

        # 每隔 8 层提取
        for layer_idx in range(0, 33, 8):
            hidden_results[layer_idx].append(hidden_states[layer_idx].cpu())  

        generate_ids = model.generate(**input_item, max_new_tokens=256)
        result_generate_ids.append(generate_ids.cpu())

# 保存隐藏层中的张量
for layer_idx in hidden_results.keys():
    hidden_results[layer_idx] = torch.cat(hidden_results[layer_idx], dim=0)

save_dir = "/home/lsy/workspace/HyperbolicResearch/src/multiturn_text_only/embeddings"
os.makedirs(save_dir, exist_ok=True)

for layer_idx, tensor in hidden_results.items():
    save_path = os.path.join(save_dir, f"layer_{layer_idx}.pt")
    torch.save(tensor, save_path)
    print(f"Saved {layer_idx}-th layer tensor to {save_path}")

# 假设 result_generate_ids 是一个 list，每个元素是 [batch, seq_len] 的张量
# 先展平 list 里的 batch
all_seqs = []
for gen in result_generate_ids:
    for seq in gen:
        all_seqs.append(seq)

# pad 成同样长度（默认填充 0，也可以换成 tokenizer.pad_token_id）
result_generate_ids = pad_sequence(
    all_seqs,
    batch_first=True,
    padding_value=processor.tokenizer.pad_token_id
)
print(result_generate_ids.shape)  # (总样本数, max_seq_len)

dict_keys([0, 8, 16, 24, 32])

#### 将模型生成的文本 token 序列解码成字符串形式，并调整为llama-guard的格式便于后续的评估

In [None]:
import re
import json

responses = processor.batch_decode(result_generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
eval_conversations = []
eval_chat_save_dir = "/home/lsy/workspace/HyperbolicResearch/src/multiturn_text_only/eval_chat"
os.makedirs(eval_chat_save_dir, exist_ok=True)

In [3]:
mt_text_only_jailbreak_complete_harmful_df.__len__()

382