# MetaSecAlign

Generate on-policy completion for Alpaca. Code adapted from https://github.com/facebookresearch/Meta_SecAlign/blob/main/generate_data.py to use vllm for fast inference.

In [2]:

import os,sys,json
import torch
sys.path.append(os.path.abspath(".")) 
sys.path.append(os.path.abspath("aside"))
from datasets import load_dataset
import random
import io
import time
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import numpy as np
from tqdm import tqdm
from copy import deepcopy   
from constants import *
from utils.model_utils import load_model



In [3]:
def jload(f, mode="r", num_samples=None):
    if not isinstance(f, io.IOBase): f = open(f, mode=mode)
    jdict = json.load(f)
    f.close()
    if num_samples is not None and num_samples > 0 and num_samples < len(jdict):
        random.seed(10)
        jdict = random.sample(jdict, num_samples)
        random.seed(time.time())
    return jdict

def jdump(obj, f, mode="w", indent=4, default=str):
    if not isinstance(f, io.IOBase): f = open(f, mode=mode)
    if isinstance(obj, (dict, list)): json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str): f.write(obj)
    else: raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()

OTHER_DELM_TOKENS = {
    'mark': ['{s}', '|{s}|', '<{s}>', '[{s}]', '<|{s}|>', '[|{s}|]', '<[{s}]>', '\'\'\'{s}\'\'\'', '***{s}***'],
    'inst': ['Command', 'Rule', 'Prompt', 'Task'],
    'inpt': ['Data', 'Context', 'Text'],
    'resp': ['Output', 'Answer', 'Reply'],
    'user': ['', 'Prompter ', 'User ', 'Human '],
    'asst': ['', 'Assistant ', 'Chatbot ', 'Bot ', 'GPT ', 'AI '],
}

def create_injection_for_completion(response, instruction, input):
    mark = np.random.choice(OTHER_DELM_TOKENS['mark']) + ':'
    
    def sample_delm(delm_name):
        role_name = 'user' if (delm_name == 'inst' or delm_name == 'inpt') else 'asst'
        role = np.random.choice(OTHER_DELM_TOKENS[role_name]) 
        delm = np.random.choice(OTHER_DELM_TOKENS[delm_name])
        p = np.random.rand()
        if p < 1/3: return (role + delm).upper()
        elif p < 2/3: return (role + delm).lower()
        else: return role + delm
    
    text = mark.format(s=sample_delm('resp')) + '\n\n' + response 
    text += '\n\n' + mark.format(s=sample_delm('inst')) + '\n\n' + instruction
    if input != '':  text += '\n\n' + mark.format(s=sample_delm('inpt')) + '\n\n' + input
    return text

def calculate_length_for_preference_dataset(dataset, tokenizer):
    chosen_input_ids = tokenizer([d['chosen'] for d in dataset], add_special_tokens=False)["input_ids"]
    rejected_input_ids = tokenizer([d['rejected'] for d in dataset], add_special_tokens=False)["input_ids"]

    chosen_lengths = np.array([len(prompt) for prompt in chosen_input_ids])
    rejected_lengths = np.array([len(prompt) for prompt in rejected_input_ids])
    prompt_and_label_lengths = np.maximum(chosen_lengths,rejected_lengths)

    print('Input+Output model_max_length (98%, 99%, 99.5%, 99.9%):', np.percentile(prompt_and_label_lengths, [95, 99, 99.5, 99.9]))
    print (f'Mean: {(np.mean(chosen_lengths) + np.mean(rejected_lengths))/2:.2f} Num > 2048: {np.sum(prompt_and_label_lengths>2048)} / {len(prompt_and_label_lengths)}')

In [4]:
clean_data = load_dataset("yahma/alpaca-cleaned")['train']
print (len(clean_data))

51760


In [5]:
model_path = LLAMA_PATH # llama or qwen
model,tokenizer,_,_ = load_model(model_path,use_vllm=True)
sampling_params = SamplingParams(temperature=0.8, max_tokens=2048, stop=tokenizer.eos_token)


preference_data_path = {'train':os.path.join(DATA_DIR, f"{model.m_name}_metasecalign_train.json"),
                        'val':os.path.join(DATA_DIR, f"{model.m_name}_metasecalign_val.json"),}


Ensure the ctx len is loaded, default is 32768
INFO 10-14 15:12:49 [utils.py:328] non-default args: {'max_model_len': 32768, 'tensor_parallel_size': 4, 'disable_log_stats': True, 'enable_chunked_prefill': False, 'model': 'meta-llama/Llama-3.1-8B-Instruct'}
INFO 10-14 15:13:05 [__init__.py:742] Resolved architecture: LlamaForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-14 15:13:05 [__init__.py:1815] Using max model len 32768
INFO 10-14 15:13:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 10-14 15:13:19 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=2258182)[0;0m INFO 10-14 15:13:21 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=2258182)[0;0m INFO 10-14 15:13:21 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=Decoding



[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
INFO 10-14 15:13:35 [__init__.py:1433] Found nccl from library libnccl.so.2
INFO 10-14 15:13:35 [__init__.py:1433] Found nccl from library libnccl.so.2
INFO 10-14 15:13:35 [__init__.py:1433] Found nccl from library libnccl.so.2
INFO 10-14 15:13:35 [pynccl.py:70] vLLM is u

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  4.25it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  3.68it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:00<00:00,  3.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  3.95it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  3.79it/s]
[1;36m(Worker_TP0 pid=2258223)[0;0m 


[1;36m(Worker_TP0 pid=2258223)[0;0m INFO 10-14 15:13:40 [default_loader.py:268] Loading weights took 1.07 seconds
[1;36m(Worker_TP2 pid=2258225)[0;0m INFO 10-14 15:13:40 [weight_utils.py:369] Time spent downloading weights for meta-llama/Llama-3.1-8B-Instruct: 0.719109 seconds
[1;36m(Worker_TP0 pid=2258223)[0;0m INFO 10-14 15:13:40 [gpu_model_runner.py:2392] Model loading took 3.7711 GiB and 2.482708 seconds
[1;36m(Worker_TP3 pid=2258226)[0;0m INFO 10-14 15:13:41 [default_loader.py:268] Loading weights took 1.19 seconds
[1;36m(Worker_TP3 pid=2258226)[0;0m INFO 10-14 15:13:41 [gpu_model_runner.py:2392] Model loading took 3.7711 GiB and 3.059974 seconds
[1;36m(Worker_TP2 pid=2258225)[0;0m INFO 10-14 15:13:41 [default_loader.py:268] Loading weights took 1.13 seconds
[1;36m(Worker_TP2 pid=2258225)[0;0m INFO 10-14 15:13:42 [gpu_model_runner.py:2392] Model loading took 3.7711 GiB and 4.026790 seconds
[1;36m(Worker_TP1 pid=2258224)[0;0m INFO 10-14 15:13:42 [default_loader.py:2

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|█| 67/67 [00:02<0


[1;36m(Worker_TP0 pid=2258223)[0;0m INFO 10-14 15:14:16 [custom_all_reduce.py:203] Registering 4355 cuda graph addresses
[1;36m(Worker_TP1 pid=2258224)[0;0m INFO 10-14 15:14:16 [custom_all_reduce.py:203] Registering 4355 cuda graph addresses
[1;36m(Worker_TP3 pid=2258226)[0;0m INFO 10-14 15:14:16 [custom_all_reduce.py:203] Registering 4355 cuda graph addresses
[1;36m(Worker_TP2 pid=2258225)[0;0m INFO 10-14 15:14:16 [custom_all_reduce.py:203] Registering 4355 cuda graph addresses
[1;36m(Worker_TP0 pid=2258223)[0;0m INFO 10-14 15:14:17 [gpu_model_runner.py:3118] Graph capturing finished in 3 secs, took 0.60 GiB
[1;36m(Worker_TP0 pid=2258223)[0;0m INFO 10-14 15:14:17 [gpu_worker.py:391] Free memory on device (78.58/79.19 GiB) on startup. Desired GPU memory utilization is (0.9, 71.27 GiB). Actual usage is 3.77 GiB for weight, 4.82 GiB for peak activation, 1.48 GiB for non-torch memory, and 0.6 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memo

# Adversarial Training

Only take samples with data input, since DPO requires the rejected to be the attack's response. Data without input cannot have IPI applied.

In [8]:
data_with_injection = []
for i,sample in enumerate(clean_data):
    if sample.get('input', '').strip() == '':
        continue
    data_with_injection.append(sample)
print (len(data_with_injection))

19157


In [9]:
preference_data = []
for i,sample in tqdm(enumerate(data_with_injection),total = len(data_with_injection)):
    current_sample = deepcopy(sample)
    injected_sample = deepcopy(sample)
    instruction = current_sample['instruction']
    inpt = current_sample['input']

    while injected_sample['instruction'] == current_sample['instruction']:
        injected_sample = np.random.choice(data_with_injection, size=1, replace=False)[0]
    
    injected_prompt = injected_sample['instruction'] + ' ' + injected_sample['input']
    if np.random.rand() < 0.9:  # 90% Straightforward Attack, 10% Completion Attack
        current_sample['input'] = injected_prompt + ' ' + current_sample['input'] if np.random.rand() < 0.5 else current_sample['input'] + ' ' + injected_prompt
    else: 
        fake_response = current_sample['output']
        current_sample['input'] += '\n\n' + create_injection_for_completion(fake_response, injected_sample['instruction'], injected_sample['input'])
    preference_data.append({
                'instruction': current_sample['instruction'],
                'input': current_sample['input'],
                'chosen_input': instruction + '\n\n' + inpt,
                'rejected_input': injected_sample['instruction'] + ' ' + injected_sample['input'],
            })
print (len(preference_data))

  0%|                                                                                         | 0/19157 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████| 19157/19157 [00:21<00:00, 899.19it/s]

19157





In [10]:
sampling_params = SamplingParams(temperature=0.8, max_tokens=2048, stop=tokenizer.eos_token)
conversations = []
for sample in preference_data:
    conversations.append(tokenizer.apply_chat_template([{"role": "user", "content": sample["chosen_input"]}], tokenize=False, add_generation_prompt=True,enable_thinking=False))
    conversations.append(tokenizer.apply_chat_template([{"role": "user", "content": sample["rejected_input"]}], tokenize=False, add_generation_prompt=True,enable_thinking=False))
outputs = model.generate(conversations, sampling_params,use_tqdm=True)
for i in range(len(preference_data)):
    sample = preference_data[i]
    sample['chosen'] = outputs[2*i].outputs[0].text + tokenizer.eos_token
    sample['rejected'] = outputs[2*i+1].outputs[0].text + tokenizer.eos_token

val_size = 500
preference_data_ids = np.random.permutation(range(len(preference_data)))
train_dataset = [preference_data[i] for i in range(len(preference_data)) if i not in preference_data_ids[:val_size]]
val_dataset = [preference_data[i] for i in preference_data_ids[:val_size]]

jdump(train_dataset, preference_data_path['train'])
jdump(val_dataset, preference_data_path['val'])
train_dataset = load_dataset('json', data_files=preference_data_path['train'], split='train')
val_dataset = load_dataset('json', data_files=preference_data_path['val'], split='train')
calculate_length_for_preference_dataset(train_dataset, tokenizer)
calculate_length_for_preference_dataset(val_dataset, tokenizer)

Adding requests:   0%|          | 0/38314 [00:00<?, ?it/s]

Processed prompts:   0%|                  | 0/38314 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.…

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Input+Output model_max_length (98%, 99%, 99.5%, 99.9%): [ 799.2  1095.88 1267.32 1925.88]
Mean: 215.00 Num > 2048: 15 / 18657
Input+Output model_max_length (98%, 99%, 99.5%, 99.9%): [ 818.05 1091.07 1168.42 1984.13]
Mean: 207.45 Num > 2048: 1 / 500
