## Step 1: Select an attack method to run (e.g., GCG)
- 10 samples from HarmBench
- Vicuna-7B-v1.5

In [None]:
from aisafetylab.attack.attackers.gcg import GCGMainManager
from aisafetylab.utils import ConfigManager

config_path = './configs/gcg.yaml'

config_manager = ConfigManager(config_path=config_path) #./configs/gcg.yaml
gcg_manager = GCGMainManager.from_config(config_manager.config)
gcg_manager.attack()

## Step 2: Generate responses after obtaining the attack prompts
## Step 3: Provide safety scores to the responses by selecting one scorer (e.g., LlamaGuard3)


In [None]:
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from aisafetylab.models import load_model
from aisafetylab.evaluation.scorers import LlamaGuard3Scorer

def load_file(path):
    if path.endswith('.jsonl'):
        with open(path) as f:
            return [json.loads(line) for line in f]
    
    if path.endswith('.json'):
        with open(path) as f:
            return json.load(f)
    
    raise ValueError(f'Unsupported file format: {path}')

def evaluate_asr(path, model, scorer, batch_size=8, save_path=None):
    data = load_file(path)
    all_scores = []
    queries = [d['final_query'] for d in data]
    responses = model.batch_chat(queries, batch_size=batch_size)
    raw_queries = [d['goals'] for d in data]
    score_results = scorer.batch_score(raw_queries, responses, batch_size=batch_size)
    all_scores = [r['score'] for r in score_results]
    for i in range(len(data)):
        data[i]['final_response'] = responses[i]
        data[i]["final_score"] = all_scores[i]

    print(f'ASR: {np.mean(all_scores)}')
    if save_path is not None:
        with open(save_path, 'w') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

model_path = "lmsys/vicuna-7b-v1.5"
device = torch.device('cuda:2')
model = (
    AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
    .eval()
    .to(device)
)
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left', trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
generation_config = {
    "do_sample": False,
    "max_new_tokens": 512
}
model = load_model(model=model, tokenizer=tokenizer, model_name='vicuna-7b-v1.5', generation_config=generation_config)
scorer = LlamaGuard3Scorer(
    model_path="meta-llama/Llama-Guard-3-8B",
    tokenizer_path="meta-llama/Llama-Guard-3-8B",
    device=device,
)
path = './results/vicuna_gcg.jsonl'
evaluate_asr(
    path,
    model,
    scorer,
    batch_size=8,
    save_path="evaluation_results/vicuna_gcg_LlamaGuard3.json",
)

## Step 4: Generate responses with inference-time defense mechanisms (e.g., Goal Prioritization)
## Step 5: Provide safety scores to the new responses

In [None]:
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from aisafetylab.models import load_model
from aisafetylab.evaluation.scorers import LlamaGuard3Scorer
from aisafetylab.defense.inference_defense import create_defender_from_yaml
from aisafetylab.defense.inference_defense import batch_chat
model_path = "lmsys/vicuna-7b-v1.5"
device = torch.device('cuda:3')
model = (
    AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
    .eval()
    .to(device)
)
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left', trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
generation_config = {
    "do_sample": False,
    "max_new_tokens": 512
}
model = load_model(model=model, tokenizer=tokenizer, model_name='vicuna-7b-v1.5', generation_config=generation_config)

scorer = LlamaGuard3Scorer(
    model_path="meta-llama/Llama-Guard-3-8B",
    tokenizer_path="meta-llama/Llama-Guard-3-8B",
    device=device,
)

def load_file(path):
    if path.endswith('.jsonl'):
        with open(path) as f:
            return [json.loads(line) for line in f]
    
    if path.endswith('.json'):
        with open(path) as f:
            return json.load(f)
    
    raise ValueError(f'Unsupported file format: {path}')

def evaluate_asr(path, model, scorer, batch_size=8, defenders=None, save_path=None):
    data = load_file(path)
    all_scores = []
    queries = [d['final_query'] for d in data]
    responses = batch_chat(model, queries, defenders, batch_size=batch_size)
    raw_queries = [d['goals'] for d in data]
    score_results = scorer.batch_score(raw_queries, responses, batch_size=batch_size)
    all_scores = [r['score'] for r in score_results]
    for i in range(len(data)):
        data[i]['final_response'] = responses[i]
        data[i]["final_score"] = all_scores[i]

    print(f'ASR: {np.mean(all_scores)}')
    if save_path is not None:
        with open(save_path, 'w') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

# Create defenders
defenders = []
defender_path = './configs/goal_prioritization.yaml'

path = './results/vicuna_gcg.jsonl'
defenders.append(create_defender_from_yaml(defender_path))
evaluate_asr(
    path,
    model,
    scorer,
    batch_size=8,
    defenders=defenders,
    save_path="evaluation_results/vicuna_goal_prioritization_gcg_LlamaGuard3.json",
)

## Step 6: Generate responses with training-time defense mechanisms (e.g., Safe Unlearning)
## Step 7: Provide safety scores to the new responses

In [None]:
import IPython

command = """
deepspeed --include localhost:6,7 --master_port=20959 training.py \
    --config configs/unlearning_config.yaml
"""

IPython.get_ipython().system(command)

In [None]:
import json
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from aisafetylab.models import load_model
from aisafetylab.evaluation.scorers import LlamaGuard3Scorer

# Change into the safety trained model
model_path = "./ckpts/vicuna-7b-v1.5-safeunlearning"
device = torch.device('cuda:7')
model = (
    AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
    .eval()
    .to(device)
)
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left', trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
generation_config = {
    "do_sample": False,
    "max_new_tokens": 512
}
model = load_model(model=model, tokenizer=tokenizer, model_name='vicuna-7b-v1.5', generation_config=generation_config)

scorer = LlamaGuard3Scorer(
    model_path="meta-llama/Llama-Guard-3-8B",
    tokenizer_path="meta-llama/Llama-Guard-3-8B",
    device=device,
)
def load_file(path):
    if path.endswith('.jsonl'):
        with open(path) as f:
            return [json.loads(line) for line in f]
    
    if path.endswith('.json'):
        with open(path) as f:
            return json.load(f)
    
    raise ValueError(f'Unsupported file format: {path}')

def evaluate_asr(path, model, scorer, batch_size=8, save_path=None):
    data = load_file(path)
    all_scores = []
    queries = [d['final_query'] for d in data]
    responses = model.batch_chat(queries, batch_size=batch_size)
    raw_queries = [d['goals'] for d in data]
    score_results = scorer.batch_score(raw_queries, responses, batch_size=batch_size)
    all_scores = [r['score'] for r in score_results]
    for i in range(len(data)):
        data[i]['final_response'] = responses[i]
        data[i]["final_score"] = all_scores[i]

    print(f'ASR: {np.mean(all_scores)}')
    if save_path is not None:
        with open(save_path, 'w') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

path = './results/vicuna_gcg.jsonl'
evaluate_asr(
    path,
    model,
    scorer,
    batch_size=8,
    save_path="evaluation_results/vicuna_goal_safe_unlearning_LlamaGuard3.json",
)

  from .autonotebook import tqdm as notebook_tqdm
2025-03-26 08:10:10,499	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Loading checkpoint shards: 100%|██████████| 3/3 [00:48<00:00, 16.01s/it]
[32m2025-03-26 08:11:08.460[0m | [1mINFO    [0m | [36maisafetylab.evaluation.scorers.llama_guard_3_scorer[0m:[36mload_model[0m:[36m41[0m - [1mloading model...[0m
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.28it/s]
[32m2025-03-26 08:11:13.924[0m | [1mINFO    [0m | [36maisafetylab.evaluation.scorers.llama_guard_3_scorer[0m:[36mload_model[0m:[36m52[0m - [1mfinish loading[0m
100%|██████████| 2/2 [00:06<00:00,  3.23s/it]
  0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 50%|█████     | 1/2 [00:00<00:00,  2.09it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|█████████

ASR: 0.0



