# Format the dataset into 
```python
{
    "instruction": ...,
    "cot": ...,
    "answer": ...,
    "mutations":[{},{},...]
}
```

# Reasoning to Defend (EMNLP'25)
- 52k samples from Alpaca
- 520 samples from AdvBench
- DeepSeek-R1

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("chuhac/R2D-R1")
ds['train'][0]

In [None]:
'''
1. remove [] tage
2. get the cot and answer
'''
def clean_output(output):
    # remove tag
    o = output.replace("[SAFE]", "").replace("[UNSAFE]", "").replace("[RETHINK]", "")
    return o

def check_harmful(output):
    if "[UNSAFE]" in output or "I'm sorry" in output:
        return "harmful"
    else:
        return "benign"


In [None]:
dataset = []
benign_num = 0
harmful_num = 0
for item in ds['train']:
    instruction = item['instruction']
    input = item['input']
    output = item['output']

    typ = check_harmful(output)
    if typ == "benign":
        benign_num += 1
    else:
        harmful_num += 1
    output = clean_output(output)

    if len(instruction) > 2 and len(input) > 2:
        question = instruction + " " + input

        dataset.append({
            "instruction": question.strip(),
            "output": output,
            "type": typ
        })
    elif len(instruction) > 2 and len(input) < 2:
        question = instruction

        dataset.append({
            "instruction": question.strip(),
            "output": output,
            "type": typ
        })
    elif len(instruction) < 2 and len(input) > 2:
        question = input

        dataset.append({
            "instruction": question.strip(),
            "output": output,
            "type": typ
        })

benign_num, harmful_num, dataset[0]

In [None]:
# save
import json
save_path = "data/processed/R2D-R1.json"
with open(save_path, "w") as f:
    json.dump(dataset, f, indent=4)


# SafeChain
- 50,000 instructions from the WildJailbreak dataset using a uniform distribution.
- R1-70B to generate 5 responses

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("UWNSL/SafeChain")
ds

In [None]:
ds['train'][0]

In [None]:
def clean_output(output):
    # split cot and answer
    answer = output.split("</think>")[-1].strip()
    cot = output.split("</think>")[0].strip()

    return cot, answer

dataset = []
for item in ds['train']:
    instruction = item['instruction']
    label = item['label']
    cot, answer = clean_output(item['response'])


    dataset.append({
        "instruction": instruction,
        "cot": cot,
        "answer": answer,
        "label": label,
    })
dataset[0]

In [None]:
# save
save_path = "data/raw/SafeChain.json"
with open(save_path, "w") as f:
    json.dump(dataset, f, indent=4)

# Circuit breaker
- self created

In [None]:
# https://raw.githubusercontent.com/GraySwanAI/circuit-breakers/refs/heads/main/data/circuit_breakers_train.json
import requests
url = "https://raw.githubusercontent.com/GraySwanAI/circuit-breakers/refs/heads/main/data/circuit_breakers_train.json"
response = requests.get(url)
data = response.json()
data[0]

In [None]:
# should filter again, not clean enough
dataset = []

for item in data:
    answer = item['llama3_output']

    if "cannot" in answer:

        new_item = item.copy()
        new_item['instruction'] = item['prompt']
        new_item['answer'] = answer
        new_item['label'] = item['category']
        dataset.append(new_item)
    else:
        continue

len(dataset), len(data)

In [None]:
# save
import os
os.makedirs("data/raw", exist_ok=True)
import json
save_path = "data/raw/circuit_breakers_train.json"
with open(save_path, "w") as f:
    json.dump(dataset, f, indent=4)


In [None]:
url = "https://raw.githubusercontent.com/GraySwanAI/circuit-breakers/refs/heads/main/data/circuit_breakers_val.json"
response = requests.get(url)
data = response.json()
data[0]

In [None]:
# should filter again, not clean enough
dataset = []

for item in data:
    answer = item['llama3_output']
    new_item = item.copy()
    new_item['instruction'] = item['prompt']
    new_item['answer'] = answer
    dataset.append(new_item)

len(dataset), len(data)

In [None]:
# save
import json
save_path = "data/raw/circuit_breakers_val.json"
with open(save_path, "w") as f:
    json.dump(dataset, f, indent=4)

# STAIR
- Ultrafeedback
- SafeRLHF
- JailBreakV

to construct both a SFT dataset of 20k samples for CoT format alignment and a seed dataset with prompts.

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("thu-ml/STAIR-SFT")
ds

In [None]:
ds['train'][0]

In [None]:
set([item['source'] for item in ds['train']])

In [None]:
dataset = []
harmful_num = 0
benign_num = 0
for item in ds['train']:
    instruction = item['prompt']
    answer = item['answer']
    if "PKU-SafeRLHF" in item['source']:
        harmful_num += 1
    else:
        benign_num += 1

    dataset.append({
        "instruction": instruction,
        "answer": answer,
        "type": "harmful" if "PKU-SafeRLHF" in item['source'] else "benign",
        "source": item['source']
    })

harmful_num, benign_num, len(dataset)


In [None]:
dataset[0]

In [None]:
# save
save_path = "data/processed/STAIR-SFT.json"
with open(save_path, "w") as f:
    json.dump(dataset, f, indent=4)

# SRG
- PKUSafeRLHF (Ji et al., 2024) and helpful instructions from
- Ultrafeedback (Cui et al., 2023), 
- with corresponding responses re-generated by GPT-4o with guidelines (tags)

We experiment with two dataset scales: 
(1) small-scale (0.8K illegal, 2.5K helpful)
(2) large-scale (5K illegal, 30K helpful).

In [None]:
# Not released!

# AlphaAlign
- SCoT data
- Dolly
- XSTest

# Dolly

In [None]:
from datasets import load_dataset
# dataset keys: category, prompt, llama3_output
ds = load_dataset("databricks/databricks-dolly-15k", split="train")
ds

In [None]:
ds[0]

In [None]:
dataset = []
for item in ds:
    context = item['context']
    instruction = item['instruction']

    instruction = context + " " + instruction
    category = item['category']
    answer = item['response']
    dataset.append({
        "category": category,
        "answer": answer,
        "instruction": instruction,
    })
len(dataset)

In [None]:
# save
import json
path = "data/raw/dolly.json"
with open(path, "w") as f:
    json.dump(dataset, f, indent=4)