In [None]:
!pip install --upgrade huggingface_hub datasets

In [None]:
import os
from datasets import load_dataset

REPO_NAME = "RCantini/CLEAR-Bias"
OUTPUT_FOLDER = "CLEAR-Bias"

def download_configs_as_csv(repo_name, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    configs = ['base_prompts', 'control_set', 'jailbreak_prompts']

    for config in configs:
        print(f"Starting download and conversion for config: {config}")
        try:
            dataset = load_dataset(repo_name, name=config)
            df = dataset["train"].to_pandas()
            print(f"Loaded dataset '{config}' with {len(df)} records")

            config_out_path = os.path.join(output_folder, config)
            os.makedirs(config_out_path, exist_ok=True)

            match config:
                case 'base_prompts':
                    load_base_prompts(df, config_out_path)
                case 'control_set':
                    load_control_set(df, config_out_path)
                case 'jailbreak_prompts':
                    load_jailbreak_prompts(df, config_out_path)

            print(f"Successfully processed config: {config}\n")
        except Exception as e:
            print(f"Error downloading or converting the config '{config}': {e}")


def load_control_set(df, out_path):
    print("Processing control set...")
    for task, group in df.groupby('TASK'):
        file_path = os.path.join(out_path, f'control_set_{task}.csv')
        group.drop(columns='TASK').to_csv(file_path, index=False)
        print(f"Saved file: {file_path}")


def load_base_prompts(df, out_path):
    print("Processing base prompts...")
    print(df['BIAS CATEGORY'].unique())
    for task, group in df.groupby('TASK'):
        file_path = os.path.join(out_path, f'base_prompts_{task}.csv')
        group.drop(columns='TASK').to_csv(file_path, index=False)
        print(f"Saved file: {file_path}")


def load_jailbreak_prompts(df, out_path):
    print("Processing jailbreak prompts...")
    cols = ['TASK', 'ATTACK', 'VARIANT']
    task_dirs = {
        'SC': 'sentence_completion',
        'CTO': 'choose_the_option'
    }
    for (task, attack, variant), group in df.groupby(cols):
        attack = attack.replace(' ', '_').lower()
        file_name = f"{attack}_{variant}_{task}.csv"

        base_dir = os.path.join(out_path, task_dirs.get(task))
        os.makedirs(base_dir, exist_ok=True)

        attack_path = os.path.join(base_dir, attack)
        os.makedirs(attack_path, exist_ok=True)

        file_path = os.path.join(attack_path, file_name)
        group.drop(columns=cols).to_csv(file_path, index=False)
        print(f"Saved file: {file_path}")


download_configs_as_csv(REPO_NAME, OUTPUT_FOLDER)