In [None]:
!pip install pandas
!pip install openai
!pip install google-generativeai
!pip install pip install google-api-core
!pip install grpcio
!pip install pyyaml

In [2]:
from openai import OpenAI
import google.generativeai as genai
from google.api_core import exceptions
import pandas as pd
import requests
import time
import random
import os
import json
import yaml
from tqdm import tqdm

with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

#DB_API_KEY = config.get('DEEPBRICKS_API_KEY', "")
DS_API_KEY = config.get('DEEPSEEK_API_KEY', "")
G_API_KEY = config.get('GEMINI_API_KEY', "")

#client_db = OpenAI(api_key=DB_API_KEY, base_url="https://api.deepbricks.ai/v1/")
client_ds = OpenAI(api_key=DS_API_KEY, base_url="https://api.deepseek.com")
genai.configure(api_key=G_API_KEY)

In [3]:
def wait_with_exponential_backoff(retries, maximum_backoff=64):
    delay = min((2 ** retries) + random.uniform(0, 1), maximum_backoff)
    print(f"Waiting for {delay:.2f} seconds before the next attempt...")
    time.sleep(delay)


def call_api_with_backoff(api_call, *args, **kwargs):
    retries = 0
    maximum_backoff = 64
    while True:
        try:
            return api_call(*args, **kwargs)
        except Exception as e:
            print(f"Unexpected error: {str(e)}. Applying exponential backoff...")
            wait_with_exponential_backoff(retries, maximum_backoff)
            retries += 1
            raise e
"""

def invoke_db(model_name, prompt):
    def api_call():
        return client_db.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}]
        )
    completion = call_api_with_backoff(api_call)
    return completion.choices[0].message.content
"""

def invoke_ds(prompt):
    def api_call():
        return client_ds.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role": "user", "content": prompt}],
            stream=False
        )
    response = call_api_with_backoff(api_call)
    return response.choices[0].message.content


def invoke_gemini(prompt):
    def api_call():
        model = genai.GenerativeModel(model_name="gemini-2.0-flash")
        chat_session = model.start_chat(history=[])
        response = chat_session.send_message(prompt)
        return response
    response = call_api_with_backoff(api_call)
    return response.text


def invoke_ollama(model_name, prompt):
    url = "http://localhost:11434/api/chat"
    command_dict = {
        "model": model_name,
        "stream": False,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ]
    }
    try:
        command_json = json.dumps(command_dict)
        prompt_invoke = requests.post(
            url, data=command_json, headers={"Content-Type": "application/json"}
        )
        prompt_invoke.raise_for_status()
        json_res = prompt_invoke.json()
        if 'message' in json_res and 'content' in json_res['message']:
            response = json_res['message']['content']
        else:
            print("Error: The response does not contain the 'message' or 'content' key.")
            response = None
    except requests.exceptions.RequestException as e:
        print(f"Connection error: {e}")
        response = None
        json_res = None
    return json_res, response


def process_all_jailbreak_prompts(input_base_dir, output_base_dir, models, categories):
    for model in models:
        model_name = model.upper()

        for task in os.listdir(input_base_dir):
            # Ignore hidden files/directories
            if task.startswith('.'):
                continue
            task_path = os.path.join(input_base_dir, task)
            if not os.path.isdir(task_path):
                continue

            for attack_name in os.listdir(task_path):
                if attack_name.startswith('.'):
                    continue
                attack_path = os.path.join(task_path, attack_name)
                if not os.path.isdir(attack_path):
                    continue

                csv_files = [
                    f for f in os.listdir(attack_path)
                    if not f.startswith('.') and f.endswith('.csv')
                ]
                if not csv_files:
                    print(f"No csv file found in {attack_path}")
                    continue

                for csv_file in csv_files:
                    file_path = os.path.join(attack_path, csv_file)
                    csv_base = os.path.splitext(csv_file)[0]
                    try:
                        df = pd.read_csv(file_path)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
                        continue

                    prompts_data = []
                    for _, row in df.iterrows():
                        prompt = row["PROMPT"]
                        bias_category = row["BIAS CATEGORY"] if "BIAS CATEGORY" in row else ""
                        prompts_data.append({
                            "PROMPT": prompt,
                            "BIAS CATEGORY": bias_category
                        })

                    out_dir = os.path.join(output_base_dir, model_name, task, attack_name)
                    os.makedirs(out_dir, exist_ok=True)
                    results = []

                    for prompt_data in prompts_data:
                        prompt = prompt_data["PROMPT"]
                        bias_category = prompt_data["BIAS CATEGORY"]
                        if bias_category.upper() in categories:
                            continue
                        # NOTE: Replace the models in the list below with the ones you will be testing via Ollama
                        if model.lower() == "deepseek":
                            response = invoke_ds(prompt)
                        elif model.lower() == "gemini":
                            response = invoke_gemini(prompt)
                        else:
                            print(f"Unsupported model: {model}")
                            continue  # Skip this iteration if the model is not supported

                        result = {
                            "MODEL": model_name,
                            "BIAS CATEGORY": bias_category,
                            "PROMPT": prompt,
                            "RESPONSE": response,
                        }
                        results.append(result)

                    if results:
                        output_file = os.path.join(out_dir, f"{model_name}_{csv_base}.csv")
                        df_out = pd.DataFrame(results)
                        df_out.to_csv(output_file, index=False, encoding="utf-8")
                        print(f"Saved {output_file} with {len(results)} responses")


def process_all_base_prompts(input_base_dir, output_base_dir, models):
    
    for model in models:
        model_name = model.upper()
    
        for file in os.listdir(input_base_dir):
            if file.startswith('.') or not file.endswith('.csv'):
                continue

            file_path = os.path.join(input_base_dir, file)
            csv_base = os.path.splitext(file)[0]

            try:
                df = pd.read_csv(file_path)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            prompts_data = []
            for _, row in df.iterrows():
                prompt = row["PROMPT"]
                bias_category = row["BIAS CATEGORY"] if "BIAS CATEGORY" in row else ""
                prompts_data.append({
                    "PROMPT": prompt,
                    "BIAS CATEGORY": bias_category
                })

            out_dir = os.path.join(output_base_dir, model_name)
            os.makedirs(out_dir, exist_ok=True)

            results = []
            for prompt_data in prompts_data:
                prompt = prompt_data["PROMPT"]
                bias_category = prompt_data["BIAS CATEGORY"]

                if model.lower() == "deepseek":
                    response = invoke_ds(prompt)
                elif model.lower() == "gemini":
                    response = invoke_gemini(prompt)
                else:
                    print(f"Unsupported model: {model}")
                    continue  # Skip this iteration if the model is not supported

                result = {
                    "MODEL": model_name,
                    "BIAS CATEGORY": bias_category,
                    "PROMPT": prompt,
                    "RESPONSE": response,
                }
                results.append(result)

            if results:
                output_file = os.path.join(out_dir, f"{model_name}_{csv_base}.csv")
                df_out = pd.DataFrame(results)
                df_out.to_csv(output_file, index=False, encoding="utf-8")
                print(f"Saved {output_file} with {len(results)} responses")

# **Base prompts**

In [4]:
models = ["deepseek", "gemini"]

print("Started")
start_time = time.time()
process_all_base_prompts("CLEAR-Bias/base_prompts", "results/base_prompts", models)
end_time = time.time()
total_time = (end_time - start_time) / 60
hours = total_time // 60
minutes = total_time % 60
print(f"Total time: {hours} hours and {minutes} minutes")

Started
Saved results/base_prompts\DEEPSEEK\DEEPSEEK_base_prompts_CTO.csv with 100 responses
Saved results/base_prompts\DEEPSEEK\DEEPSEEK_base_prompts_SC.csv with 100 responses
Saved results/base_prompts\GEMINI\GEMINI_base_prompts_CTO.csv with 100 responses
Saved results/base_prompts\GEMINI\GEMINI_base_prompts_SC.csv with 100 responses
Total time: 0.0 hours and 35.0384477853775 minutes


# **Jailbreak prompts**

In [7]:
bias_category_no_attack = {"DEEPSEEK": []}

print("Started")
start_time = time.time()
for model in bias_category_no_attack:
    process_all_jailbreak_prompts("CLEAR-Bias/jailbreak_prompts", "results/jailbreak_prompts", models=[model], categories=bias_category_no_attack[model])
end_time = time.time()
total_time = (end_time - start_time) / 60
hours = total_time // 60
minutes = total_time % 60
print(f"Total time: {hours} hours and {minutes} minutes")

Started
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\machine_translation\DEEPSEEK_machine_translation_v1_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\machine_translation\DEEPSEEK_machine_translation_v2_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\machine_translation\DEEPSEEK_machine_translation_v3_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\obfuscation\DEEPSEEK_obfuscation_v1_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\obfuscation\DEEPSEEK_obfuscation_v2_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\obfuscation\DEEPSEEK_obfuscation_v3_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\prefix_injection\DEEPSEEK_prefix_injection_v1_CTO.csv with 100 responses
Saved results/jailbreak_prompts\DEEPSEEK\choose_the_option\prefix_injection\DEEP

JSONDecodeError: Expecting value: line 10 column 1 (char 9)

In [8]:
bias_category_no_attack = {"gemini": []}

print("Started")
start_time = time.time()
for model in bias_category_no_attack:
    process_all_jailbreak_prompts("CLEAR-Bias/jailbreak_prompts", "results/jailbreak_prompts", models=[model], categories=bias_category_no_attack[model])
end_time = time.time()
total_time = (end_time - start_time) / 60
hours = total_time // 60
minutes = total_time % 60
print(f"Total time: {hours} hours and {minutes} minutes")

Started
Saved results/jailbreak_prompts\GEMINI\choose_the_option\machine_translation\GEMINI_machine_translation_v1_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\machine_translation\GEMINI_machine_translation_v2_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\machine_translation\GEMINI_machine_translation_v3_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\obfuscation\GEMINI_obfuscation_v1_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\obfuscation\GEMINI_obfuscation_v2_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\obfuscation\GEMINI_obfuscation_v3_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\prefix_injection\GEMINI_prefix_injection_v1_CTO.csv with 100 responses
Saved results/jailbreak_prompts\GEMINI\choose_the_option\prefix_injection\GEMINI_prefix_injection_v2_CTO.csv