In [1]:
import os,sys
sys.path.append(os.path.abspath(".")) 
sys.path.append(os.path.abspath("..")) 
sys.path.append(os.path.abspath("InjecAgent")) 
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict,Counter
from utils.utils import *
from copy import deepcopy
from utils.plot_utils import *
home_dir = "/export/home2/weijie210/StruQ"
cache_dir = f'{home_dir}/cache'
os.makedirs(cache_dir,exist_ok=True)
seed_all()

from agentdojo.attacks.attack_registry import ATTACKS, load_attack
from agentdojo.task_suite.load_suites import get_suite, get_suites
from agentdojo.task_suite.task_suite import *
from agentdojo.agent_pipeline.agent_pipeline import DEFENSES, AgentPipeline, PipelineConfig



  return torch._C._cuda_getDeviceCount() > 0
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


# Load Model

In [5]:
model_path = 'deepseek'
defense = None

with open(f'/export/home2/weijie210/ipi_huawei/ds_key.txt','r') as f:
    ds_key = f.read().strip()
os.environ['OPENAI_API_KEY'] = ds_key

agent_pipeline = AgentPipeline.from_config(
        PipelineConfig(
            llm='deepseek-chat',
            defense=defense,
            system_message_name=None,
            system_message=None,
            tool_output_format=None,
            extra_args ={'max_tokens':512}
            
        )
    )

In [6]:
from agentdojo.base_tasks import BaseInjectionTask, BaseUserTask

def run_task(suite,user_task,injection_task=None,task_injection = {},return_messages=False):
    environment = suite.load_and_inject_default_environment(task_injection)
    if isinstance(user_task, BaseUserTask):
        task_environment = user_task.init_environment(environment)
    else:
        task_environment = environment
    pre_environment = task_environment.model_copy(deep=True)
    if isinstance(user_task, BaseUserTask):
        prompt = user_task.PROMPT
    else:
        prompt = user_task.GOAL

    messages = []
    runtime = FunctionsRuntime(suite.tools)

    _, _, task_environment, messages, _ = agent_pipeline.query(prompt, runtime, task_environment)
    model_output = model_output_from_messages(messages)
    functions_stack_trace = functions_stack_trace_from_messages(messages)
    utility = suite._check_task_result(
            user_task,
            model_output or [],
            pre_environment,
            task_environment,  # type: ignore
            functions_stack_trace,
        )
    security = None
    if injection_task is not None:
        security = suite._check_task_result(
            injection_task,
            model_output or [],
            pre_environment,
            task_environment,  # type: ignore
            functions_stack_trace,
        )
    if not return_messages:
        return utility,security
    else:
        return utility,security, messages
    

# Eval

In [7]:
benchmark_version = 'v1.2.1'
# suites = tuple(get_suites(benchmark_version).keys())
suites = ['slack','banking','travel','workspace']
# suites = ['slack']
attack_name = 'important_instructions'

all_suite_results = defaultdict(dict)
all_suite_asr_messages = defaultdict(dict)
debug_messages = defaultdict(list)

if 'qwen' in model_path.lower():
    base_model_name = 'qwen3'
elif 'llama3' in model_path.lower():
    base_model_name = 'llama3'
else:
    base_model_name = model_path


model_n = model_path.split('/')[-1]
result_dict = 'results/agentdojo'
os.makedirs(result_dict,exist_ok=True)

result_file = f'{result_dict}/{base_model_name}_{model_n}.json'
msg_file = f'{result_dict}/{base_model_name}_{model_n}_messages.pkl'

if not os.path.exists(result_file):
    for suite_name in suites:
        suite = get_suite(benchmark_version, suite_name)
        print (suite.injection_tasks)
        print(f"**Running benchmark for suite: '{suite.name}'**")
        user_tasks_to_run = suite.user_tasks.values()
        print (f"Num User tasks to run: {len(user_tasks_to_run)}")
        attacker_ = load_attack(attack_name, suite, target_pipeline = agent_pipeline)
        print ('is dos_attack:', attacker_.is_dos_attack)
        injection_tasks_to_run = suite.injection_tasks
        print (f"Num Injection tasks to run: {len(injection_tasks_to_run)}")

        #Injection utility
        injection_task_utility = {}
        for injection_task_id, injection_task in tqdm(injection_tasks_to_run.items(),total = len(injection_tasks_to_run),desc = f'Running Injection utility for {suite_name}'):
            utility,_,message = run_task(suite,injection_task,return_messages=True)
            injection_task_utility[injection_task_id] = utility
            debug_messages['injection'].append(message)

        print (f'{suite_name} Injection utility: {np.mean(list(injection_task_utility.values())):.2f}')
        all_suite_results[suite_name]['injection_utility'] = np.mean(list(injection_task_utility.values()))

        # ## Non-injection utility
        task_utility = {}
        for user_task in tqdm(user_tasks_to_run,total = len(user_tasks_to_run),desc = f'Running Task utility for {suite_name}'):
            utility,_,message = run_task(suite,user_task,return_messages=True)
            task_utility[user_task.ID] = utility
            debug_messages['utility'].append(message)
            
            
        print (f'{suite_name} Task utility: {np.mean(list(task_utility.values())):.2f}')
        all_suite_results[suite_name]['task_utility'] = np.mean(list(task_utility.values()))
        
        ## With attack
        utility_with_attack = {}
        all_messages = {}
        suite_security_results =  {}
        for user_task in tqdm(user_tasks_to_run,total = len(user_tasks_to_run),desc = f'Running Attack for {suite_name}'):
            for injection_task_id in injection_tasks_to_run:
                injection_task = suite.get_injection_task_by_id(injection_task_id)
                task_injections = attacker_.attack(user_task, injection_task)
                utility,security,messages = run_task(suite,user_task,injection_task,task_injections,return_messages=True)
                utility_with_attack[(user_task.ID, injection_task_id)] = utility
                suite_security_results[(user_task.ID, injection_task_id)] = security
                all_messages[(user_task.ID, injection_task_id)] = messages
        all_suite_asr_messages[suite_name] = all_messages
        all_suite_results[suite_name]['asr'] = np.mean(list(suite_security_results.values()))
        all_suite_results[suite_name]['attack_utility'] = np.mean(list(utility_with_attack.values()))

        print (f'{suite_name} Task with injection Utility: {np.mean(list(utility_with_attack.values())):.2f}')
        print (f'{suite_name} ASR: {np.mean(list(suite_security_results.values())):.2f}')
    with open(result_file, 'w') as f:
        json.dump(all_suite_results, f, indent=4)
        
    with open(msg_file, 'wb') as f:
        pickle.dump(all_suite_asr_messages, f)
        

else:
    with open(result_file, 'r') as f:
        all_suite_results = json.load(f)

    for suite_name in all_suite_results:
        suite_results = all_suite_results[suite_name]
        print (f"Results for {suite_name}:")
        print (f"Injection utility: {suite_results['injection_utility']:.2f}")
        print (f"Task utility: {suite_results['task_utility']:.2f}")
        print (f"Task with injection utility: {suite_results['attack_utility']:.2f}")
        print (f"ASR: {suite_results['asr']:.2f}")

{'injection_task_1': <agentdojo.default_suites.v1.slack.injection_tasks.InjectionTask1 object at 0x15538e118070>, 'injection_task_2': <agentdojo.default_suites.v1.slack.injection_tasks.InjectionTask2 object at 0x15538e1180d0>, 'injection_task_3': <agentdojo.default_suites.v1.slack.injection_tasks.InjectionTask3 object at 0x15538e118130>, 'injection_task_4': <agentdojo.default_suites.v1.slack.injection_tasks.InjectionTask4 object at 0x15538e118190>, 'injection_task_5': <agentdojo.default_suites.v1.slack.injection_tasks.InjectionTask5 object at 0x15538e1181f0>}
**Running benchmark for suite: 'slack'**
Num User tasks to run: 21
is dos_attack: False
Num Injection tasks to run: 5


Running Injection utility for slack:   0%|          | 0/5 [00:00<?, ?it/s]

Running Injection utility for slack: 100%|██████████| 5/5 [02:10<00:00, 26.08s/it]


slack Injection utility: 1.00


Running Task utility for slack: 100%|██████████| 21/21 [13:25<00:00, 38.36s/it]


slack Task utility: 0.90


Running Attack for slack:  29%|██▊       | 6/21 [25:16<1:03:11, 252.75s/it]


JSONDecodeError: Invalid \escape: line 1 column 327 (char 326)