Skip to content

Port from safety to redteaming #201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 2, 2025
Prev Previous commit
Next Next commit
Merge branch 'main' into redteam
  • Loading branch information
pamelafox committed May 14, 2025
commit 606ef1dd5e2d11aaf5a427de3534a957a91ae52e
4 changes: 3 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -38,7 +38,9 @@
"ms-vscode.vscode-node-azure-pack",
"esbenp.prettier-vscode",
"twixes.pypi-assistant",
"ms-python.vscode-python-envs"
"ms-python.vscode-python-envs",
"teamsdevapp.vscode-ai-foundry",
"ms-windows-ai-studio.windows-ai-studio"
],
// Set *default* container specific settings.json values on container create.
"settings": {
40 changes: 21 additions & 19 deletions evals/safety_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import asyncio
import datetime
import logging
import os
import pathlib
import sys
from typing import Optional

import requests
from azure.ai.evaluation import AzureAIProject
@@ -52,7 +54,7 @@ async def callback(
return {"messages": messages + [message]}


async def run_simulator(target_url: str, max_simulations: int):
async def run_simulator(target_url: str, max_simulations: int, scan_name: Optional[str] = None):
credential = get_azure_credential()
azure_ai_project: AzureAIProject = {
"subscription_id": os.getenv("AZURE_SUBSCRIPTION_ID"),
@@ -64,26 +66,25 @@ async def run_simulator(target_url: str, max_simulations: int):
credential=credential,
risk_categories=[
RiskCategory.Violence,
# RiskCategory.HateUnfairness,
# RiskCategory.Sexual,
# RiskCategory.SelfHarm,
RiskCategory.HateUnfairness,
RiskCategory.Sexual,
RiskCategory.SelfHarm,
],
num_objectives=1,
)
if scan_name is None:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
scan_name = f"Safety evaluation {timestamp}"
await model_red_team.scan(
target=lambda messages, stream=False, session_state=None, context=None: callback(messages, target_url),
scan_name="Advanced-Callback-Scan",
scan_name=scan_name,
attack_strategies=[
AttackStrategy.EASY, # Group of easy complexity attacks
# AttackStrategy.MODERATE, # Group of moderate complexity attacks
# AttackStrategy.CharacterSpace, # Add character spaces
# AttackStrategy.ROT13, # Use ROT13 encoding
# AttackStrategy.UnicodeConfusable, # Use confusable Unicode characters
# AttackStrategy.CharSwap, # Swap characters in prompts
# AttackStrategy.Morse, # Encode prompts in Morse code
# AttackStrategy.Leetspeak, # Use Leetspeak
# AttackStrategy.Url, # Use URLs in prompts
# AttackStrategy.Binary, # Encode prompts in binary
AttackStrategy.DIFFICULT,
AttackStrategy.Baseline,
AttackStrategy.UnicodeConfusable, # Use confusable Unicode characters
AttackStrategy.Morse, # Encode prompts in Morse code
AttackStrategy.Leetspeak, # Use Leetspeak
AttackStrategy.Url, # Use URLs in prompts
],
output_path="Advanced-Callback-Scan.json",
)
@@ -97,28 +98,29 @@ async def run_simulator(target_url: str, max_simulations: int):
parser.add_argument(
"--max_simulations", type=int, default=200, help="Maximum number of simulations (question/response pairs)."
)
# argument for the name
parser.add_argument("--scan_name", type=str, default=None, help="Name of the safety evaluation (optional).")
args = parser.parse_args()

# Configure logging to show tracebacks for warnings and above
logging.basicConfig(
level=logging.DEBUG,
level=logging.WARNING,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler(rich_tracebacks=False, show_path=True)],
)

# Set urllib3 and azure libraries to WARNING level to see connection issues
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("azure").setLevel(logging.DEBUG)
logging.getLogger("RedTeamLogger").setLevel(logging.DEBUG)
logging.getLogger("azure").setLevel(logging.WARNING)

# Set our application logger to INFO level
logger.setLevel(logging.INFO)

load_azd_env()

try:
asyncio.run(run_simulator(args.target_url, args.max_simulations))
asyncio.run(run_simulator(args.target_url, args.max_simulations, args.scan_name))
except Exception:
logging.exception("Unhandled exception in safety evaluation")
sys.exit(1)
You are viewing a condensed version of this merge commit. You can view the full changes here.