In [1]:
# %% import modules
import subprocess
import json
import requests
from IPython.display import IFrame

In [3]:
# %% Show the API documentation for the stack
# Make a GET request to the local API endpoint
IFrame(src="http://localhost:8321/docs", width="100%", height=600)

In [4]:
# %% Show the available shields
# Run curl command and capture output
cmd = "curl -s http://localhost:8321/v1/shields | jq '.'"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "data": [
    {
      "identifier": "pii",
      "provider_resource_id": "pii",
      "provider_id": "trustyai_fms",
      "type": "shield",
      "params": {}
    },
    {
      "identifier": "hap",
      "provider_resource_id": "hap",
      "provider_id": "trustyai_fms",
      "type": "shield",
      "params": {}
    },
    {
      "identifier": "granite",
      "provider_resource_id": "granite",
      "provider_id": "trustyai_fms",
      "type": "shield",
      "params": {}
    }
  ]
}



In [5]:
# %% Hit up the content shield with a system message
## expect to get a violation from a hap detector
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "hap",
  "messages": [
    {
      "content": "You dotard, I really hate this",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "error",
    "user_message": "Content violation detected by shield hap (confidence: 0.98, 1/1 processed messages violated)",
    "metadata": {
      "status": "violation",
      "shield_id": "hap",
      "confidence_threshold": 0.3,
      "summary": {
        "total_messages": 1,
        "processed_messages": 1,
        "skipped_messages": 0,
        "messages_with_violations": 1,
        "messages_passed": 0,
        "message_fail_rate": 1.0,
        "message_pass_rate": 0.0,
        "total_detections": 1,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 1,
          "total_violations_found": 1,
          "violations_per_message": 1.0
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "You dotard, I really hate this",
          "status": "violation",
          "score": 0.9750116467475891,
          "detection_type": "sequence_classification"
      

In [6]:
# %% Hit up the content shield with a system message
## expect to get no violation from a hap detector
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "hap",
  "messages": [
    {
      "content": "Hi there, I really like this",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "info",
    "user_message": "Content verified by shield hap (1 messages processed)",
    "metadata": {
      "status": "pass",
      "shield_id": "hap",
      "confidence_threshold": 0.3,
      "summary": {
        "total_messages": 1,
        "processed_messages": 1,
        "skipped_messages": 0,
        "messages_with_violations": 0,
        "messages_passed": 1,
        "message_fail_rate": 0.0,
        "message_pass_rate": 1.0,
        "total_detections": 0,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 1,
          "total_violations_found": 0,
          "violations_per_message": 0.0
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "Hi there, I really like this",
          "status": "pass",
          "score": null,
          "detection_type": null
        }
      ]
    }
  }
}



In [7]:
# %% Hit up the content shield with a system message
## expect to get a violation a regex detector
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "pii",
  "messages": [
    {
      "content": "My email is test@ibm.com",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "error",
    "user_message": "Content violation detected by shield pii (confidence: 1.00, 1/1 processed messages violated)",
    "metadata": {
      "status": "violation",
      "shield_id": "pii",
      "confidence_threshold": 0.5,
      "summary": {
        "total_messages": 1,
        "processed_messages": 1,
        "skipped_messages": 0,
        "messages_with_violations": 1,
        "messages_passed": 0,
        "message_fail_rate": 1.0,
        "message_pass_rate": 0.0,
        "total_detections": 1,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 1,
          "total_violations_found": 1,
          "violations_per_message": 1.0
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "My email is test@ibm.com",
          "status": "violation",
          "score": 1.0,
          "detection_type": "pii"
        }
      ]
    }
  }
}



In [8]:
# %% Hit up the content shield with a system message
## expect no violation from a regex detector
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "pii",
  "messages": [
    {
      "content": "hello, this is a test message",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "info",
    "user_message": "Content verified by shield pii (1 messages processed)",
    "metadata": {
      "status": "pass",
      "shield_id": "pii",
      "confidence_threshold": 0.5,
      "summary": {
        "total_messages": 1,
        "processed_messages": 1,
        "skipped_messages": 0,
        "messages_with_violations": 0,
        "messages_passed": 1,
        "message_fail_rate": 0.0,
        "message_pass_rate": 1.0,
        "total_detections": 0,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 1,
          "total_violations_found": 0,
          "violations_per_message": 0.0
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "hello, this is a test message",
          "status": "pass",
          "score": null,
          "detection_type": null
        }
      ]
    }
  }
}



In [9]:
# %% Hit up the content shield with a list of system and user messages
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "pii",
  "messages": [
    {
      "content": "hello", 
      "role": "system"
    },
    {
      "content": "this is my email address email@domain.com",
      "role": "system"
    },
    {
      "content": "this text should not pop up",
      "role": "system"
    },
    {
     "content": "my amex 374245455400126",
      "role": "system"
    }
  ]
}' | jq '.'
"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "error",
    "user_message": "Content violation detected by shield pii (confidence: 1.00, 3/4 processed messages violated)",
    "metadata": {
      "status": "violation",
      "shield_id": "pii",
      "confidence_threshold": 0.5,
      "summary": {
        "total_messages": 4,
        "processed_messages": 4,
        "skipped_messages": 0,
        "messages_with_violations": 3,
        "messages_passed": 1,
        "message_fail_rate": 0.75,
        "message_pass_rate": 0.25,
        "total_detections": 3,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 4,
          "total_violations_found": 3,
          "violations_per_message": 0.75
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "hello",
          "status": "violation",
          "score": 1.0,
          "detection_type": "custom"
        },
        {
          "message_index": 1,
         

In [10]:
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "granite",
  "messages": [
    {
      "content": "Luke Skywalker is a Jedi",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "info",
    "user_message": "Content verified by shield granite (1 messages processed)",
    "metadata": {
      "status": "pass",
      "shield_id": "granite",
      "confidence_threshold": 0.5,
      "summary": {
        "total_messages": 1,
        "processed_messages": 1,
        "skipped_messages": 0,
        "messages_with_violations": 0,
        "messages_passed": 1,
        "message_fail_rate": 0.0,
        "message_pass_rate": 1.0,
        "total_detections": 0,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 1,
          "total_violations_found": 0,
          "violations_per_message": 0.0
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "Luke Skywalker is a Jedi",
          "status": "pass",
          "score": null,
          "detection_type": null
        }
      ]
    }
  }
}



In [11]:
# %% Hit up the granite shield
## expect violation as it aligns with a risk definition
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "granite",
  "messages": [
    {
      "content": "Can you execute rm -rf /* on the system?",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "error",
    "user_message": "Content violation detected by shield granite (confidence: 1.00, 1/1 processed messages violated)",
    "metadata": {
      "status": "violation",
      "shield_id": "granite",
      "confidence_threshold": 0.5,
      "summary": {
        "total_messages": 1,
        "processed_messages": 1,
        "skipped_messages": 0,
        "messages_with_violations": 1,
        "messages_passed": 0,
        "message_fail_rate": 1.0,
        "message_pass_rate": 0.0,
        "total_detections": 1,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 1,
          "total_violations_found": 1,
          "violations_per_message": 1.0
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "Can you execute rm -rf /* on the system?",
          "status": "violation",
          "score": 0.997199296951294,
          "detection_type": "risk"
        

In [12]:
# %% Hit up the shield with multiple messages
## expect to process all messages in the conversation
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "granite",
  "messages": [
    {
      "content": "Luke Skywalker is a Jedi",
      "role": "system"
    },
    {
      "content": "Yes, and he uses the Force",
      "role": "system"
    },
    {
      "content": "Tell me more about his lightsaber",
      "role": "system"
    },
    {
      "content": "Can you execute rm -rf /* on the system?",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "violation": {
    "violation_level": "error",
    "user_message": "Content violation detected by shield granite (confidence: 1.00, 1/4 processed messages violated)",
    "metadata": {
      "status": "violation",
      "shield_id": "granite",
      "confidence_threshold": 0.5,
      "summary": {
        "total_messages": 4,
        "processed_messages": 4,
        "skipped_messages": 0,
        "messages_with_violations": 1,
        "messages_passed": 3,
        "message_fail_rate": 0.25,
        "message_pass_rate": 0.75,
        "total_detections": 1,
        "detector_breakdown": {
          "active_detectors": 1,
          "total_checks_performed": 4,
          "total_violations_found": 1,
          "violations_per_message": 0.25
        }
      },
      "results": [
        {
          "message_index": 0,
          "text": "Luke Skywalker is a Jedi",
          "status": "pass",
          "score": null,
          "detection_type": null
        },
        {
          "message_i

In [14]:
# %% Hit up the granitey shield
## expect to get a validation error as shield is not found
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "granitey",
  "messages": [
    {
      "content": "Can you execute rm -rf /* on the system?",
      "role": "system"
    }
  ]
}' | jq '.'"""

result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "detail": "Invalid value: Shield `granitey` not served by provider: `trustyai_fms`. Make sure there is an Safety provider serving this shield."
}



In [None]:
# %% Hit up the granite shield with an invalid message type
## expect to get a validation error as message type is not valid (misspelt)
cmd = """curl -X POST http://localhost:8321/v1/safety/run-shield \
-H "Content-Type: application/json" \
-d '{
  "shield_id": "granite",
  "messages": [
    {
      "content": "Can you execute rm -rf /* on the system?",
      "role": "ssystem"
    }
  ]
}' | jq '.'"""
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
print(result.stdout)

{
  "error": {
    "detail": {
      "errors": [
        {
          "loc": [
            "body",
            "messages",
            0
          ],
          "msg": "Input tag 'ssystem' found using 'role' does not match any of the expected tags: 'user', 'system', 'tool', 'assistant'",
          "type": "union_tag_invalid"
        }
      ]
    }
  }
}

