### This notebook generates realistic compliance evaluation scenarios by leveraging AWS Bedrock's Retrieval-Augmented Generation (RAG) capabilities using the NIST control framework and specific organizational policies generated against that framework. It generates 1,000 (500 compliant, 500 non-compliant) complex, multi-policy scenarios that simulate real-world compliance situations (like employee onboarding, data access requests, or security incidents).


In [1]:
# Import required libraries
import boto3  # AWS SDK for Python
import json   # JSON handling
import time   # For rate limiting between API calls
from typing import List, Dict  # Type hints

OUTPUT_BUCKET = '183023889407-us-east-1-compliance-rule-generator'
OUTPUT_PREFIX = 'scenarios/'  # Folder path for results

# Configuration constants
KNOWLEDGE_BASE_ID = 'T8EW10IU3Z'  # AWS Bedrock Knowledge Base containing NIST policies

# Available Bedrock model ARNs with performance notes
MODELS = {
    'premium': 'arn:aws:bedrock:us-east-1:183023889407:inference-profile/global.anthropic.claude-opus-4-5-20251101-v1:0', # not available
    'good': 'arn:aws:bedrock:us-east-1:183023889407:inference-profile/global.anthropic.claude-sonnet-4-5-20250929-v1:0', # times out
    'balanced': 'arn:aws:bedrock:us-east-1:183023889407:inference-profile/us.anthropic.claude-sonnet-4-20250514-v1:0',  # recommended
    'fast_cheap': 'arn:aws:bedrock:us-east-1:183023889407:inference-profile/us.anthropic.claude-haiku-4-5-20251001-v1:0',
    'aws_native_premier': 'arn:aws:bedrock:us-east-1:183023889407:inference-profile/us.amazon.nova-premier-v1:0',
    'aws_native_pro': 'arn:aws:bedrock:us-east-1:183023889407:inference-profile/us.amazon.nova-pro-v1:0'
}
MODEL_ARN = MODELS['good']  # Default model selection

# Initialize AWS Bedrock clients
bedrock_agent_runtime = boto3.client('bedrock-agent-runtime', region_name='us-east-1')  # For knowledge base retrieval
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-east-1')  # For model inference

# JSON tool configuration for Bedrock Converse API
# Forces the model to return structured JSON with specific schema
TOOL_CONFIG = {
    "tools": [{
        "toolSpec": {
            "name": "scenario_json",
            "description": "Return compliance scenarios as JSON",
            "inputSchema": {
                "json": {
                    "type": "object",
                    "properties": {
                        "scenarios": {  # Array of scenario objects
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "scenario-id": {"type": "string"},      # Format: scenario-id-1, scenario-id-2, etc.
                                    "scenario-detail": {"type": "string"},  # Detailed scenario description (200+ words)
                                    "is-compliant": {"type": "boolean"}     # True if compliant, False if non-compliant
                                },
                                "required": ["scenario-id", "scenario-detail", "is-compliant"]
                            }
                        }
                    },
                    "required": ["scenarios"]
                }
            }
        }
    }],
    "toolChoice": {"tool": {"name": "scenario_json"}}  # Force use of the JSON tool
}

In [2]:
def retrieve_kb_context(knowledge_base_id: str, batch_size: int) -> str:
    """
    Retrieve comprehensive policy context from AWS Bedrock Knowledge Base.
    
    This function queries the knowledge base for policies covering all 20 NIST control families
    to ensure comprehensive coverage for scenario generation.
    
    Args:
        knowledge_base_id: AWS Bedrock Knowledge Base ID containing NIST policies
        batch_size: Number of scenarios per batch (used to calculate query scope)
    
    Returns:
        Combined text of all retrieved policy documents, separated by double newlines
    """
    # Comprehensive query covering all 20 NIST 800-53 control families
    kb_query = f"""Retrieve {batch_size * 15} different company policies covering NIST control families including: AC (Access Control), AT (Awareness and Training), AU (Audit and Accountability),
    CA (Assessment, Authorization, and Monitoring), CM (Configuration Management), CP (Contingency Planning), IA (Identification and Authentication), IR (Incident Response), MA (Maintenance),
    MP (Media Protection), PE (Physical and Environmental Protection), PL (Planning), PM (Program Management), PS (Personnel Security), PT (PII Processing and Transparency), RA (Risk Assessment),
    SA (System and Services Acquisition), SC (System and Communications Protection), SI (System and Information Integrity), SR (Supply Chain Risk Management)."""
    
    # Query the knowledge base using vector search
    kb_response = bedrock_agent_runtime.retrieve(
        knowledgeBaseId=knowledge_base_id,
        retrievalQuery={'text': kb_query},
        retrievalConfiguration={'vectorSearchConfiguration': {'numberOfResults': 100}}  # Max results for comprehensive context
    )
    
    # Extract text content from all retrieved results
    all_context = [result['content']['text'] for result in kb_response.get('retrievalResults', [])]
    print(f"Retrieved {len(all_context)} context chunks from knowledge base")
    
    # Combine all context with double newlines for readability
    return '\n\n'.join(all_context)

In [3]:
def generate_scenario_batch(context: str, batch_num: int, batch_size: int, model_arn: str) -> List[Dict]:
    """
    Generate one batch of compliance scenarios using Bedrock Converse API.
    
    This function creates realistic compliance scenarios by alternating between
    compliant and non-compliant scenarios across batches. Uses the JSON tool
    to ensure structured output.
    
    Args:
        context: Combined policy text from knowledge base
        batch_num: Current batch number (0-indexed)
        batch_size: Number of scenarios to generate in this batch
        model_arn: AWS Bedrock model ARN or ID
    
    Returns:
        List of scenario dictionaries with keys: scenario-id, scenario-detail, is-compliant
    """
    # Alternate between compliant (even batches) and non-compliant (odd batches) scenarios
    is_compliant = batch_num % 2 == 0
    
    # Calculate starting ID for consecutive numbering across all batches
    start_id = batch_num * batch_size + 1
    
    # Construct detailed prompt with all policy context and specific requirements
    prompt = f"""Based on the following comprehensive compliance policies and NIST controls, generate {batch_size} realistic compliance scenarios 
    that are {'compliant' if is_compliant else 'non-compliant.  Be subtle, ensuring that only one underlying policy is violated out of all the underlying policies for each scenario.'}.
    
    KNOWLEDGE BASE CONTEXT:
    {context}

    Each scenario must:
    - Reference multiple policies from the context above
    - Include specific business details (roles, systems, data, actions)
    - Be realistic and detailed (200+ words)
    - Have scenario-id format: scenario-id-{start_id}, scenario-id-{start_id+1}, etc.
    
    Return as JSON array with fields: scenario-id, scenario-detail, is-compliant"""
    
    # Extract model ID from ARN (Converse API requires model ID, not full ARN)
    model_id = model_arn.split('/')[-1] if '/' in model_arn else model_arn
    
    # Call Bedrock Converse API with JSON tool enforcement
    response = bedrock_runtime.converse(
        modelId=model_id,
        messages=[{"role": "user", "content": [{"text": prompt}]}],
        toolConfig=TOOL_CONFIG,  # Forces structured JSON output
        inferenceConfig={"maxTokens": 4096, "temperature": 0.7}  # Allow creative but controlled generation, about 3,000 words max
    )
    
    # Extract scenarios from tool use response
    if response.get('stopReason') == 'tool_use':
        for content_block in response['output']['message']['content']:
            if 'toolUse' in content_block:
                scenarios_data = content_block['toolUse']['input']
                return scenarios_data.get('scenarios', [])
    
    # Return empty list if no tool use or scenarios found
    return []

In [4]:
def generate_compliance_scenarios(
    knowledge_base_id: str = KNOWLEDGE_BASE_ID,
    model_arn: str = MODEL_ARN,
    batch_size: int = 10,
    total_scenarios: int = 100
) -> List[Dict]:
    """
    Main orchestrator function for generating compliance scenarios.
    
    This function coordinates the entire scenario generation process:
    1. Retrieves comprehensive policy context from knowledge base (once)
    2. Generates scenarios in batches to manage API limits and costs
    3. Alternates between compliant and non-compliant scenarios
    4. Implements rate limiting to avoid API throttling
    
    Args:
        knowledge_base_id: AWS Bedrock Knowledge Base ID
        model_arn: AWS Bedrock model ARN to use for generation
        batch_size: Number of scenarios per batch (default: 10)
        total_scenarios: Total number of scenarios to generate (default: 100)
    
    Returns:
        List of all generated scenario dictionaries
    """
    
    # Generate scenarios in batches
    all_scenarios = []
    for batch_num in range(total_scenarios // batch_size):
      # Retrieve all policy context once (reused across all batches)
        try:
            context = retrieve_kb_context(knowledge_base_id, batch_size)
        except Exception as e:
            print(f"Error retrieving from KB: {e}")
            return []
        try:
            # Generate one batch of scenarios
            scenarios = generate_scenario_batch(context, batch_num, batch_size, model_arn)
            all_scenarios.extend(scenarios)
            print(f"Batch {batch_num + 1}: Generated {len(scenarios)} scenarios")
        except Exception as e:
            print(f"Error generating scenario batch {batch_num + 1}: {e}")
            continue  # excplicit to go to next scenario, maybe this was a temporary glitch
        
        # Rate limiting: pause between batches to avoid API throttling
        time.sleep(2)
    
    return all_scenarios

In [5]:
def save_scenarios_to_file(scenarios: List[Dict], output_path: str):
    """
    Save generated scenarios to a JSON file with metadata.
    
    Creates a structured JSON file containing:
    - Summary statistics (total, compliant, non-compliant counts)
    - All generated scenarios
    """
    # Print scenarios to console for immediate review
    print(json.dumps(scenarios, indent=2))
    
    # Save to file with metadata and statistics
    with open(output_path, 'w') as f:
        json.dump({
            'total_scenarios': len(scenarios),
            'compliant_count': sum(1 for s in scenarios if s['is-compliant']),
            'non_compliant_count': sum(1 for s in scenarios if not s['is-compliant']),
            'scenarios': scenarios
        }, f, indent=2)

In [6]:
def save_scenarios_to_s3(scenarios: List[Dict], output_bucket: str = OUTPUT_BUCKET, output_prefix: str = OUTPUT_PREFIX, object_name: str = "scenarios.json"):
    """
    Save generated scenarios to a S3.

    """
    s3 = boto3.client('s3')
    json_data = json.dumps({"scenarios": scenarios}, indent=2)
    s3.put_object(Bucket=output_bucket, Key=output_prefix+object_name, Body=json_data)


In [7]:
# Example usage: Generate 4 scenarios in 2 batches of 2 each
# Batch 0 (even): compliant scenarios with IDs scenario-id-1, scenario-id-2
# Batch 1 (odd): non-compliant scenarios with IDs scenario-id-3, scenario-id-4
scenarios = generate_compliance_scenarios(
    knowledge_base_id=KNOWLEDGE_BASE_ID,
    model_arn=MODELS['balanced'],
    batch_size=2,
    total_scenarios=4
)
save_scenarios_to_file(scenarios, '/home/sagemaker-user/scenarios.json')
save_scenarios_to_s3(scenarios, OUTPUT_BUCKET, OUTPUT_PREFIX, "scenarios.json")


Retrieved 100 context chunks from knowledge base
Batch 1: Generated 2 scenarios
Retrieved 100 context chunks from knowledge base
Batch 2: Generated 2 scenarios
[
  {
    "scenario-id": "scenario-id-1",
    "scenario-detail": "TechCorp, a Fortune 500 technology organization with 12,000+ employees across multiple business units, is implementing a comprehensive supply chain risk management program following a security incident involving a third-party vendor. The Chief Information Security Officer (CISO) has developed an organization-level Supply Chain Risk Management Policy (SR-1) that addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance with SOX, FISMA, and PCI-DSS requirements. The policy designates the Chief Risk Officer (CRO) to manage policy development and dissemination across all business units within 90 days. Additionally, TechCorp has established a Supply Chain Risk Management Plan (SR-2) that identif