## Prerequisites

Before you can use Amazon Bedrock, you must carry out the following steps:

* Sign up for an AWS account (if you don't already have one) and IAM Role with the necessary permissions for Amazon Bedrock, see AWS Account and IAM Role.

* Request access to the foundation models (FM) that you want to use, see Request access to FMs.


In this Notebook you will use the following Foundation Models in us-east-1 (N. Virginia) region:

| Provider Name | Foundation Model Name | Model Id |
| ------------- | --------------------- | ------------- |
| Amazon        | Nova Pro              | us.amazon.nova-pro-v1:0 |
| Mistral       | Mistral Large         | mistral.mistral-large-2402-v1:0 |

In [1]:
import boto3
import json
import pandas as pd
import uuid
from IPython.display import JSON, display, IFrame, Markdown
import os
import shutil
from urllib.parse import urlparse
import sagemaker

sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()['Account']
region_name = boto3.session.Session().region_name


# Set up AWS credentials (make sure you have the appropriate permissions)
session = boto3.Session()
sagemaker_session = sagemaker.Session()
bedrock_client = session.client('bedrock')



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
evaluator_model = "mistral.mistral-large-2402-v1:0"
generator_model = "us.amazon.nova-pro-v1:0"

In [3]:
def upload_to_s3(local_file: str, bucket: str, s3_key: str) -> bool:
    """
    Upload a file to S3 with error handling.
    
    Returns:
        bool: Success status
    """
    try:
        s3_client = session.client('s3')
        s3_client.upload_file(local_file, bucket, s3_key)
        print(f"✓ Successfully uploaded {s3_key}")
        return True
    except Exception as e:
        print(f"✗ Error uploading to S3: {str(e)}")
        return False

In [4]:
from typing import List, Dict, Optional, Any

def create_llm_judge_evaluation(
    client,
    job_name: str,
    role_arn: str,
    input_s3_uri: str,
    output_s3_uri: str,
    evaluator_model_id: str,
    generator_model_id: str,
    dataset_name: str = None,
    task_type: str = "General" # must be General for LLMaaJ
):    
    # All available LLM-as-judge metrics
    llm_judge_metrics = [
        "Builtin.Correctness",
        "Builtin.Completeness", 
        "Builtin.Faithfulness",
        "Builtin.Helpfulness",
        "Builtin.Coherence",
        "Builtin.Relevance",
        "Builtin.FollowingInstructions",
        "Builtin.ProfessionalStyleAndTone",
        "Builtin.Harmfulness",
        "Builtin.Stereotyping",
        "Builtin.Refusal"
    ]

    # Configure dataset
    dataset_config = {
        "name": dataset_name or "CustomDataset",
        "datasetLocation": {
            "s3Uri": input_s3_uri
        }
    }

    try:
        response = client.create_evaluation_job(
            jobName=job_name,
            roleArn=role_arn,
            applicationType="ModelEvaluation",
            evaluationConfig={
                "automated": {
                    "datasetMetricConfigs": [
                        {
                            "taskType": task_type,
                            "dataset": dataset_config,
                            "metricNames": llm_judge_metrics
                        }
                    ],
                    "evaluatorModelConfig": {
                        "bedrockEvaluatorModels": [
                            {
                                "modelIdentifier": evaluator_model_id
                            }
                        ]
                    }
                }
            },
            inferenceConfig={
                "models": [
                    {
                        "bedrockModel": {
                            "modelIdentifier": generator_model_id
                        }
                    }
                ]
            },
            outputDataConfig={
                "s3Uri": output_s3_uri
            }
        )
        return response
        
    except Exception as e:
        print(f"Error creating evaluation job: {str(e)}")
        raise
        


def run_model_comparison(sagemaker_session_role: str,
    generator_models: List[str],
    evaluator_model: str
) -> List[Dict[str, Any]]:
    evaluation_jobs = []
    
    for generator_model in generator_models:
        job_name = f"llmaaj-{generator_model.split('.')[0]}-{evaluator_model.split('.')[0]}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
        
        try:
            response = create_llm_judge_evaluation(
                client=bedrock_client,
                job_name=job_name,
                role_arn=sagemaker_session_role,
                input_s3_uri=input_data,
                output_s3_uri=f"{output_path}/{job_name}/",
                evaluator_model_id=evaluator_model,
                generator_model_id=generator_model,
                task_type="General"
            )
            
            job_info = {
                "job_name": job_name,
                "job_arn": response["jobArn"],
                "generator_model": generator_model,
                "evaluator_model": evaluator_model,
                "status": "CREATED"
            }
            evaluation_jobs.append(job_info)
            
            print(f"✓ Created job: {job_name}")
            print(f"  Generator: {generator_model}")
            print(f"  Evaluator: {evaluator_model}")
            print("-" * 80)
            
        except Exception as e:
            print(f"✗ Error with {generator_model}: {str(e)}")
            continue
            
    return evaluation_jobs

In [7]:
account_id = sts_client.get_caller_identity()['Account']
OUTPUT_DIR = "evaluation_output"
INPUT_DATASET = "eval_dataset.jsonl"
DATA_BUCKET = f"nova-citations-{account_id}"
s3_key = f"{OUTPUT_DIR}/{INPUT_DATASET}"

# Create the bucket if it doesn't exist
try:
    s3_client = session.client('s3')
    s3_client.create_bucket(
        Bucket=DATA_BUCKET,
        CreateBucketConfiguration={'LocationConstraint': region_name} if region_name != 'us-east-1' else {}
    )
    print(f"✓ Created bucket {DATA_BUCKET}")
except Exception as e:
    if 'BucketAlreadyOwnedByYou' in str(e):
        print(f"✓ Bucket already exists")
    else:
        print(f"✗ Error creating bucket: {str(e)}")
        raise

# Upload the dataset
upload_success = upload_to_s3(INPUT_DATASET, DATA_BUCKET, s3_key)
if not upload_success:
    raise Exception("✗ Failed to upload dataset to S3")


✓ Bucket already exists
✓ Successfully uploaded evaluation_output/eval_dataset.jsonl


In [None]:
import sagemaker
import re
# Get the execution role
sagemaker_session_role = sagemaker.get_execution_role()
# Create evaluation job
input_data = f"s3://{DATA_BUCKET}/{s3_key}"
output_path = f"s3://{DATA_BUCKET}/{OUTPUT_DIR}"
print("input_data",input_data)
print("output_path",output_path)
try:
    unique_job_sux = str(uuid.uuid4())[:16]  
    llm_as_judge_response = create_llm_judge_evaluation(
        client=bedrock_client,
        job_name=f"evalnovacitations-{unique_job_sux}",
        role_arn=sagemaker_session_role,
        input_s3_uri=input_data,
        output_s3_uri=output_path,
        evaluator_model_id=evaluator_model,
        generator_model_id=generator_model,
        task_type="General"
    )
    print(f"✓ Created evaluation job: {llm_as_judge_response['jobArn']}")
except Exception as e:
    print(f"✗ Failed to create evaluation job: {str(e)}")
    raise


# Get job ARN based on job type
evaluation_job_arn = llm_as_judge_response['jobArn']


In [9]:
# Check job status
check_status = bedrock_client.get_evaluation_job(jobIdentifier=evaluation_job_arn) 
print(f"Job Status: {check_status['status']}")

Job Status: InProgress
