# 02d - 运行评测

In [1]:
from pprint import pprint
import json
from dotenv import load_dotenv
load_dotenv()
import os
# 检查 API Key 和 Base URL 是否已配置
bool(os.environ["ANTHROPIC_API_KEY"] and os.environ["BASE_URL"])

True

In [2]:
from anthropic import Anthropic

# 初始化 Anthropic 客户端
client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"], base_url=os.environ["BASE_URL"])
# 注意，我们在这里换用了相对廉价且快速的 haiku 4.5 模型
model = "claude-haiku-4-5-20251001"

In [3]:
def add_user_message(messages, text):
    """向消息列表中添加用户消息"""
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    """向消息列表中添加助手消息"""
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, **kwargs):
    """带着完整的消息列表发起对话请求并返回 Claude 的回复文本"""
    message = client.messages.create(
        model=model,
        max_tokens=1000,
        messages=messages,
        **kwargs
    )
    return message.content[0].text

In [None]:
def run_prompt(test_case):
    prompt = f"""
Please solve the following task:

{test_case["task"]}
"""
    
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

In [5]:
def run_test_case(test_case):
    output = run_prompt(test_case)
    # TODO 我们先使用一个硬编码的分数来走通流程，后续修改成实际的评分逻辑
    score = 10
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score
    }

In [7]:
def run_eval(dataset):
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    return results

In [8]:
with open("02c-dataset.json", "r") as f:
    dataset = json.load(f)

dataset

[{'task': "Write a Python function that extracts the AWS region from an S3 bucket ARN string. For example, 'arn:aws:s3:::my-bucket' should return None, and 'arn:aws:s3:us-east-1:123456789012:bucket/my-bucket' should return 'us-east-1'."},
 {'task': 'Create a JSON object that represents an AWS IAM policy allowing a principal to perform s3:GetObject and s3:PutObject actions on a specific S3 bucket ARN (arn:aws:s3:::my-app-bucket/*).'},
 {'task': "Write a Regex pattern that matches valid AWS EC2 instance IDs. Instance IDs follow the format 'i-' followed by exactly 17 hexadecimal characters (example: i-0a1b2c3d4e5f6g7h8)."}]

In [9]:
results = run_eval(dataset)

In [11]:
results

[{'output': '# Python Function to Extract AWS Region from S3 Bucket ARN\n\nHere\'s a comprehensive solution with multiple approaches:\n\n```python\ndef extract_region_from_s3_arn(arn: str) -> str | None:\n    """\n    Extract AWS region from an S3 bucket ARN string.\n    \n    Args:\n        arn: S3 bucket ARN string\n        \n    Returns:\n        Region string (e.g., \'us-east-1\') or None if not present\n        \n    Examples:\n        >>> extract_region_from_s3_arn(\'arn:aws:s3:::my-bucket\')\n        None\n        >>> extract_region_from_s3_arn(\'arn:aws:s3:us-east-1:123456789012:bucket/my-bucket\')\n        \'us-east-1\'\n    """\n    try:\n        # ARN format: arn:partition:service:region:account-id:resource\n        parts = arn.split(\':\')\n        \n        # Check if ARN has the correct structure (at least 6 parts)\n        if len(parts) < 6:\n            return None\n        \n        # Region is the 4th element (index 3)\n        region = parts[3]\n        \n        # R