# 02e - 基于模型的打分

In [13]:
from pprint import pprint
from statistics import mean
import json
from dotenv import load_dotenv
load_dotenv()
import os
# 检查 API Key 和 Base URL 是否已配置
bool(os.environ["ANTHROPIC_API_KEY"] and os.environ["BASE_URL"])

True

In [14]:
from anthropic import Anthropic

# 初始化 Anthropic 客户端
client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"], base_url=os.environ["BASE_URL"])
# 为了避免偏见，进行打分的模型通常应与生成回答的模型不同
# 因此这里从 Haiku 换成 Sonnet 4
model = "claude-sonnet-4-20250514"

In [15]:
def add_user_message(messages, text):
    """向消息列表中添加用户消息"""
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    """向消息列表中添加助手消息"""
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, **kwargs):
    """带着完整的消息列表发起对话请求并返回 Claude 的回复文本"""
    message = client.messages.create(
        model=model,
        max_tokens=1000,
        messages=messages,
        **kwargs
    )
    return message.content[0].text

In [16]:
def run_prompt(test_case):
    prompt = f"""
Please solve the following task:

{test_case["task"]}
"""
    
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

In [17]:
def grade_by_model(test_case, output):
    task, solution = test_case, output
    eval_prompt = f"""
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Task: {task}
    Solution: {solution}
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [18]:
def run_test_case(test_case):
    output = run_prompt(test_case)
    
    # 打分
    model_grade = grade_by_model(test_case, output)
    score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    
    return {
        "output": output, 
        "test_case": test_case, 
        "score": score,
        "reasoning": reasoning
    }

In [19]:
def run_eval(dataset):
    """运行评估流程"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")
    
    return results

In [20]:
with open("02c-dataset.json", "r") as f:
    dataset = json.load(f)

dataset

[{'task': "Write a Python function that extracts the AWS region from an S3 bucket ARN string. For example, 'arn:aws:s3:::my-bucket' should return None, and 'arn:aws:s3:us-east-1:123456789012:bucket/my-bucket' should return 'us-east-1'."},
 {'task': 'Create a JSON object that represents an AWS IAM policy allowing a principal to perform s3:GetObject and s3:PutObject actions on a specific S3 bucket ARN (arn:aws:s3:::my-app-bucket/*).'},
 {'task': "Write a Regex pattern that matches valid AWS EC2 instance IDs. Instance IDs follow the format 'i-' followed by exactly 17 hexadecimal characters (example: i-0a1b2c3d4e5f6g7h8)."}]

In [21]:
results = run_eval(dataset)

Average score: 8


In [22]:
results

[{'output': 'Looking at this task, I need to understand the structure of S3 ARNs and extract the region component.\n\nS3 ARNs can have different formats:\n1. Traditional bucket ARN: `arn:aws:s3:::bucket-name` (no region specified)\n2. S3 access point ARN: `arn:aws:s3:region:account-id:accesspoint/access-point-name`\n3. S3 object ARN: `arn:aws:s3:region:account-id:bucket/bucket-name`\n4. Other S3 resource ARNs that include region information\n\nThe general ARN format is: `arn:partition:service:region:account-id:resource`\n\nHere\'s my solution:\n\n```python\ndef extract_s3_region(arn):\n    """\n    Extracts the AWS region from an S3 bucket ARN string.\n    \n    Args:\n        arn (str): The S3 ARN string\n        \n    Returns:\n        str or None: The AWS region if present, None otherwise\n        \n    Examples:\n        extract_s3_region(\'arn:aws:s3:::my-bucket\') -> None\n        extract_s3_region(\'arn:aws:s3:us-east-1:123456789012:bucket/my-bucket\') -> \'us-east-1\'\n    """\