In [1]:
from vllm_judge import Judge, Metric, TemplateProcessor

In [2]:
judge = Judge.from_url(base_url="http://localhost:8080")

In [3]:
# Basic usage with format strings
result = await judge.evaluate(
    content="def fibonacci(n): return n if n <= 1 else fib(n-1) + fib(n-2)",
    criteria="Evaluate this {language} function for {use_case}",
    template_vars={
        "language": "Python",
        "use_case": "production deployment"
    },
    rubric="Assign a score between 1 and 10 based on the quality of the code, with 1 being the worst and 10 being the best"
)
result.model_dump()

{'decision': 'FAIL',
 'reasoning': "The function contains a typo ('fib' instead of 'fibonacci' in the recursive call) and lacks a proper docstring or comments, which are essential for production code. The code is also not optimized and may lead to a stack overflow for large values of n due to repeated calculations.",
 'score': 3.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "FAIL",\n    "reasoning": "The function contains a typo (\'fib\' instead of \'fibonacci\' in the recursive call) and lacks a proper docstring or comments, which are essential for production code. The code is also not optimized and may lead to a stack overflow for large values of n due to repeated calculations.",\n    "score": 3\n}',
  'template_vars': {'language': 'Python', 'use_case': 'production deployment'},
  'template_engine': 'format'}}

In [4]:
# Define once
code_review_metric = Metric(
    name="code_review",
    criteria="Review this {language} code for {purpose}",
    rubric={
        10: "Perfect {language} code for {purpose}",
        5: "Acceptable for {purpose} with improvements",
        1: "Unsuitable for {purpose}"
    },
    system_prompt="You are a {language} expert.",
    required_vars=["language", "purpose"]
)

In [5]:
python_code = "def fibonacci(n): return n if n <= 1 else fib(n-1) + fib(n-2)"
js_code = "function fibonacci(n) { return n <= 1 ? n : fibonacci(n-1) + fibonacci(n-2); }"


# Use many times with different contexts
result1 = await judge.evaluate(
    content=python_code,
    metric=code_review_metric,
    template_vars={"language": "Python", "purpose": "data science"}
)

result2 = await judge.evaluate(
    content=js_code,
    metric=code_review_metric,
    template_vars={"language": "JavaScript", "purpose": "web frontend"}
)

In [6]:
result1.model_dump()

{'decision': 'False',
 'reasoning': 'The code is a recursive implementation of the Fibonacci sequence, but it lacks an appropriate base case check and is inefficient for larger values of n due to repeated calculations. This makes it unsuitable for data science applications where performance and efficiency are crucial.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "False",\n    "reasoning": "The code is a recursive implementation of the Fibonacci sequence, but it lacks an appropriate base case check and is inefficient for larger values of n due to repeated calculations. This makes it unsuitable for data science applications where performance and efficiency are crucial.",\n    "score": 1\n}',
  'template_vars': {'language': 'Python', 'purpose': 'data science'},
  'template_engine': 'format'}}

In [7]:
result2.model_dump()

{'decision': 'false',
 'reasoning': 'The function is a correct implementation of the Fibonacci sequence, but it is not efficient for large values of n due to its exponential time complexity. This makes it unsuitable for web frontend applications where performance is critical.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "false",\n    "reasoning": "The function is a correct implementation of the Fibonacci sequence, but it is not efficient for large values of n due to its exponential time complexity. This makes it unsuitable for web frontend applications where performance is critical.",\n    "score": 1\n}',
  'template_vars': {'language': 'JavaScript', 'purpose': 'web frontend'},
  'template_engine': 'format'}}

In [8]:
# Conditional logic in templates
api_review = Metric(
    name="api_review",
    criteria="""
    Review this API endpoint:
    {% for aspect in aspects %}
    - {{ aspect }}
    {% endfor %}
    {% if security_critical %}
    Pay special attention to authentication and authorization.
    {% endif %}
    """,
    rubric="Classify as GOOD, DECENT, BAD and assign a score between 1 and 10 based on the quality of the code, with 1 being the worst and 10 being the best",
    template_engine="jinja2"
)

api_code = """
@app.route('/api/v1/users')
def get_users():
    # Get all users
    return jsonify(users)
"""

result = await judge.evaluate(
    content=api_code,
    metric=api_review,
    template_vars={
        "aspects": ["RESTful design", "Error handling", "Documentation"],
        "security_critical": True
    }
)

In [9]:
result.model_dump()

{'decision': 'DECENT',
 'reasoning': 'The API endpoint follows a basic RESTful design by using the GET method for retrieving users. However, it lacks error handling and documentation, which are crucial for a robust API. Additionally, there is no mention of authentication and authorization, which are essential for securing the API.',
 'score': 5.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "DECENT",\n    "reasoning": "The API endpoint follows a basic RESTful design by using the GET method for retrieving users. However, it lacks error handling and documentation, which are crucial for a robust API. Additionally, there is no mention of authentication and authorization, which are essential for securing the API.",\n    "score": 5\n}',
  'template_vars': {'aspects': ['RESTful design',
    'Error handling',
    'Documentation'],
   'security_critical': True},
  'template_engine': 'jinja2'}}

In [11]:
# Metric with defaults
education_metric = Metric(
    name="education",
    criteria="Evaluate for {grade_level} studying {subject}",
    template_vars={
        "grade_level": "high school"  # Default
    },
    rubric="Classify as GOOD, DECENT, BAD and assign a score between 1 and 10 based on the quality of the code, with 1 being the worst and 10 being the best",
    required_vars=["subject"]  # Only subject is required
)


In [12]:
content = "The travel agency wants to ensure their translations are not only accurate but also culturally appropriate. To achieve this they are considering creating a custom metric that allows Worldwide WanderAgency to quantify how well their translations maintain cultural context and idiomatic expressions."

# Use with default grade_level
result = await judge.evaluate(
    content=content,
    metric=education_metric,
    template_vars={"subject": "biology"}
)

result.model_dump()

{'decision': 'BAD',
 'reasoning': "The response does not address the evaluation of high school biology or provide any code for assessment. It discusses a travel agency's need for culturally appropriate translations, which is unrelated to the given context.",
 'score': None,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "BAD",\n    "reasoning": "The response does not address the evaluation of high school biology or provide any code for assessment. It discusses a travel agency\'s need for culturally appropriate translations, which is unrelated to the given context.",\n    "score": null\n}',
  'template_vars': {'grade_level': 'high school', 'subject': 'biology'},
  'template_engine': 'format'}}

In [13]:
# Override grade_level
result = await judge.evaluate(
    content=content,
    metric=education_metric,
    template_vars={
        "subject": "computer science",
        "grade_level": "undergraduate"  # Override default
    }
)
result.model_dump()

{'decision': 'DECENT',
 'reasoning': 'The response provides a clear idea of what the travel agency is aiming to achieve, which is to ensure translations are culturally appropriate. However, it does not provide any technical details about how the custom metric would be implemented, which is crucial for evaluating the quality of the code. Therefore, it is considered decent as it sets the right direction but lacks technical depth.',
 'score': 6.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "DECENT",\n    "reasoning": "The response provides a clear idea of what the travel agency is aiming to achieve, which is to ensure translations are culturally appropriate. However, it does not provide any technical details about how the custom metric would be implemented, which is crucial for evaluating the quality of the code. Therefore, it is considered decent as it sets the right direction but lacks technical depth.",\n    "score": 6\n}',
  'template_vars': {'grade_level': 