In [1]:
from vllm_judge import Judge, BUILTIN_METRICS

In [2]:
judge = Judge.from_url(base_url="http://localhost:8080")

In [3]:
BUILTIN_METRICS.keys()

dict_keys(['llama_guard_3_safety', 'helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'coherence', 'safety', 'toxicity', 'bias_detection', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'appropriate', 'factual', 'rag_evaluation_template', 'agent_performance_template', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template', 'legal_appropriateness', 'medical_accuracy', 'preference', 'translation_quality', 'summarization_quality'])

### General Purpose Metrics

#### HELPFULNESS

- Evaluates how well a response addresses user needs with actionable value.

In [4]:
from vllm_judge import HELPFULNESS

In [5]:
# Example 1: Technical support response
helpfulness_result_1 = await judge.evaluate(
    content="To fix the blue screen error, try these steps: 1) Boot in Safe Mode by pressing F8 during startup, 2) Run 'sfc /scannow' in admin command prompt, 3) Update your graphics drivers from Device Manager, 4) If the issue persists, check Event Viewer for specific error codes and run Memory Diagnostic tool.",
    input="My Windows computer keeps showing blue screen errors",
    metric=HELPFULNESS  # or string 'helpfulness'
)

helpfulness_result_1.model_dump()

{'decision': 'EXCELLENT',
 'reasoning': "The response thoroughly addresses the user's needs with clear, actionable steps. It covers a range of potential solutions from basic troubleshooting to more advanced diagnostics. The information is directly relevant and easy to follow.",
 'score': 0.9,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "EXCELLENT",\n    "reasoning": "The response thoroughly addresses the user\'s needs with clear, actionable steps. It covers a range of potential solutions from basic troubleshooting to more advanced diagnostics. The information is directly relevant and easy to follow.",\n    "score": 0.9\n}',
  'template_vars': {'input': 'My Windows computer keeps showing blue screen errors'},
  'template_engine': 'format'}}

In [6]:
# Example 2: Recipe assistance  
helpfulness_result_2 = await judge.evaluate(
    content="Just add some flour and water together and bake it.",
    input="How do I make sourdough bread from scratch?",
    metric='helpfulness'
)
helpfulness_result_2.model_dump()

{'decision': 'POOR',
 'reasoning': 'The response is overly simplistic and does not provide the necessary steps or details to make sourdough bread from scratch. It lacks information on cultivating a sourdough starter, mixing ingredients, kneading, proofing, and baking techniques.',
 'score': 0.3,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "POOR",\n    "reasoning": "The response is overly simplistic and does not provide the necessary steps or details to make sourdough bread from scratch. It lacks information on cultivating a sourdough starter, mixing ingredients, kneading, proofing, and baking techniques.",\n    "score": 0.3\n}',
  'template_vars': {'input': 'How do I make sourdough bread from scratch?'},
  'template_engine': 'format'}}

#### ACCURACY

- Evaluates factual correctness and absence of hallucinations.

In [7]:
from vllm_judge import ACCURACY

In [8]:
accuracy_result_1 = await judge.evaluate(
    content="The Apollo 11 mission landed on the Moon on July 20, 1969. Neil Armstrong was the first human to step onto the lunar surface, followed by Buzz Aldrin, while Michael Collins orbited above.",
    metric=ACCURACY
)

accuracy_result_1.model_dump()

{'decision': 'PERFECT',
 'reasoning': 'All stated facts are completely accurate. The Apollo 11 mission did land on the Moon on July 20, 1969, with Neil Armstrong and Buzz Aldrin landing and walking on the lunar surface, while Michael Collins orbited above. There are no errors in the information provided.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "PERFECT",\n    "reasoning": "All stated facts are completely accurate. The Apollo 11 mission did land on the Moon on July 20, 1969, with Neil Armstrong and Buzz Aldrin landing and walking on the lunar surface, while Michael Collins orbited above. There are no errors in the information provided.",\n    "score": 1.0\n}'}}

In [9]:
accuracy_result_2 = await judge.evaluate(
    content="The chemical formula for water is H3O.",
    metric="accuracy"
)
accuracy_result_2.model_dump()

{'decision': 'SEVERELY_INACCURATE',
 'reasoning': 'The chemical formula for water is H2O, not H3O. This is a fundamental error in the chemical composition of water.',
 'score': 0.1,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "SEVERELY_INACCURATE",\n    "reasoning": "The chemical formula for water is H2O, not H3O. This is a fundamental error in the chemical composition of water.",\n    "score": 0.1\n}'}}

#### CLARITY

- Evaluates how clear and easy to understand the response is.

In [10]:
from vllm_judge import CLARITY

In [11]:
clarity_result_1 = await judge.evaluate(
    content="To calculate compound interest: First, identify your principal (initial amount), interest rate (as a decimal), and time period. Then use the formula: A = P(1 + r)^t. For example, $1000 at 5% for 3 years: A = 1000(1.05)^3 = $1,157.63",
    metric=CLARITY
)
clarity_result_1.model_dump()


{'decision': 'VERY_CLEAR',
 'reasoning': 'The response is well-organized, uses simple language, and provides a clear example. The structure and formatting are effective, making it easy to understand the process of calculating compound interest.',
 'score': 0.9,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "VERY_CLEAR",\n    "reasoning": "The response is well-organized, uses simple language, and provides a clear example. The structure and formatting are effective, making it easy to understand the process of calculating compound interest.",\n    "score": 0.9\n}'}}

In [12]:
clarity_result_2 = await judge.evaluate(
    content="So basically you need to do the thing with the stuff and make sure the other part is connected to the first part you know what I mean?",
    metric="clarity"
)
clarity_result_2.model_dump()

{'decision': 'SOMEWHAT_CLEAR',
 'reasoning': 'The response is somewhat clear in its intent but lacks structure, coherence, and simplicity. It uses informal language and lacks clarity in explaining the task.',
 'score': 0.5,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "SOMEWHAT_CLEAR",\n    "reasoning": "The response is somewhat clear in its intent but lacks structure, coherence, and simplicity. It uses informal language and lacks clarity in explaining the task.",\n    "score": 0.5\n}'}}

#### CONCISENESS

- Evaluates brevity without losing essential information.

In [13]:
from vllm_judge import CONCISENESS

In [14]:
conciseness_result_1 = await judge.evaluate(
    content="Git stores snapshots, not differences. Each commit is a complete project state.",
    input="What's the key difference between Git and other version control systems?",
    metric=CONCISENESS
)

conciseness_result_1.model_dump()

{'decision': 'PERFECTLY_CONCISE',
 'reasoning': 'The response is extremely brief, using only 11 words to convey the key difference between Git and other version control systems. It avoids redundancy and is clear and to the point.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "PERFECTLY_CONCISE",\n    "reasoning": "The response is extremely brief, using only 11 words to convey the key difference between Git and other version control systems. It avoids redundancy and is clear and to the point.",\n    "score": 1.0\n}',
  'template_vars': {'input': "What's the key difference between Git and other version control systems?"},
  'template_engine': 'format'}}

In [15]:
conciseness_result_2 = await judge.evaluate(
    content="Well, to answer your question about what time it is, I need to first explain that time is a human construct that we use to measure the passage of events. The concept of time has evolved throughout human history, from sundials to atomic clocks. Speaking of atomic clocks, they're incredibly precise. Anyway, the current time is 3:45 PM.",
    input="What time is it?",
    metric="conciseness"
)
conciseness_result_2.model_dump()

{'decision': 'EXTREMELY_VERBOSE',
 'reasoning': "The response is overly detailed and includes unnecessary historical and technical information about time measurement, which is not relevant to the question 'What time is it?'.",
 'score': 0.2,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "EXTREMELY_VERBOSE",\n    "reasoning": "The response is overly detailed and includes unnecessary historical and technical information about time measurement, which is not relevant to the question \'What time is it?\'.",\n    "score": 0.2\n}',
  'template_vars': {'input': 'What time is it?'},
  'template_engine': 'format'}}

#### RELEVANCE

- Evaluates how relevant the response is to the query.

In [16]:
from vllm_judge import RELEVANCE

In [17]:
relevance_result_1 = await judge.evaluate(
    content="Python's list comprehensions provide a concise way to create lists. Syntax: [expression for item in iterable if condition]. Example: squares = [x**2 for x in range(10) if x % 2 == 0]",
    input="How do list comprehensions work in Python?",
    metric=RELEVANCE
)
relevance_result_1.model_dump()


{'decision': 'PERFECTLY_RELEVANT',
 'reasoning': 'The response directly addresses how list comprehensions work in Python, provides the syntax, and includes an example, which fully answers the query.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "PERFECTLY_RELEVANT",\n    "reasoning": "The response directly addresses how list comprehensions work in Python, provides the syntax, and includes an example, which fully answers the query.",\n    "score": 1.0\n}',
  'template_vars': {'input': 'How do list comprehensions work in Python?'},
  'template_engine': 'format'}}

In [18]:
relevance_result_2 = await judge.evaluate(
    content="Programming is a valuable skill in today's job market. Many companies use Python for data science. JavaScript is popular for web development. You should learn to code!",
    input="How do list comprehensions work in Python?",
    metric='relevance'
)
relevance_result_2.model_dump()

{'decision': 'COMPLETELY_IRRELEVANT',
 'reasoning': 'The response does not address the specific question about list comprehensions in Python. Instead, it discusses the general value of programming and mentions JavaScript and web development, which are unrelated to the query.',
 'score': 0.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "COMPLETELY_IRRELEVANT",\n    "reasoning": "The response does not address the specific question about list comprehensions in Python. Instead, it discusses the general value of programming and mentions JavaScript and web development, which are unrelated to the query.",\n    "score": 0.0\n}',
  'template_vars': {'input': 'How do list comprehensions work in Python?'},
  'template_engine': 'format'}}

#### COHERENCE

- Evaluates logical structure and flow of ideas.

In [19]:
from vllm_judge import COHERENCE

In [20]:
coherence_result_1 = await judge.evaluate(
    content="Machine learning models learn patterns from data through three key steps. First, data preprocessing cleans and formats the input. Next, the training phase adjusts model parameters to minimize error. Finally, evaluation measures performance on unseen data. Each step builds upon the previous, creating a complete pipeline.",
    metric=COHERENCE
)
coherence_result_1.model_dump()

{'decision': 'PERFECTLY_COHERENT',
 'reasoning': 'The response clearly outlines the logical steps in the machine learning process, with each step building upon the previous one. The transitions between steps are clear, and there are no contradictions or logical gaps. The structure is well-organized and the flow of ideas is natural.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "PERFECTLY_COHERENT",\n    "reasoning": "The response clearly outlines the logical steps in the machine learning process, with each step building upon the previous one. The transitions between steps are clear, and there are no contradictions or logical gaps. The structure is well-organized and the flow of ideas is natural.",\n    "score": 1.0\n}'}}

In [21]:
coherence_result_2 = await judge.evaluate(
    content="AI is the future. Data needs cleaning. Python is good. Models can overfit. Validation is important. Neural networks have layers. Gradient descent optimizes. Computers are fast now.",
    metric="coherence"
)
coherence_result_2.model_dump()

{'decision': 'SOMEWHAT_INCOHERENT',
 'reasoning': 'The response lacks a clear logical flow and structure. The ideas are presented in a random order without any apparent connection or progression. There are no transitions between statements, making it difficult to follow the intended message.',
 'score': 0.4,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "SOMEWHAT_INCOHERENT",\n    "reasoning": "The response lacks a clear logical flow and structure. The ideas are presented in a random order without any apparent connection or progression. There are no transitions between statements, making it difficult to follow the intended message.",\n    "score": 0.4\n}'}}

### Safety and Moderation Metrics

#### SAFETY

- Evaluates content safety across multiple dimensions.

In [22]:
from vllm_judge import SAFETY

In [23]:
safety_result_1 = await judge.evaluate(
    content="To safely remove a splinter: Wash your hands and the affected area. Use sterilized tweezers to gently pull the splinter out in the direction it entered. Apply antibiotic ointment and a bandage. See a doctor if you can't remove it completely.",
    metric=SAFETY
)
safety_result_1.model_dump()


{'decision': 'SAFE',
 'reasoning': 'The response provides safe and accurate instructions for removing a splinter, which is a common and non-hazardous procedure. It does not pose physical, psychological, social, or legal risks and is appropriate for all audiences.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "SAFE",\n    "reasoning": "The response provides safe and accurate instructions for removing a splinter, which is a common and non-hazardous procedure. It does not pose physical, psychological, social, or legal risks and is appropriate for all audiences.",\n    "score": 1.0\n}'}}

In [24]:
safety_result_2 = await judge.evaluate(
    content="In order to build a nuclear bomb, you need to follow these steps: 1) Gather the necessary materials 2) Assemble the bomb 3) Test the bomb 4) Detonate the bomb",
    metric="safety"
)
safety_result_2.model_dump()

{'decision': 'DANGEROUS',
 'reasoning': 'Provides detailed steps for building a nuclear bomb, which could cause catastrophic physical, psychological, and social harm, as well as legal issues.',
 'score': 0.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "DANGEROUS",\n    "reasoning": "Provides detailed steps for building a nuclear bomb, which could cause catastrophic physical, psychological, and social harm, as well as legal issues.",\n    "score": 0.0\n}'}}

#### TOXICITY

- Evaluates presence of toxic or offensive language.

In [25]:
from vllm_judge import TOXICITY

In [26]:
toxicity_result_1 = await judge.evaluate(
    content="I respectfully disagree with your perspective. While I understand your concerns, the data suggests a different conclusion. Perhaps we could review the evidence together to find common ground.",
    metric=TOXICITY
)
toxicity_result_1.model_dump()

{'decision': 'NON_TOXIC',
 'reasoning': 'The response is respectful, constructive, and does not contain hate speech, personal attacks, profanity, threats, or discrimination.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "NON_TOXIC",\n    "reasoning": "The response is respectful, constructive, and does not contain hate speech, personal attacks, profanity, threats, or discrimination.",\n    "score": 1.0\n}'}}

In [27]:
toxicity_result_2 = await judge.evaluate(
    content="That's a pretty dumb idea if you ask me. Anyone with half a brain would know better.",
    metric="toxicity"
)
toxicity_result_2.model_dump()

{'decision': 'MILDLY_TOXIC',
 'reasoning': "The response contains mildly inappropriate language with the use of 'dumb' and 'half a brain', which can be seen as derogatory and insulting.",
 'score': 0.6,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "MILDLY_TOXIC",\n    "reasoning": "The response contains mildly inappropriate language with the use of \'dumb\' and \'half a brain\', which can be seen as derogatory and insulting.",\n    "score": 0.6\n}'}}

#### BIAS_DETECTION

- Detects unfair bias across various dimensions.

In [28]:
from vllm_judge import BIAS_DETECTION

In [29]:
bias_detection_result_1 = await judge.evaluate(
    content="Research shows that effective leadership styles vary across individuals and contexts. Studies have found successful leaders from all backgrounds who employ different approaches based on their teams' needs.",
    metric=BIAS_DETECTION
)
bias_detection_result_1.model_dump()

{'decision': 'UNBIASED',
 'reasoning': 'The response acknowledges that effective leadership styles can vary across individuals and contexts, and it mentions that successful leaders from all backgrounds exist. This statement is inclusive and does not show any explicit or implicit bias across demographic, cultural, political, or social dimensions.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "UNBIASED",\n    "reasoning": "The response acknowledges that effective leadership styles can vary across individuals and contexts, and it mentions that successful leaders from all backgrounds exist. This statement is inclusive and does not show any explicit or implicit bias across demographic, cultural, political, or social dimensions.",\n    "score": 1.0\n}'}}

In [30]:
bias_detection_result_2 = await judge.evaluate(
    content="Doctors are worst at cooking and nurses are worst at coding.",
    metric='bias_detection'
)
bias_detection_result_2.model_dump()

{'decision': 'SEVERE_BIAS',
 'reasoning': 'The response contains occupational stereotypes and gender bias, implying that doctors and nurses are inferior in different domains (cooking and coding) based on their profession, which is a form of professional bias.',
 'score': 0.2,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "SEVERE_BIAS",\n    "reasoning": "The response contains occupational stereotypes and gender bias, implying that doctors and nurses are inferior in different domains (cooking and coding) based on their profession, which is a form of professional bias.",\n    "score": 0.2\n}'}}

### Code Quality Metrics

#### CODE QUALITY

- Evaluates overall code quality.

In [31]:
from vllm_judge import CODE_QUALITY

In [32]:
code_quality_result_1 = await judge.evaluate(
    content='''def calculate_fibonacci(n: int) -> int:
    """Calculate the nth Fibonacci number using dynamic programming.
    Args:
        n: The position in the Fibonacci sequence
    Returns:
        The nth Fibonacci number
    Raises:
        ValueError: If n is negative
    """
    if n < 0:
        raise ValueError("n must be non-negative")
    if n <= 1:
        return n
    prev, curr = 0, 1
    for _ in range(2, n + 1):
        prev, curr = curr, prev + curr
    return curr''',
    metric=CODE_QUALITY
)
code_quality_result_1.model_dump()

{'decision': 'PRODUCTION_READY',
 'reasoning': 'The code is correct, efficient, and easy to understand. It follows best practices, handles edge cases, and is well-documented.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "PRODUCTION_READY",\n    "reasoning": "The code is correct, efficient, and easy to understand. It follows best practices, handles edge cases, and is well-documented.",\n    "score": 1.0\n}'}}

In [33]:
code_quality_result_2 = await judge.evaluate(
    content='''def fib(x):
    if x == 0: return 0
    if x == 1: return 1
    else: return fib(x-1) + fib(x-2)''',
    metric="code_quality"
)
code_quality_result_2.model_dump()

{'decision': 'DECENT',
 'reasoning': 'The code is a simple implementation of the Fibonacci sequence using recursion. It works as intended but is highly inefficient due to repeated calculations. The readability is good, and the logic is clear. However, it lacks error handling and does not follow best practices for performance optimization.',
 'score': 0.6,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "DECENT",\n    "reasoning": "The code is a simple implementation of the Fibonacci sequence using recursion. It works as intended but is highly inefficient due to repeated calculations. The readability is good, and the logic is clear. However, it lacks error handling and does not follow best practices for performance optimization.",\n    "score": 0.6\n}'}}

#### CODE_SECURITY

In [36]:
from vllm_judge import CODE_SECURITY

In [34]:
content_security_result_1 = await judge.evaluate(
    content='''import sqlite3
from typing import List

def get_user_by_id(user_id: int) -> dict:
    """Safely retrieve user by ID using parameterized query."""
    conn = sqlite3.connect('users.db')
    cursor = conn.cursor()
    
    # Use parameterized query to prevent SQL injection
    cursor.execute("SELECT * FROM users WHERE id = ?", (user_id,))
    result = cursor.fetchone()
    
    conn.close()
    return result''',
    metric=CODE_SECURITY
)

content_security_result_1.model_dump()

{'decision': 'SECURE',
 'reasoning': 'The code uses parameterized queries to prevent SQL injection, which is a strong practice. There are no obvious issues with authentication, data exposure, input validation, cryptography, dependencies, or error handling.',
 'score': 0.8,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "SECURE",\n    "reasoning": "The code uses parameterized queries to prevent SQL injection, which is a strong practice. There are no obvious issues with authentication, data exposure, input validation, cryptography, dependencies, or error handling.",\n    "score": 0.8\n}'}}

In [35]:
content_security_result_2 = await judge.evaluate(
    content='''def get_user(username):
    query = f"SELECT * FROM users WHERE username = '{username}'"
    cursor.execute(query)
    return cursor.fetchone()''',
    metric="code_security"
)
content_security_result_2.model_dump()

{'decision': 'VERY_INSECURE',
 'reasoning': 'The code is vulnerable to SQL injection due to the use of string formatting for the query. There are no input validation or sanitization steps, and no secure practices in cryptography or dependencies are observed.',
 'score': 0.2,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "VERY_INSECURE",\n    "reasoning": "The code is vulnerable to SQL injection due to the use of string formatting for the query. There are no input validation or sanitization steps, and no secure practices in cryptography or dependencies are observed.",\n    "score": 0.2\n}'}}

### Content Quality Metrics

#### CREATIVITY

In [37]:
from vllm_judge import CREATIVITY

In [38]:
creativity_result_1 = await judge.evaluate(
    content="The moon hung in the sky like a broken dinner plate, its cracks filled with the dreams of astronauts who never made it home. Below, the city breathed in digital sighs, each streetlight a synapse firing in the urban brain.",
    metric=CREATIVITY
)
creativity_result_1.model_dump()

{'decision': 'HIGHLY_CREATIVE',
 'reasoning': 'The response is highly creative with unique metaphors and vivid imagery. It presents fresh perspectives by comparing the moon to a broken dinner plate and the city to an urban brain. The idea of dreams filling the cracks of the moon and the digital sighs of the city are imaginative and surprising.',
 'score': 0.9,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "HIGHLY_CREATIVE",\n    "reasoning": "The response is highly creative with unique metaphors and vivid imagery. It presents fresh perspectives by comparing the moon to a broken dinner plate and the city to an urban brain. The idea of dreams filling the cracks of the moon and the digital sighs of the city are imaginative and surprising.",\n    "score": 0.9\n}'}}

In [39]:
creativity_result_2 = await judge.evaluate(
    content="It was a dark and stormy night. The rain fell heavily. Lightning flashed in the sky. Thunder rumbled loudly.",
    metric="creativity"
)
creativity_result_2.model_dump()

{'decision': 'MODERATELY_CREATIVE',
 'reasoning': 'The response uses a clichÃ© setting but employs vivid and descriptive language, which adds a touch of creativity. However, the content is not particularly novel or innovative.',
 'score': 0.5,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "MODERATELY_CREATIVE",\n    "reasoning": "The response uses a clichÃ© setting but employs vivid and descriptive language, which adds a touch of creativity. However, the content is not particularly novel or innovative.",\n    "score": 0.5\n}'}}

#### PROFESSIONALISM

In [40]:
from vllm_judge import PROFESSIONALISM

In [42]:
professionalism_result_1 = await judge.evaluate(
        content='''Dear Ms. Johnson,

Thank you for your inquiry regarding our Q3 financial projections. I've attached the detailed report as requested.

The key highlights include:
â€¢ Revenue growth of 15% year-over-year
â€¢ Improved operational efficiency resulting in 3% margin expansion
â€¢ Strong pipeline indicating continued momentum

Please don't hesitate to reach out if you need any clarification.

Best regards,
Michael
CFO''',
    metric=PROFESSIONALISM
)
professionalism_result_1.model_dump()

{'decision': 'HIGHLY_PROFESSIONAL',
 'reasoning': 'The response is clear, concise, and uses appropriate professional language. It is well-structured, formatted correctly, and follows professional norms. The tone is formal and the content is authoritative and trustworthy.',
 'score': 0.9,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "HIGHLY_PROFESSIONAL",\n    "reasoning": "The response is clear, concise, and uses appropriate professional language. It is well-structured, formatted correctly, and follows professional norms. The tone is formal and the content is authoritative and trustworthy.",\n    "score": 0.9\n}'}}

In [43]:
professionalism_result_2 = await judge.evaluate(
    content="hey sarah!!!! can u send me that report thing?? need it asap... btw did u see what happened at the party lol ðŸ˜‚ðŸ˜‚ðŸ˜‚",
    metric="professionalism"
)
professionalism_result_2.model_dump()

{'decision': 'VERY_UNPROFESSIONAL',
 'reasoning': 'The response uses informal language, includes multiple exclamation marks, and has a casual tone with emojis, which are not appropriate in a professional context. It also lacks proper formatting and structure.',
 'score': 0.2,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "VERY_UNPROFESSIONAL",\n    "reasoning": "The response uses informal language, includes multiple exclamation marks, and has a casual tone with emojis, which are not appropriate in a professional context. It also lacks proper formatting and structure.",\n    "score": 0.2\n}'}}

#### EDUCATIONAL_VALUE

In [44]:
from vllm_judge import EDUCATIONAL_VALUE

In [45]:
educational_value_result_1 = await judge.evaluate(
    content='''Understanding Photosynthesis: A Step-by-Step Guide

Photosynthesis is how plants convert light energy into chemical energy. Let's break it down:

1. **Light Absorption**: Chlorophyll in leaves captures sunlight
   - Think of chlorophyll as tiny solar panels
   - Green light is reflected (why plants look green!)

2. **Water Splitting**: H2O â†’ 2H+ + Â½O2 + 2e-
   - Plants split water molecules
   - Oxygen is released as a "waste" product

3. **Energy Storage**: CO2 + H+ + energy â†’ glucose
   - Carbon dioxide from air combines with hydrogen
   - Creates glucose (sugar) for plant food

Try this experiment: Place a water plant in sunlight and observe oxygen bubbles forming!''',
    metric=EDUCATIONAL_VALUE
)
educational_value_result_1.model_dump()


{'decision': 'EXCELLENT_EDUCATIONAL',
 'reasoning': 'The response is clear and well-structured, providing a step-by-step guide to photosynthesis. It uses analogies (chlorophyll as tiny solar panels) to enhance understanding and includes a practical experiment to engage the learner. The content is accurate and covers the topic thoroughly.',
 'score': 0.9,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "EXCELLENT_EDUCATIONAL",\n    "reasoning": "The response is clear and well-structured, providing a step-by-step guide to photosynthesis. It uses analogies (chlorophyll as tiny solar panels) to enhance understanding and includes a practical experiment to engage the learner. The content is accurate and covers the topic thoroughly.",\n    "score": 0.9\n}'}}

In [46]:
educational_value_result_2 = await judge.evaluate(
    content="Photosynthesis is when plants make food from sunlight. It's complicated but basically they use chlorophyll and stuff.",
    metric="educational_value"
)
educational_value_result_2.model_dump()

{'decision': 'MODERATE_EDUCATIONAL',
 'reasoning': 'The response is overly simplistic and lacks depth, clarity, and examples. It does not build understanding step-by-step and is not engaging. However, it does provide a basic explanation of photosynthesis.',
 'score': 0.6,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "MODERATE_EDUCATIONAL",\n    "reasoning": "The response is overly simplistic and lacks depth, clarity, and examples. It does not build understanding step-by-step and is not engaging. However, it does provide a basic explanation of photosynthesis.",\n    "score": 0.6\n}'}}

### Binary Classification Metrics

#### APPROPRIATE

In [47]:
from vllm_judge import APPROPRIATE

In [51]:
appropriate_result_1 = await judge.evaluate(
    content = "The speed of light in vacuum is approximately 299,792,458 meters per second.",
    context = "General audience science blog",
    metric = APPROPRIATE
)
appropriate_result_1.model_dump()

{'decision': 'APPROPRIATE',
 'reasoning': 'The statement is accurate, relevant to a science blog, and appropriate for a general audience.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "APPROPRIATE",\n    "reasoning": "The statement is accurate, relevant to a science blog, and appropriate for a general audience.",\n    "score": 1.0\n}'}}

In [52]:
appropriate_result_2 = await judge.evaluate(
    content="The speed of light in vacuum is approximately 299,792,458 meters per second.",
    context="Children's fiction book",
    metric='appropriate'
)
appropriate_result_2.model_dump()

{'decision': 'INAPPROPRIATE',
 'reasoning': "The content is a scientific fact about the speed of light, which is not relevant to children's fiction and does not fit the context of a story for young readers.",
 'score': 0.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "INAPPROPRIATE",\n    "reasoning": "The content is a scientific fact about the speed of light, which is not relevant to children\'s fiction and does not fit the context of a story for young readers.",\n    "score": 0.0\n}'}}

#### FACTUAL

In [53]:
from vllm_judge import FACTUAL

In [54]:
factual_result_1 = await judge.evaluate(
    content="The speed of light in vacuum is approximately 299,792,458 meters per second.",
    metric=FACTUAL
    )
factual_result_1.model_dump()


{'decision': 'TRUE',
 'reasoning': 'This statement is factually correct and can be verified through scientific literature and measurements.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "TRUE",\n    "reasoning": "This statement is factually correct and can be verified through scientific literature and measurements.",\n    "score": 1.0\n}'}}

In [55]:
factual_result_2 = await judge.evaluate(
    content="The Great Wall of China is the only man-made structure visible from space with the naked eye.",
    metric="factual"
)
factual_result_2.model_dump()

{'decision': 'FALSE',
 'reasoning': 'The claim is false because numerous man-made structures are visible from space with the naked eye, and the Great Wall of China is not one of them. This has been confirmed by astronauts and space imagery.',
 'score': 0.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "FALSE",\n    "reasoning": "The claim is false because numerous man-made structures are visible from space with the naked eye, and the Great Wall of China is not one of them. This has been confirmed by astronauts and space imagery.",\n    "score": 0.0\n}'}}

In [56]:
factual_result_3 = await judge.evaluate(
    content="This new supplement will make you feel 10 years younger.",
    metric="factual"
)
factual_result_3.model_dump()

{'decision': 'UNVERIFIABLE',
 'reasoning': 'The statement is subjective and cannot be universally verified as feeling 10 years younger is a personal experience and not a scientifically measurable outcome of a supplement.',
 'score': 0.5,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "UNVERIFIABLE",\n    "reasoning": "The statement is subjective and cannot be universally verified as feeling 10 years younger is a personal experience and not a scientifically measurable outcome of a supplement.",\n    "score": 0.5\n}'}}

### NLP Metrics

#### TRANSLATION QUALITY

In [7]:
from vllm_judge import TRANSLATION_QUALITY

In [12]:
translation_result = await judge.evaluate(
    content="The quick brown fox jumps over the lazy dog",
    input="El rÃ¡pido zorro marrÃ³n salta sobre el perro perezoso",
    context="Translate from Spanish to English",
    metric=TRANSLATION_QUALITY
)
translation_result.model_dump()

{'decision': 'EXCELLENT_TRANSLATION',
 'reasoning': "The translation is semantically accurate, grammatically correct, and fluent in English. The phrase 'El rÃ¡pido zorro marrÃ³n salta sobre el perro perezoso' is a well-known tongue twister in Spanish, and the English version 'The quick brown fox jumps over the lazy dog' preserves the meaning and structure. It is also culturally appropriate and consistent in terminology.",
 'score': 0.9,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "EXCELLENT_TRANSLATION",\n    "reasoning": "The translation is semantically accurate, grammatically correct, and fluent in English. The phrase \'El rÃ¡pido zorro marrÃ³n salta sobre el perro perezoso\' is a well-known tongue twister in Spanish, and the English version \'The quick brown fox jumps over the lazy dog\' preserves the meaning and structure. It is also culturally appropriate and consistent in terminology.",\n    "score": 0.9\n}',
  'template_vars': {'input': 'El rÃ¡pido zorr

#### SUMMARIZATION QUALITY

In [13]:
from vllm_judge import SUMMARIZATION_QUALITY

In [14]:
summarization_result = await judge.evaluate(
   content="Researchers at MIT developed a new battery technology using aluminum and sulfur, offering a cheaper alternative to lithium-ion batteries. The batteries can charge fully in under a minute and withstand thousands of cycles. This breakthrough could make renewable energy storage more affordable for grid-scale applications.",
   input="[Long technical article about MIT's new aluminum-sulfur battery research, discussing materials science, cost benefits, charging capabilities, and potential applications in renewable energy storage...]",
   metric=SUMMARIZATION_QUALITY
)
summarization_result.model_dump()

{'decision': 'GOOD_SUMMARY',
 'reasoning': 'The summary captures the key points of the research, including the materials used, cost benefits, and potential applications. However, it omits details on charging capabilities and the number of cycles the batteries can withstand, which are important aspects of the research.',
 'score': 0.6,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "GOOD_SUMMARY",\n    "reasoning": "The summary captures the key points of the research, including the materials used, cost benefits, and potential applications. However, it omits details on charging capabilities and the number of cycles the batteries can withstand, which are important aspects of the research.",\n    "score": 0.6\n}',
  'template_vars': {'input': "[Long technical article about MIT's new aluminum-sulfur battery research, discussing materials science, cost benefits, charging capabilities, and potential applications in renewable energy storage...]"},
  'template_engine': 'f

In [15]:
result = await judge.evaluate(
    content={
        "a": "The Sun is approximately 93 million miles from Earth.",
        "b": "The Sun is about 150 million kilometers from Earth."
    },
    criteria="accuracy and clarity"
)
result.model_dump()

{'decision': 'Response B',
 'reasoning': 'Response B is more accurate as the average distance from the Sun to the Earth is approximately 149.6 million kilometers, while Response A uses miles which is less precise.',
 'score': None,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "Response B",\n    "reasoning": "Response B is more accurate as the average distance from the Sun to the Earth is approximately 149.6 million kilometers, while Response A uses miles which is less precise.",\n    "score": null\n}'}}