In [1]:
from vllm_judge import Judge, JudgeConfig, Metric, BUILTIN_METRICS

In [2]:
BUILTIN_METRICS.keys()

dict_keys(['helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])

In [2]:
judge = Judge.from_url(base_url="http://localhost:8080")

In [4]:
res = await judge.evaluate(content="I want to bump the version to 1.0.1, is it a good idea?",
                           criteria="Check the professional tone.")

In [5]:
res.model_dump()

{'decision': False,
 'reasoning': 'The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.',
 'score': None,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": false,\n    "reasoning": "The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.",\n    "score": null\n}'}}

In [5]:
res = await judge.evaluate(content="I want to bump the version to 1.0.1, is it a good idea?",
                           criteria="Check the professional tone.",
                           rubric="Assign a score between 0 and 10 based on the professional tone. 0 is the worst and 10 is the best.")
res.model_dump()

{'decision': 5,
 'reasoning': 'The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.',
 'score': 5.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": 5,\n    "reasoning": "The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.",\n    "score": 5\n}'}}

In [6]:
res = await judge.evaluate(content="I want to bump the version to 1.0.1, is it a good idea?",
                           criteria="Check the professional tone.",
                           rubric={
                               0: "The response is not professional.",
                               5: "The response is somewhat professional.",
                               10: "The response is very professional."
                           })
res.model_dump()

{'decision': 5,
 'reasoning': 'The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.',
 'score': 5.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": 5,\n    "reasoning": "The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.",\n    "score": 5\n}'}}

In [8]:
professional_tone_metric = Metric(
    name="professional_tone",
    criteria="Assess the professional tone of the provided email body.",
    rubric="Classify the text into 'professional', 'moderate', or 'non-professional' categories.",
    scale=(1,10)
)

In [9]:
res = await judge.evaluate(response="I want to bump the version to 1.0.1, is it a good idea?",
                           metric=professional_tone_metric)
res.model_dump()

{'decision': 'moderate',
 'reasoning': 'The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.',
 'score': 5.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "moderate",\n    "reasoning": "The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.",\n    "score": 5\n}'}}

In [10]:
res = await judge.evaluate(response="Holy shit, this is a great!",
                           metric=professional_tone_metric)
res.model_dump()

{'decision': 'non-professional',
 'reasoning': 'The response uses informal and expletive language, which is not appropriate for a professional context.',
 'score': 1.0,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": "non-professional",\n    "reasoning": "The response uses informal and expletive language, which is not appropriate for a professional context.",\n    "score": 1\n}'}}

In [1]:
from vllm_judge.api import JudgeClient

client = JudgeClient("http://localhost:9090")

In [2]:
await client.health_check()

{'status': 'healthy',
 'version': '0.1.0',
 'model': 'qwen2',
 'base_url': 'http://localhost:8080',
 'uptime_seconds': 62.64390587806702,
 'total_evaluations': 1,
 'active_connections': 0,
 'metrics_available': 24}

In [3]:
result = await client.evaluate(
    response="Python is great!",
    criteria="technical accuracy"
)
result.model_dump() 

{'decision': False,
 'reasoning': 'The response lacks technical detail and does not provide a substantive explanation of why Python is great.',
 'score': None,
 'metadata': {'model': 'qwen2',
  'raw_response': '{\n    "decision": false,\n    "reasoning": "The response lacks technical detail and does not provide a substantive explanation of why Python is great.",\n    "score": null\n}'}}