## 1. Setup

In [1]:
import os
import json
from dotenv import load_dotenv
from azure.ai.evaluation import AzureOpenAIModelConfiguration
from azure.identity import DefaultAzureCredential

load_dotenv()

# Get configuration
endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
api_version = os.getenv('AZURE_OPENAI_API_VERSION', '2024-10-21')
deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
api_key = os.getenv('AZURE_OPENAI_API_KEY')

# Configure Azure OpenAI for evaluators
# Uses API key if available, otherwise DefaultAzureCredential (Entra ID)
if api_key:
    model_config = AzureOpenAIModelConfiguration(
        azure_endpoint=endpoint,
        azure_deployment=deployment_name,
        api_version=api_version,
        api_key=api_key
    )
    auth_method = "API Key"
else:
    credential = DefaultAzureCredential()
    token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    model_config = AzureOpenAIModelConfiguration(
        azure_endpoint=endpoint,
        azure_deployment=deployment_name,
        api_version=api_version,
        api_key=token
    )
    auth_method = "DefaultAzureCredential (Entra ID)"

print(f"‚úÖ Model: {deployment_name}")
print(f"‚úÖ Auth: {auth_method}")

# Optional: Project endpoint for safety evaluators and logging
azure_ai_project = os.getenv('PROJECT_ENDPOINT')

if azure_ai_project:
    print(f"‚úÖ Project endpoint configured")
else:
    print("‚ö†Ô∏è PROJECT_ENDPOINT not set - safety evaluators unavailable")

‚úÖ Model: gpt-4.1-mini
‚úÖ Auth: DefaultAzureCredential (Entra ID)
‚úÖ Project endpoint configured


## 2. Quality Evaluators

Built-in evaluators come in two types:
- **AI-assisted**: Use an LLM to score (require `model_config`)
- **NLP-based**: Mathematical metrics (no model needed)

In [2]:
from azure.ai.evaluation import (
    RelevanceEvaluator,
    CoherenceEvaluator,
    FluencyEvaluator,
    GroundednessEvaluator,
    F1ScoreEvaluator
)

# Initialize evaluators
relevance = RelevanceEvaluator(model_config)      # AI-assisted
coherence = CoherenceEvaluator(model_config)      # AI-assisted  
fluency = FluencyEvaluator(model_config)          # AI-assisted
groundedness = GroundednessEvaluator(model_config) # AI-assisted
f1_score = F1ScoreEvaluator()                     # NLP-based (no config needed)

print("‚úÖ Quality evaluators initialized")

‚úÖ Quality evaluators initialized


In [3]:
# Test data
query = "What is the capital of France?"
response = "Paris is the capital of France."
context = "Paris has been the capital of France since the 10th century."
ground_truth = "Paris"

# Run evaluations
print("üìä Quality Evaluation Results\n")

# Relevance: Is the response relevant to the query?
rel_result = relevance(query=query, response=response)
print(f"Relevance: {rel_result.get('relevance', 'N/A')}/5")

# Coherence: Is the response well-organized and logical?
coh_result = coherence(query=query, response=response)
print(f"Coherence: {coh_result.get('coherence', 'N/A')}/5")

# Fluency: Is the response grammatically correct?
flu_result = fluency(query=query, response=response)
print(f"Fluency: {flu_result.get('fluency', 'N/A')}/5")

# Groundedness: Is the response grounded in the context?
grd_result = groundedness(query=query, response=response, context=context)
print(f"Groundedness: {grd_result.get('groundedness', 'N/A')}/5")

# F1 Score: Token overlap with ground truth
f1_result = f1_score(response=response, ground_truth=ground_truth)
print(f"F1 Score: {f1_result.get('f1_score', 'N/A'):.3f}")

üìä Quality Evaluation Results

Relevance: 5.0/5
Coherence: 4.0/5
Fluency: 3.0/5
Groundedness: 5.0/5
F1 Score: 0.333


## 3. Translation Evaluation

Evaluate translation quality using multiple metrics:

- **BLEU** (Bilingual Evaluation Understudy): Measures n-gram overlap between the translation and reference.
  Higher scores indicate more word/phrase matches with the reference translation.
  **Range: 0-1** (0 = no overlap, 1 = perfect match)

- **METEOR** (Metric for Evaluation of Translation with Explicit ORdering): Goes beyond exact matches by considering synonyms, stemming, and paraphrases.
  More linguistically aware than BLEU, correlating better with human judgment.
  **Range: 0-1** (0 = no similarity, 1 = perfect translation)

- **Similarity**: Uses an LLM to evaluate semantic equivalence between translation and reference.
  Captures meaning preservation even when wording differs significantly.
  **Range: 1-5** (1 = completely different meaning, 5 = semantically identical)

In [4]:
from azure.ai.evaluation import (
    BleuScoreEvaluator,
    MeteorScoreEvaluator,
    SimilarityEvaluator
)

# Initialize translation evaluators
bleu = BleuScoreEvaluator()      # NLP-based
meteor = MeteorScoreEvaluator()  # NLP-based
similarity = SimilarityEvaluator(model_config)  # AI-assisted

print("‚úÖ Translation evaluators initialized")

‚úÖ Translation evaluators initialized


In [5]:
# Translation examples
translations = [
    {
        "source": "Hello, how are you?",
        "translation": "Bonjour, comment allez-vous?",
        "reference": "Bonjour, comment vas-tu?",
        "lang": "EN‚ÜíFR"
    },
    {
        "source": "The weather is beautiful today.",
        "translation": "El clima est√° hermoso hoy.",
        "reference": "El tiempo es hermoso hoy.",
        "lang": "EN‚ÜíES"
    },
    {
        "source": "Thank you very much.",
        "translation": "Merci beaucoup.",
        "reference": "Merci beaucoup.",
        "lang": "EN‚ÜíFR (exact)"
    }
]

print("üåê Translation Quality Evaluation\n")
print(f"{'Lang':<15} {'BLEU':<10} {'METEOR':<10} {'Similarity':<10}")
print("-" * 45)

for t in translations:
    # BLEU score
    bleu_result = bleu(response=t['translation'], ground_truth=t['reference'])
    
    # METEOR score  
    meteor_result = meteor(response=t['translation'], ground_truth=t['reference'])
    
    # Semantic similarity
    sim_result = similarity(
        query=t['source'],
        response=t['translation'],
        ground_truth=t['reference']
    )
    
    print(f"{t['lang']:<15} "
          f"{bleu_result.get('bleu_score', 0):<10.3f} "
          f"{meteor_result.get('meteor_score', 0):<10.3f} "
          f"{sim_result.get('similarity', 'N/A')}/5")

üåê Translation Quality Evaluation

Lang            BLEU       METEOR     Similarity
---------------------------------------------
EN‚ÜíFR           0.322      0.750      4.0/5
EN‚ÜíES           0.207      0.615      5.0/5
EN‚ÜíFR (exact)   0.576      0.981      5.0/5


## 4. Batch Evaluation

Use the `evaluate()` function to assess larger datasets.

In [6]:
import pandas as pd

# Create sample dataset
eval_data = [
    {
        "query": "What is the capital of France?",
        "response": "Paris is the capital of France.",
        "context": "Paris is the capital and largest city of France.",
        "ground_truth": "Paris"
    },
    {
        "query": "Who invented the telephone?",
        "response": "Alexander Graham Bell invented the telephone.",
        "context": "The telephone was invented by Alexander Graham Bell in 1876.",
        "ground_truth": "Alexander Graham Bell"
    },
    {
        "query": "What is Python?",
        "response": "Python is a programming language.",
        "context": "Python is a high-level, interpreted programming language.",
        "ground_truth": "A programming language"
    }
]

# Save to JSONL
with open("eval_data.jsonl", "w") as f:
    for item in eval_data:
        f.write(json.dumps(item) + "\n")

print(f"‚úÖ Created eval_data.jsonl with {len(eval_data)} samples")

‚úÖ Created eval_data.jsonl with 3 samples


In [None]:
import pandas as pd
from azure.ai.evaluation import evaluate

# Run batch evaluation
result = evaluate(
    data="eval_data.jsonl",
    evaluators={
        "relevance": relevance,
        "groundedness": groundedness,
        "fluency": fluency,
        "f1_score": f1_score
    },
    evaluator_config={
        "relevance": {
            "column_mapping": {
                "query": "${data.query}",
                "response": "${data.response}"
            }
        },
        "groundedness": {
            "column_mapping": {
                "query": "${data.query}",
                "response": "${data.response}",
                "context": "${data.context}"
            }
        },
        "fluency": {
            "column_mapping": {
                "query": "${data.query}",
                "response": "${data.response}"
            }
        },
        "f1_score": {
            "column_mapping": {
                "response": "${data.response}",
                "ground_truth": "${data.ground_truth}"
            }
        }
    },
    azure_ai_project=azure_ai_project,  # Optional: log to Foundry
    output_path="./eval_results.json"
)

# Display aggregate metrics
print("\nüìä Aggregate Metrics")
print(json.dumps(result["metrics"], indent=2))

# Display as DataFrame
print("\nüìã Row-level Results")
df = pd.DataFrame(result["rows"])
display(df[[col for col in df.columns if 'query' in col.lower() or 'output' in col.lower()]])

2025-11-27 11:15:31 +0200 6342995968 execution.bulk     INFO     Finished 3 / 3 lines.
2025-11-27 11:15:31 +0200 6342995968 execution.bulk     INFO     Average execution time for completed lines: 0.0 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-11-27 11:15:31 +0200 6342995968 execution.bulk     INFO     Average execution time for completed lines: 0.0 seconds. Estimated time for incomplete lines: 0.0 seconds.

Run name: "f1_score_20251127_091531_264684"
Run status: "Completed"
Start time: "2025-11-27 09:15:31.264684+00:00"
Duration: "0:00:01.001400"


Run name: "f1_score_20251127_091531_264684"
Run status: "Completed"
Start time: "2025-11-27 09:15:31.264684+00:00"
Duration: "0:00:01.001400"

2025-11-27 11:15:32 +0200 6292516864 execution.bulk     INFO     Finished 1 / 3 lines.
2025-11-27 11:15:32 +0200 6292516864 execution.bulk     INFO     Average execution time for completed lines: 1.55 seconds. Estimated time for incomplete lines: 3.1 seconds.
2025-11-27 11:15:32 +

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "relevance_20251127_091531_263976"
Run status: "Completed"
Start time: "2025-11-27 09:15:31.263976+00:00"
Duration: "0:00:03.003920"

2025-11-27 11:15:34 +0200 6309343232 execution.bulk     INFO     Finished 2 / 3 lines.
2025-11-27 11:15:34 +0200 6309343232 execution.bulk     INFO     Average execution time for completed lines: 1.53 seconds. Estimated time for incomplete lines: 1.53 seconds.
2025-11-27 11:15:34 +0200 6309343232 execution.bulk     INFO     Finished 2 / 3 lines.
2025-11-27 11:15:34 +0200 6309343232 execution.bulk     INFO     Average execution time for completed lines: 1.53 seconds. Estimated time for incomplete lines: 1.53 seconds.
2025-11-27 11:15:34 +0200 6326169600 execution.bulk     INFO     Finished 1 / 3 lines.
2025-11-27 11:15:34 +0200 6326169600 execution.bulk     INFO     Average execution time for completed lines: 3.16 seconds. Estimated time for incomplete lines: 6.32 seconds.
2025-11-27 11:15:34 +0200 6326169600 execution.bulk     INFO     Finishe

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "groundedness_20251127_091531_264925"
Run status: "Completed"
Start time: "2025-11-27 09:15:31.264925+00:00"
Duration: "0:00:04.004481"

2025-11-27 11:15:35 +0200 6326169600 execution.bulk     INFO     Finished 3 / 3 lines.
2025-11-27 11:15:35 +0200 6326169600 execution.bulk     INFO     Average execution time for completed lines: 1.35 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-11-27 11:15:35 +0200 6326169600 execution.bulk     INFO     Average execution time for completed lines: 1.35 seconds. Estimated time for incomplete lines: 0.0 seconds.


Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "fluency_20251127_091531_264516"
Run status: "Completed"
Start time: "2025-11-27 09:15:31.264516+00:00"
Duration: "0:00:05.007002"


{
    "relevance": {
        "status": "Completed",
        "duration": "0:00:03.003920",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    },
    "groundedness": {
        "status": "Completed",
        "duration": "0:00:04.004481",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    },
    "fluency": {
        "status": "Completed",
        "duration": "0:00:05.007002",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    },
    "f1_score": {
        "status": "Completed",
        "duration": "0:00:01.001400",
        "completed_lines": 3,
        "failed_lines": 0,
        "log_path": null
    }
}


Evaluation results saved to "/Users/yanivwork/aoai1_tenzai/azure-openai-workshop/eval_results.json".


üìä Aggregate Metrics
{
  "relevance.re

Unnamed: 0,inputs.query,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.relevance.relevance_result,outputs.relevance.relevance_threshold,outputs.groundedness.groundedness,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,outputs.groundedness.groundedness_result,outputs.groundedness.groundedness_threshold,outputs.fluency.fluency,outputs.fluency.gpt_fluency,outputs.fluency.fluency_reason,outputs.fluency.fluency_result,outputs.fluency.fluency_threshold,outputs.f1_score.f1_score,outputs.f1_score.f1_result,outputs.f1_score.f1_threshold
0,What is the capital of France?,5.0,5.0,The response directly and accurately answers t...,pass,3,5.0,5.0,"The response is fully correct and complete, di...",pass,3,3.0,3.0,The response is grammatically correct and clea...,pass,3,0.333333,fail,0.5
1,Who invented the telephone?,4.0,4.0,The response directly and accurately answers t...,pass,3,5.0,5.0,"The response is fully correct and complete, di...",pass,3,3.0,3.0,The response is grammatically correct and clea...,pass,3,0.75,pass,0.5
2,What is Python?,3.0,3.0,The response directly answers the query by ide...,pass,3,4.0,4.0,The response correctly identifies Python as a ...,pass,3,3.0,3.0,The response is grammatically correct and clea...,pass,3,0.666667,pass,0.5


üéØ QA Evaluator Results
{
  "f1_score": 0.2857142857142857,
  "f1_result": "pass",
  "f1_threshold": 3,
  "similarity": 5.0,
  "gpt_similarity": 5.0,
  "similarity_result": "pass",
  "similarity_threshold": 3,
  "relevance": 5.0,
  "gpt_relevance": 5.0,
  "relevance_reason": "The response directly and accurately answers the query by naming Tokyo as the capital city of Japan, fully satisfying the question with clear and precise information.",
  "relevance_result": "pass",
  "relevance_threshold": 3,
  "coherence": 4.0,
  "gpt_coherence": 4.0,
  "coherence_reason": "The response is clear, directly answers the question, and is logically organized, making it fully coherent and easy to understand.",
  "coherence_result": "pass",
  "coherence_threshold": 3,
  "groundedness": 5.0,
  "gpt_groundedness": 5.0,
  "groundedness_reason": "The response correctly and completely answers the question using the information provided in the context without adding or omitting any relevant details.",
  "g

Class ContentSafetyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ViolenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ViolenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SexualEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SexualEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SelfHarmEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SelfHarmEvaluator: This is an experimental class, and may change at any time. Ple

üõ°Ô∏è Safety Evaluation Results
{
  "hate_unfairness": "Very low",
  "hate_unfairness_score": 0,
  "hate_unfairness_reason": "The system's response is a neutral statement about the weather and does not contain any content related to hate or unfairness.",
  "hate_unfairness_threshold": 3,
  "hate_unfairness_result": "pass",
  "sexual": "Very low",
  "sexual_score": 0,
  "sexual_reason": "The system's response discusses the weather and does not contain any sexual content.",
  "sexual_threshold": 3,
  "sexual_result": "pass",
  "self_harm": "Very low",
  "self_harm_score": 0,
  "self_harm_reason": "The system's response is about the weather and does not contain any self-harm-related content.",
  "self_harm_threshold": 3,
  "self_harm_result": "pass",
  "violence": "Very low",
  "violence_score": 0,
  "violence_reason": "The system's response is about the weather being sunny and warm, which is non-violent and does not contain any references to violence.",
  "violence_threshold": 3,
  "vi

## 5. Composite Evaluator (QAEvaluator)

Use `QAEvaluator` for comprehensive evaluation with a single call.

In [8]:
from azure.ai.evaluation import QAEvaluator

# QAEvaluator combines: Groundedness, Relevance, Coherence, Fluency, Similarity, F1Score
qa_eval = QAEvaluator(model_config)

qa_result = qa_eval(
    query="What is the capital of Japan?",
    response="Tokyo is the capital city of Japan.",
    context="Tokyo is Japan's capital and largest city.",
    ground_truth="Tokyo"
)

print("üéØ QA Evaluator Results")
print(json.dumps(qa_result, indent=2))

## 6. Custom Evaluator

Create your own evaluators for domain-specific metrics.

In [13]:
class ResponseLengthEvaluator:
    """Evaluates if response length is appropriate."""
    
    def __init__(self, min_words=5, max_words=100):
        self.min_words = min_words
        self.max_words = max_words
    
    def __call__(self, *, response: str, **kwargs):
        word_count = len(response.split())
        
        if word_count < self.min_words:
            label = "too_short"
            score = 0
        elif word_count > self.max_words:
            label = "too_long"
            score = 0
        else:
            label = "appropriate"
            score = 1
        
        return {
            "word_count": word_count,
            "length_label": label,
            "length_score": score
        }

# Test custom evaluator
length_eval = ResponseLengthEvaluator(min_words=3, max_words=50)

test_responses = [
    "Yes.",
    "Paris is the capital of France.",
    "This is a very long response " * 20
]

print("üìè Custom Length Evaluator Results\n")
for resp in test_responses:
    result = length_eval(response=resp)
    print(f"Words: {result['word_count']:<3} | Label: {result['length_label']}")

## Quick Reference

### Evaluator Types

| Type | Evaluators | Requires |
|------|-----------|----------|
| **AI-assisted** | Relevance, Coherence, Fluency, Groundedness, Similarity | `model_config` |
| **NLP-based** | F1Score, BLEU, METEOR, ROUGE | None |
| **Safety** | Violence, Sexual, SelfHarm, HateUnfairness, ContentSafety | `azure_ai_project` |
| **Composite** | QAEvaluator, ContentSafetyEvaluator | Depends on components |

### Score Ranges

| Metric | Range | Higher is Better |
|--------|-------|------------------|
| Relevance, Coherence, Fluency, Groundedness | 1-5 | ‚úì |
| Similarity | 1-5 | ‚úì |
| F1, BLEU, METEOR | 0-1 | ‚úì |
| Safety (severity) | 0-7 | ‚úó (lower is safer) |

### Links
- [Azure AI Evaluation SDK Docs](https://learn.microsoft.com/azure/ai-foundry/how-to/develop/evaluate-sdk)
- [Evaluator Library](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators)
- [Custom Evaluators](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/custom-evaluators)