In [1]:
import openai
import json
from typing import List, Tuple, Dict, Optional
from collections import defaultdict
import instructor
import weave
from set_env import set_env
from pydantic import BaseModel, Field
from typing import List, Literal, Union, Any
from pprint import pprint
from instructor_models import TaskDescription, CombinedTaskDescription, Criterion, EvaluationCriteria, PythonAssertion, LLMAssertion, CriterionAssertions
import asyncio
import nest_asyncio
from criterion_assertion_map import CriterionAssertionMap
from combined_scorer import AssertionScorer, predict_passthrough


In [2]:
set_env("OPENAI_API_KEY")
set_env("WANDB_API_KEY")

        Unable to set WANDB_API_KEY=WANDB_API_KEY,
        not in colab or Secrets not set, not kaggle
        or Secrets not set, no .env/dotenv/env file
        in the current working dir or parent dirs.[0m


loading envfile='/Users/anishshah/Documents/Manual Library/GitHub(1)/improve-evals/.env' with dotenv_values(envfile)


In [3]:
try:
    import IPython
    in_jupyter = True
except ImportError:
    in_jupyter = False
if in_jupyter:
    nest_asyncio.apply()

In [4]:
import random
weave.init(f"evalgen_test_{random.randint(0, 1000000)}")

Logged in as Weights & Biases user: a-sh0ts.
View Weave data at https://wandb.ai/a-sh0ts/evalgen_test_914532/weave


<weave.weave_client.WeaveClient at 0x14ece72c0>

In [5]:
client = instructor.from_openai(openai.AsyncOpenAI())

In [6]:
DataPoint = Tuple[dict, dict, Literal[0, 1], Optional[str], Optional[str], Optional[str]]  # (input, output, annotation, note, human_description_for_task_or_judge, human_description_for_metric_details)

In [7]:
MODEL = "gpt-4o-2024-08-06"

In [8]:
TEST_TASK = "medical"
if TEST_TASK == "medical":
    data = weave.ref("weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:4utHXhRnO2oquowlrvJxCztretSxtFavBUHVciMZJBw").get()
elif TEST_TASK == "product":
    pass
else:
    data = [
        ({"text": "Summarize the impact of climate change on polar bears."}, {"text": "Climate change is reducing sea ice, which polar bears rely on for hunting seals."}, 1, "Accurate and relevant."),
        ({"text": "Explain the process of photosynthesis."}, {"text": "Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water."}, 1, "Correct and detailed."),
        ({"text": "What are the main causes of the American Civil War?"}, {"text": "The main causes were slavery, states' rights, and economic differences."}, 1, "Concise and accurate."),
        ({"text": "Describe the symptoms of COVID-19."}, {"text": "COVID-19 is caused by a virus that originated in bats."}, 0, "Irrelevant and incorrect."),
        ({"text": "What is the significance of the Magna Carta?"}, {"text": "The Magna Carta was a document that limited the power of the king and established certain legal rights."}, 1, "Historically accurate and relevant.")
    ]

In [9]:
pprint(data[0])

WeaveList([{'input': "Dialogue: [doctor] hey dylan what's going on so i lift quite a bit of weights i try to stay in shape as much as i can i'm not like normal people i lift heavy weights and my elbow is extremely sore which elbow is it [patient] actually it's both my elbows but my right elbow is hurting me the most [doctor] okay and you said you lift a lot of weights [patient] mm-hmm [doctor] did you play any sports when you were younger [patient] no anything you can think of primarily it was basketball baseball and football [doctor] okay and did your elbows hurt at that time or is this a a new injury [patient] it's new [doctor] when did it start [patient] probably year and a half ago [doctor] okay on both elbows about a year and a half ago [patient] yeah [doctor] okay have you taken anything for the pain [patient] ibuprofen eight hundred milligrams three times a day [doctor] okay and does anything make it better or worse [patient] the more i use my hands or my arms the more it hurts 

In [10]:
# TODO: Batch this as opposed to one at a time
# or sample the dataset and ensure that taking into tokens (maybe something fun with a distribution)
# distribution = more stuff we can grab and throw into prompt in smart way
@weave.op()
async def get_task_description(data: List[DataPoint]) -> str:
    task_description = ""
    
    for i, datapoint in enumerate(data):
        input_data, output_data, annotation, note = datapoint[0], datapoint[1], datapoint[2], datapoint[3]
        
        prompt = f"""
        Current task description: {task_description}

        New datapoint:
        Input: {input_data}
        Output: {output_data}
        Annotation: {"Correct" if annotation == 1 else "Incorrect"}
        Note: {note}

        Based on this new datapoint and the current task description, provide an updated, more refined task description. 
        If this is the first datapoint, create an initial task description.
        Focus on:
        1. The nature of the input and output data
        2. The specific information being extracted or transformed
        3. Any formatting or style requirements
        4. Evaluation criteria (based on the annotation and note)

        Keep the description concise yet comprehensive.
        """

        response = await client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": prompt}],
            response_model=TaskDescription
        )
        
        new_description = response.description
        
        # TODO: Add guardrails to prevent LLM from saying no update needed
        if new_description.lower().startswith("no update needed"):
            continue
        
        task_description = new_description

    return task_description

In [11]:
llm_task_description = await get_task_description(data)

In [12]:
@weave.op()
async def combine_human_and_llm_descriptions(data: List[DataPoint], llm_description: str) -> str:
    human_descriptions = set()
    for dp in data:
        if len(dp) > 4 and dp[4]:  # Check if human description exists
            human_descriptions.add(dp[4])
    
    if not human_descriptions:
        return llm_description
    
    human_context = "\n".join(f"- {desc}" for desc in human_descriptions)
    
    prompt = f"""
    LLM-generated task description:
    {llm_description}

    Additional human-provided context:
    {human_context}

    Your task is to create a comprehensive, coherent task description that combines insights from both the LLM-generated description and the human-provided context. Ensure that:
    1. The final description is clear and concise.
    2. It incorporates key points from both sources.
    3. Any contradictions are resolved logically.
    4. The description maintains a professional tone.
    5. It provides a complete picture of the task requirements and evaluation criteria.

    Please provide the combined description in a single, well-structured paragraph.
    """

    response = await client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        response_model=CombinedTaskDescription
    )
    
    return response.description

In [13]:
finalized_task_description = await combine_human_and_llm_descriptions(data, llm_task_description)

In [14]:
finalized_task_description

'The task involves extracting and summarizing structured medical information from a dialogue transcript between a doctor and a patient. This includes the patient\'s complaints, medical history, examination findings, and treatment plans. You will input a WeaveDict containing the dialogue text and output a WeaveDict summarizing these details in a bullet list format. The summary must cover key medical components: chief complaint, history of present illness, physical examination findings, symptoms, medication changes with dosages, and follow-up instructions. If information is lacking, use \'N/A\'. It is crucial to strictly adhere to the format "-key: value" and ensure that personal identifiable information (PII) is omitted, using terms like \'the patient\' instead of names. The task demands conciseness, completeness, accuracy, privacy observance, and format adherence. Evaluation criteria include these aspects to ensure all required information is presented without unnecessary details, stay

In [15]:
def format_single_datapoint(dp: DataPoint, finalized_task_description: str) -> str:
    input_data, output_data, annotation, note = dp[0], dp[1], dp[2], dp[3]
    metrics_details = dp[5] if len(dp) > 5 else None

    formatted = [
        f"Task Description: {finalized_task_description}",
        "",
        "Input:",
        "\n".join(f"  {key.capitalize()}: {value}" for key, value in input_data.items()),
        "",
        "Output:",
        "\n".join(f"  {key.capitalize()}: {value}" for key, value in output_data.items()),
        "",
        f"Annotation: {'Correct' if annotation == 1 else 'Incorrect'}",
        f"Note: {note}"
    ]

    if metrics_details:
        formatted.append(f"Metrics Details: {metrics_details}")

    return "\n".join(formatted)

In [16]:
formatted_dp = format_single_datapoint(data[0], finalized_task_description)

In [17]:
pprint(formatted_dp)

('Task Description: The task involves extracting and summarizing structured '
 'medical information from a dialogue transcript between a doctor and a '
 "patient. This includes the patient's complaints, medical history, "
 'examination findings, and treatment plans. You will input a WeaveDict '
 'containing the dialogue text and output a WeaveDict summarizing these '
 'details in a bullet list format. The summary must cover key medical '
 'components: chief complaint, history of present illness, physical '
 'examination findings, symptoms, medication changes with dosages, and '
 "follow-up instructions. If information is lacking, use 'N/A'. It is crucial "
 'to strictly adhere to the format "-key: value" and ensure that personal '
 "identifiable information (PII) is omitted, using terms like 'the patient' "
 'instead of names. The task demands conciseness, completeness, accuracy, '
 'privacy observance, and format adherence. Evaluation criteria include these '
 'aspects to ensure all r

In [18]:
@weave.op()
async def process_criteria(formatted_dp: str, finalized_task_description: str) -> str:
    prompt = f"""
Task Description: {finalized_task_description}

Analyze the following annotated datapoint:

{formatted_dp}

Generate 1-3 evaluation criteria that can be used to assess the quality of outputs for this task. Consider the following guidelines:

1. If a 'Metrics Details' field is present in the datapoint, prioritize this information as it provides the most important evaluation criteria.
2. Focus on general aspects of quality that can be used across multiple outputs.
3. Consider criteria that address potential misalignment between LLM outputs and human preferences.
4. Include criteria that can be evaluated both by code and by LLM-based evaluators.
5. Think about criteria that might reveal hallucinations, instruction-following, or other common LLM issues.
6. Generate criteria that could help in debugging or improving the LLM pipeline.

Provide each criterion as a concise statement, followed by a brief explanation of why it's important and how it might be evaluated (e.g., via code, LLM evaluator, or human judgment).

Return the criteria in this format:
[Criterion]: [Brief explanation and evaluation method]
[Criterion]: [Brief explanation and evaluation method]
[Criterion]: [Brief explanation and evaluation method]

Aim for a mix of straightforward, code-evaluable criteria and more nuanced criteria that might require LLM or human evaluation.
"""
    response = await client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        response_model=EvaluationCriteria
    )
    return response

@weave.op()
async def generate_criteria(data: List[DataPoint], finalized_task_description: str) -> List[Criterion]:
    all_criteria = []

    async def process_datapoint(dp):
        formatted_dp = format_single_datapoint(dp, finalized_task_description)
        response = (await process_criteria(formatted_dp, finalized_task_description)).criteria
        return response

    tasks = [process_datapoint(dp) for dp in data]
    results = await asyncio.gather(*tasks)

    for new_criteria in results:
        # TODO: Add an additional check to see if a nearly identical criterion is already in the list
        all_criteria.extend(new_criteria)
    


    return all_criteria

In [19]:
# Generate criteria
criteria = await generate_criteria(data, finalized_task_description)

In [20]:
criteria[0]

Criterion(criterion='Completeness and presence of key medical components', explanation='The summary must address all required fields, such as chief complaint, history of present illness, physical examination findings, etc. Missing components can lead to incomplete medical information. This can be evaluated using code-based assertions to check for the presence of the specified keys.', evaluation_method='code')

In [21]:
criteria

[Criterion(criterion='Completeness and presence of key medical components', explanation='The summary must address all required fields, such as chief complaint, history of present illness, physical examination findings, etc. Missing components can lead to incomplete medical information. This can be evaluated using code-based assertions to check for the presence of the specified keys.', evaluation_method='code'),
 Criterion(criterion='Adherence to formatting and privacy standards', explanation='The structured format (key-value bullet points) and the non-disclosure of PII help in maintaining consistency and privacy, essential for handling sensitive medical information. Format adherence can be assessed through code, while privacy compliance may require LLM evaluation to ensure no personal details are included.', evaluation_method='llm'),
 Criterion(criterion='Conciseness and relevance', explanation='The output must present information succinctly and focus on relevant medical details withou

In [22]:
@weave.op()
async def create_candidate_assertions(formatted_data_string: str, criterion: Criterion) -> CriterionAssertions:
    prompt = f"""
Given the following evaluation criterion and annotated data, generate 1-3 specific, testable assertions:

Criterion: {criterion.dict()}

Annotated data: {formatted_data_string}

Your task is to create assertions that can be used to evaluate LLM outputs based on this criterion. Follow these guidelines:

1. Make each assertion clear, concise, and directly related to the criterion
2. For Python assertions:
   - Provide a valid Python method that can be used within a unittest.TestCase class
   - Ensure the method name is in snake case and starts with test_
   - The method should take 'self' as the only input, where 'self.output' is a dictionary containing the LLM output being evaluated
   - The 'self.output' dictionary will have the same keys and shape as the output in the annotated data
   - Use unittest assertion methods (e.g., self.assertTrue, self.assertEqual) to test the output
   - The test should pass if the assertion is met, and fail otherwise
   - Only use the keys and shapes present in the annotated data output for your assertions
3. For LLM assertions:
   - Provide a clear, detailed prompt for an LLM to evaluate the assertion
   - The prompt should guide the LLM to return "PASS" or "FAIL" based on the evaluation
4. Include a mix of positive and negative assertions where appropriate
5. Consider edge cases and potential failure modes for the criterion
6. Aim for assertions that could be applied across multiple types of outputs

Ensure that your assertions are directly evaluable and avoid vague or subjective language. Focus on creating assertions that align with human preferences and can be used to validate the quality of LLM-generated evaluations.

Format your response as a JSON object with the following structure:
{{
  "assertions": [
    {{
      "test_name": "Name of the test case method in snake case",
      "text" or "code": "Assertion text or code",
      "evaluation_type": "python" or "llm"
    }},
    ...
  ]
}}
"""
    response = await client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        response_model=CriterionAssertions
    )
    return response

In [23]:
#TODO: improve this function
def format_all_datapoints(data: List[DataPoint], finalized_task_description: str) -> str:
    formatted = [f"Task Description: {finalized_task_description}\n"]
    
    for i, dp in enumerate(data, 1):
        input_data, output_data, annotation, note = dp[0], dp[1], dp[2], dp[3]
        
        formatted.extend([
            f"Example {i}:",
            "Input:",
            json.dumps(input_data, indent=2),
            "",
            "Output:",
            json.dumps(output_data, indent=2),
            "",
            f"Annotation: {'Correct' if annotation == 1 else 'Incorrect'}",
            f"Note: {note}",
            "\n" + "-"*50 + "\n"  # Separator between examples
        ])
    
    return "\n".join(formatted)

In [24]:
formatted_data = format_all_datapoints(data, finalized_task_description)

In [25]:
pprint(formatted_data)

('Task Description: The task involves extracting and summarizing structured '
 'medical information from a dialogue transcript between a doctor and a '
 "patient. This includes the patient's complaints, medical history, "
 'examination findings, and treatment plans. You will input a WeaveDict '
 'containing the dialogue text and output a WeaveDict summarizing these '
 'details in a bullet list format. The summary must cover key medical '
 'components: chief complaint, history of present illness, physical '
 'examination findings, symptoms, medication changes with dosages, and '
 "follow-up instructions. If information is lacking, use 'N/A'. It is crucial "
 'to strictly adhere to the format "-key: value" and ensure that personal '
 "identifiable information (PII) is omitted, using terms like 'the patient' "
 'instead of names. The task demands conciseness, completeness, accuracy, '
 'privacy observance, and format adherence. Evaluation criteria include these '
 'aspects to ensure all r

In [26]:
# evalgen.ipynb

@weave.op()
async def generate_all_assertions(criteria, formatted_data):
    async def process_criterion(criterion):
        candidate_assertions = await create_candidate_assertions(formatted_data, criterion)
        assertions = candidate_assertions.assertions
        return criterion, assertions

    tasks = [process_criterion(criterion) for criterion in criteria]
    results = await asyncio.gather(*tasks)

    # Use the alternative constructor
    criterion_assertion_map = CriterionAssertionMap.from_assertions(results)

    return criterion_assertion_map

In [27]:
# Usage
all_assertions = await generate_all_assertions(criteria, formatted_data)

In [28]:
all_assertions.criterion_to_assertions

{'Completeness and presence of key medical components': [PythonAssertion(test_name='test_presence_of_required_fields', code="def test_presence_of_required_fields(self):\n    required_fields = ['chief complaint', 'history of present illness', 'physical examination', 'symptoms experienced by the patient', 'new medications prescribed or changed', 'follow-up instructions']\n    for field in required_fields:\n        self.assertIn(field, self.output)\n", evaluation_type='python'),
  PythonAssertion(test_name='test_correctness_of_field_values', code="def test_correctness_of_field_values(self):\n    self.assertEqual(self.output['chief complaint'], 'Bilateral elbow pain, right worse than left.')\n    self.assertEqual(self.output['new medications prescribed or changed'], 'N/A')\n", evaluation_type='python'),
  LLMAssertion(test_name='evaluate_key_medical_components', text="Evaluate the given summary to ensure it contains all key medical components such as chief complaint, history of present ill

In [29]:
# For Python assertions
python_assertions = {
    criterion: [assertion for assertion in assertions if isinstance(assertion, PythonAssertion)]
    for criterion, assertions in all_assertions.criterion_to_assertions.items()
}

# For LLM assertions
llm_assertions = {
    criterion: [assertion for assertion in assertions if isinstance(assertion, LLMAssertion)]
    for criterion, assertions in all_assertions.criterion_to_assertions.items()
}

In [30]:
# Print or use the results
print("Python Assertions:")
for criterion, assertions in python_assertions.items():
    if assertions:  # Only print if there are assertions for this criterion
        print(f"\n{criterion}:")
        for assertion in assertions:
            print(f"  - {assertion.test_name}")

Python Assertions:

Completeness and presence of key medical components:
  - test_presence_of_required_fields
  - test_correctness_of_field_values

Adherence to formatting and privacy standards:
  - test_format_adherence

Conciseness and relevance:
  - test_conciseness_within_word_limit
  - test_format_adherence_bullet_points

Inclusion of Key Components:
  - test_inclusion_of_all_components
  - test_correct_formatting

Privacy and PII Omission:
  - test_no_pii_in_output
  - test_patient_reference_as_generic

Conciseness and Word Limit:
  - test_summary_word_limit
  - test_format_conciseness

Word count limitation:
  - test_word_count_within_limit
  - test_word_count_edge_case

Presence of targeted keys:
  - test_presence_of_all_medical_components
  - test_correct_format_and_structure
  - test_absence_of_personal_information


In [31]:
print("\nLLM Assertions:")
for criterion, assertions in llm_assertions.items():
    if assertions:  # Only print if there are assertions for this criterion
        print(f"\n{criterion}:")
        for assertion in assertions:
            print(f"  - {assertion.test_name}")


LLM Assertions:

Completeness and presence of key medical components:
  - evaluate_key_medical_components

Adherence to formatting and privacy standards:
  - test_no_inclusion_of_pii
  - test_conciseness_and_completeness

Conciseness and relevance:
  - test_medical_relevance_check

Inclusion of Key Components:
  - validate_completeness

Privacy and PII Omission:
  - llm_response_does_not_contain_pii

Conciseness and Word Limit:
  - test_information_completeness

Word count limitation:
  - evaluate_word_count_limit

Absence of personal identifiable information (PII):
  - absence_of_pii_check
  - use_of_placeholders_for_pii
  - no_personal_info_in_output


In [32]:
def convert_datapoint_to_example(task_description: str, data: List[DataPoint]) -> List[Dict[str, Any]]:
    examples = []
    for dp in data:
        input_data, output_data, annotation, note = dp[0], dp[1], dp[2], dp[3]
        examples.append({
            "task_description": task_description,
            "input_data": input_data,
            "model_output": {"output": output_data},
            "annotation": annotation,
            "note": note
        })
    return examples

In [33]:
annotation_examples = convert_datapoint_to_example(finalized_task_description, data)

In [34]:

# Initialize the AssertionScorer with the assertions
scorer = AssertionScorer(
   criterion_assertion_map=all_assertions,
    llm_model="gpt-4o-2024-08-06",
    prompt_template="""
Task Description:
{task_description}

Evaluate the following output based on the given task, input, and assertion:

Input:
{input_data}

Output:
{model_output}

Assertion:
{assertion_text}

Consider the task description and input when evaluating the output against the assertion.
Respond with either 'PASS' if the output meets the assertion criteria in the context of the task and input, or 'FAIL' if it does not.
""",
    system_prompt="You are an AI assistant evaluating the quality of text outputs based on given tasks, inputs, and assertions."
)


# TODO: figure out how to get each examples individual results as opposed to aggregate
# Create a custom summarize function?
evaluation = weave.Evaluation(
    scorers=[scorer],
    dataset=annotation_examples,
)


assertion_results = asyncio.run(evaluation.evaluate(predict_passthrough))



In [35]:
assertion_results

{'AssertionScorer': {'Conciseness and Word Limit': {'test_information_completeness': {'score': {'mean': 1.0}},
   'test_summary_word_limit': {'score': {'mean': 0.0}},
   'test_format_conciseness': {'score': {'mean': 0.0}}},
  'Privacy and PII Omission': {'llm_response_does_not_contain_pii': {'score': {'mean': 1.0}},
   'test_no_pii_in_output': {'score': {'mean': 0.0}},
   'test_patient_reference_as_generic': {'score': {'mean': 0.0}}},
  'Inclusion of Key Components': {'validate_completeness': {'score': {'mean': 1.0}},
   'test_inclusion_of_all_components': {'score': {'mean': 0.0}},
   'test_correct_formatting': {'score': {'mean': 0.0}}},
  'Completeness and presence of key medical components': {'evaluate_key_medical_components': {'score': {'mean': 1.0}},
   'test_presence_of_required_fields': {'score': {'mean': 0.0}},
   'test_correctness_of_field_values': {'score': {'mean': 0.0}}},
  'Absence of personal identifiable information (PII)': {'absence_of_pii_check': {'score': {'mean': 0.33

In [36]:
@weave.op()
async def evaluate(scorer: AssertionScorer, annotation_examples: List[Dict[str, Any]]) -> Dict[str, Dict[str, List[Tuple[int, int]]]]:
    # The outer dict maps criterion names to assertion results
    criterion_assertion_results = {}

    async def process_example(example):
        result = await scorer.score(
            model_output={"output": example["model_output"]["output"]},
            task_description=example["task_description"],
            input_data=example["input_data"]
        )
        return result, example["annotation"]

    # Run all examples concurrently
    results = await asyncio.gather(*[process_example(example) for example in annotation_examples])

    # Process the results to accumulate scores
    for (criterion_results, human_annotation) in results:
        # criterion_results is a dict mapping criteria to their assertion results
        for criterion, assertion_results in criterion_results.items():
            if criterion not in criterion_assertion_results:
                criterion_assertion_results[criterion] = {}
            for assertion_name, score in assertion_results.items():
                if assertion_name not in criterion_assertion_results[criterion]:
                    criterion_assertion_results[criterion][assertion_name] = []
                # Append the (score, human_annotation) tuple
                criterion_assertion_results[criterion][assertion_name].append((score, human_annotation))

    return criterion_assertion_results

In [37]:
assertion_results = asyncio.run(evaluate(scorer, annotation_examples))

In [38]:
assertion_results

{'Completeness and presence of key medical components': {'evaluate_key_medical_components': [({'score': 1,
     'result': 'PASS',
     'type': 'llm'},
    1),
   ({'score': 1, 'result': 'PASS', 'type': 'llm'}, 1),
   ({'score': 1, 'result': 'PASS', 'type': 'llm'}, 1)],
  'test_correctness_of_field_values': [({'score': 0,
     'result': 'ERROR',
     'type': 'code'},
    1),
   ({'score': 0, 'result': 'ERROR', 'type': 'code'}, 1),
   ({'score': 0, 'result': 'ERROR', 'type': 'code'}, 1)],
  'test_presence_of_required_fields': [({'score': 0,
     'result': 'FAIL',
     'type': 'code'},
    1),
   ({'score': 0, 'result': 'FAIL', 'type': 'code'}, 1),
   ({'score': 0, 'result': 'FAIL', 'type': 'code'}, 1)]},
 'Adherence to formatting and privacy standards': {'test_no_inclusion_of_pii': [({'score': 1,
     'result': 'PASS',
     'type': 'llm'},
    1),
   ({'score': 1, 'result': 'PASS', 'type': 'llm'}, 1),
   ({'score': 1, 'result': 'PASS', 'type': 'llm'}, 1)],
  'test_conciseness_and_complet

1. **Selectivity**:
   ```python
   selectivity = passes / total_outputs
   ```
   Selectivity measures how often an assertion passes LLM outputs. A lower selectivity means the assertion is more "picky" or strict.

2. **Coverage**:
   ```python
   coverage = fails_on_bad / total_bad if total_bad > 0 else 0
   ```
   Coverage measures how well our assertions catch the outputs that humans marked as bad. A higher coverage means we're better at identifying problematic outputs.

3. **False Failure Rate (FFR)**:
   ```python
   ffr = fails_on_good / total_good if total_good > 0 else 0
   ```
   FFR shows how often our assertions incorrectly fail outputs that humans thought were good. A lower FFR is better, as it means we're not being overly strict.

4. **Alignment**:
   ```python
   alignment = 2 * (coverage * (1 - ffr)) / (coverage + (1 - ffr)) if (coverage + (1 - ffr)) > 0 else 0
   ```
   Alignment combines coverage and FFR into a single score. It represents how well our automated evaluations match human judgments overall.

These metrics help us refine our assertion set over time, aiming to catch more bad outputs while avoiding false alarms on good ones

In [39]:
def calculate_metrics(assertion_results: Dict[str, Dict[str, List[Tuple[Dict[str, Any], int]]]]) -> Dict[str, Any]:
    metrics = {}

    for criterion, assertions in assertion_results.items():
        criterion_metrics = {}
        # Initialize per-criterion totals
        criterion_total_outputs = 0
        criterion_total_bad = 0
        criterion_total_good = 0

        criterion_passes = 0
        criterion_fails = 0
        criterion_fails_on_bad = 0
        criterion_fails_on_good = 0

        for assertion, results in assertions.items():
            # Process each assertion
            total_outputs = len(results)
            total_bad = sum(1 for _, human_annotation in results if human_annotation == 0)
            total_good = total_outputs - total_bad

            passes = sum(1 for score_dict, _ in results if score_dict['score'] == 1)
            fails = total_outputs - passes
            fails_on_bad = sum(1 for score_dict, human_annotation in results if human_annotation == 0 and score_dict['score'] == 0)
            fails_on_good = sum(1 for score_dict, human_annotation in results if human_annotation == 1 and score_dict['score'] == 0)

            selectivity = passes / total_outputs if total_outputs > 0 else 0
            coverage = fails_on_bad / total_bad if total_bad > 0 else 1  # If no bad outputs, perfect coverage
            ffr = fails_on_good / total_good if total_good > 0 else 0  # If no good outputs, no false failures

            # Calculate alignment
            if coverage + (1 - ffr) > 0:
                alignment = 2 * (coverage * (1 - ffr)) / (coverage + (1 - ffr))
            else:
                alignment = 0

            eval_type = results[0][0]['type'] if results else "unknown"

            # Store per-assertion metrics
            criterion_metrics[assertion] = {
                "type": eval_type,
                "selectivity": selectivity,
                "coverage": coverage,
                "ffr": ffr,
                "alignment": alignment,
                "total_outputs": total_outputs,
                "total_good": total_good,
                "total_bad": total_bad,
                "passes": passes,
                "fails": fails,
                "fails_on_bad": fails_on_bad,
                "fails_on_good": fails_on_good
            }

            # Aggregate totals for criterion-level metrics
            criterion_total_outputs += total_outputs
            criterion_total_bad += total_bad
            criterion_total_good += total_good
            criterion_passes += passes
            criterion_fails += fails
            criterion_fails_on_bad += fails_on_bad
            criterion_fails_on_good += fails_on_good

        # Compute per-criterion metrics
        criterion_selectivity = criterion_passes / criterion_total_outputs if criterion_total_outputs > 0 else 0
        criterion_coverage = criterion_fails_on_bad / criterion_total_bad if criterion_total_bad > 0 else 1
        criterion_ffr = criterion_fails_on_good / criterion_total_good if criterion_total_good > 0 else 0

        if criterion_coverage + (1 - criterion_ffr) > 0:
            criterion_alignment = 2 * (criterion_coverage * (1 - criterion_ffr)) / (criterion_coverage + (1 - criterion_ffr))
        else:
            criterion_alignment = 0

        # Store both per-assertion and per-criterion metrics
        metrics[criterion] = {
            "per_assertion": criterion_metrics,
            "criterion_metrics": {
                "selectivity": criterion_selectivity,
                "coverage": criterion_coverage,
                "ffr": criterion_ffr,
                "alignment": criterion_alignment,
                "total_outputs": criterion_total_outputs,
                "total_good": criterion_total_good,
                "total_bad": criterion_total_bad,
                "passes": criterion_passes,
                "fails": criterion_fails,
                "fails_on_bad": criterion_fails_on_bad,
                "fails_on_good": criterion_fails_on_good
            }
        }

    return metrics

In [40]:
# Calculate metrics
metrics = calculate_metrics(assertion_results)
print("Assertion metrics:", json.dumps(metrics, indent=2))

Assertion metrics: {
  "Completeness and presence of key medical components": {
    "per_assertion": {
      "evaluate_key_medical_components": {
        "type": "llm",
        "selectivity": 1.0,
        "coverage": 1,
        "ffr": 0.0,
        "alignment": 1.0,
        "total_outputs": 3,
        "total_good": 3,
        "total_bad": 0,
        "passes": 3,
        "fails": 0,
        "fails_on_bad": 0,
        "fails_on_good": 0
      },
      "test_correctness_of_field_values": {
        "type": "code",
        "selectivity": 0.0,
        "coverage": 1,
        "ffr": 1.0,
        "alignment": 0.0,
        "total_outputs": 3,
        "total_good": 3,
        "total_bad": 0,
        "passes": 0,
        "fails": 3,
        "fails_on_bad": 0,
        "fails_on_good": 3
      },
      "test_presence_of_required_fields": {
        "type": "code",
        "selectivity": 0.0,
        "coverage": 1,
        "ffr": 1.0,
        "alignment": 0.0,
        "total_outputs": 3,
        "total

In [41]:
def select_best_assertions(
    metrics: Dict[str, Any],
    assertion_results: Dict[str, Dict[str, List[Tuple[Dict[str, Any], int]]]],
    num_assertions_per_criterion: int = None
) -> Dict[str, Dict[str, str]]:
    """
    Select the subset of assertions that maximize the alignment score per criterion.

    Args:
        metrics: Dictionary containing metrics per criterion and per assertion.
        assertion_results: Original assertion results used by calculate_metrics.
        num_assertions_per_criterion: Number of assertions to select per criterion.
            If None, intelligently select the subset that maximizes the criterion alignment.

    Returns:
        A dictionary mapping each criterion to its selected assertions and their types.
        Format: {criterion: {assertion_name: assertion_type, ...}, ...}
    """
    from itertools import combinations

    best_assertions = {}

    for criterion in assertion_results.keys():
        all_assertions = list(assertion_results[criterion].keys())

        if not num_assertions_per_criterion:
            # Intelligently select the subset of assertions that maximize the criterion's alignment score

            # Initialize variables to keep track of the best subset
            max_alignment = -1
            best_subset = []

            # Generate all possible non-empty subsets of assertions
            num_assertions = len(all_assertions)
            for r in range(1, num_assertions + 1):
                for subset in combinations(all_assertions, r):
                    # Create subset of assertion_results
                    subset_assertion_results = {
                        criterion: {
                            assertion: assertion_results[criterion][assertion]
                            for assertion in subset
                        }
                    }

                    # Calculate metrics for this subset
                    subset_metrics = calculate_metrics(subset_assertion_results)

                    # Get the alignment score for this criterion
                    alignment = subset_metrics[criterion]['criterion_metrics']['alignment']

                    # If the alignment score is better, update best_subset
                    if alignment > max_alignment:
                        max_alignment = alignment
                        best_subset = subset

            # Store the best subset for this criterion
            best_assertions[criterion] = {
                assertion_name: metrics[criterion]['per_assertion'][assertion_name]['type']
                for assertion_name in best_subset
            }
        else:
            # When num_assertions_per_criterion is specified, select top N assertions

            # Get per_assertion_metrics
            per_assertion_metrics = metrics[criterion]['per_assertion']

            # Sort assertions by alignment score in descending order
            sorted_assertions = sorted(
                per_assertion_metrics.items(),
                key=lambda item: item[1]['alignment'],
                reverse=True
            )

            # Select top N assertions per criterion
            selected_assertions = sorted_assertions[:num_assertions_per_criterion]

            # Store the selected assertions along with their types
            best_assertions[criterion] = {
                assertion_name: assertion_metrics['type']
                for assertion_name, assertion_metrics in selected_assertions
            }

    return best_assertions

In [42]:
# Assuming 'metrics' and 'assertion_results' are already defined
best_assertions = select_best_assertions(
    metrics,
    assertion_results,
    num_assertions_per_criterion=None  # Use intelligent selection
)

# Now, 'best_assertions' contains the selected assertions per criterion that maximize the alignment score
print(json.dumps(best_assertions, indent=2))

{
  "Completeness and presence of key medical components": {
    "evaluate_key_medical_components": "llm"
  },
  "Adherence to formatting and privacy standards": {
    "test_no_inclusion_of_pii": "llm"
  },
  "Conciseness and relevance": {
    "test_medical_relevance_check": "llm"
  },
  "Inclusion of Key Components": {
    "validate_completeness": "llm"
  },
  "Privacy and PII Omission": {
    "llm_response_does_not_contain_pii": "llm"
  },
  "Conciseness and Word Limit": {
    "test_information_completeness": "llm"
  },
  "Word count limitation": {
    "evaluate_word_count_limit": "llm"
  },
  "Absence of personal identifiable information (PII)": {
    "use_of_placeholders_for_pii": "llm"
  },
  "Presence of targeted keys": {
    "test_absence_of_personal_information": "code"
  }
}


In [43]:
#TODO: Also add filters based on criteria so no two assertions solve the same criteria
best_assertions

{'Completeness and presence of key medical components': {'evaluate_key_medical_components': 'llm'},
 'Adherence to formatting and privacy standards': {'test_no_inclusion_of_pii': 'llm'},
 'Conciseness and relevance': {'test_medical_relevance_check': 'llm'},
 'Inclusion of Key Components': {'validate_completeness': 'llm'},
 'Privacy and PII Omission': {'llm_response_does_not_contain_pii': 'llm'},
 'Conciseness and Word Limit': {'test_information_completeness': 'llm'},
 'Word count limitation': {'evaluate_word_count_limit': 'llm'},
 'Absence of personal identifiable information (PII)': {'use_of_placeholders_for_pii': 'llm'},
 'Presence of targeted keys': {'test_absence_of_personal_information': 'code'}}

In [44]:
def filter_assertion_results(
    assertion_results: Dict[str, Dict[str, Any]],
    best_assertions: Dict[str, Dict[str, str]]
) -> Dict[str, Dict[str, Any]]:
    filtered_results = {}

    for criterion, assertions in best_assertions.items():
        if criterion in assertion_results:
            filtered_results[criterion] = {
                assertion_name: assertion_results[criterion][assertion_name]
                for assertion_name in assertions
                if assertion_name in assertion_results[criterion]
            }
    return filtered_results

# Filter the assertion_results
filtered_assertion_results = filter_assertion_results(assertion_results, best_assertions)

# Recalculate metrics with the filtered results
new_metrics = calculate_metrics(filtered_assertion_results)

In [52]:
def select_best_criteria(
    metrics: Dict[str, Any],
    alignment_threshold: float,
    num_criteria: Optional[int] = None
) -> Dict[str, Any]:
    # Sort criteria by alignment score in descending order
    sorted_criteria = sorted(
        metrics.items(),
        key=lambda x: x[1]['criterion_metrics']['alignment'],
        reverse=True
    )
    
    # Filter based on alignment threshold
    threshold_filtered = [
        (criterion, data) for criterion, data in sorted_criteria
        if data['criterion_metrics']['alignment'] >= alignment_threshold
    ]
    
    # If num_criteria is specified, limit the number of criteria
    if num_criteria is not None:
        threshold_filtered = threshold_filtered[:num_criteria]
    
    # Convert back to dictionary
    best_criteria = dict(threshold_filtered)
    
    return best_criteria

# Example usage
alignment_threshold = 0.7  # Set your desired threshold
num_criteria = 3  # Set the maximum number of criteria you want to select
best_criteria = select_best_criteria(new_metrics, alignment_threshold, num_criteria)

print(f"Selected top {num_criteria} Criteria based on alignment threshold {alignment_threshold}:")
for criterion in best_criteria:
    alignment = best_criteria[criterion]['criterion_metrics']['alignment']
    print(f"- {criterion} (Alignment: {alignment:.2f})")

Selected top 3 Criteria based on alignment threshold 0.7:
- Completeness and presence of key medical components (Alignment: 1.00)
- Adherence to formatting and privacy standards (Alignment: 1.00)
- Conciseness and relevance (Alignment: 1.00)


In [53]:
best_criteria

{'Completeness and presence of key medical components': {'per_assertion': {'evaluate_key_medical_components': {'type': 'llm',
    'selectivity': 1.0,
    'coverage': 1,
    'ffr': 0.0,
    'alignment': 1.0,
    'total_outputs': 3,
    'total_good': 3,
    'total_bad': 0,
    'passes': 3,
    'fails': 0,
    'fails_on_bad': 0,
    'fails_on_good': 0}},
  'criterion_metrics': {'selectivity': 1.0,
   'coverage': 1,
   'ffr': 0.0,
   'alignment': 1.0,
   'total_outputs': 3,
   'total_good': 3,
   'total_bad': 0,
   'passes': 3,
   'fails': 0,
   'fails_on_bad': 0,
   'fails_on_good': 0}},
 'Adherence to formatting and privacy standards': {'per_assertion': {'test_no_inclusion_of_pii': {'type': 'llm',
    'selectivity': 1.0,
    'coverage': 1,
    'ffr': 0.0,
    'alignment': 1.0,
    'total_outputs': 3,
    'total_good': 3,
    'total_bad': 0,
    'passes': 3,
    'fails': 0,
    'fails_on_bad': 0,
    'fails_on_good': 0}},
  'criterion_metrics': {'selectivity': 1.0,
   'coverage': 1,
   'f

In [47]:
# Assuming all_assertions, best_criteria, and the original criteria list are available

# Create a new CriterionAssertionMap instance
filtered_criterion_assertion_map = CriterionAssertionMap()

# Create a dictionary for quick lookup of original Criterion objects
original_criteria = {c.criterion: c for c in criteria}

# Filter and add assertions based on best_criteria
for criterion_name, criterion_data in best_criteria.items():
    if criterion_name in original_criteria:
        original_criterion = original_criteria[criterion_name]
        for assertion_name in criterion_data['per_assertion'].keys():
            assertions = all_assertions.get_assertions_by_criterion(criterion_name)
            if assertions:
                for assertion in assertions:
                    if assertion.test_name == assertion_name:
                        filtered_criterion_assertion_map.add_assertion(
                            original_criterion,
                            assertion
                        )

# Now filtered_criterion_assertion_map contains only the best criteria and their corresponding assertions

In [48]:
filtered_criterion_assertion_map.assertion_to_criterion

{'evaluate_key_medical_components': 'Completeness and presence of key medical components',
 'test_no_inclusion_of_pii': 'Adherence to formatting and privacy standards',
 'test_medical_relevance_check': 'Conciseness and relevance',
 'validate_completeness': 'Inclusion of Key Components',
 'llm_response_does_not_contain_pii': 'Privacy and PII Omission',
 'test_information_completeness': 'Conciseness and Word Limit',
 'evaluate_word_count_limit': 'Word count limitation',
 'use_of_placeholders_for_pii': 'Absence of personal identifiable information (PII)',
 'test_absence_of_personal_information': 'Presence of targeted keys'}

In [49]:
# Initialize the AssertionScorer with the assertions
final_judge = AssertionScorer(
    name="final_judge",
    criterion_assertion_map=filtered_criterion_assertion_map,
    llm_model="gpt-4o-2024-08-06",
    prompt_template="""
Task Description:
{task_description}

Evaluate the following output based on the given task, input, and assertion:

Input:
{input_data}

Output:
{model_output}

Assertion:
{assertion_text}

Consider the task description and input when evaluating the output against the assertion.
Respond with either 'PASS' if the output meets the assertion criteria in the context of the task and input, or 'FAIL' if it does not.
""",
    system_prompt="You are an AI assistant evaluating the quality of text outputs based on given tasks, inputs, and assertions."
)


evaluation = weave.Evaluation(
    scorers=[final_judge],
    dataset=annotation_examples,
)


final_judge_results = asyncio.run(evaluation.evaluate(predict_passthrough))

In [50]:
final_judge_results

{'final_judge': {'Conciseness and Word Limit': {'test_information_completeness': {'score': {'mean': 0.6666666666666666}}},
  'Privacy and PII Omission': {'llm_response_does_not_contain_pii': {'score': {'mean': 1.0}}},
  'Inclusion of Key Components': {'validate_completeness': {'score': {'mean': 1.0}}},
  'Completeness and presence of key medical components': {'evaluate_key_medical_components': {'score': {'mean': 1.0}}},
  'Absence of personal identifiable information (PII)': {'use_of_placeholders_for_pii': {'score': {'mean': 1.0}}},
  'Adherence to formatting and privacy standards': {'test_no_inclusion_of_pii': {'score': {'mean': 1.0}}},
  'Conciseness and relevance': {'test_medical_relevance_check': {'score': {'mean': 1.0}}},
  'Word count limitation': {'evaluate_word_count_limit': {'score': {'mean': 1.0}}},
  'Presence of targeted keys': {'test_absence_of_personal_information': {'score': {'mean': 1.0}}}},
 'model_latency': {'mean': 0.28713266054789227}}

In [51]:
weave.publish(final_judge, name="final_judge")

ðŸ“¦ Published to https://wandb.ai/a-sh0ts/evalgen_test_914532/weave/objects/final_judge/versions/wGi4q1QkH7YDldsGIwnw0XP5BZkhdjpXlaUtN5UwXV4


ObjectRef(entity='a-sh0ts', project='evalgen_test_914532', name='final_judge', digest='wGi4q1QkH7YDldsGIwnw0XP5BZkhdjpXlaUtN5UwXV4', extra=())