In [1]:
import openai
import json
from typing import List, Tuple, Dict, Optional
from collections import defaultdict
import instructor
import weave
from set_env import set_env
from pydantic import BaseModel, Field
from typing import List, Literal, Union
import pandas as pd

In [2]:
set_env("OPENAI_API_KEY")
set_env("WANDB_API_KEY")

        Unable to set WANDB_API_KEY=WANDB_API_KEY,
        not in colab or Secrets not set, not kaggle
        or Secrets not set, no .env/dotenv/env file
        in the current working dir or parent dirs.[0m


loading envfile='/Users/anishshah/Documents/Manual Library/GitHub(1)/improve-evals/.env' with dotenv_values(envfile)


In [3]:
client = instructor.from_openai(openai.OpenAI())

In [4]:
DataPoint = Tuple[dict, dict, Literal[0, 1], str, Optional[str], Optional[str]]  # (input, output, annotation, note, human_description_for_task_or_judge, human_description_for_metric_details)

In [5]:
MODEL = "gpt-4o-2024-08-06"

In [None]:
medical_task = """
You are extracting insights from some medical records.
The records contain a medical note and a
dialogue between a doctor and a patient. You need
to extract values for the following: Chief
complaint, History of present illness, Physical
examination, symptoms experienced by the patient,
New medications prescribed or changed, including
dosages (N/A if not provided), and Follow-up
instructions (N/A if not provided). Your answer
should not include any personal identifiable
information (PII) such as name, age, gender, or
ID. Use "the patient" instead of their name, for
example. Return your answer as a bullet list,
where each bullet is formatted like •chief
complaint: xx. If there is no value for the key,
the value should be N/A. Keep your response
around 150 words (you may have to summarize some
extracted values to stay within the word limit).
{transcript}
"""
medical_metric_details = """
word count, presence of the six targeted keys, and absence of PII, with the first two implemented via code- based assertions and the last via an LLM evaluator
"""
medical_dataset_url = "https://raw.githubusercontent.com/wyim/aci-bench/main/data/challenge_data/train.csv"

def load_medical_data(url: str) -> List[DataPoint]:
    df = pd.read_csv(url)
    return df

In [None]:
product_task = """
You are an expert copywriter. You need to write an e-
commerce product description based on the product
details and customer reviews. Your description
should be SEO-optimized. It should use an active
voice and include the product's features,
benefits, unique selling points without
overpromising, and a call to action for the buyer
• Benefits describe how product features will
work for the buyer, addressing exactly how the
product will improve their lives. Clearly
distinguish between features (e.g., lightweight,
USB-chargeable) and benefits (e.g., convenience,
nutritious drinks on-the-go). Don't mention
weaknesses of the product or use generic or
repetitive language. Don't make up review text or
quotes. Don't include any links. Don't cite the
reviews too heavily. Divide your description into
readable chunks divided by relevant subheadings.
Keep your description around 200 words, no more
than 300, in Markdown format.
{document}
"""
product_metric_details = """
absence of negative reviews, absence of links, adherence to markdown format, and word count limitation, with only the first criterion requiring LLM implementation
"""
product_dataset_url = ""

In [None]:
TEST_TASK = "medical"
if TEST_TASK == "medical":
    task = medical_task
    metric_details = medical_metric_details
    data = 
elif TEST_TASK == "product":
    task = product_task
    metric_details = product_metric_details
    dataset_url = product_dataset_url
else:
    task = None
    metric_details = None
    dataset_url = None

In [None]:
# data = ML use cases from call transcripts

In [16]:
data = [
    ({"text": "Summarize the impact of climate change on polar bears."}, {"text": "Climate change is reducing sea ice, which polar bears rely on for hunting seals."}, 1, "Accurate and relevant."),
    ({"text": "Explain the process of photosynthesis."}, {"text": "Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water."}, 1, "Correct and detailed."),
    ({"text": "What are the main causes of the American Civil War?"}, {"text": "The main causes were slavery, states' rights, and economic differences."}, 1, "Concise and accurate."),
    ({"text": "Describe the symptoms of COVID-19."}, {"text": "COVID-19 is caused by a virus that originated in bats."}, 0, "Irrelevant and incorrect."),
    ({"text": "What is the significance of the Magna Carta?"}, {"text": "The Magna Carta was a document that limited the power of the king and established certain legal rights."}, 1, "Historically accurate and relevant.")
]

In [42]:
def format_data(data: List[DataPoint]) -> str:
    formatted_data = [
        f"{i+1}. Input: {dp[0]['text']}\n"
        f"   Output: {dp[1]['text']}\n"
        f"   Annotation: {'Correct' if dp[2] == 1 else 'Incorrect'}\n"
        f"   Note: {dp[3]}"
        for i, dp in enumerate(data)
    ]
    return formatted_data

In [44]:
#Maintain key names in the formatted_string
formatted_data = format_data(data)
formatted_data_string = "\n\n".join(formatted_data)
print(formatted_data_string)

1. Input: Summarize the impact of climate change on polar bears.
   Output: Climate change is reducing sea ice, which polar bears rely on for hunting seals.
   Annotation: Correct
   Note: Accurate and relevant.

2. Input: Explain the process of photosynthesis.
   Output: Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water.
   Annotation: Correct
   Note: Correct and detailed.

3. Input: What are the main causes of the American Civil War?
   Output: The main causes were slavery, states' rights, and economic differences.
   Annotation: Correct
   Note: Concise and accurate.

4. Input: Describe the symptoms of COVID-19.
   Output: COVID-19 is caused by a virus that originated in bats.
   Annotation: Incorrect
   Note: Irrelevant and incorrect.

5. Input: What is the significance of the Magna Carta?
   Output: The Magna Carta was a document that limited the power of the king and established certain legal rights.
   Annotation: Corre

In geenral refinement instead of dumping all the data

In [None]:
def get_task_description(formatted_data: List[str]) -> str:
    # Read each attribute from the datapoints individually and then combine to determine the underlying intent of the task and display in a usbale LLM way
    # Read inputs
    # Read outputs
    # Take both learnings into the task description
    pass

In [None]:
def combine_human_and_llm_descriptions(human_description: str, llm_description: str) -> str:
    # Combine the human and llm descriptions into a single description
    # Use the llm description as the primary description
    # Use the human description to fill in any gaps or provide additional context
    pass

In [31]:
class EvaluationCriteria(BaseModel):
    criteria: List[str] = Field(
        ...,
        min_items=3,
        max_items=5,
        description="List of 3-5 evaluation criteria generated from the annotated data"
    )

def generate_criteria(formatted_data_string: str) -> List[str]:
    prompt = f"""
Analyze the following annotated data and generate 3-5 evaluation criteria:

{formatted_data_string}

Your task is to generate evaluation criteria that can be used to assess the quality of outputs for this task. When generating criteria, consider the following guidelines from recent research on LLM evaluation:

1. Focus on general aspects of quality that can be used across multiple outputs
2. Consider criteria that address potential misalignment between LLM outputs and human preferences
3. Include criteria that can be evaluated both by code and by LLM-based evaluators
4. Think about criteria that might reveal hallucinations, instruction-following, or other common LLM issues
5. Generate criteria that could help in debugging or improving the LLM pipeline

Provide each criterion as a concise statement, followed by a brief explanation of why it's important and how it might be evaluated (e.g., via code, LLM evaluator, or human judgment).

Return the criteria in this format:
[Criterion]: [Brief explanation and evaluation method]
[Criterion]: [Brief explanation and evaluation method]
[Criterion]: [Brief explanation and evaluation method]

Aim for a mix of straightforward, code-evaluable criteria and more nuanced criteria that might require LLM or human evaluation.
"""
    response = client.chat.completions.create(
        model=MODEL,
        max_tokens=300,
        messages=[{"role": "user", "content": prompt}],
        response_model=EvaluationCriteria
    )
    return response.criteria

In [32]:
# Generate criteria
criteria = generate_criteria(formatted_data)
print("Generated criteria:")
for i, criterion in enumerate(criteria, 1):
    print(f"{i}. {criterion}")

Generated criteria:
1. Relevance: The output should align closely with the input question or prompt, providing information that is directly related. This can be evaluated using keyword matching algorithms or manual human judgment to ensure the response stays on topic.
2. Accuracy: The information provided in the output must be factually correct and up-to-date. This can be evaluated by cross-referencing with reliable databases or sources, potentially using an LLM evaluator to assess factual consistency.
3. Completeness: The response should fully address all aspects of the input question, not omitting any key components. This could be checked by an LLM for thoroughness or by a rubric used by human evaluators.
4. Clarity: The output should be easy to understand and communicated in a clear manner, avoiding unnecessary complexity or ambiguity. This might be evaluated by language models for readability or by human judges for comprehensibility.
5. Correctness (for factual prompts): Ensure the

In [38]:
class PythonAssertion(BaseModel):
    text: str
    evaluation_type: Literal["python"]

class LLMAssertion(BaseModel):
    text: str
    evaluation_type: Literal["llm"]

class CriterionAssertions(BaseModel):
    assertions: List[Union[PythonAssertion, LLMAssertion]] = Field(
        ...,
        min_items=1,
        max_items=3,
        description="Generate 1-3 specific, testable assertions that can be used to evaluate LLM outputs based on the given criterion"
    )

def create_candidate_assertions(formatted_data_string: str, criterion: str) -> CriterionAssertions:
    prompt = f"""
Given the following evaluation criterion and annotated data, generate 1-3 specific, testable assertions:

Criterion: {criterion}

Annotated data: {formatted_data_string}

Your task is to create assertions that can be used to evaluate LLM outputs based on this criterion. Follow these guidelines:

1. Make each assertion clear, concise, and directly related to the criterion
2. Specify whether each assertion should be evaluated using Python code or an LLM
3. For Python assertions:
   - Provide valid Python code that can be executed to evaluate the assertion
   - Use 'output' as the variable name for the LLM output being evaluated
   - Return True if the assertion passes, False otherwise
4. For LLM assertions:
   - Provide a clear, detailed prompt for an LLM to evaluate the assertion
   - The prompt should guide the LLM to return "PASS" or "FAIL" based on the evaluation
5. Include a mix of positive and negative assertions where appropriate
6. Consider edge cases and potential failure modes for the criterion
7. Aim for assertions that could be applied across multiple types of outputs

Ensure that your assertions are directly evaluable and avoid vague or subjective language. Focus on creating assertions that align with human preferences and can be used to validate the quality of LLM-generated evaluations.

Format your response as a JSON object with the following structure:
{{
  "assertions": [
    {{
      "text": "Assertion text or code",
      "evaluation_type": "python" or "llm"
    }},
    ...
  ]
}}
"""
    response = client.chat.completions.create(
        model=MODEL,
        max_tokens=1000,
        messages=[{"role": "user", "content": prompt}],
        response_model=CriterionAssertions
    )
    return response.assertions

In [39]:
# Create candidate assertions
for criterion in criteria:
    assertions = create_candidate_assertions(formatted_data, criterion)
    print(f"\nCriterion: {criterion}")
    print("Candidate assertions:")
    for assertion in assertions:
        print(f"  Type: {assertion.evaluation_type.upper()}")
        if assertion.evaluation_type == 'python':
            print("  Code:")
            print("    " + assertion.text.replace('\n', '\n    '))
        else:
            print("  Prompt:")
            print("    " + assertion.text.replace('\n', '\n    '))
    print("-" * 80)


Criterion: Relevance: The output should align closely with the input question or prompt, providing information that is directly related. This can be evaluated using keyword matching algorithms or manual human judgment to ensure the response stays on topic.
Candidate assertions:
  Type: PYTHON
  Code:
    Check if the output contains key phrases or terms that directly relate to the input question or prompt. For example, for the input 'Describe the symptoms of COVID-19', check if the terms like 'symptoms', 'fever', 'cough' are present in the output.
  Type: LLM
  Prompt:
    Prompt LLM: 'Given the input and output pair, evaluate whether the output provides relevant and directly related information to the input. Consider if the output by itself adequately addresses the main topic requested in the input prompt.'
  Type: PYTHON
  Code:
    Ensure the output stays on topic by checking if more than 60% of the output sentences contain thematic words found in the input. Return True if this con

In [None]:
def evaluate_python_assertion(assertion, datum):
    pass

In [None]:
def evaluate_llm_assertion(assertion, datum):
    pass

In [None]:
def evaluate_assertions(datum, assertion):
    if assertion.evaluation_type == 'python':
        return evaluate_python_assertion(assertion.text, datum)
    else:
        return evaluate_llm_assertion(assertion.text, datum)
    
    return None

In [None]:
# Evaluate assertions
assertion_results = evaluate_assertions(data, assertions)

1. **Selectivity**:
   ```python
   selectivity = passes / total_outputs
   ```
   Selectivity measures how often an assertion passes LLM outputs. A lower selectivity means the assertion is more "picky" or strict.

2. **Coverage**:
   ```python
   coverage = fails_on_bad / total_bad if total_bad > 0 else 0
   ```
   Coverage measures how well our assertions catch the outputs that humans marked as bad. A higher coverage means we're better at identifying problematic outputs.

3. **False Failure Rate (FFR)**:
   ```python
   ffr = fails_on_good / total_good if total_good > 0 else 0
   ```
   FFR shows how often our assertions incorrectly fail outputs that humans thought were good. A lower FFR is better, as it means we're not being overly strict.

4. **Alignment**:
   ```python
   alignment = 2 * (coverage * (1 - ffr)) / (coverage + (1 - ffr)) if (coverage + (1 - ffr)) > 0 else 0
   ```
   Alignment combines coverage and FFR into a single score. It represents how well our automated evaluations match human judgments overall.

These metrics help us refine our assertion set over time, aiming to catch more bad outputs while avoiding false alarms on good ones

We need to get Alignment score right

In [None]:
def calculate_metrics(data: List[DataPoint], assertion_results: Dict[str, List[int]]) -> Dict[str, Dict[str, float]]:
    metrics = {}
    total_outputs = len(data)
    total_bad = sum(1 for _, _, annotation, _ in data if annotation == 0)
    total_good = total_outputs - total_bad

    for assertion, results in assertion_results.items():
        passes = sum(results)
        fails_on_bad = sum(1 for (_, _, annotation), result in zip(data, results) if annotation == 0 and result == 0)
        fails_on_good = sum(1 for (_, _, annotation), result in zip(data, results) if annotation == 1 and result == 0)

        selectivity = passes / total_outputs
        coverage = fails_on_bad / total_bad if total_bad > 0 else 0
        ffr = fails_on_good / total_good if total_good > 0 else 0
        alignment = 2 * (coverage * (1 - ffr)) / (coverage + (1 - ffr)) if (coverage + (1 - ffr)) > 0 else 0
        # human_accuracy = 

        metrics[assertion] = {
            "selectivity": selectivity,
            "coverage": coverage,
            "ffr": ffr,
            "alignment": alignment
        }

    return metrics

In [None]:
# Calculate metrics
metrics = calculate_metrics(data, assertion_results)
print("Assertion metrics:", json.dumps(metrics, indent=2))

In [None]:
#Rerthink this especially for python case because unittest may not always be pure 1 assertion test (like a test suite)
def select_best_assertions(assertions: Dict[str, List[str]], metrics: Dict[str, Dict[str, float]]) -> Dict[str, str]:
    best_assertions = {}
    for criterion, assertion_list in assertions.items():
        best_assertion = max(assertion_list, key=lambda a: metrics[a]['alignment'])
        best_assertions[criterion] = best_assertion
    return best_assertions

In [None]:
# Select best assertions
best_assertions = select_best_assertions(assertions, metrics)
print("Best assertions:", json.dumps(best_assertions, indent=2))

In [None]:
def calculate_overall_metrics(data: List[DataPoint], best_assertions: Dict[str, str], assertion_results: Dict[str, List[int]]) -> Dict[str, float]:
    total_outputs = len(data)
    total_bad = sum(1 for _, _, annotation, _ in data if annotation == 0)
    total_good = total_outputs - total_bad

    fails_on_bad = sum(1 for i, (_, _, annotation, _) in enumerate(data) 
                       if annotation == 0 and any(assertion_results[assertion][i] == 0 for assertion in best_assertions.values()))
    fails_on_good = sum(1 for i, (_, _, annotation, _) in enumerate(data) 
                        if annotation == 1 and any(assertion_results[assertion][i] == 0 for assertion in best_assertions.values()))

    coverage = fails_on_bad / total_bad if total_bad > 0 else 0
    ffr = fails_on_good / total_good if total_good > 0 else 0
    alignment = 2 * (coverage * (1 - ffr)) / (coverage + (1 - ffr)) if (coverage + (1 - ffr)) > 0 else 0

    return {
        "coverage": coverage,
        "ffr": ffr,
        "alignment": alignment
    }

In [None]:
# Calculate overall metrics
overall_metrics = calculate_overall_metrics(data, best_assertions, assertion_results)
print("Overall metrics:", json.dumps(overall_metrics, indent=2))

In [None]:
# Generate final report
report = {
    "final_assertions": best_assertions,
    "assertion_metrics": {assertion: metrics[assertion] for assertion in best_assertions.values()},
    "overall_metrics": overall_metrics
}

print("\nFinal Report:")
print(json.dumps(report, indent=2))