In [1]:
import weave
from set_env import set_env
import nest_asyncio
from evalforge.evalforge import EvalForge, convert_datapoint_to_example
from evalforge.combined_scorer import predict_passthrough
import asyncio
from evalforge.evalforge_alignment import calculate_alignment_metrics, format_alignment_metrics

In [2]:
set_env("OPENAI_API_KEY")
set_env("WANDB_API_KEY")
print("Env set")

Env set


In [3]:
try:
    import IPython
    in_jupyter = True
except ImportError:
    in_jupyter = False
if in_jupyter:
    nest_asyncio.apply()

In [4]:
import random
weave.init(f"evalgen_test_{random.randint(0, 1000000)}")

Logged in as Weights & Biases user: a-sh0ts.
View Weave data at https://wandb.ai/a-sh0ts/evalgen_test_117112/weave


<weave.weave_client.WeaveClient at 0x149b88050>

In [5]:
TEST_TASK = "medical"
if TEST_TASK == "medical":
    all_data = weave.ref("weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8").get()
    data = random.sample(all_data, 10)
elif TEST_TASK == "product":
    pass
else:
    all_data = data = [
        ({"text": "Summarize the impact of climate change on polar bears."}, {"text": "Climate change is reducing sea ice, which polar bears rely on for hunting seals."}, 1, "Accurate and relevant."),
        ({"text": "Explain the process of photosynthesis."}, {"text": "Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water."}, 1, "Correct and detailed."),
        ({"text": "What are the main causes of the American Civil War?"}, {"text": "The main causes were slavery, states' rights, and economic differences."}, 1, "Concise and accurate."),
        ({"text": "Describe the symptoms of COVID-19."}, {"text": "COVID-19 is caused by a virus that originated in bats."}, 0, "Irrelevant and incorrect."),
        ({"text": "What is the significance of the Magna Carta?"}, {"text": "The Magna Carta was a document that limited the power of the king and established certain legal rights."}, 1, "Historically accurate and relevant.")
    ]

In [6]:
forger = EvalForge()

results = await forger.predict(data)

In [7]:
forged_judge = results["forged_judges"]["judge"]
forged_judges_metrics = results["forged_judges"]["alignment_metrics"]
forged_judges_assertion_results = results["forged_judges"]["assertion_results"]
forged_judges_summary = results["forged_judges"]["summary"]

raw_judges = results["raw_judges"]["judge"]
raw_judges_metrics = results["raw_judges"]["alignment_metrics"]
raw_judges_assertion_results = results["raw_judges"]["assertion_results"]
raw_judges_summary = results["raw_judges"]["summary"]

annotation_examples = results["annotation_examples"]
finalized_task_description = results["finalized_task_description"]


In [8]:
print(forged_judges_summary)
print(raw_judges_summary)

| Criterion                               | Assertion                               | Type      | Alignment |
|-----------------------------------------|-----------------------------------------|-----------|-----------|
| Accuracy and Completeness                | **OVERALL**                             |           | 0.57     |
|                                         | check_llm_for_completeness_and_accuracy  | llm       | 0.57     |
| Adherence to Format and Privacy Constrai | **OVERALL**                             |           | 0.50     |
|                                         | output_format_adherence_and_pii_constrai | llm       | 0.00     |
|                                         | test_output_format_and_structure         | code      | 0.00     |
| Conciseness and Clarity                  | **OVERALL**                             |           | 0.50     |
|                                         | clarity_of_bullet_points                 | llm       | 0.00     |
|         

In [9]:
@weave.op()
async def run_assertions_and_calculate_metrics(forger, judge, data, task_description):
    all_annotation_examples = convert_datapoint_to_example(task_description, data)
    all_data_forged_judge_assertion_results = await forger.run_assertions(judge, all_annotation_examples)
    all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results)
    all_data_metrics_str = format_alignment_metrics(all_data_metrics)
    return all_data_metrics_str

In [10]:
print(await run_assertions_and_calculate_metrics(forger, forged_judge, data, finalized_task_description))

| Criterion                               | Assertion                               | Type      | Alignment |
|-----------------------------------------|-----------------------------------------|-----------|-----------|
| Accuracy and Completeness                | **OVERALL**                             |           | 0.57     |
|                                         | check_llm_for_completeness_and_accuracy  | llm       | 0.57     |
| Adherence to Format and Privacy Constrai | **OVERALL**                             |           | 0.50     |
|                                         | output_format_adherence_and_pii_constrai | llm       | 0.00     |
|                                         | test_output_format_and_structure         | code      | 0.00     |
| Conciseness and Clarity                  | **OVERALL**                             |           | 0.50     |
|                                         | clarity_of_bullet_points                 | llm       | 0.00     |
|         

In [11]:
all_annotation_examples = convert_datapoint_to_example(finalized_task_description, all_data)
evaluation = weave.Evaluation(
    scorers=[forged_judge],
    dataset=all_annotation_examples,
)


final_judge_results = asyncio.run(evaluation.evaluate(predict_passthrough))

In [12]:
weave.publish(forged_judge, name="final_judge")

📦 Published to https://wandb.ai/a-sh0ts/evalgen_test_117112/weave/objects/final_judge/versions/ERVJXhRHsv2oRW0icPbeUAi1XsOczSqjoxMwWRHITXk


ObjectRef(entity='a-sh0ts', project='evalgen_test_117112', name='final_judge', digest='ERVJXhRHsv2oRW0icPbeUAi1XsOczSqjoxMwWRHITXk', extra=())