In [1]:
import weave
from set_env import set_env
import nest_asyncio
from evalforge.evalforge import EvalForge, convert_datapoint_to_example
from evalforge.combined_scorer import predict_passthrough
import asyncio
from evalforge.evalforge_alignment import calculate_alignment_metrics, format_alignment_metrics

In [2]:
set_env("OPENAI_API_KEY")
set_env("WANDB_API_KEY")
print("Env set")

Env set


In [3]:
try:
    import IPython
    in_jupyter = True
except ImportError:
    in_jupyter = False
if in_jupyter:
    nest_asyncio.apply()

In [4]:
import random
weave.init(f"evalgen_test_{random.randint(0, 1000000)}")

Logged in as Weights & Biases user: a-sh0ts.
View Weave data at https://wandb.ai/a-sh0ts/evalgen_test_379476/weave


<weave.trace.weave_client.WeaveClient at 0x147cd9250>

In [5]:
TEST_TASK = "medical"
if TEST_TASK == "medical":
    all_data = weave.ref("weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8").get()
    data = random.sample(all_data, 10)
elif TEST_TASK == "product":
    pass
else:
    all_data = data = [
        ({"text": "Summarize the impact of climate change on polar bears."}, {"text": "Climate change is reducing sea ice, which polar bears rely on for hunting seals."}, 1, "Accurate and relevant."),
        ({"text": "Explain the process of photosynthesis."}, {"text": "Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water."}, 1, "Correct and detailed."),
        ({"text": "What are the main causes of the American Civil War?"}, {"text": "The main causes were slavery, states' rights, and economic differences."}, 1, "Concise and accurate."),
        ({"text": "Describe the symptoms of COVID-19."}, {"text": "COVID-19 is caused by a virus that originated in bats."}, 0, "Irrelevant and incorrect."),
        ({"text": "What is the significance of the Magna Carta?"}, {"text": "The Magna Carta was a document that limited the power of the king and established certain legal rights."}, 1, "Historically accurate and relevant.")
    ]

In [6]:
forger = EvalForge()

results = await forger.predict(data)

🍩 https://wandb.ai/a-sh0ts/evalgen_test_379476/r/call/01921103-380e-7920-9c5d-07cb829eb840


In [7]:
forged_judge = results["forged_judges"]["judge"]
forged_judges_metrics = results["forged_judges"]["alignment_metrics"]
forged_judges_assertion_results = results["forged_judges"]["assertion_results"]
forged_judges_summary = results["forged_judges"]["summary"]

raw_judges = results["raw_judges"]["judge"]
raw_judges_metrics = results["raw_judges"]["alignment_metrics"]
raw_judges_assertion_results = results["raw_judges"]["assertion_results"]
raw_judges_summary = results["raw_judges"]["summary"]

annotation_examples = results["annotation_examples"]
finalized_task_description = results["finalized_task_description"]


In [8]:
print(forged_judges_summary)
print(raw_judges_summary)

| Criterion                               | Assertion                               | Type      | Alignment |
|-----------------------------------------|-----------------------------------------|-----------|-----------|
| Adherence to format and completeness     | **OVERALL**                             |           | 0.54     |
|                                         | evaluate_structure_and_completeness      | llm       | 0.29     |
|                                         | test_adheres_to_bullet_point_format      | code      | 0.00     |
| Exclusion of personal identifiable infor | **OVERALL**                             |           | 0.71     |
|                                         | evaluate_exclusion_of_pii                | llm       | 0.71     |
| Accuracy and Relevance of Transformed No | **OVERALL**                             |           | 0.54     |
|                                         | relevance_of_transformed_notes           | llm       | 0.29     |
|         

In [15]:
finalized_task_description

"The task requires transforming detailed medical dialogues between a doctor and a patient into structured medical notes in a standardized bullet-point list format. The dialogues provide comprehensive information on the patient's medical history, current symptoms, examination findings, familial history, allergies, medications, and future healthcare plans. The output should be a concise, structured list containing the mandatory fields: Chief complaint, History of present illness, Physical examination findings, Symptoms, New medications with dosages, and Follow-up instructions. Missing data should be marked as 'N/A'. The process demands the accurate extraction and summary of relevant medical information while strictly omitting any personally identifiable information (PII) to ensure privacy, substituting terms like 'the patient' for specific identifiers. Notes should maintain consistent punctuation and formatting, with new medications distinctly separated from existing ones. Evaluation foc

In [9]:
@weave.op()
async def run_assertions_and_calculate_metrics(forger, judge, data, task_description):
    all_annotation_examples = convert_datapoint_to_example(task_description, data)
    all_data_forged_judge_assertion_results = await forger.run_assertions(judge, all_annotation_examples)
    all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results)
    all_data_metrics_str = format_alignment_metrics(all_data_metrics)
    return all_data_metrics_str

In [10]:
print(await run_assertions_and_calculate_metrics(forger, forged_judge, data, finalized_task_description))

🍩 https://wandb.ai/a-sh0ts/evalgen_test_379476/r/call/01921104-8ce1-7962-a215-02563ec93d50
| Criterion                               | Assertion                               | Type      | Alignment |
|-----------------------------------------|-----------------------------------------|-----------|-----------|
| Exclusion of personal identifiable infor | **OVERALL**                             |           | 0.50     |
|                                         | evaluate_exclusion_of_pii                | llm       | 0.50     |
| Adherence to format and completeness     | **OVERALL**                             |           | 0.50     |
|                                         | evaluate_structure_and_completeness      | llm       | 0.00     |
|                                         | test_adheres_to_bullet_point_format      | code      | 0.00     |
| Accuracy and Relevance of Transformed No | **OVERALL**                             |           | 0.54     |
|                            

In [11]:
# all_annotation_examples = convert_datapoint_to_example(finalized_task_description, all_data)
# evaluation = weave.Evaluation(
#     scorers=[forged_judge],
#     dataset=all_annotation_examples,
# )


# final_judge_results = asyncio.run(evaluation.evaluate(predict_passthrough))

In [12]:
weave.publish(forged_judge, name="final_judge")

📦 Published to https://wandb.ai/a-sh0ts/evalgen_test_379476/weave/objects/final_judge/versions/SZSv6IC9FAdCL6wY6ZmRVoQcpWEyd917Y7KubRFNHWA


ObjectRef(entity='a-sh0ts', project='evalgen_test_379476', name='final_judge', digest='SZSv6IC9FAdCL6wY6ZmRVoQcpWEyd917Y7KubRFNHWA', extra=())

In [13]:
forged_judge.export()

In [14]:
weave.finish()