In [1]:
import weave
from set_env import set_env
import nest_asyncio
from evalforge.evalforge import EvalForge, convert_datapoint_to_example
from evalforge.evalforge_alignment import calculate_alignment_metrics, format_alignment_metrics

In [2]:
set_env("OPENAI_API_KEY")
set_env("WANDB_API_KEY")
print("Env set")

Env set


In [3]:
try:
    import IPython
    in_jupyter = True
except ImportError:
    in_jupyter = False
if in_jupyter:
    nest_asyncio.apply()

In [4]:
import random
weave.init(f"evalforge_test_{random.randint(0, 1000000)}")

Logged in as Weights & Biases user: a-sh0ts.
View Weave data at https://wandb.ai/a-sh0ts/evalgen_test_514403/weave


<weave.trace.weave_client.WeaveClient at 0x15d783bc0>

In [5]:
TEST_TASK = "medical"
if TEST_TASK == "medical":
    all_data = weave.ref("weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8").get()
    data = random.sample(all_data, 10)
elif TEST_TASK == "product":
    pass
else:
    all_data = data = [
        ({"text": "Summarize the impact of climate change on polar bears."}, {"text": "Climate change is reducing sea ice, which polar bears rely on for hunting seals."}, 1, "Accurate and relevant."),
        ({"text": "Explain the process of photosynthesis."}, {"text": "Photosynthesis is the process by which plants use sunlight to synthesize foods from carbon dioxide and water."}, 1, "Correct and detailed."),
        ({"text": "What are the main causes of the American Civil War?"}, {"text": "The main causes were slavery, states' rights, and economic differences."}, 1, "Concise and accurate."),
        ({"text": "Describe the symptoms of COVID-19."}, {"text": "COVID-19 is caused by a virus that originated in bats."}, 0, "Irrelevant and incorrect."),
        ({"text": "What is the significance of the Magna Carta?"}, {"text": "The Magna Carta was a document that limited the power of the king and established certain legal rights."}, 1, "Historically accurate and relevant.")
    ]

In [6]:
forger = EvalForge()

results = await forger.predict(data)

🍩 https://wandb.ai/a-sh0ts/evalgen_test_514403/r/call/01921118-16d4-70d0-86c4-6bf8e5ff172d


In [7]:
forged_judge = results["forged_judges"]["judge"]
forged_judges_metrics = results["forged_judges"]["alignment_metrics"]
forged_judges_assertion_results = results["forged_judges"]["assertion_results"]
forged_judges_summary = results["forged_judges"]["summary"]

raw_judges = results["raw_judges"]["judge"]
raw_judges_metrics = results["raw_judges"]["alignment_metrics"]
raw_judges_assertion_results = results["raw_judges"]["assertion_results"]
raw_judges_summary = results["raw_judges"]["summary"]

annotation_examples = results["annotation_examples"]
finalized_task_description = results["finalized_task_description"]


In [8]:
print(forged_judges_summary)
print(raw_judges_summary)

| Criterion                               | Assertion                               | Type      | Alignment |
|-----------------------------------------|-----------------------------------------|-----------|-----------|
| Accuracy and Completeness                | **OVERALL**                             |           | 0.86     |
|                                         | check_accuracy_and_completeness          | llm       | 0.86     |
| Clarity and Conciseness                  | **OVERALL**                             |           | 0.00     |
|                                         | evaluate_organization_and_conciseness    | llm       | 0.00     |
| Adherence to Structure and Format        | **OVERALL**                             |           | 0.60     |
|                                         | structure_and_format_llm_check           | llm       | 0.60     |

| Criterion                               | Assertion                               | Type      | Alignment |
|--------

In [9]:
finalized_task_description

"The task involves transforming dialogues between doctors and patients and accompanying medical notes into precise and structured medical summaries. The input consists of textual transcriptions detailing both the dialogues and comprehensive medical notes. The aim is to extract and systematically organize essential clinical data into categorized bullet points such as chief complaint, history of present illness, physical examination findings, symptoms, medication instructions, and follow-up guidelines. Summaries must exclude all personally identifiable information (PII), including names, ages, genders, ensuring confidentiality by referring to 'the patient.' Each section must be comprehensive and indicate 'N/A' if specific information is lacking while maintaining brevity within a 150-word limit. A clear distinction between newly prescribed medications and existing ones, with detailed dosage information or 'N/A' for unchanged dosages, is essential. The evaluation criteria focus on accuracy

In [10]:
@weave.op()
async def run_assertions_and_calculate_metrics(forger, judge, data, task_description):
    all_annotation_examples = convert_datapoint_to_example(task_description, data)
    all_data_forged_judge_assertion_results = await forger.run_assertions(judge, all_annotation_examples)
    all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results)
    all_data_metrics_str = format_alignment_metrics(all_data_metrics)
    return all_data_metrics_str

In [11]:
print(await run_assertions_and_calculate_metrics(forger, forged_judge, data, finalized_task_description))

🍩 https://wandb.ai/a-sh0ts/evalgen_test_514403/r/call/01921119-3799-7b41-899a-8eaa2c63aa2e
| Criterion                               | Assertion                               | Type      | Alignment |
|-----------------------------------------|-----------------------------------------|-----------|-----------|
| Accuracy and Completeness                | **OVERALL**                             |           | 0.79     |
|                                         | check_accuracy_and_completeness          | llm       | 0.79     |
| Adherence to Structure and Format        | **OVERALL**                             |           | 0.71     |
|                                         | structure_and_format_llm_check           | llm       | 0.71     |



In [12]:
# all_annotation_examples = convert_datapoint_to_example(finalized_task_description, all_data)
# evaluation = weave.Evaluation(
#     scorers=[forged_judge],
#     dataset=all_annotation_examples,
# )


# final_judge_results = asyncio.run(evaluation.evaluate(predict_passthrough))

In [13]:
weave.publish(forged_judge, name="final_judge")

📦 Published to https://wandb.ai/a-sh0ts/evalgen_test_514403/weave/objects/final_judge/versions/N7ZhN6Al1f2oN0yXDOEx0kspbMVMkkpIjxSjKyKK4z4


ObjectRef(entity='a-sh0ts', project='evalgen_test_514403', name='final_judge', digest='N7ZhN6Al1f2oN0yXDOEx0kspbMVMkkpIjxSjKyKK4z4', extra=())

In [14]:
forged_judge.export()

In [15]:
weave.finish()