In [None]:
!pip install judgeval==0.0.51 graphiti-core neo4j pandas langgraph openai langchain




# **IMPORT NECESSARY PACKAGES**

In [None]:
# Standard Library
import os
import sys
import json
import uuid
import asyncio
import logging
import warnings
from datetime import datetime, timezone
from pathlib import Path
from typing import TypedDict, Sequence, Annotated
# Environment Variables
from dotenv import load_dotenv


# Data Processing
import pandas as pd
from tqdm import tqdm

# IPython/Jupyter
import ipywidgets as widgets
from IPython.display import display

# LangGraph
from langgraph.graph import StateGraph, START, END, add_messages
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import ToolNode

# LangChain
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI

# Judgeval
from judgeval.judgment_client import JudgmentClient
from judgeval.scorers.judgeval_scorers.api_scorers import FaithfulnessScorer
from judgeval.common.tracer import Tracer
from judgeval.integrations.langgraph import JudgevalCallbackHandler
from judgeval.data import Example
from judgeval.data.datasets import EvalDataset
from judgeval.scorers import AnswerCorrectnessScorer
from judgeval.scorers import InstructionAdherenceScorer

# Graphiti
from graphiti_core import Graphiti
from graphiti_core.nodes import EpisodeType
from graphiti_core.edges import EntityEdge
from graphiti_core.utils.maintenance.graph_data_operations import clear_data
from graphiti_core.search.search_config_recipes import NODE_HYBRID_SEARCH_EPISODE_MENTIONS

# Misc
import warnings



# **INITIALIZE OPENAI API KEY**

In [None]:
os.environ["OPENAI_API_KEY"] = "sk-..."

# **READ THE USDA FOOD DATASET**

In [None]:


# Path to the USDA-Food-Dataset folder
folder_path = "USDA-Food-Dataset"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Loop through each CSV and print its columns
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path, nrows=0)
        print(f"{file}:")
        print(list(df.columns))
        print("-" * 50)
    except Exception as e:
        print(f"Failed to read {file}: {e}")


food_calorie_conversion_factor.csv:
['food_nutrient_conversion_factor_id', 'protein_value', 'fat_value', 'carbohydrate_value']
--------------------------------------------------
food_nutrient_conversion_factor.csv:
['id', 'fdc_id']
--------------------------------------------------
food_attribute_type.csv:
['id', 'name', 'description']
--------------------------------------------------
food_attribute.csv:
['id', 'fdc_id', 'seq_num', 'food_attribute_type_id', 'name', 'value']
--------------------------------------------------
market_acquisition.csv:
['fdc_id', 'brand_description', 'expiration_date', 'label_weight', 'location', 'acquisition_date', 'sales_type', 'sample_lot_nbr', 'sell_by_date', 'store_city', 'store_name', 'store_state', 'upc_code']
--------------------------------------------------
foundation_food.csv:
['fdc_id', 'NDB_number', 'footnote']
--------------------------------------------------
food_update_log_entry.csv:
['id', 'description', 'last_updated']
------------------

# **CREATE KNOWLEDGE GRAPH TRIPLETS FROM THE USDA FOOD DATASET**

In [None]:
folder_path = "USDA-Food-Dataset/"

# Function to safely read CSV with low_memory=False
def read_csv_safe(path):
    return pd.read_csv(path, low_memory=False)

# Load all CSVs
food = read_csv_safe(folder_path + "food.csv")
food_nutrient = read_csv_safe(folder_path + "food_nutrient.csv")
nutrient = read_csv_safe(folder_path + "nutrient.csv")
food_category = read_csv_safe(folder_path + "food_category.csv")
food_component = read_csv_safe(folder_path + "food_component.csv")
food_attribute = read_csv_safe(folder_path + "food_attribute.csv")
food_attribute_type = read_csv_safe(folder_path + "food_attribute_type.csv")
food_portion = read_csv_safe(folder_path + "food_portion.csv")
measure_unit = read_csv_safe(folder_path + "measure_unit.csv")
food_calorie_conv = read_csv_safe(folder_path + "food_calorie_conversion_factor.csv")
food_protein_conv = read_csv_safe(folder_path + "food_protein_conversion_factor.csv")
food_nutrient_conv = read_csv_safe(folder_path + "food_nutrient_conversion_factor.csv")
market_acquisition = read_csv_safe(folder_path + "market_acquisition.csv")
input_food = read_csv_safe(folder_path + "input_food.csv")
foundation_food = read_csv_safe(folder_path + "foundation_food.csv")

triplets = []

# Helper function to get description safely
def get_desc(row):
    return row.get("description", row.get("description_x", "Unknown Food"))

#  FOOD–NUTRIENT RELATIONSHIPS

merged_fn = food_nutrient.merge(food, on="fdc_id").merge(nutrient, left_on="nutrient_id", right_on="id")
for _, row in merged_fn.iterrows():
    food_name = get_desc(row)
    nutrient_name = row["name"]
    triplets.append( (food_name, "contains", nutrient_name) )
    if nutrient_name == "Energy":
        triplets.append( (food_name, "has calorie value", f"{row['amount']} {row['unit_name']}") )

#  FOOD–CATEGORY RELATIONSHIPS

merged_fc = food.merge(food_category, left_on="food_category_id", right_on="id", how="left")
for _, row in merged_fc.iterrows():
    food_name = row.get("description_x", "Unknown Food")
    category_name = row.get("description_y")
    if pd.notna(category_name):
        triplets.append( (food_name, "belongs to category", category_name) )

#  FOOD–COMPONENT RELATIONSHIPS

merged_comp = food_component.merge(food, on="fdc_id")
for _, row in merged_comp.iterrows():
    food_name = get_desc(row)
    triplets.append( (food_name, "is source of", row["name"]) )

#  FOOD–ATTRIBUTE RELATIONSHIPS

merged_attr = food_attribute.merge(food, on="fdc_id").merge(food_attribute_type, left_on="food_attribute_type_id", right_on="id")
for _, row in merged_attr.iterrows():
    food_name = get_desc(row)
    attr_type = row["name_y"]
    attr_value = row["value"]
    triplets.append( (food_name, f"has attribute ({attr_type})", attr_value) )

#  FOOD–PORTION RELATIONSHIPS

merged_portion = food_portion.merge(food, on="fdc_id").merge(measure_unit, left_on="measure_unit_id", right_on="id")
for _, row in merged_portion.iterrows():
    food_name = get_desc(row)
    portion_desc = f"{row['amount']} {row['name']} ({row['portion_description']})"
    triplets.append( (food_name, "has portion", portion_desc) )

#  FOOD–CALORIE CONVERSION FACTOR

for _, row in food_calorie_conv.iterrows():
    factor_id = f"ConversionFactor-{row['food_nutrient_conversion_factor_id']}"
    triplets.append( (factor_id, "has protein value", row["protein_value"]) )
    triplets.append( (factor_id, "has fat value", row["fat_value"]) )
    triplets.append( (factor_id, "has carbohydrate value", row["carbohydrate_value"]) )

#  FOOD–PROTEIN & NUTRIENT CONVERSION FACTORS

for _, row in food_protein_conv.iterrows():
    factor_id = f"ConversionFactor-{row['food_nutrient_conversion_factor_id']}"
    triplets.append( (factor_id, "has protein conversion factor", row["value"]) )

for _, row in food_nutrient_conv.iterrows():
    triplets.append( (row["fdc_id"], "has nutrient conversion factor id", row["id"]) )

#  MARKET ACQUISITION RELATIONSHIPS

merged_market = market_acquisition.merge(food, on="fdc_id")
for _, row in merged_market.iterrows():
    food_name = get_desc(row)
    if pd.notna(row.get("location")):
        triplets.append( (food_name, "acquired from location", row["location"]) )
    if pd.notna(row.get("brand_description")):
        triplets.append( (food_name, "has brand", row["brand_description"]) )

#  INPUT FOOD RELATIONSHIPS

merged_input = input_food.merge(food, left_on="fdc_of_input_food", right_on="fdc_id", suffixes=('_input', '_food'))
for _, row in merged_input.iterrows():
    food_name = row.get("description_food", "Unknown Food")
    input_food_name = row.get("description_input", "Unknown Input Food")
    triplets.append( (food_name, "has input food", input_food_name) )

#  FOUNDATION FOOD NDB NUMBERS

merged_foundation = foundation_food.merge(food, on="fdc_id")
for _, row in merged_foundation.iterrows():
    food_name = get_desc(row)
    triplets.append( (food_name, "has NDB number", row["NDB_number"]) )

#  FINAL OUTPUT

triplet_df = pd.DataFrame(triplets, columns=["Subject", "Predicate", "Object"])


# Save as pretty JSON
output_path_json = "usda_full_kg_triplets.json"
triplet_df.to_json(output_path_json, orient="records", indent=2)
print(f"Extracted {len(triplets)} triplets and saved to {output_path_json}")


Extracted 263941 triplets and saved to usda_full_kg_triplets.json


In [None]:
unique_predicates = triplet_df["Predicate"].unique()

print("\nUnique relationship types:")
for rel in unique_predicates:
    print(rel)


Unique relationship types:
contains
has calorie value
belongs to category
has attribute (Common Name)
has attribute (Attribute)
has portion
has protein value
has fat value
has carbohydrate value
has protein conversion factor
has nutrient conversion factor id
acquired from location
has brand
has input food
has NDB number


# **SET UP LOGGING AND CONFIGURE GRAPHITI**

In [None]:

# Load environment variables
load_dotenv()

# Suppress warnings
warnings.filterwarnings("ignore")

# Configure logging
def setup_logging():
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # Avoid adding multiple handlers if re-running in Jupyter
    if not logger.handlers:
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

    # Suppress noisy loggers
    noisy_loggers = [
        "httpx",
        "openai",
        "neo4j",
        "neo4j.notifications",
        "graphiti_core",
        "asyncio",
        "urllib3",
    ]
    for nl in noisy_loggers:
        logging.getLogger(nl).setLevel(logging.ERROR)

    return logger

logger = setup_logging()

# Confirm logger initialization
logger.info("Logging setup completed.")



# Initialize Graphiti client
client = Graphiti(
    "bolt://127.0.0.1:7687",
    "neo4j",
    "HayaVadana11*18"
)

logger.info("Connected to Neo4j via Graphiti!")


root - INFO - Logging setup completed.


root - INFO - Connected to Neo4j via Graphiti!


# **GENERATING A DATABASE SCHEMA**

In [None]:

#await clear_data(client.driver).   -> Done only the first time, before ingesting triplets
await client.build_indices_and_constraints()

# **INGESTING THE KNOWLEDGE GRAPH TRIPLETS FOR CONTEXT RETRIEVAL**

In [None]:


logger = logging.getLogger(__name__)

async def ingest_usda_triplets(client, batch_size=10):
    script_dir = Path.cwd()
    json_file_path = script_dir / 'usda_full_kg_triplets.json'
    with open(json_file_path) as file:
        triplets = json.load(file)
        triplets = triplets[:5000]

    total = len(triplets)

    with tqdm(
        total=total,
        desc="Ingesting USDA Triplets",
        bar_format='{desc}: {percentage:3.0f}% |{bar}|'
    ) as pbar:
        for i in range(0, total, batch_size):
            batch = triplets[i:i + batch_size]
            tasks = []

            for idx, triplet in enumerate(batch):
                episode_body = {
                    "Subject": triplet.get("Subject"),
                    "Predicate": triplet.get("Predicate"),
                    "Object": triplet.get("Object")
                }

                tasks.append(
                    client.add_episode(
                        name=triplet.get('Subject', f'Triplet {i+idx}'),
                        episode_body=json.dumps(episode_body),
                        source_description='USDA KG Triplet',
                        source=EpisodeType.json,
                        reference_time=datetime.now(timezone.utc),
                    )
                )

            try:
                await asyncio.gather(*tasks)
            except Exception as e:
                logger.error(f"Error in batch starting at {i}: {e}")

            pbar.update(len(batch))

    logger.info("Finished ingesting all USDA triplets.")


In [None]:
await ingest_usda_triplets(client)

Ingesting USDA Triplets: 100% |██████████|

__main__ - INFO - Finished ingesting all USDA triplets.





In [None]:


user_name = 'health_user'

# Create user node in Graphiti with a health-specific context
await client.add_episode(
    name='User Creation',
    episode_body=f'{user_name} is interested in learning about nutrition and health topics.',
    source=EpisodeType.text,
    reference_time=datetime.now(timezone.utc),
    source_description='HealthBot',
)

# Retrieve the created user's node UUID
nl = await client._search(user_name, NODE_HYBRID_SEARCH_EPISODE_MENTIONS)
user_node_uuid = nl.nodes[0].uuid if nl.nodes else None

# Retrieve the HealthBot node UUID (optional)
nl = await client._search('HealthBot', NODE_HYBRID_SEARCH_EPISODE_MENTIONS)
healthbot_node_uuid = nl.nodes[0].uuid if nl.nodes else None

# Utility function to convert entity edges to a clean facts string
def edges_to_facts_string(entities: list[EntityEdge]) -> str:
    return '-' + '\n- '.join([edge.fact for edge in entities]) if entities else 'No facts found.'


# **DEFINE THE GET_HEALTH_DATA TOOL**

In [None]:
# Define the health data retrieval tool
@tool
async def get_health_data(query: str) -> str:
    """Search the Graphiti graph for information about foods and nutrients."""
    edge_results = await client.search(
        query,
        center_node_uuid=healthbot_node_uuid,
        num_results=25,
    )
    return edges_to_facts_string(edge_results)

# Register the tool
tools = [get_health_data]
tool_node = ToolNode(tools)

# Initialize LLM and bind with tools
llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.1).bind_tools(tools)

# Test the tool node with a health query
await tool_node.ainvoke({'messages': [await llm.ainvoke('nutrients in broccoli')]})

{'messages': [ToolMessage(content='-Broccoli, raw contains Protein\n- Broccoli, raw contains Vitamin K (Dihydrophylloquinone)\n- Broccoli, raw contains Folate, total\n- Broccoli, raw contains Sodium, Na\n- Broccoli, raw contains Fatty acids, total saturated\n- Broccoli, raw contains PUFA 18:2\n- Broccoli, raw contains PUFA 18:3\n- Broccoli, raw contains Thiamin\n- Broccoli, raw contains SFA 14:0\n- Broccoli, raw contains Vitamin B-6\n- Broccoli, raw contains Vitamin K (phylloquinone)\n- Broccoli, raw contains Vitamin E (alpha-tocopherol)\n- Broccoli, raw contains Calcium, Ca\n- Broccoli, raw contains Fiber, total dietary\n- Broccoli, raw contains Proline\n- Broccoli, raw contains Tocopherol, beta\n- Broccoli, raw contains PUFA 20:4c\n- Broccoli, raw contains Cryptoxanthin, beta\n- Broccoli, raw contains Iron, Fe\n- Broccoli, raw contains Phytofluene\n- Broccoli, raw contains Pantothenic acid\n- Broccoli, raw contains Carotene, beta\n- Broccoli, raw contains Fiber, soluble\n- Broccoli, 

# **INITIALIZE JUDGEVAL**

In [None]:
judgment_tracer = Tracer(
    api_key="...",
    project_name="graphiti-agent",
    organization_id="..."
)
handler = JudgevalCallbackHandler(judgment_tracer)

judgment_client = JudgmentClient(
    judgment_api_key="...",
    organization_id="..."
)
faithfulness_scorer = FaithfulnessScorer(threshold=0.5)

# **THE CHATBOT FUNCTION**

In [None]:
class State(TypedDict):
    messages: Annotated[list, add_messages]
    user_name: str
    user_node_uuid: str

async def chatbot(state: State):
    last_message = state['messages'][-1] if state.get('messages') else None
    retrieval_context = []

    # ===== Retrieve context from Graphiti =====
    if last_message and isinstance(last_message, HumanMessage):
        search_query = last_message.content
        search_results = await client.search(
            search_query,
            num_results=25,
        )
        retrieval_context = [edge.fact for edge in search_results] if search_results else []

    # System prompt
    system_message = SystemMessage(
    content=(
        "You are HealthBot, an expert clinical nutrition and health assistant with advanced knowledge of food science and biochemistry. "
        "All your responses are grounded in authoritative databases such as the USDA FoodData Central and your extensive medical and nutritional expertise. "
        "Provide clear, confident, and concise answers with relevant numeric details, standard ranges, and practical advice, as a professional nutrition specialist would communicate to patients or users. "
        "If a specific value is not directly available in the context, integrate your domain knowledge to provide standard values or best-known estimates without mentioning lack of data or disclaimers. "
        "Never reveal missing context or limitations in your knowledge. "
        "Always maintain a trustworthy, direct, and professional tone, ensuring the user receives an actionable and confident response."
    )
)



    messages = [system_message] + state['messages']

    # Get LLM response with tool binding
    response = await llm.ainvoke(messages)

    # Determine actual_output robustly
    if hasattr(response, 'content') and response.content:
        actual_output = response.content
    elif hasattr(response, 'tool_calls'):
        actual_output = f"Tool call: {response.tool_calls}"
    else:
        actual_output = str(response)

    # Evaluate with Judgeval using retrieved context
    if isinstance(last_message, HumanMessage):
        example = Example(
            input=last_message.content,
            actual_output=actual_output,
            retrieval_context=retrieval_context,
        )
        evaluation_results = judgment_client.run_evaluation(
            examples=[example],
            scorers=[faithfulness_scorer],
            model="gpt-4o",
            project_name="graphiti-agent",
            eval_run_name=f"health_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        )
        print("\nJudgeval Evaluation Results:", evaluation_results)

    return {'messages': [response]}

# **SET UP THE AGENT**

In [None]:
async def should_continue(state, config):
    last_message = state['messages'][-1]
    if not last_message.tool_calls:
        return 'end'
    else:
        return 'continue'

In [None]:


memory = MemorySaver()
graph_builder = StateGraph(State)

graph_builder.add_node('agent', chatbot)
graph_builder.add_node('tools', tool_node)

graph_builder.add_edge(START, 'agent')
graph_builder.add_conditional_edges('agent', should_continue, {'continue': 'tools', 'end': END})
graph_builder.add_edge('tools', 'agent')

graph = graph_builder.compile(checkpointer=memory)


# **RUNNING THE AGENT FOR A SINGLE CALL**

In [None]:


# Example test input
test_input = {
    'messages': [
        {
            'role': 'user',
            'content': 'What are the ingredients in 1% Milk? And what does Broccoli contain?'
        }
    ],
    'user_name': 'health_user',
    'user_node_uuid': None,
}

# Config with Judgeval handler and thread_id
config = {
    'configurable': {'thread_id': uuid.uuid4().hex},
    'callbacks': [handler],
}

# Run the agent graph with this input
final_state = await graph.ainvoke(test_input, config=config)

# Print the output assistant message
if 'messages' in final_state and isinstance(final_state['messages'], list):
    last_message = final_state['messages'][-1]
    print("\nAssistant:", last_message.content if hasattr(last_message, "content") else last_message)
else:
    print("No messages returned.")


                     


Judgeval Evaluation Results: [ScoringResult(success=True, scorers_data=[ScorerData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions between the output and the retrieval context. Everything is perfectly aligned and factually consistent. Great job!', strict_mode=None, evaluation_model='gpt-4o', error=None, evaluation_cost=None, verbose_logs=None, additional_metadata={'claims': [{'claim': 'A tool call was made for getting health data on 1% Milk ingredients.', 'quote': "Tool call: [{'name': 'get_health_data', 'args': {'query': '1% Milk ingredients'}, 'id': 'call_Rx7RSC9VKq0aQO22XngVshdO', 'type': 'tool_call'}"}, {'claim': 'A tool call was made for getting health data on Broccoli nutrients.', 'quote': "{'name': 'get_health_data', 'args': {'query': 'Broccoli nutrients'}, 'id': 'call_uConwqOzOMexgQodN5rCAFYr', 'type': 'tool_call'}"}, {'claim': 'The tool call for 1% Milk ingredients has an ID of call_Rx7RSC9VKq0aQO22Xng

# **RUNNING THE AGENT IN INTERACTIVE MODE**

In [None]:


# Configure conversation output widget
conversation_output = widgets.Output()
config = {'configurable': {'thread_id': uuid.uuid4().hex}}

user_state = {'user_name': user_name}

# Define process_input to handle user messages
async def process_input(user_state: State, user_input: str):
    conversation_output.append_stdout(f'\nUser: {user_input}\n')
    conversation_output.append_stdout('\nAssistant: ')

    graph_state = {
        'messages': [{'role': 'user', 'content': user_input}],
        'user_name': user_state['user_name'],
    }

    try:
        async for event in graph.astream(
            graph_state,
            config=config,
        ):
            for value in event.values():
                if 'messages' in value:
                    last_message = value['messages'][-1]
                    if isinstance(last_message, AIMessage) and hasattr(last_message, 'content'):
                        conversation_output.append_stdout(last_message.content)
    except Exception as e:
        conversation_output.append_stdout(f'Error: {e}')

# Setup input box and submit button
def on_submit(b):
    user_input = input_box.value
    input_box.value = ''
    asyncio.create_task(process_input(user_state, user_input))

input_box = widgets.Text(placeholder='Type your message here...')
submit_button = widgets.Button(description='Send')
submit_button.on_click(on_submit)

# Display initial greeting and UI
conversation_output.append_stdout('Assistant: Hello, how can I help you today?')

display(widgets.VBox([input_box, submit_button, conversation_output]))

VBox(children=(Text(value='', placeholder='Type your message here...'), Button(description='Send', style=Butto…

# **INSTRUCTION ADHERENCE UNIT TEST**

In [None]:

example = Example(
    input="List three key nutrients found in spinach. Then tell me which one is highest in quantity per 100 grams.",
    actual_output=(
        "1. Iron\n"
        "2. Vitamin K\n"
        "3. Folate\n"
        "Among these, Vitamin K is highest in quantity per 100 grams of spinach."
    )
)

# ============ Initialize InstructionAdherenceScorer ============
scorer = InstructionAdherenceScorer(threshold=0.8)

# ============ Run unit test evaluation ============
results = client.run_evaluation(
    examples=[example],
    scorers=[scorer],
    model="gpt-4o-mini",
    project_name="graphiti-agent",
    eval_run_name="instruction_adherence_test"
)

# ============ Print results ============
print("Unit test evaluation results:")
for r in results:
    print(r)


                     

Unit test evaluation results:
ScoringResult(            success=True,             scorer_data=[ScorerData(name='Instruction Adherence', threshold=0.8, success=True, score=1.0, reason="[Verdict(instruction='1. List three key nutrients found in spinach.', analysis='The LLM output correctly listed three key nutrients: Iron, Vitamin K, and Folate.', score=1.0), Verdict(instruction='2. Identify and specify the nutrient that is highest in quantity per 100 grams of spinach.', analysis='The LLM correctly identified Vitamin K as the nutrient that is highest in quantity per 100 grams of spinach.', score=1.0)]", strict_mode=None, evaluation_model='gpt-4o-mini', error=None, evaluation_cost=None, verbose_logs=None, additional_metadata=None)],             data_object=Example(input=List three key nutrients found in spinach. Then tell me which one is highest in quantity per 100 grams., actual_output=1. Iron
2. Vitamin K
3. Folate
Among these, Vitamin K is highest in quantity per 100 grams of spinach.,

# **EVALUATION OF THE DATASET ON THE NUTRIBENCH BENCHMARK DATASET**

NutriBench is a benchmark dataset designed to evaluate nutrition QA sets. It contains realistic meal descriptions paired with nutritional information such as carbohydrates, protein, fat, and energy values. The dataset combines natural language meal entries with structured nutrient data to test models on understanding, retrieval, and reasoning in nutrition-related tasks. It is valuable for developing and benchmarking healthbots, diet planners, and food analysis systems.

In [None]:


# ============ Load NutriBench CSV benchmark ============
df = pd.read_csv("NutriBench.csv").head(50)


correctness_scorer = AnswerCorrectnessScorer(threshold=0.8)

# ============ Initialize examples list ============
examples = []

async def generate_examples():
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating examples"):
        user_input = row["meal_description"]

        # Build agent state for your HealthBot
        state = {
            "messages": [{"role": "user", "content": user_input}],
            "user_name": "benchmark_user",
            "user_node_uuid": "benchmark_uuid"
        }

        # Call your chatbot agent function
        bot_response = await chatbot(state)

        # Build Example object
        example = Example(
            input=user_input,
            actual_output=bot_response['messages'][0].content,
            expected_output=row.get("expected_output", ""),
            retrieval_context=[],  # add retrieval context if available
        )
        examples.append(example)

# ============ Main entrypoint ============
if __name__ == "__main__":
    # Run the asynchronous example generation with progress bar
    asyncio.run(generate_examples())

    # Build EvalDataset
    dataset = EvalDataset(examples=examples)

    # Evaluate the dataset using AnswerCorrectnessScorer
    print("Running evaluation with Judgeval...")
    results = client.run_evaluation(
        examples=dataset.examples,
        scorers=[correctness_scorer],
        model="gpt-4o",
        project_name="graphiti-agent",
        eval_run_name=f"nutribench_correctness_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    )

    # Print results summary
    print("Evaluation complete. Results summary:")
    for r in results:
        print(r)


Generating examples: 100%|██████████| 50/50 [01:06<00:00,  1.34s/it]


Running evaluation with Judgeval...
                     

Evaluation complete. Results summary:
ScoringResult(            success=True,             scorer_data=[ScorerData(name='Answer Correctness', threshold=0.8, success=True, score=1.0, reason="The score is 1.00 because the model's output perfectly matches the expected output with no incorrect or inconsistent statements.", strict_mode=None, evaluation_model='gpt-4o', error=None, evaluation_cost=None, verbose_logs=None, additional_metadata=None)],             data_object=Example(input=For a quick snack, I reached for 360g of bottled water, keeping it simple and hydrating., actual_output=Bottled water is an excellent choice for hydration, as it contains no calories, sugars, or fats. Drinking 360 grams (approximately 360 mL) of water provides essential hydration without any additional nutrients or calories. 

For optimal hydration, aim to drink water throughout the day, especially if you're active or in a hot environment. The general recommendation is about 2 to 3 liters (or 8 to 12 cups) of w