In [1]:
#%pip install --quiet llama-index-llms-mistralai llama-index-llms-groq llama-index-llms-anthropic llama-index-llms-gemini llama-index-llms-openai llama-index-llms-openai-like

In [1]:
import sys
import os

# Insert the parent directory of "app" into sys.path
# so that Python recognizes "app" as an importable package.
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)

In [2]:
from dotenv import load_dotenv
load_dotenv()  # This looks for .env in the root directory

True

In [3]:
import time
from typing import Dict, List
from tqdm.auto import tqdm

import pandas as pd
from datasets import Dataset
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI
from llama_index.llms.anthropic import Anthropic
from llama_index.llms.mistralai import MistralAI
from llama_index.llms.openai_like import OpenAILike
from llama_index.llms.groq import Groq
from prettytable import PrettyTable
from ragas import evaluate
from ragas.llms import LlamaIndexLLMWrapper
from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
)
from google.generativeai.types import RequestOptions
from google.api_core import retry

from app.workflows.shared import graph_store

  from pandas.core import (




In [4]:
# Import flows
from app.workflows.naive_text2cypher import NaiveText2CypherFlow
from app.workflows.naive_text2cypher_retry import NaiveText2CypherRetryFlow
from app.workflows.text2cypher_retry_check import NaiveText2CypherRetryCheckFlow
from app.workflows.iterative_planner import IterativePlanningFlow

In [5]:
# Benchmark data
test_df = pd.read_csv('test_data.csv', delimiter=";")
test_df.head()

Unnamed: 0,Question,Cypher
0,Who acted in Tom Hanks’s highest-rated movie?,MATCH (p:Person {name: 'Tom Hanks'})-[:ACTED_I...
1,Which movie starring Keanu Reeves has the most...,"MATCH (meg:Actor {name: ""Keanu Reeves""})-[:ACT..."
2,Who directed the most recent movie starring Ha...,"MATCH (p:Person {name: ""Halle Berry""})-[:ACTED..."
3,What is the highest-rated movie from the 1990s...,MATCH (m:Movie)-[:DIRECTED]-(d:Person) WHERE m...
4,"For all movies starring Keanu Reeves, find the...","MATCH (keanu:Person {name: ""Keanu Reeves""})-[:..."


In [6]:
async def evaluate_flow_llm_combination(flow_name, flow, llm_name, llm, test_df, graph_store):
    results = []
    latencies = []
    ground_truth = []
    timeouts = 0
    flow_instance = flow(llm=llm, timeout=90) # Timeout 2 minutes

    # Add tqdm progress bar
    for i, row in tqdm(test_df.iterrows(), total=len(test_df), 
                      desc=f"Evaluating {flow_name} with {llm_name}"):
        question = row['Question']

        start = time.time()
        try:
            data = await flow_instance.run(input=question)
        except:
            data = {"answer": "timeout/error", "question": question}
            timeouts += 1
        end = time.time()
        latencies.append(end - start)
        results.append(data)

        try:
            ground_truth.append(str(graph_store.structured_query(row['Cypher'])))
        except Exception as e:
            ground_truth.append("missing")

    # Create evaluation dataset
    df = pd.DataFrame(results)
    df['ground_truth'] = ground_truth
    df['latencies'] = latencies
    dataset = Dataset.from_pandas(df)

    # Run evaluation
    result = evaluate(
        dataset,
        metrics=[answer_relevancy],
        llm=LlamaIndexLLMWrapper(OpenAI(model="gpt-4o-2024-11-20", temperature=0))
    )

    return {
        'answer_relevancy': result['answer_relevancy'],
        'avg_latency': sum(latencies) / len(latencies),
        'timeout/errors': timeouts
    }

async def run_grid_search(
    flows: List[callable],
    llms: List[object],
    test_df: pd.DataFrame,
    graph_store: object
):
    results = []

    for flow in flows:
        for llm_name, llm in llms:
            try:
                print(f"\nEvaluating {flow.__name__} with {llm_name}")

                # Skip iterative planning only with these
                if flow.__name__ == "IterativePlanningFlow" and not llm_name in ["gpt-4o", "1.5pro"]:
                    continue

                result = await evaluate_flow_llm_combination(
                    flow_name=flow.__name__,
                    flow=flow,
                    llm_name=llm_name,
                    llm=llm,
                    test_df=test_df,
                    graph_store=graph_store
                )
                print(result)
                results.append({
                    'flow': flow.__name__,
                    'llm': llm_name,
                    **result
                })
            except:
                continue

    return results

In [7]:
flows = [
    IterativePlanningFlow,
    NaiveText2CypherFlow,
    NaiveText2CypherRetryFlow,
    NaiveText2CypherRetryCheckFlow
]  # Add your flows

google_retry = dict(retry=retry.Retry(initial=0.1, multiplier=2, timeout=61))
llms = [
    ("1.5pro", Gemini(model="models/gemini-1.5-pro", temperature=0, request_options=google_retry)),
    ("1.5flash", Gemini(model="models/gemini-1.5-flash", temperature=0, request_options=google_retry)),
    #("2.0flash", Gemini(model="models/gemini-2.0-flash-exp", temperature=0)), # rate limits
    ("gpt-4o", OpenAI(model="gpt-4o", temperature=0)),
    #("gpt-4o-mini", OpenAI(model="gpt-4o-mini", temperature=0)),
    ("o1", OpenAI(model="o1-preview", temperature=0)), #no tools
    ("o1-mini", OpenAI(model="o1-mini", temperature=0)), #no tools
    ("sonnet 3.5", Anthropic(model="claude-3-5-sonnet-latest", max_tokens=8076)),
    ("haiku 3.5", Anthropic(model="claude-3-5-haiku-latest", max_tokens=8076)),
    ("mistral medium", MistralAI(model="mistral-medium")),
    ("mistral large", MistralAI(model="mistral-large-latest")),
    ("ministral 8b", MistralAI(model="ministral-8b-latest")),
    ("codestral", MistralAI(model="codestral-latest")),
    ("deepsek-v3", OpenAILike(
                            model="deepseek-chat",
                            api_base="https://api.deepseek.com/beta",
                            api_key=os.getenv("DEEPSEEK_API_KEY"),
                        )
                    ),
    #("groq llama3 70b", Groq(model="llama3-70b-8192")) # Rate limits
    
    
    
]  # Add your LLMs

results = await run_grid_search(
    flows=flows, llms=llms, test_df=test_df, graph_store=graph_store
)



Evaluating IterativePlanningFlow with 1.5pro





Evaluating IterativePlanningFlow with 1.5pro:   0%|          | 0/53 [00:00<?, ?it/s]

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-89eff2539276', bound_args=<BoundArgumen...borations?'})>, instance=<app.workflow...t 0x371826c50>, context=<_contextvars...t 0x3732c6880>)(<WorkflowHand...son_invalid')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-89eff2539276', bound_args=<BoundArgumen...borations?'})>, instance=<app.workflow...t 0x371826c50>, context=<_contextvars...t 0x3732c6880>)(<WorkflowHand...son_invalid')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/workflow/workflow.py", line 247, in _task
    new_ev = await instrumented_step(**kwargs)
      

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9949993984215767, 0.0, 0.0, 0.0, 0.0, 0.9255446080804511, 0.9480529023410917, 0.0, 0.0, 0.0, 0.0, 0.9642308647001295, 0.0, 0.0, 0.9114772262523193, 0.9942132787505584, 0.0, 0.0, 0.0, 0.0, 0.9768152775889568, 0.0, 0.9999999999999997, 0.0, 0.0, 0.0, 0.9201242237573121, 0.0, 0.0, 0.0, 0.9198666530615033, 0.0, 0.0, 0.0, 0.0, 0.9508387619187723, 0.0, 0.9271700932374024, 0.0, 0.8543403085593829, 0.0, 0.0, 0.0, 0.0, 0.9577391872133875, 0.9269296617738968, 0.0, 0.0, 0.0, 0.0, 0.9204039942955068, 0.9480071491023597, 0.0], 'avg_latency': 20.9698898027528, 'timeout/errors': 1}

Evaluating IterativePlanningFlow with 1.5flash

Evaluating IterativePlanningFlow with gpt-4o


Evaluating IterativePlanningFlow with gpt-4o:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9249969323612245, 0.0, 0.9703706704997245, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9761839216407041, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9894001515725316, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9829147621654565, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9541285442605668, 0.9435206746557657, 0.0, 0.9137268890630655, 0.0, 0.0, 0.0, 0.0, 0.9761055803559793, 0.0, 0.0], 'avg_latency': 24.676482110653282, 'timeout/errors': 0}

Evaluating IterativePlanningFlow with o1

Evaluating IterativePlanningFlow with o1-mini

Evaluating IterativePlanningFlow with sonnet 3.5

Evaluating IterativePlanningFlow with haiku 3.5

Evaluating IterativePlanningFlow with mistral medium

Evaluating IterativePlanningFlow with mistral large

Evaluating IterativePlanningFlow with ministral 8b

Evaluating IterativePlanningFlow with codestral

Evaluating IterativePlanningFlow with deepsek-v3

Evaluating NaiveText2CypherFlow with 1.5pro


Evaluating NaiveText2CypherFlow with 1.5pro:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9949993984215767, 0.9999999999999997, 0.9829458623660029, 0.0, 0.9285420055616255, 0.9289127551632536, 0.0, 0.9279729760430796, 0.9138625391009302, 0.9693385662237514, 0.9822890690442501, 0.9815878612887233, 0.9834085418855416, 0.9473388558800022, 0.9722568794852787, 0.9918676554847439, 0.970388925384084, 0.0, 0.0, 0.0, 0.9698168922983722, 0.9859132176101052, 0.9999999999999997, 0.920313695958005, 0.0, 0.9808597363335845, 0.9841965080362601, 0.0, 0.9662863170070782, 0.9525348561975923, 0.0, 0.0, 0.9429483073310371, 0.0, 0.9405507443246995, 0.0, 0.9801954893722957, 0.0, 0.9659747295340795, 0.9342005042693198, 0.9526292927227692, 0.9782739401827311, 0.9220202616154977, 0.9363917957984552, 0.9577391872133875, 0.9794759023153303, 0.0, 0.949448057893644, 0.0, 0.9354601131431354, 0.9214679572332067, 0.9258073854794538, 0.9594228248090867], 'avg_latency': 5.560009983350646, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with 1.5flash


Evaluating NaiveText2CypherFlow with 1.5flash:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.0, 0.0, 0.868427630395574, 0.0, 0.0, 0.0, 0.0, 0.9390897823426072, 0.0, 0.9811312962559807, 0.9812882381392732, 0.7022547748389808, 0.0, 0.9595701709516368, 0.9852627441742084, 0.9770365279014507, 0.0, 0.0, 0.0, 0.9716567433158619, 0.0, 0.9999999999999997, 0.9835002538672065, 0.0, 0.9758379485398123, 0.8968597827510463, 0.0, 0.9589372584411966, 0.9808843871828601, 0.9278099116689581, 0.0, 0.0, 0.0, 0.901160990124608, 0.0, 0.0, 0.0, 0.9819629724143226, 0.950164015461644, 0.9132657113647218, 0.9728073226427538, 0.9544806450369636, 0.9518690999170353, 0.9494555171314145, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9428459750464738, 0.0], 'avg_latency': 2.5880306171921066, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with gpt-4o


Evaluating NaiveText2CypherFlow with gpt-4o:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9657327243945076, 0.9999999999999997, 0.9375330045636842, 0.9010290459807635, 0.9605657321793997, 0.9350400094034544, 0.0, 0.0, 0.9934139521065269, 0.9915098288465175, 0.9811312962559807, 0.9946073296702288, 0.9445508530783734, 0.9827927063581554, 0.9861571370714334, 1.0000000000000007, 0.9868758664548074, 0.0, 0.0, 0.0, 0.9766275485615704, 0.0, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9755375532741734, 0.9657233691152433, 0.0, 0.0, 0.0, 0.0, 0.9695576502211706, 0.0, 0.0, 0.9575512236078362, 0.9938601358184028, 0.950826407231879, 0.0, 0.9891942344856485, 0.0, 0.9532096039316021, 0.9577391872133875, 0.9947890884543034, 0.0, 0.9741363567387146, 0.0, 0.0, 0.9622659012423194, 0.9397697598222309, 0.9603533645059152], 'avg_latency': 6.400268572681355, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with o1


Evaluating NaiveText2CypherFlow with o1:   0%|          | 0/53 [00:00<?, ?it/s]

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-d37867f039f1', bound_args=<BoundArgumen...t coactor?'})>, instance=<app.workflow...t 0x39c444dd0>, context=<_contextvars...t 0x39cbac3c0>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-d37867f039f1', bound_args=<BoundArgumen...t coactor?'})>, instance=<app.workflow...t 0x39c444dd0>, context=<_contextvars...t 0x39cbac3c0>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/tomazbratanic/anaconda3

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9698381379575419, 0.991883372863564, 0.9522521626850273, 0.0, 0.942012643854067, 0.0, 0.0, 0.0, 0.982389201845823, 0.9915098288465175, 0.9811312962559807, 0.9870248274456653, 0.9454009572001332, 0.0, 0.9857034584907215, 1.0000000000000007, 0.9868758664548074, 0.9302439356386728, 0.0, 0.9703728014088556, 0.9716567433158619, 0.0, 0.9999999999999997, 0.9243550354028119, 0.0, 0.9473799328107688, 0.9325856535984051, 0.9935214378119879, 0.9755475209846317, 0.0, 0.0, 0.9823336724926705, 0.0, 0.0, 0.0, 0.0, 0.9854165919630992, 0.9509460358753588, 0.9885703990220498, 0.9479915559256562, 0.0, 0.9891942344856485, 0.0, 0.9594903487948646, 0.9577391872133875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9707571598838278, 0.9540298013668037, 0.0], 'avg_latency': 43.13295695466815, 'timeout/errors': 6}

Evaluating NaiveText2CypherFlow with o1-mini


Evaluating NaiveText2CypherFlow with o1-mini:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9698381379575419, 0.8960060811673188, 0.9636473187944211, 0.0, 0.0, 0.9404996523894532, 0.0, 0.0, 0.9862670672545785, 0.9897821171067838, 0.9777838577915207, 0.9815878612887233, 0.9999999999999997, 0.9543152496511772, 0.9900287509083733, 1.0000000000000007, 0.9789842515161288, 0.933925371730142, 0.0, 0.0, 0.9716567433158619, 0.9876378239773619, 0.9999999999999997, 0.9832346997476925, 0.0, 0.9316647613772527, 0.9263037002614539, 0.9935214378119879, 0.9763550654188721, 0.9055741128316251, 0.907096345912365, 0.987424973641085, 0.0, 0.9473074988639838, 0.7011909536023416, 0.0, 0.0, 0.0, 0.988356076582302, 0.9504591914705919, 0.9282761656723273, 0.0, 0.9692229584623321, 0.9581519334612917, 0.9230292002889063, 0.9211511286236757, 0.0, 0.0, 0.0, 0.0, 0.9702831999679834, 0.9428459750464738, 0.0], 'avg_latency': 18.622604729994286, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with sonnet 3.5


Evaluating NaiveText2CypherFlow with sonnet 3.5:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.9944069945793942, 0.0, 0.9646510737698675, 0.0, 0.9379414579385582, 0.0, 0.0, 0.9952693239961747, 0.9333858459571623, 0.9726840988312248, 0.0, 0.0, 0.9684445906055764, 0.9485667761051945, 1.0000000000000007, 0.9752124430223609, 0.9552068627268738, 0.9282913807188248, 0.9925700514858112, 0.9780473066011658, 0.0, 0.9658103289292521, 0.9835002538672065, 0.9816326165286043, 0.9808597363335845, 0.9065545735054968, 0.9339092423229959, 0.9763201528661081, 0.9149956258001112, 0.0, 0.0, 0.944594847007599, 0.9355072921648319, 0.0, 0.0, 0.0, 0.9434322079357994, 0.9698121060239716, 0.9321174698786034, 0.0, 0.0, 0.0, 0.0, 0.9223642376775355, 0.9592042801395082, 0.0, 0.0, 0.0, 0.9604461912517355, 0.9724614392766927, 0.8812406487399423, 0.9603776384202555], 'avg_latency': 10.551478862762451, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with haiku 3.5


Evaluating NaiveText2CypherFlow with haiku 3.5:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9657327243945076, 0.9939396147892711, 0.963730652364781, 0.9629511597776221, 0.9405020213400409, 0.9350400094034544, 0.0, 0.0, 0.984380925241465, 0.9802448935653318, 0.9811312962559807, 0.9923944666796998, 0.0, 0.9660794336880724, 0.9861571370714334, 1.0000000000000007, 0.9849616380214904, 0.953028751523341, 0.0, 0.0, 0.9716567433158619, 0.0, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9705203979467253, 0.9386306469979009, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9641837929870715, 0.9480096204619874, 0.9526292927227692, 0.9802635080018304, 0.9544806450369636, 0.0, 0.932494296348182, 0.963375069628961, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'avg_latency': 8.430957956134149, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with mistral medium


Evaluating NaiveText2CypherFlow with mistral medium:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.9999992315976121, 0.0, 0.9654929737467811, 0.0, 0.0, 0.8993553956736534, 0.9499302377728162, 0.9829364345949441, 0.9068146233681055, 0.9811487792868933, 0.0, 0.9714542516939324, 0.0, 0.0, 1.0000000000000007, 0.986872263181592, 0.0, 0.0, 0.0, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9821611368876274, 0.0, 0.9758244953320939, 0.9075692161228366, 0.0, 0.973331753615342, 0.9657977992603329, 0.0, 0.0, 0.0, 0.0, 0.9695576502211706, 0.0, 0.0, 0.0, 0.9906964422588626, 0.0, 0.0, 0.0, 0.0, 0.9425118355063016, 0.9577391872133875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8751113727043601, 0.9110526343452997, 0.0], 'avg_latency': 9.121988710367456, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with mistral large


Evaluating NaiveText2CypherFlow with mistral large:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9698381379575419, 0.9999999999999997, 0.9917410337814058, 0.0, 0.9398810917210249, 0.9351767895249209, 0.0, 0.0, 0.9934139521065269, 0.9915098288465175, 0.9785793851982176, 0.9879687406986418, 0.9999999999999997, 0.9684445906055764, 0.9861571370714334, 1.0000000000000007, 0.9868758664548074, 0.9475609202071656, 0.0, 0.993715649075868, 0.9780473066011658, 0.0, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.973331753615342, 0.9182138132621529, 0.0, 0.9823336724926705, 0.9761883717526239, 0.0, 0.9684779445473014, 0.0, 0.9867076731616654, 0.0, 0.9938601358184028, 0.9514227086203831, 0.0, 0.9933359189253742, 0.9692229584623321, 0.9493584178114727, 0.9223642376775355, 0.0, 0.0, 0.0, 0.0, 0.9625393303337467, 0.9761055803559793, 0.9999999999999997, 0.9614745058376358], 'avg_latency': 7.4956045960480315, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with ministral 8b


Evaluating NaiveText2CypherFlow with ministral 8b:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9432607444420148, 0.0, 0.0, 0.0, 0.0, 0.9442099793876872, 0.8940678761774965, 0.954800394367851, 0.9902133765867328, 0.0, 0.9811312962559807, 0.9946073296702288, 0.0, 0.0, 0.9861571370714334, 1.0000000000000007, 0.0, 0.9523551094380557, 0.0, 0.0, 0.0, 0.9876378239773619, 0.0, 0.9843437232262001, 0.9999999999999996, 0.9808597363335845, 0.9212942731120256, 0.0, 0.9722238699306972, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9418363036163026, 0.0, 0.0, 0.0, 0.9938601358184028, 0.950826407231879, 0.0, 0.0, 0.9692005103817681, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6842213535297891, 0.0, 0.9480071491023597, 0.0], 'avg_latency': 4.845444256404661, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with codestral


Evaluating NaiveText2CypherFlow with codestral:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [1.0000000000000002, 0.9999999999999997, 0.906095589152942, 0.0, 0.9419228444778501, 0.9048920423722824, 0.0, 0.9043111046382611, 0.9832641318884785, 0.0, 0.9811312962559807, 0.9879687406986418, 0.0, 0.9594982515001487, 0.9861571370714334, 1.0000000000000007, 0.9868758664548074, 0.9804424007796445, 0.0, 0.0, 0.9780407212033543, 0.9876378239773619, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9705203979467253, 0.9755147293659722, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9854165919630992, 0.0, 0.9938601358184028, 0.9501499874605438, 0.8981691129001453, 0.0, 0.9692229584623321, 0.9551478681000564, 0.9587235269065059, 0.9947890884543034, 0.9414193578517919, 0.0, 0.0, 0.8904839740825327, 0.9806689918500003, 0.9999999999999997, 0.0], 'avg_latency': 4.1370605927593305, 'timeout/errors': 0}

Evaluating NaiveText2CypherFlow with deepsek-v3


Evaluating NaiveText2CypherFlow with deepsek-v3:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [1.0000000000000002, 0.9999999999999997, 0.9593673492202157, 0.9638091737929537, 0.942834321006389, 0.9404996523894532, 0.0, 0.9350066673379316, 0.6469694948370815, 0.7125455500122048, 0.9811312962559807, 0.9923944666796998, 0.8620897928332057, 0.9824025228798795, 0.9831994358616831, 1.0000000000000007, 0.9752124430223609, 0.9477445230572829, 0.0, 0.0, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9722238699306972, 0.9260028629319894, 0.0, 0.9823336724926705, 0.9429483073310371, 0.0, 0.9695576502211706, 0.0, 0.7210516896295088, 0.9485932479116707, 0.9938601358184028, 0.9479358557825054, 0.0, 0.9567789843559971, 0.9692229584623321, 0.9554847154390048, 0.9552985855575084, 0.9533480125085106, 0.0, 0.9590019288101268, 0.0, 0.0, 0.9761055803559793, 0.8793638451311437, 0.9455296716518425], 'avg_latency': 9.383746704965267, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with 

Evaluating NaiveText2CypherRetryFlow with 1.5pro:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9949993984215767, 0.9999999999999997, 0.981525302339416, 0.0, 0.933145615788041, 0.9358692676563941, 0.0, 0.9279729760430796, 0.9138625391009302, 0.9548227818031693, 0.9812985975440629, 0.9815878612887233, 0.9834085418855416, 0.9473388558800022, 0.9449424251562402, 0.9918676554847439, 0.970388925384084, 0.0, 0.0, 0.0, 0.9766275485615704, 0.9859132176101052, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.926123087937656, 0.9723686910298389, 0.0, 0.0, 0.9295510876728713, 0.0, 0.9503799018529571, 0.0, 0.9896423751422004, 0.0, 0.9670172411655281, 0.9364415160174651, 0.9430649940141521, 0.9701604888701411, 0.9220202616154977, 0.934832781093656, 0.9556721547366936, 0.0, 0.0, 0.0, 0.0, 0.9404325936566242, 0.9214679572332067, 0.9510159190223876, 0.9540664796145446], 'avg_latency': 5.8469507154428735, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with 1.5flash


Evaluating NaiveText2CypherRetryFlow with 1.5flash:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.0, 0.0, 0.868427630395574, 0.0, 0.0, 0.0, 0.0, 0.9559969164365331, 0.9475921690068857, 0.9811312962559807, 0.9812882381392732, 0.7022547748389808, 0.0, 0.959491805088199, 0.9852627441742084, 0.9757759315952912, 0.0, 0.0, 0.0, 0.9733136783977647, 0.0, 0.9999999999999997, 0.9830538815406801, 0.0, 0.9758244953320939, 0.8968597827510463, 0.0, 0.9589372584411966, 0.973253196141069, 0.9278099116689581, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9819629724143226, 0.0, 0.0, 0.9728073226427538, 0.9544806450369636, 0.9435206746557657, 0.9556721547366936, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9428459750464738, 0.0], 'avg_latency': 2.7681290383608834, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with gpt-4o


Evaluating NaiveText2CypherRetryFlow with gpt-4o:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9657327243945076, 0.9999999999999997, 0.9023760622500072, 0.9452949688076595, 0.9367585597877222, 0.9350400094034544, 0.0, 0.0, 0.9934139521065269, 0.9915098288465175, 0.9811312962559807, 0.9946073296702288, 0.0, 0.9827927063581554, 0.9861571370714334, 1.0000000000000007, 0.9868753850257347, 0.9513140050364456, 0.0, 0.0, 0.9766275485615704, 0.0, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9755475209846317, 0.9657977992603329, 0.0, 0.0, 0.0, 0.0, 0.9695576502211706, 0.0, 0.0, 0.0, 0.9938601358184028, 0.936222784946945, 0.0, 0.9891942344856485, 0.9692229584623321, 0.9551478681000564, 0.9582313570599467, 0.9230335057240566, 0.0, 0.9398634401125218, 0.0, 0.0, 0.9827091336149906, 0.9480071491023597, 0.0], 'avg_latency': 6.323774616673307, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with o1


Evaluating NaiveText2CypherRetryFlow with o1:   0%|          | 0/53 [00:00<?, ?it/s]

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-890c658788af', bound_args=<BoundArgumen...the 1980s?'})>, instance=<app.workflow...t 0x39c21b950>, context=<_contextvars...t 0x39b975b80>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-890c658788af', bound_args=<BoundArgumen...the 1980s?'})>, instance=<app.workflow...t 0x39c21b950>, context=<_contextvars...t 0x39b975b80>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/tomazbratanic/anaconda3

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9762441454949505, 0.9981356648597979, 0.9603243367272034, 0.0, 0.940034176549107, 0.9350400094034544, 0.0, 0.0, 0.9727118637519528, 0.9823589929883504, 0.9811312962559807, 0.9815309684396892, 0.9999999999999997, 0.9822074311407417, 0.9919645578268432, 1.0000000000000007, 0.9868758664548074, 0.0, 0.0, 0.0, 0.9766275485615704, 0.0, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9038845106394758, 0.0, 0.9755475209846317, 0.0, 0.0, 0.0, 0.9429483073310371, 0.0, 0.9318649638538083, 0.9460752349759095, 0.9873532137609486, 0.9603430703560699, 0.9938601358184028, 0.9486134678339414, 0.0, 0.9949473724034338, 0.0, 0.9493584178114727, 0.8677648246733828, 0.0, 0.9963897106865262, 0.0, 0.0, 0.0, 0.9765887083200195, 0.9480071491023597, 0.9724919907650187], 'avg_latency': 37.8336232158373, 'timeout/errors': 2}

Evaluating NaiveText2CypherRetryFlow with o1-mini


Evaluating NaiveText2CypherRetryFlow with o1-mini:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9782252247788868, 0.9999999999999997, 0.9636473187944211, 0.0, 0.9290843891476014, 0.9404996523894532, 0.8940678761774965, 0.929902462926219, 0.9754407463636939, 0.9917908327234836, 0.9777838577915207, 0.9833719501224033, 0.9999999999999997, 0.0, 0.9919645578268432, 1.0000000000000007, 0.9750384440467895, 0.9257801943757804, 0.0, 0.0, 0.9716567433158619, 0.9876378239773619, 0.9999999999999997, 0.9832346997476925, 0.0, 0.9382577484153852, 0.9284152038660985, 0.0, 0.9722238699306972, 0.9655592862884793, 0.936931128093033, 0.9823336724926705, 0.0, 0.0, 0.9695576502211706, 0.0, 0.98480305891685, 0.9747312358829058, 0.9627086599298204, 0.9479358557825054, 0.0, 0.0, 0.9692229584623321, 0.9543073168093846, 0.959215696753065, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9735505589310472, 0.9480071491023597, 0.9724919907650187], 'avg_latency': 18.089810812248373, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with sonnet 3.5


Evaluating NaiveText2CypherRetryFlow with sonnet 3.5:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.0, 0.0, 0.96296727381604, 0.0, 0.9404996523894532, 0.0, 0.0, 0.9862670672545785, 0.940582326953558, 0.9811312962559807, 0.9815878612887233, 0.9516463075507904, 0.9684445906055764, 0.9513966422248687, 1.0000000000000007, 0.9752124430223609, 0.95138453213159, 0.9253460679813403, 0.9885640654071898, 0.9775740539213006, 0.0, 0.9984579713493207, 0.9835002538672065, 0.9816326165286043, 0.9808597363335845, 0.9065545735054968, 0.9339092423229959, 0.9705203979467253, 0.946692894568661, 0.0, 0.0, 0.9185168487562884, 0.9355072921648319, 0.0, 0.0, 0.0, 0.9462381550596642, 0.9590975223001369, 0.9314704004086329, 0.0, 0.9713772866934182, 0.0, 0.9562213244285905, 0.9223642376775355, 0.9931318962120503, 0.0, 0.0, 0.0, 0.9471219007175238, 0.9735505589310472, 0.0, 0.0], 'avg_latency': 10.008387799532908, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with haiku 3.5


Evaluating NaiveText2CypherRetryFlow with haiku 3.5:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9657327243945076, 0.9926635593850252, 0.9645604374427137, 0.9654929737467811, 0.9393931985162386, 0.9368236083612099, 0.9798847463863979, 0.0, 0.984380925241465, 0.9730446653480348, 0.9822890690442501, 0.9879687406986418, 0.0, 0.9607769066032951, 0.9727036297829864, 0.9999982041003173, 0.9869260662896142, 0.9717793449565969, 0.0, 0.0, 0.9716567433158619, 0.0, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9705203979467253, 0.9501855612498624, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9622030027962776, 0.949091812576909, 0.0, 0.9802635080018304, 0.9544806450369636, 0.9578448708925477, 0.9414014298164762, 0.9848857337747692, 0.9855649049707987, 0.0, 0.0, 0.0, 0.8692431162894424, 0.0, 0.0], 'avg_latency': 8.573774513208642, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with mistral medium


Evaluating NaiveText2CypherRetryFlow with mistral medium:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.0, 0.9883353796243588, 0.0, 0.0, 0.0, 0.90942661979734, 0.9499302377728162, 0.9829364345949441, 0.9918993554553642, 0.9811312962559807, 0.0, 0.9714542516939324, 0.0, 0.0, 1.0000000000000007, 0.9868758664548074, 0.0, 0.0, 0.0, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9821611368876274, 0.0, 0.9808597363335845, 0.9075692161228366, 0.0, 0.9744656638150588, 0.9657977992603329, 0.0, 0.0, 0.0, 0.0, 0.9695576502211706, 0.0, 0.0, 0.0, 0.9887137434335269, 0.0, 0.0, 0.0, 0.9692229584623321, 0.948666193615724, 0.9577391872133875, 0.0, 0.996391734048732, 0.947888475522662, 0.0, 0.0, 0.8746332271113539, 0.9110526343452997, 0.0], 'avg_latency': 10.786852071870047, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with mistral large


Evaluating NaiveText2CypherRetryFlow with mistral large:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9698381379575419, 0.9926635593850252, 0.9633628930933863, 0.0, 0.9398810917210249, 0.9404996523894532, 0.0, 0.0, 0.9902133765867328, 0.9927128394114716, 0.9785227514694217, 0.9876541029476497, 0.9928929400705281, 0.0, 0.9861571370714334, 1.0000000000000007, 0.9868758664548074, 0.9475609202071656, 0.0, 0.9852442844726689, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9722238699306972, 0.9384249673343837, 0.0, 0.9823336724926705, 0.9681469337119668, 0.0, 0.9695576502211706, 0.0, 0.9873532137609486, 0.0, 0.9938601358184028, 0.9494735676892087, 0.0, 0.0, 0.9692229584623321, 0.9551478681000564, 0.9587235269065059, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9761055803559793, 0.9999999999999997, 0.0], 'avg_latency': 8.96845271452418, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with ministral 8b


Evaluating NaiveText2CypherRetryFlow with ministral 8b:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9432607444420148, 0.0, 0.0, 0.8332942357603933, 0.0, 0.9442222335234108, 0.8940678761774965, 0.0, 0.9902133765867328, 0.0, 0.9811312962559807, 0.9946073296702288, 0.9834085418855416, 0.0, 0.9861571370714334, 0.0, 0.9870264659592275, 0.9494563321980131, 0.0, 0.0, 0.9818315882582876, 0.9876378239773619, 0.0, 0.9843437232262001, 0.9999999999999996, 0.9808597363335845, 0.9212942731120256, 0.0, 0.9722238699306972, 0.0, 0.0, 0.9823336724926705, 0.0, 0.0, 0.9419343409998188, 0.0, 0.0, 0.0, 0.9938601358184028, 0.9493167108943178, 0.9136871810772552, 0.0, 0.9692229584623321, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9432001718644712, 0.6812820455708867, 0.0, 0.9999999999999997, 0.0], 'avg_latency': 6.328750223483679, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with codestral


Evaluating NaiveText2CypherRetryFlow with codestral:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.9926635593850252, 0.906095589152942, 0.0, 0.9340629740074061, 0.9215786077211869, 0.0, 0.0, 0.9832641318884785, 0.9030582200687157, 0.9811312962559807, 0.9879687406986418, 0.0, 0.0, 0.9861637486159204, 1.0000000000000007, 0.9868758664548074, 0.0, 0.0, 0.0, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9836490395797503, 0.0, 0.9808597363335845, 0.9075692161228366, 0.0, 0.9722238699306972, 0.9755147293659722, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9938601358184028, 0.950826407231879, 0.0, 0.9833636841509931, 0.9692229584623321, 0.9552852577667915, 0.959215696753065, 0.0, 0.9963897106865262, 0.0, 0.0, 0.9669946921047551, 0.9827091336149906, 0.9999999999999997, 0.9257232421434884], 'avg_latency': 4.466209056242457, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryFlow with deepsek-v3


Evaluating NaiveText2CypherRetryFlow with deepsek-v3:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [1.0000000000000002, 0.9926675757107128, 0.9547759651044775, 0.96296727381604, 0.9446934006117536, 0.9404996523894532, 0.0, 0.929902462926219, 0.9902133765867328, 0.9945060315676182, 0.9811312962559807, 0.9879687406986418, 0.9999999999999997, 0.9822074311407417, 0.9831994358616831, 1.0000000000000007, 0.9752124430223609, 0.9477445230572829, 0.0, 0.0, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9836490395797503, 0.0, 0.9808597363335845, 0.9325856535984051, 0.0, 0.9722238699306972, 0.9497690333003902, 0.0, 0.9823336724926705, 0.0, 0.0, 0.9695576502211706, 0.0, 0.9855289591952735, 0.9603430703560699, 0.9938601358184028, 0.9491975236265486, 0.0, 0.9567789843559971, 0.9692229584623321, 0.9564322352561702, 0.9577391872133875, 0.9545236159444737, 0.0, 0.9579893993730488, 0.0, 0.0, 0.9761055803559793, 0.8806189101831011, 0.9827736536299247], 'avg_latency': 8.738093628073639, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with 1.5pro


Evaluating NaiveText2CypherRetryCheckFlow with 1.5pro:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.0, 0.9789754765041323, 0.0, 0.933145615788041, 0.0, 0.8932524174822011, 0.9392877335453655, 0.9499390148841105, 0.9936787229169831, 0.9812985975440629, 0.9815878612887233, 0.9788135788784246, 0.9822074311407417, 0.9338229960777937, 0.9918676554847439, 0.9712225737865573, 0.0, 0.0, 0.9769148727387837, 0.9760979675176578, 0.979386143163597, 0.9999999999999997, 0.9843437232262001, 0.9999999999999996, 0.9808622779543327, 0.9841965080362601, 0.9378404227690478, 0.9829147621654565, 0.9615149903069304, 0.8902267583906981, 0.0, 0.9429483073310371, 0.936625483686242, 0.9811048831220389, 0.0, 0.9876993566451443, 0.8708772594757432, 0.9665979505249748, 0.9417981625465718, 0.0, 0.9782739401827311, 0.9116252360963268, 0.9623303360895122, 0.9577391872133875, 0.8033302130551366, 0.0, 0.0, 0.0, 0.0, 0.9214679572332067, 0.9258073854794538, 0.0], 'avg_latency': 24.418314605389, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with 1.5flash


Evaluating NaiveText2CypherRetryCheckFlow with 1.5flash:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9509333034032509, 0.0, 0.9496447589181729, 0.96296727381604, 0.0, 0.0, 0.8810710174850951, 0.929902462926219, 0.9869489819518035, 0.0, 0.9815172205187372, 0.9812908772984864, 0.8116629560266118, 0.0, 0.9572863817830366, 0.9852627441742084, 0.9770365279014507, 0.0, 0.0, 0.993715649075868, 0.9716567433158619, 0.0, 0.9999999999999997, 0.9835002538672065, 0.0, 0.9758244953320939, 0.9384354144825379, 0.0, 0.9589372584411966, 0.9772565261157965, 0.0, 0.0, 0.92900600180436, 0.9487322701580444, 0.9402273663651542, 0.0, 0.0, 0.0, 0.9819629724143226, 0.0, 0.0, 0.9834699543976256, 0.9544806450369636, 0.0, 0.9252710549570624, 0.9656561988484483, 0.9600255425886335, 0.0, 0.0, 0.0, 0.9077145697181291, 0.9428459750464738, 0.0], 'avg_latency': 9.48330335797004, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with gpt-4o


Evaluating NaiveText2CypherRetryCheckFlow with gpt-4o:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9671011955821859, 0.9999999999999997, 0.8931246003902141, 0.9242550638916244, 0.9476105668059368, 0.9342669210209628, 0.8892010271322833, 0.0, 0.9934139521065269, 0.0, 0.9811312962559807, 0.9946073296702288, 0.0, 0.0, 0.9861571370714334, 1.0000000000000007, 0.9868758664548074, 0.9514959089009977, 0.9831047427692097, 0.993715649075868, 0.9766275485615704, 0.9876886614701613, 0.9999999999999997, 0.9843437232262001, 0.0, 0.9808597363335845, 0.9325856535984051, 0.9935214378119879, 0.9755475209846317, 0.965574508825064, 0.9628908982229029, 0.0, 0.9429483073310371, 0.9525475739979096, 0.9695576502211706, 0.9349720023024141, 0.9873532137609486, 0.9641928908477081, 0.9938601358184028, 0.950826407231879, 0.9526292927227692, 0.9891942344856485, 0.9692229584623321, 0.9532096039316021, 0.9587235269065059, 0.9947890884543034, 0.0, 0.9735682697407017, 0.0, 0.9689757319668647, 0.9131967771563381, 0.9397697598222309, 0.972089926021123], 'avg_latency': 19.832896776919096, 'timeo

Evaluating NaiveText2CypherRetryCheckFlow with o1:   0%|          | 0/53 [00:00<?, ?it/s]

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-980765e38c7a', bound_args=<BoundArgumen...borations.'})>, instance=<app.workflow...t 0x38b13ed10>, context=<_contextvars...t 0x389864540>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-980765e38c7a', bound_args=<BoundArgumen...borations.'})>, instance=<app.workflow...t 0x38b13ed10>, context=<_contextvars...t 0x389864540>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/tomazbratanic/anaconda3

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9520177476994767, 0.9944069945793942, 0.9595723320382299, 0.8829590632844657, 0.0, 0.9404996523894532, 0.9162630962431524, 0.951429942498501, 0.967922756851984, 0.9915098288465175, 0.9811312962559807, 0.9946073296702288, 0.0, 0.9822074311407417, 0.9919645578268432, 1.0000000000000007, 0.9868758664548074, 0.0, 0.9568375143271908, 0.9898662754420519, 0.9737869310776298, 0.9876378239773619, 0.9999999999999997, 0.9243550354028119, 0.0, 0.9316647613772527, 0.9325856535984051, 0.9935214378119879, 0.9755475209846317, 0.9213610113525955, 0.9628908982229029, 0.9823336724926705, 0.9429483073310371, 0.9444921364142574, 0.0, 0.9388999973158662, 0.9854165919630992, 0.0, 0.9887137434335269, 0.0, 0.9134368319900062, 0.9840300340601204, 0.0, 0.0, 0.9577391872133875, 0.9783749422489306, 0.0, 0.0, 0.0, 0.9660987405550338, 0.9765887083200195, 0.9480071491023597, 0.9668371873161713], 'avg_latency': 57.08801124230871, 'timeout/errors': 12}

Evaluating NaiveText2CypherRetryCheckFlow 

Evaluating NaiveText2CypherRetryCheckFlow with o1-mini:   0%|          | 0/53 [00:00<?, ?it/s]

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-3d94de073c70', bound_args=<BoundArgumen...Db rating?'})>, instance=<app.workflow...t 0x38b4ff990>, context=<_contextvars...t 0x38bb36280>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-3d94de073c70', bound_args=<BoundArgumen...Db rating?'})>, instance=<app.workflow...t 0x38b4ff990>, context=<_contextvars...t 0x38bb36280>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/tomazbratanic/anaconda3

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9698381379575419, 0.9999999999999997, 0.9380749430114615, 0.96296727381604, 0.0, 0.9404996523894532, 0.0, 0.9516470190389784, 0.9731439212941444, 0.9933106697977027, 0.977808976256882, 0.9879687406986418, 0.9999999999999997, 0.9822074311407417, 0.9919645578268432, 1.0000000000000007, 0.960481232553707, 0.9302439356386728, 0.9253460679813403, 0.9674315371811254, 0.9716567433158619, 0.9876378239773619, 0.9999999999999997, 0.9832346997476925, 0.8117670568414453, 0.9730746074302837, 0.9325856535984051, 0.9561660551923232, 0.970941451111163, 0.0, 0.0, 0.9823336724926705, 0.9429483073310371, 0.936625483686242, 0.0, 0.9349720023024141, 0.98480305891685, 0.9678152701756186, 0.963465177108462, 0.9479358557825054, 0.9258616689375421, 0.0, 0.9692229584623321, 0.9489968032087311, 0.959215696753065, 0.0, 0.0, 0.9720096832548482, 0.8741396770968072, 0.9622935416464163, 0.9827091336149906, 0.9999999999999997, 0.9614769351283551], 'avg_latency': 36.518278027480505, 'timeout/err

Evaluating NaiveText2CypherRetryCheckFlow with sonnet 3.5:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.9655881142833124, 0.9541081270915077, 0.9443685475439971, 0.9375780299676801, 0.9379414579385582, 0.889506127964839, 0.9427715559301196, 0.9862670672545785, 0.9213952157616031, 0.9811312962559807, 0.9815878612887233, 0.9516463075507904, 0.9684445906055764, 0.9513966422248687, 1.0000000000000007, 0.9752124430223609, 0.9552068627268738, 0.9253460679813403, 0.9955726444655474, 0.9780473066011658, 0.9859132176101052, 0.965810328929252, 0.9835002538672065, 0.977688812620778, 0.9808597363335845, 0.9065545735054968, 0.9322713959520562, 0.9581198093452529, 0.9145248363192667, 0.9257768845967592, 0.0, 0.9185168487562884, 0.9355072921648319, 0.9572404222493636, 0.9091600545098816, 0.890414118306032, 0.9513928520800445, 0.9662405781160267, 0.9332089187787759, 0.9210439614645202, 0.9713772866934182, 0.0, 0.9533404814100103, 0.9223642376775355, 0.9931318962120503, 0.0, 0.0, 0.0, 0.9436346056458241, 0.9735505589310472, 0.9110526343452997, 0.9613562632819619], 'avg_latenc

Evaluating NaiveText2CypherRetryCheckFlow with haiku 3.5:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9541231771926014, 0.9887625023904828, 0.0, 0.9951972973941526, 0.9405020213400409, 0.0, 0.0, 0.0, 0.9815769906778226, 0.9584563094421202, 0.9811312962559807, 0.9879687406986418, 0.0, 0.0, 0.9861571370714334, 1.0000000000000007, 0.9858801064758925, 0.9072580739589305, 0.9253460679813403, 0.9716834748591658, 0.9716567433158619, 0.9876378239773619, 0.9999999999999997, 0.9252683848530624, 0.9999999999999996, 0.9808597363335845, 0.9021367098680466, 0.0, 0.0, 0.9680406742941555, 0.9404476852077694, 0.0, 0.9369321383178691, 0.9444921364142574, 0.0, 0.9349720023024141, 0.9753608566955568, 0.9490410055832256, 0.9622030027962776, 0.949091812576909, 0.9499830962527267, 0.9802635080018304, 0.9545079598454699, 0.0, 0.932494296348182, 0.0, 0.954225766717783, 0.0, 0.0, 0.0, 0.861739683515038, 0.911713173722888, 0.0], 'avg_latency': 28.195346337444377, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with mistral medium


Evaluating NaiveText2CypherRetryCheckFlow with mistral medium:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.0, 0.0, 0.958683490121173, 0.9407890575368567, 0.0, 0.9377077359081946, 0.902056594099986, 0.9495358503172892, 0.0, 0.0, 0.0, 0.9815878612887233, 0.0, 0.9822074311407417, 0.0, 1.0000000000000007, 0.9868758664548074, 0.0, 0.9456173241163923, 0.0, 0.0, 0.9876378239773619, 0.9999999999999997, 0.9821611368876274, 0.0, 0.9808597363335845, 0.9125515656853621, 0.9935214378119879, 0.9455309627146599, 0.9778740174501396, 0.0, 0.0, 0.0, 0.0, 0.9695576502211706, 0.0, 0.0, 0.0, 0.9906964422588626, 0.0, 0.0, 0.0, 0.9692229584623321, 0.9563760112890249, 0.959215696753065, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8703501520177978, 0.9110526343452997, 0.0], 'avg_latency': 31.521116436652417, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with mistral large


Evaluating NaiveText2CypherRetryCheckFlow with mistral large:   0%|          | 0/53 [00:00<?, ?it/s]

Exception in callback Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-e4ef0b8b767d', bound_args=<BoundArgumen...er movies?'})>, instance=<app.workflow...t 0x38e2e2a10>, context=<_contextvars...t 0x371bc92c0>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273
handle: <Handle Dispatcher.span.<locals>.wrapper.<locals>.handle_future_result(span_id='Workflow.run...-e4ef0b8b767d', bound_args=<BoundArgumen...er movies?'})>, instance=<app.workflow...t 0x38e2e2a10>, context=<_contextvars...t 0x371bc92c0>)(<WorkflowHand... 90 seconds')>) at /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:273>
Traceback (most recent call last):
  File "/Users/tomazbratanic/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/tomazbratanic/anaconda3

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9684696667698637, 0.9987301628761093, 0.0, 0.0, 0.9398810917210249, 0.0, 0.957219700810317, 0.0, 0.9902133765867328, 0.969142194318283, 0.9785227514694217, 0.9879687406986418, 0.9999999999999997, 0.9822074311407417, 0.9861571370714334, 1.0000000000000007, 0.9868758664548074, 0.9465278068898991, 0.9831047427692097, 0.993715649075868, 0.9780473066011658, 0.9876378239773619, 0.999999367853207, 0.9843437232262001, 0.9999999999999996, 0.9808597363335845, 0.9325856535984051, 0.9935214378119879, 0.973331753615342, 0.9668350557248203, 0.9270836636876161, 0.9823336724926705, 0.9420791683500069, 0.9421763402493224, 0.9695576502211706, 0.9333423881434034, 0.0, 0.0, 0.9938601358184028, 0.9498628900820877, 0.9526292927227692, 0.9891998196429118, 0.9692229584623321, 0.9560739875934825, 0.9582313570599467, 0.9668163257782861, 0.0, 0.0, 0.0, 0.9666649996681015, 0.9761055803559793, 0.9540298013668037, 0.0], 'avg_latency': 26.787904685398317, 'timeout/errors': 2}

Evaluating Naiv

Evaluating NaiveText2CypherRetryCheckFlow with ministral 8b:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9432607444420148, 0.0, 0.9334791061446959, 0.0, 0.0, 0.9999999999999996, 0.9572023881272562, 0.0, 0.0, 0.0, 0.9811312962559807, 0.9946073296702288, 0.9999999999999997, 0.0, 0.9861571370714334, 0.0, 0.0, 0.9523551094380557, 0.9831047427692097, 0.0, 0.9894001515725316, 0.9876378239773619, 0.0, 0.9843437232262001, 0.9999999999999996, 0.9808597363335845, 0.9212942731120256, 0.9935214378119879, 0.9722238699306972, 0.0, 0.0, 0.0, 0.970621195270962, 0.9525475739979096, 0.9519070995596022, 0.0, 0.0, 0.8903424251537171, 0.9938601358184028, 0.0, 0.0, 0.0, 0.9692229584623321, 0.0, 0.0, 0.0, 0.9249182012387213, 0.0, 0.0, 0.0, 0.9761055803559793, 0.9480071491023597, 0.0], 'avg_latency': 16.203270858188844, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with codestral


Evaluating NaiveText2CypherRetryCheckFlow with codestral:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [1.0000000000000002, 0.9926635593850252, 0.927670070950112, 0.0, 0.9333663484368565, 0.0, 0.8991747184391953, 0.6960530974132588, 0.9832641318884785, 0.9823589929883504, 0.9811312962559807, 0.9923944666796998, 0.0, 0.0, 0.9861571370714334, 0.9999987724128648, 0.9870264659592275, 0.9804424007796445, 0.9253460679813403, 0.993715649075868, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9828886656671516, 0.0, 0.9808597363335845, 0.9075692161228366, 0.9935214378119879, 0.9722446069509113, 0.9680406742941555, 0.9628908982229029, 0.0, 0.0, 0.8608897276861768, 0.9695576502211706, 0.9349720023024141, 0.9878824008609187, 0.0, 0.9938601358184028, 0.9594386260371484, 0.9526292927227692, 0.9837753867407325, 0.9692229584623321, 0.953079784979778, 0.959215696753065, 0.0, 0.9963987265920059, 0.0, 0.0, 0.0, 0.0, 0.9480071491023597, 0.9804908088510119], 'avg_latency': 12.907477113435853, 'timeout/errors': 0}

Evaluating NaiveText2CypherRetryCheckFlow with deepsek-v3


Evaluating NaiveText2CypherRetryCheckFlow with deepsek-v3:   0%|          | 0/53 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/53 [00:00<?, ?it/s]

{'answer_relevancy': [0.9714692521745475, 0.0, 0.9593673492202157, 0.9454351770227799, 0.9420713214535477, 0.9404996523894532, 0.8940328086391339, 0.929902462926219, 0.0, 0.9926784086730578, 0.9811312962559807, 0.9901816036891709, 0.0, 0.9822074311407417, 0.9919645578268432, 1.0000000000000007, 0.9752124430223609, 0.9477445230572829, 0.9831047427692097, 0.9921124854281648, 0.9780473066011658, 0.9876378239773619, 0.9999999999999997, 0.9546426942445786, 0.9999999999999996, 0.9808622779543327, 0.9325856535984051, 0.9935221191711868, 0.9722238699306972, 0.9788472754788455, 0.93437985480061, 0.9823336724926705, 0.9412079000574947, 0.9476003707601599, 0.9695576502211706, 0.9349720023024141, 0.0, 0.9485932479116707, 0.9938601358184028, 0.9517208593146352, 0.9355287745932864, 0.9949473724034338, 0.9692229584623321, 0.9567678026789093, 0.9577391872133875, 0.9840112870295821, 0.0, 0.9635655593384452, 0.0, 0.9625393303337467, 0.9735505589310472, 0.8806189101831011, 0.0], 'avg_latency': 29.1097523

In [8]:
def print_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Timeouts/Errors", "Avg Latency (s)"]

    # Sort results by answer relevancy
    sorted_results = sorted(results, key=lambda x: sum(x['answer_relevancy']) / len(x['answer_relevancy']), reverse=True)

    # Add rows
    for result in sorted_results:
        answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        timeout_errors = result['timeout/errors']

        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{timeout_errors}",
            f"{result['avg_latency']:.2f}"
        ])

    print("\nGrid Search Results:")
    print(table)

print_results(results)


Grid Search Results:
+--------------------------------+----------------+------------------+-----------------+-----------------+
|              Flow              |      LLM       | Answer Relevancy | Timeouts/Errors | Avg Latency (s) |
+--------------------------------+----------------+------------------+-----------------+-----------------+
| NaiveText2CypherRetryCheckFlow |   sonnet 3.5   |      0.843       |        0        |      25.45      |
| NaiveText2CypherRetryCheckFlow |   deepsek-v3   |      0.837       |        0        |      29.11      |
| NaiveText2CypherRetryCheckFlow |     gpt-4o     |      0.820       |        0        |      19.83      |
| NaiveText2CypherRetryCheckFlow |    o1-mini     |      0.816       |        2        |      36.52      |
| NaiveText2CypherRetryCheckFlow | mistral large  |      0.789       |        2        |      26.79      |
| NaiveText2CypherRetryCheckFlow |       o1       |      0.746       |        12       |      57.09      |
|   NaiveText2C

In [9]:
def print_by_flow_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Timeouts/Errors", "Avg Latency (s)"]

    # Sort results first by flow, then by answer relevancy
    sorted_results = sorted(results, 
        key=lambda x: (
            x['flow'],
            sum(x['answer_relevancy']) / len(x['answer_relevancy'])
        ), 
        reverse=True
    )

    # Add rows
    for result in sorted_results:
        answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        timeout_errors = result['timeout/errors']

        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{timeout_errors}",
            f"{result['avg_latency']:.2f}"
        ])

    print("\nGrid Search Results:")
    print(table)

print_by_flow_results(results)


Grid Search Results:
+--------------------------------+----------------+------------------+-----------------+-----------------+
|              Flow              |      LLM       | Answer Relevancy | Timeouts/Errors | Avg Latency (s) |
+--------------------------------+----------------+------------------+-----------------+-----------------+
|   NaiveText2CypherRetryFlow    |   deepsek-v3   |      0.732       |        0        |       8.74      |
|   NaiveText2CypherRetryFlow    |    o1-mini     |      0.674       |        0        |      18.09      |
|   NaiveText2CypherRetryFlow    |     1.5pro     |      0.669       |        0        |       5.85      |
|   NaiveText2CypherRetryFlow    | mistral large  |      0.625       |        0        |       8.97      |
|   NaiveText2CypherRetryFlow    |       o1       |      0.621       |        2        |      37.83      |
|   NaiveText2CypherRetryFlow    |   sonnet 3.5   |      0.616       |        0        |      10.01      |
|   NaiveText2C

In [10]:
def print_pivot_results(results: List[Dict]):
    # Get unique flows and LLMs
    flows = sorted(list(set(r['flow'] for r in results)))
    llms = sorted(list(set(r['llm'] for r in results)))
    
    # Create table
    table = PrettyTable()
    
    # Set field names with flows as columns
    table.field_names = ["LLM"] + flows
    
    # Create a dictionary to store relevancy scores
    relevancy_dict = {}
    for result in results:
        llm = result['llm']
        flow = result['flow']
        relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        if llm not in relevancy_dict:
            relevancy_dict[llm] = {}
        relevancy_dict[llm][flow] = f"{relevancy:.3f}"
    
    # Add rows for each LLM
    for llm in llms:
        row = [llm]
        for flow in flows:
            row.append(relevancy_dict[llm].get(flow, "N/A"))
        table.add_row(row)

    print("\nGrid Search Results (Answer Relevancy):")
    print(table)