In [1]:
import sys
import os

# Insert the parent directory of "app" into sys.path
# so that Python recognizes "app" as an importable package.
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)

In [2]:
from dotenv import load_dotenv
load_dotenv()  # This looks for .env in the current working directory

True

In [3]:
import time
from typing import Dict, List

import pandas as pd
from datasets import Dataset
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI
from prettytable import PrettyTable
from ragas import evaluate
from ragas.llms import LlamaIndexLLMWrapper
from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
)
from google.generativeai.types import RequestOptions
from google.api_core import retry

from app.workflows.shared import graph_store

  from pandas.core import (




In [4]:
# Import flows
from app.workflows.naive_text2cypher import NaiveText2CypherFlow
from app.workflows.naive_text2cypher_retry import NaiveText2CypherRetryFlow
from app.workflows.iterative_planner import IterativePlanningFlow




In [5]:
# Benchmark data
test_df = pd.read_csv('test_data.csv', delimiter=";")
test_df.head()

Unnamed: 0,Question,Cypher
0,Who acted in Tom Hanks’s highest-rated movie?,MATCH (p:Person {name: 'Tom Hanks'})-[:ACTED_I...
1,Which movie starring Keanu Reeves has the most...,"MATCH (meg:Actor {name: ""Keanu Reeves""})-[:ACT..."
2,Who directed the most recent movie starring Ha...,"MATCH (p:Person {name: ""Halle Berry""})-[:ACTED..."
3,What is the highest-rated movie from the 1990s...,MATCH (m:Movie)-[:DIRECTED]-(d:Person) WHERE m...
4,"For all movies starring Keanu Reeves, find the...","MATCH (keanu:Person {name: ""Keanu Reeves""})-[:..."


In [6]:
async def evaluate_flow_llm_combination(flow_name, flow, llm_name, llm, test_df, graph_store):
    results = []
    latencies = []
    ground_truth = []
    timeouts = 0
    flow_instance = flow(llm=llm, timeout=90)

    for i, row in test_df.iterrows():
        question = row['Question']

        start = time.time()
        try:
            data = await flow_instance.run(input=question)
        except:
            data = {"answer": "timeout/error", "question": question}
            timeouts += 1
        end = time.time()
        latencies.append(end - start)
        results.append(data)

        try:
            ground_truth.append(str(graph_store.structured_query(row['Cypher'])))
        except Exception as e:
            ground_truth.append("missing")
    # Create evaluation dataset
    df = pd.DataFrame(results)
    df['ground_truth'] = ground_truth
    df['latencies'] = latencies
    dataset = Dataset.from_pandas(df)

    # Run evaluation
    result = evaluate(
        dataset,
        metrics=[answer_relevancy],
        llm=LlamaIndexLLMWrapper(OpenAI(model="gpt-4o-2024-11-20", temperature=0))
    )

    return {
        'answer_relevancy': result['answer_relevancy'],
        'avg_latency': sum(latencies) / len(latencies),
        'timeout/errors': timeouts
    }

async def run_grid_search(
    flows: List[callable],
    llms: List[object],
    test_df: pd.DataFrame,
    graph_store: object
):
    results = []

    for flow in flows:
        for llm_name, llm in llms:
            try:
                print(f"\nEvaluating {flow.__name__} with {llm_name}")

                result = await evaluate_flow_llm_combination(
                    flow_name=flow.__name__,
                    flow=flow,
                    llm_name=llm_name,
                    llm=llm,
                    test_df=test_df,
                    graph_store=graph_store
                )

                results.append({
                    'flow': flow.__name__,
                    'llm': llm_name,
                    **result
                })
            except:
                continue

    return results

In [None]:
flows = [
    IterativePlanningFlow,
    NaiveText2CypherFlow,
    NaiveText2CypherRetryFlow,
]  # Add your flows

google_retry = dict(retry=retry.Retry(initial=0.1, multiplier=2, timeout=61))
llms = [
    ("1.5pro", Gemini(model="models/gemini-1.5-pro", temperature=0, request_options=google_retry)),
    ("1.5flash", Gemini(model="models/gemini-1.5-flash", temperature=0, request_options=google_retry)),
    #("2.0flash", Gemini(model="models/gemini-2.0-flash-exp", temperature=0)), # rate limits
    ("gpt-4o", OpenAI(model="gpt-4o", temperature=0)),
    #("gpt-4o-mini", OpenAI(model="gpt-4o-mini", temperature=0)),
    #("o1", OpenAI(model="o1-preview", temperature=0)), no tools
    #("o1-mini", OpenAI(model="o1-mini", temperature=0)), no tools
]  # Add your LLMs

results = await run_grid_search(
    flows=flows, llms=llms, test_df=test_df, graph_store=graph_store
)


In [8]:
def print_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Timeouts/Errors", "Avg Latency (s)"]

    # Sort results by answer relevancy
    sorted_results = sorted(results, key=lambda x: sum(x['answer_relevancy']) / len(x['answer_relevancy']), reverse=True)

    # Add rows
    for result in sorted_results:
        answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        timeout_errors = result['timeout/errors']

        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{timeout_errors}",
            f"{result['avg_latency']:.2f}"
        ])

    print("\nGrid Search Results:")
    print(table)

print_results(results)


Grid Search Results:
+---------------------------+----------+------------------+-----------------+-----------------+
|            Flow           |   LLM    | Answer Relevancy | Timeouts/Errors | Avg Latency (s) |
+---------------------------+----------+------------------+-----------------+-----------------+
| NaiveText2CypherRetryFlow |  1.5pro  |      0.686       |        0        |       6.32      |
| NaiveText2CypherRetryFlow |  gpt-4o  |      0.639       |        0        |       9.32      |
|    NaiveText2CypherFlow   |  1.5pro  |      0.574       |        0        |       5.84      |
|    NaiveText2CypherFlow   |  gpt-4o  |      0.505       |        0        |       9.17      |
| NaiveText2CypherRetryFlow | 1.5flash |      0.402       |        0        |       3.24      |
|    NaiveText2CypherFlow   | 1.5flash |      0.398       |        0        |       2.91      |
|   IterativePlanningFlow   |  1.5pro  |      0.132       |        21       |      24.82      |
|   IterativePlann