In [1]:
#%pip install --quiet llama-index-llms-mistralai llama-index-llms-groq llama-index-llms-anthropic llama-index-llms-gemini llama-index-llms-openai llama-index-llms-openai-like

In [1]:
import sys
import os

# Insert the parent directory of "app" into sys.path
# so that Python recognizes "app" as an importable package.
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)

In [2]:
from dotenv import load_dotenv
load_dotenv()  # This looks for .env in the root directory

True

In [3]:
import time
from typing import Dict, List
from tqdm.auto import tqdm

import pandas as pd
from datasets import Dataset
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI
from llama_index.llms.anthropic import Anthropic
from llama_index.llms.mistralai import MistralAI
from llama_index.llms.openai_like import OpenAILike
from llama_index.llms.groq import Groq
from prettytable import PrettyTable
from ragas import evaluate
from ragas.llms import LlamaIndexLLMWrapper
from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
)
from google.generativeai.types import RequestOptions
from google.api_core import retry

from workflows.shared import graph_store

  from pandas.core import (




In [4]:
# Import flows
from workflows.naive_text2cypher import NaiveText2CypherFlow
from workflows.naive_text2cypher_retry import NaiveText2CypherRetryFlow
from workflows.text2cypher_retry_check import NaiveText2CypherRetryCheckFlow
from workflows.iterative_planner import IterativePlanningFlow

In [5]:
# Benchmark data
test_df = pd.read_csv('test_data.csv', delimiter=";")
test_df.head()

Unnamed: 0,Question,Cypher
0,Who acted in Tom Hanks’s highest-rated movie?,MATCH (p:Person {name: 'Tom Hanks'})-[:ACTED_I...
1,Which movie starring Keanu Reeves has the most...,"MATCH (meg:Actor {name: ""Keanu Reeves""})-[:ACT..."
2,Who directed the most recent movie starring Ha...,"MATCH (p:Person {name: ""Halle Berry""})-[:ACTED..."
3,What is the highest-rated movie from the 1990s...,MATCH (m:Movie)-[:DIRECTED]-(d:Person) WHERE m...
4,"For all movies starring Keanu Reeves, find the...","MATCH (keanu:Person {name: ""Keanu Reeves""})-[:..."


In [6]:
async def evaluate_flow_llm_combination(flow_name, flow, llm_name, llm, test_df, graph_store):
    results = []
    latencies = []
    ground_truth = []
    timeouts = 0
    flow_instance = flow(llm=llm, timeout=90) # Timeout 2 minutes

    # Add tqdm progress bar
    for i, row in tqdm(test_df.iterrows(), total=len(test_df),
                      desc=f"Evaluating {flow_name} with {llm_name}"):
        question = row['Question']

        start = time.time()
        try:
            data = await flow_instance.run(input=question)
        except:
            data = {"answer": "timeout/error", "question": question}
            timeouts += 1
        end = time.time()
        latencies.append(end - start)
        results.append(data)

        try:
            ground_truth.append(str(graph_store.structured_query(row['Cypher'])))
        except Exception as e:
            ground_truth.append("missing")

    # Create evaluation dataset
    df = pd.DataFrame(results)
    df['ground_truth'] = ground_truth
    df['latencies'] = latencies
    dataset = Dataset.from_pandas(df)

    # Run evaluation
    result = evaluate(
        dataset,
        metrics=[answer_relevancy],
        llm=LlamaIndexLLMWrapper(OpenAI(model="gpt-4o-2024-11-20", temperature=0))
    )

    return {
        'answer_relevancy': result['answer_relevancy'],
        'avg_latency': sum(latencies) / len(latencies),
        'timeout/errors': timeouts
    }

async def run_grid_search(
    flows: List[callable],
    llms: List[object],
    test_df: pd.DataFrame,
    graph_store: object
):
    results = []

    for flow in flows:
        for llm_name, llm in llms:
            try:
                print(f"\nEvaluating {flow.__name__} with {llm_name}")

                # Skip iterative planning only with these
                if flow.__name__ == "IterativePlanningFlow" and not llm_name in ["gpt-4o", "1.5pro"]:
                    continue

                result = await evaluate_flow_llm_combination(
                    flow_name=flow.__name__,
                    flow=flow,
                    llm_name=llm_name,
                    llm=llm,
                    test_df=test_df,
                    graph_store=graph_store
                )
                print(result)
                results.append({
                    'flow': flow.__name__,
                    'llm': llm_name,
                    **result
                })
            except:
                continue

    return results

In [None]:
flows = [
    IterativePlanningFlow,
    NaiveText2CypherFlow,
    NaiveText2CypherRetryFlow,
    NaiveText2CypherRetryCheckFlow
]  # Add your flows

google_retry = dict(retry=retry.Retry(initial=0.1, multiplier=2, timeout=61))
llms = [
    ("1.5pro", Gemini(model="models/gemini-1.5-pro", temperature=0, request_options=google_retry)),
    ("1.5flash", Gemini(model="models/gemini-1.5-flash", temperature=0, request_options=google_retry)),
    #("2.0flash", Gemini(model="models/gemini-2.0-flash-exp", temperature=0)), # rate limits
    ("gpt-4o", OpenAI(model="gpt-4o", temperature=0)),
    #("gpt-4o-mini", OpenAI(model="gpt-4o-mini", temperature=0)),
    ("o1", OpenAI(model="o1-preview", temperature=0)), #no tools
    ("o1-mini", OpenAI(model="o1-mini", temperature=0)), #no tools
    ("sonnet 3.5", Anthropic(model="claude-3-5-sonnet-latest", max_tokens=8076)),
    ("haiku 3.5", Anthropic(model="claude-3-5-haiku-latest", max_tokens=8076)),
    ("mistral medium", MistralAI(model="mistral-medium")),
    ("mistral large", MistralAI(model="mistral-large-latest")),
    ("ministral 8b", MistralAI(model="ministral-8b-latest")),
    ("codestral", MistralAI(model="codestral-latest")),
    ("deepsek-v3", OpenAILike(
                            model="deepseek-chat",
                            api_base="https://api.deepseek.com/beta",
                            api_key=os.getenv("DEEPSEEK_API_KEY"),
                        )
                    ),
    #("groq llama3 70b", Groq(model="llama3-70b-8192")) # Rate limits



]  # Add your LLMs

results = await run_grid_search(
    flows=flows, llms=llms, test_df=test_df, graph_store=graph_store
)


In [8]:
def print_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Timeouts/Errors", "Avg Latency (s)"]

    # Sort results by answer relevancy
    sorted_results = sorted(results, key=lambda x: sum(x['answer_relevancy']) / len(x['answer_relevancy']), reverse=True)

    # Add rows
    for result in sorted_results:
        answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        timeout_errors = result['timeout/errors']

        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{timeout_errors}",
            f"{result['avg_latency']:.2f}"
        ])

    print("\nGrid Search Results:")
    print(table)

print_results(results)


Grid Search Results:
+--------------------------------+----------------+------------------+-----------------+-----------------+
|              Flow              |      LLM       | Answer Relevancy | Timeouts/Errors | Avg Latency (s) |
+--------------------------------+----------------+------------------+-----------------+-----------------+
| NaiveText2CypherRetryCheckFlow |   sonnet 3.5   |      0.843       |        0        |      25.45      |
| NaiveText2CypherRetryCheckFlow |   deepsek-v3   |      0.837       |        0        |      29.11      |
| NaiveText2CypherRetryCheckFlow |     gpt-4o     |      0.820       |        0        |      19.83      |
| NaiveText2CypherRetryCheckFlow |    o1-mini     |      0.816       |        2        |      36.52      |
| NaiveText2CypherRetryCheckFlow | mistral large  |      0.789       |        2        |      26.79      |
| NaiveText2CypherRetryCheckFlow |       o1       |      0.746       |        12       |      57.09      |
|   NaiveText2C

In [9]:
def print_by_flow_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Timeouts/Errors", "Avg Latency (s)"]

    # Sort results first by flow, then by answer relevancy
    sorted_results = sorted(results,
        key=lambda x: (
            x['flow'],
            sum(x['answer_relevancy']) / len(x['answer_relevancy'])
        ),
        reverse=True
    )

    # Add rows
    for result in sorted_results:
        answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        timeout_errors = result['timeout/errors']

        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{timeout_errors}",
            f"{result['avg_latency']:.2f}"
        ])

    print("\nGrid Search Results:")
    print(table)

print_by_flow_results(results)


Grid Search Results:
+--------------------------------+----------------+------------------+-----------------+-----------------+
|              Flow              |      LLM       | Answer Relevancy | Timeouts/Errors | Avg Latency (s) |
+--------------------------------+----------------+------------------+-----------------+-----------------+
|   NaiveText2CypherRetryFlow    |   deepsek-v3   |      0.732       |        0        |       8.74      |
|   NaiveText2CypherRetryFlow    |    o1-mini     |      0.674       |        0        |      18.09      |
|   NaiveText2CypherRetryFlow    |     1.5pro     |      0.669       |        0        |       5.85      |
|   NaiveText2CypherRetryFlow    | mistral large  |      0.625       |        0        |       8.97      |
|   NaiveText2CypherRetryFlow    |       o1       |      0.621       |        2        |      37.83      |
|   NaiveText2CypherRetryFlow    |   sonnet 3.5   |      0.616       |        0        |      10.01      |
|   NaiveText2C

In [16]:
def print_pivot_results(results: List[Dict]):
    # Get unique flows and LLMs
    flows = list(set(r['flow'] for r in results))
    llms = sorted(list(set(r['llm'] for r in results)))

    # Create table
    table = PrettyTable()

    # Set field names with flows as columns
    table.field_names = ["LLM"] + flows

    # Create a dictionary to store relevancy scores
    relevancy_dict = {}
    for result in results:
        llm = result['llm']
        flow = result['flow']
        relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        if llm not in relevancy_dict:
            relevancy_dict[llm] = {}
        relevancy_dict[llm][flow] = f"{relevancy:.3f}"

    # Add rows for each LLM
    for llm in llms:
        row = [llm]
        for flow in flows:
            row.append(relevancy_dict[llm].get(flow, "N/A"))
        table.add_row(row)

    print("\nGrid Search Results (Answer Relevancy):")
    print(table)

In [17]:
print_pivot_results(results)


Grid Search Results (Answer Relevancy):
+----------------+-----------------------+----------------------+---------------------------+--------------------------------+
|      LLM       | IterativePlanningFlow | NaiveText2CypherFlow | NaiveText2CypherRetryFlow | NaiveText2CypherRetryCheckFlow |
+----------------+-----------------------+----------------------+---------------------------+--------------------------------+
|    1.5flash    |          N/A          |        0.445         |           0.411           |             0.558              |
|     1.5pro     |         0.303         |        0.706         |           0.669           |             0.703              |
|   codestral    |          N/A          |        0.620         |           0.548           |             0.725              |
|   deepsek-v3   |          N/A          |        0.729         |           0.732           |             0.837              |
|     gpt-4o     |         0.163         |        0.622         |     