In [None]:
import sys
import os

# Insert the parent directory of "app" into sys.path
# so that Python recognizes "app" as an importable package.
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)

In [None]:
from dotenv import load_dotenv
load_dotenv()  # This looks for .env in the current working directory

In [None]:
import time
from typing import Dict, List

import pandas as pd
from datasets import Dataset
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI
from prettytable import PrettyTable
from ragas import evaluate
from ragas.llms import LlamaIndexLLMWrapper
from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
)
from google.generativeai.types import RequestOptions
from google.api_core import retry

from app.workflows.utils import graph_store

In [None]:
# Import flows
from app.workflows.naive_text2cypher import NaiveText2CypherFlow
from app.workflows.naive_text2cypher_retry import NaiveText2CypherRetryFlow
from app.workflows.iterative_planner import IterativePlanningFlow

In [None]:
# Benchmark data
test_df = pd.read_csv('test_data.csv', delimiter=";")
test_df.head()

In [None]:
async def evaluate_flow_llm_combination(flow_name, flow, llm_name, llm, test_df, graph_store):
    results = []
    latencies = []
    ground_truth = []

    flow_instance = flow(llm=llm, timeout=90)
    
    for i, row in test_df.iterrows():
        question = row['Question']
        
        start = time.time()
        try:
            data = await flow_instance.run(input=question)
        except:
            data = {"answer": "timeout/error", "question": question}
        end = time.time()
        latencies.append(end - start)
        results.append(data)
            
        try:
            ground_truth.append(graph_store.structured_query(row['Cypher']))
        except Exception as e:
            ground_truth.append("missing")
    # Create evaluation dataset
    df = pd.DataFrame(results)
    df['ground_truth'] = [str(el) for el in ground_truth]
    df['latencies'] = latencies
    dataset = Dataset.from_pandas(df)
    
    # Run evaluation
    result = evaluate(
        dataset,
        metrics=[answer_relevancy],
        llm=LlamaIndexLLMWrapper(OpenAI(model="gpt-4o-2024-11-20", temperature=0))
    )
    
    return {
        'answer_relevancy': result['answer_relevancy'],
        'avg_latency': sum(latencies) / len(latencies)
    }

async def run_grid_search(
    flows: List[callable],
    llms: List[object],
    test_df: pd.DataFrame,
    graph_store: object
):
    results = []
    
    for flow in flows:
        for llm_name, llm in llms:
            print(f"\nEvaluating {flow.__name__} with {llm_name}")
            
            result = await evaluate_flow_llm_combination(
                flow_name=flow.__name__,
                flow=flow,
                llm_name=llm_name,
                llm=llm,
                test_df=test_df,
                graph_store=graph_store
            )
            
            results.append({
                'flow': flow.__name__,
                'llm': llm_name,
                **result
            })
    
    return results

In [None]:
import warnings
warnings.filterwarnings('ignore')


flows = [
    IterativePlanningFlow,
    NaiveText2CypherFlow,
    NaiveText2CypherRetryFlow,
]  # Add your flows

google_retry = dict(retry=retry.Retry(initial=0.1, multiplier=2, timeout=61))
llms = [
    ("1.5pro", Gemini(model="models/gemini-1.5-pro", temperature=0, request_options=google_retry)),
    #("1.5flash", Gemini(model="models/gemini-1.5-flash", temperature=0, request_options=google_retry)),
    #("2.0flash", Gemini(model="models/gemini-2.0-flash-exp", temperature=0)), # rate limits 
    ("gpt-4o", OpenAI(model="gpt-4o", temperature=0)),
    #("gpt-4o-mini", OpenAI(model="gpt-4o-mini", temperature=0)),
    #("o1", OpenAI(model="o1-preview", temperature=0)), no tools
    #("o1-mini", OpenAI(model="o1-mini", temperature=0)), no tools
]  # Add your LLMs

results = await run_grid_search(
    flows=flows, llms=llms, test_df=test_df, graph_store=graph_store
)


In [None]:
def print_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Avg Latency (s)"]
    
    # Sort results by answer relevancy
    sorted_results = sorted(results, key=lambda x: x['answer_relevancy'], reverse=True)
    
    # Add rows
    for result in sorted_results:
        # Handle different data types for answer_relevancy
        if isinstance(result['answer_relevancy'], list):
            answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        else:
            answer_relevancy = result['answer_relevancy']
            
        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{result['avg_latency']:.2f}"
        ])
    
    print("\nGrid Search Results:")
    print(table)
    
print_results(results)