In [1]:
import sys
import os

# Insert the parent directory of "app" into sys.path
# so that Python recognizes "app" as an importable package.
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)

In [2]:
from dotenv import load_dotenv
load_dotenv()  # This looks for .env in the current working directory

True

In [4]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate
from ragas.llms import LlamaIndexLLMWrapper

from app.workflows.utils import graph_store

In [5]:
# Import flows
from app.workflows.naive_text2cypher import NaiveText2CypherFlow
from app.workflows.naive_text2cypher_retry import NaiveText2CypherRetryFlow
from app.workflows.iterative_planner import IterativePlanningFlow




In [6]:
import pandas as pd

test_df = pd.read_csv('test_data.csv', delimiter=";")
test_df.head()

Unnamed: 0,Question,Cypher
0,Who acted in Tom Hanks’s highest-rated movie?,MATCH (p:Person {name: 'Tom Hanks'})-[:ACTED_I...
1,Which movie starring Keanu Reeves has the most...,"MATCH (meg:Actor {name: ""Keanu Reeves""})-[:ACT..."
2,Who directed the most recent movie starring Ha...,"MATCH (p:Person {name: ""Halle Berry""})-[:ACTED..."
3,What is the highest-rated movie from the 1990s...,MATCH (m:Movie)-[:DIRECTED]-(d:Person) WHERE m...
4,"For all movies starring Keanu Reeves, find the...","MATCH (keanu:Person {name: ""Keanu Reeves""})-[:..."


In [7]:
import time
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from typing import List, Dict
from prettytable import PrettyTable

async def evaluate_flow_llm_combination(flow_name, flow, llm_name, llm, test_df, graph_store):
    results = []
    latencies = []
    ground_truth = []

    flow_instance = flow(llm=llm, timeout=90)
    
    for i, row in test_df.iterrows():
        question = row['Question']
        print(f"Processing question: {question}")
        
        start = time.time()
        try:
            data = await flow_instance.run(input=question)
        except:
            data = "timeout"
            
        try:
            ground_truth.append(graph_store.structured_query(row['Cypher']))
        except Exception as e:
            ground_truth.append("missing")
            
        end = time.time()
        latencies.append(end - start)
        results.append(data)
    # Create evaluation dataset
    df = pd.DataFrame(results)
    df['ground_truth'] = [str(el) for el in ground_truth]
    df['latencies'] = latencies
    dataset = Dataset.from_pandas(df)
    
    # Run evaluation
    result = evaluate(
        dataset,
        metrics=[answer_relevancy],
        llm=LlamaIndexLLMWrapper(OpenAI(model="gpt-4o-2024-11-20", temperature=0))
    )
    
    return {
        'answer_relevancy': result['answer_relevancy'],
        'avg_latency': sum(latencies) / len(latencies),
        'timeout_count': results.count("timeout")
    }

async def run_grid_search(
    flows: List[callable],
    llms: List[object],
    test_df: pd.DataFrame,
    graph_store: object
):
    results = []
    
    for flow in flows:
        for llm_name, llm in llms:
            print(f"\nEvaluating {flow.__name__} with {llm_name}")
            
            result = await evaluate_flow_llm_combination(
                flow_name=flow.__name__,
                flow=flow,
                llm_name=llm_name,
                llm=llm,
                test_df=test_df,
                graph_store=graph_store
            )
            
            results.append({
                'flow': flow.__name__,
                'llm': llm_name,
                **result
            })
    
    return results

In [8]:
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI


google_llm = Gemini(
    model="models/gemini-1.5-pro",
)

openai_llm = OpenAI(model="gpt-4o-2024-11-20", temperature=0)

In [9]:
flows = [NaiveText2CypherFlow, NaiveText2CypherRetryFlow, IterativePlanningFlow]  # Add your flows
llms = [("1.5pro", google_llm), ("gpt-4o",openai_llm)]  # Add your LLMs

results = await run_grid_search(
    flows=flows,
    llms=llms,
    test_df=test_df,
    graph_store=graph_store
)



Evaluating NaiveText2CypherFlow with 1.5pro
Processing question: Who acted in Tom Hanks’s highest-rated movie?
Processing question: Which movie starring Keanu Reeves has the most actors in common with a Tom Hanks movie?
Processing question: Who directed the most recent movie starring Halle Berry, and which other actors starred in that director’s earlier movies?
Processing question: What is the highest-rated movie from the 1990s, who directed it, and which other films did that director make?
Processing question: For all movies starring Keanu Reeves, find the director who worked the most times with him and list the co-stars across those collaborations.


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Evaluating NaiveText2CypherFlow with gpt-4o
Processing question: Who acted in Tom Hanks’s highest-rated movie?
Processing question: Which movie starring Keanu Reeves has the most actors in common with a Tom Hanks movie?
Processing question: Who directed the most recent movie starring Halle Berry, and which other actors starred in that director’s earlier movies?
Processing question: What is the highest-rated movie from the 1990s, who directed it, and which other films did that director make?
Processing question: For all movies starring Keanu Reeves, find the director who worked the most times with him and list the co-stars across those collaborations.


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Evaluating NaiveText2CypherRetryFlow with 1.5pro
Processing question: Who acted in Tom Hanks’s highest-rated movie?
Processing question: Which movie starring Keanu Reeves has the most actors in common with a Tom Hanks movie?
Processing question: Who directed the most recent movie starring Halle Berry, and which other actors starred in that director’s earlier movies?
Processing question: What is the highest-rated movie from the 1990s, who directed it, and which other films did that director make?
Processing question: For all movies starring Keanu Reeves, find the director who worked the most times with him and list the co-stars across those collaborations.


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]


Evaluating NaiveText2CypherRetryFlow with gpt-4o
Processing question: Who acted in Tom Hanks’s highest-rated movie?
Processing question: Which movie starring Keanu Reeves has the most actors in common with a Tom Hanks movie?
Processing question: Who directed the most recent movie starring Halle Berry, and which other actors starred in that director’s earlier movies?
Processing question: What is the highest-rated movie from the 1990s, who directed it, and which other films did that director make?
Processing question: For all movies starring Keanu Reeves, find the director who worked the most times with him and list the co-stars across those collaborations.


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

In [12]:
def print_results(results: List[Dict]):
    # Create table
    table = PrettyTable()
    table.field_names = ["Flow", "LLM", "Answer Relevancy", "Avg Latency (s)", "Timeouts"]
    
    # Sort results by answer relevancy
    sorted_results = sorted(results, key=lambda x: x['answer_relevancy'], reverse=True)
    
    # Add rows
    for result in sorted_results:
        # Handle different data types for answer_relevancy
        if isinstance(result['answer_relevancy'], list):
            answer_relevancy = sum(result['answer_relevancy']) / len(result['answer_relevancy'])
        else:
            answer_relevancy = result['answer_relevancy']
            
        table.add_row([
            result['flow'],
            result['llm'],
            f"{answer_relevancy:.3f}" if isinstance(answer_relevancy, (float, int)) else str(answer_relevancy),
            f"{result['avg_latency']:.2f}",
            result['timeout_count']
        ])
    
    print("\nGrid Search Results:")
    print(table)
    
print_results(results)


Grid Search Results:
+---------------------------+--------+------------------+-----------------+----------+
|            Flow           |  LLM   | Answer Relevancy | Avg Latency (s) | Timeouts |
+---------------------------+--------+------------------+-----------------+----------+
|    NaiveText2CypherFlow   | 1.5pro |      0.767       |       8.23      |    0     |
|    NaiveText2CypherFlow   | gpt-4o |      0.583       |      12.86      |    0     |
| NaiveText2CypherRetryFlow | 1.5pro |      0.767       |       8.62      |    0     |
| NaiveText2CypherRetryFlow | gpt-4o |      0.763       |      11.30      |    0     |
+---------------------------+--------+------------------+-----------------+----------+
