In [21]:
import opik
opik.configure(use_local=False)

OPIK: Existing Opik clients will not use updated values for "url", "api_key", "workspace".
OPIK: Opik is already configured. You can check the settings by viewing the config file at /Users/akshay/.opik.config


In [22]:
from dotenv import load_dotenv
load_dotenv()

True

# Setup Workflow

In [24]:
from workflow import RAGWorkflow
import asyncio

def load_workflow(model_option):
    if model_option == "Gemma3":
        workflow = RAGWorkflow(model_name="gemma3")
    else:
        workflow = RAGWorkflow(model_name="llama3.2")
    return workflow

In [25]:
model_name = 'Gemma3'
# model_name = 'DeepSeek-R1'
workflow  = load_workflow(model_name)

# Trace RAG calls 

In [26]:
from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager
from opik.integrations.llama_index import LlamaIndexCallbackHandler

# A callback handler tp automatically log all LlamaIndex operations to Opik
opik_callback_handler = LlamaIndexCallbackHandler()

# Integrate handler into LlamaIndex's settings
Settings.callback_manager = CallbackManager([opik_callback_handler])

In [27]:
await workflow.ingest_documents("./eval-data/paul_graham")

OPIK: Started logging traces to the "Default Project" project at https://www.comet.com/opik/akshayp/redirect/projects?name=Default%20Project.


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x33e383c80>

In [28]:
response = await workflow.query("Who is Paul Graham?")
print(response)

Paul is an entrepreneur who made Y Combinatore (YC), a startup accelerator that funds new startups. He was a programmer in his adolescence and later decided to study philosophy in college before realizing his passion for AI. In 2010, Robert Morris’s advice led him to realize he needed to hand over YC and he began painting as a new activity. He ultimately gave up painting after losing interest in the project.


# Evaluation

In [29]:
from opik import Opik

client = Opik()
dataset = client.get_or_create_dataset(name="Test dataset")

In [11]:
import pandas as pd

df = pd.read_csv("./eval-data/test.csv")

Unnamed: 0,Question,Answer,Context
0,What was the very first programming language P...,He used an early version of Fortran on the IBM...,The language we used was an early version of F...
1,Which microcomputer did Paul Graham’s father f...,A TRS-80.,Computers were expensive in those days and it ...
2,What was the name of the startup Paul Graham c...,Viaweb.,"We started a new company we called Viaweb, aft..."
3,Which friend of Paul Graham was the person res...,Robert Tappan Morris (often referred to as “Ro...,I remember when my friend Robert Morris got ki...
4,What was the title of the second Lisp book tha...,*ANSI Common Lisp.*,So with my unerring nose for financial opportu...


In [15]:
# insert the data into the dataset

qa_pairs = [
    {"input": row["Question"], "expected_output": row["Answer"], "context": row["Context"]} 
    for _, row in df.iterrows()
]
qa_pairs[0]


{'input': 'What was the very first programming language Paul Graham used when he began learning to program on the IBM 1401?',
 'expected_output': 'He used an early version of Fortran on the IBM 1401.',
 'context': 'The language we used was an early version of Fortran. You had to type programs on punch cards, then stack them in the card reader and press a button to load the program into memory and run it.'}

In [16]:
dataset.insert(qa_pairs)

In [30]:
from opik import track

@track
async def my_llm_application(input: str) -> str:
    response = await workflow.query(input)
    return str(response)

def evaluation_task(x):
    return {
        "output": my_llm_application(x['input'])
    }

In [31]:
from opik.evaluation.metrics import (
    Hallucination,
    AnswerRelevance,
    ContextPrecision,
    ContextRecall
)

# Define the metrics
hallucination_metric = Hallucination()
answer_relevance_metric = AnswerRelevance()
context_precision_metric = ContextPrecision()
context_recall_metric = ContextRecall() 

In [32]:
from opik.evaluation import evaluate

evaluation = evaluate(
    dataset=dataset,
    task=evaluation_task,
    experiment_name = model_name,
    scoring_metrics=[hallucination_metric, answer_relevance_metric, context_precision_metric, context_recall_metric],
    experiment_config={
        "model": "gpt-3.5-turbo"
    }
)

Evaluation: 100%|██████████| 5/5 [00:15<00:00,  3.09s/it]
