# Benchmark Experiments
 In practice, models are often evaluated using popular industry benchmarks to assess their performance in a reliable and standardized way. These benchmarks may too generic, overly clean, or already seen during training. We can use our golden dataset of generated queries to benchmark models in a way that is more representative of our real-world production use case.

## 1. Setup

### 1.1 Install & Import
Install the necessary packages and import modules.

In [None]:
%pip install -r requirements.txt

In [None]:
%load_ext autoreload
%autoreload 2

__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
from datetime import datetime
from openai import OpenAI as OpenAIClient
import voyageai
from functions.llm import *
from functions.embed import *
from functions.chroma import *
from functions.evaluate import *
from functions.visualize import *
from dotenv import load_dotenv
load_dotenv()

### 1.2 Load Data
Load our curated data of cafe reviews as well as the golden dataset of generated queries.

In [None]:
with open('data/cafes.json', 'r') as f:
    df_cafes = [json.loads(line) for line in f]
df_cafes = pd.DataFrame(df_cafes)
df_cafes.head()

In [None]:
with open('queries/golden_dataset.json', 'r') as f:
    golden_dataset = [json.loads(line) for line in f]
golden_dataset = pd.DataFrame(golden_dataset)
golden_dataset.head()

## 2. Run Benchmark Experiments

### 2.1 Infrastructure for Running Tests
Setup methods to abstract repeated IO and Chroma operations related to running benchmark experiments.

In [None]:
def extract_documents(df_data):
    ids = df_data['id'].tolist()
    documents = df_data['text'].tolist()
    metadatas = df_data[['name', 'address']].to_dict(orient='records')
    return ids, documents, metadatas

def extract_queries(df_queries):
    queries = golden_dataset['query'].tolist()
    query_ids = golden_dataset['id'].tolist()
    qrels = pd.DataFrame(
        {
            "query-id": query_ids,
            "corpus-id": query_ids,
            "score": 1
        }
    )
    return queries, query_ids, qrels


def format_query_embeddings_lookup(query_ids, queries, query_embeddings):
    query_embeddings_lookup = {
        id: {
            "text": query,
            "embedding": embedding
        }
        for id, query, embedding in zip(query_ids, queries, query_embeddings)
    }
    return query_embeddings_lookup

def create_and_populate_chroma_collection(collection_name, ids, documents, metadatas, embeddings):
    chroma_client = chromadb.Client()
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}
    )
    
    collection_add_in_batches(
        collection=collection,
        ids=ids,
        texts=documents,
        metadatas=metadatas,
        embeddings=embeddings,
    )
    return collection

def save_results(results, model):
    results_dir = Path("results")
    timestamp = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
    results_to_save = {
        "model": model,
        "results": results
    }

    with open(os.path.join(results_dir, f'{model}-{timestamp}.json'), 'w') as f:
        json.dump(results_to_save, f)

def run_jina_experiment(api_key, df_data, df_queries):
    model = "jina-embeddings-v3"

    ids, documents, metadatas = extract_documents(df_data)

    queries, query_ids, qrels = extract_queries(df_queries)

    query_embeddings = jina_embed_in_batches(
        JINA_API_KEY=api_key,
        input_type="retrieval.query",
        texts=queries,
    )
    query_embeddings_lookup = format_query_embeddings_lookup(query_ids, queries, query_embeddings)

    embeddings = jina_embed_in_batches(
        JINA_API_KEY=api_key,
        input_type="retrieval.passage",
        texts=documents,
    )

    collection_name = f'cafes-{model}'
    collection = create_and_populate_chroma_collection(collection_name, ids, documents, metadatas, embeddings)

    results = run_benchmark(
        query_embeddings_lookup=query_embeddings_lookup,
        collection=collection,
        qrels=qrels
    )
    save_results(results, model)

def run_voyage_experiment(api_key, df_data, df_queries):
    model = "voyage-3-large"
    client = voyageai.Client(api_key=api_key)
    
    ids, documents, metadatas = extract_documents(df_data)

    queries, query_ids, qrels = extract_queries(df_queries)

    query_embeddings = voyage_embed_in_batches(
        voyage_client=client,
        input_type="query",
        texts=queries,
    )
    query_embeddings_lookup = format_query_embeddings_lookup(query_ids, queries, query_embeddings)

    embeddings = voyage_embed_in_batches(
        voyage_client=client,
        input_type="document",
        texts=documents,
    )

    collection_name = f'cafes-{model}'
    collection = create_and_populate_chroma_collection(collection_name, ids, documents, metadatas, embeddings)

    results = run_benchmark(
        query_embeddings_lookup=query_embeddings_lookup,
        collection=collection,
        qrels=qrels
    )
    save_results(results, model)

def run_openai_experiment(model, api_key, df_data, df_queries):
    openai_client = OpenAIClient(api_key=api_key)

    ids, documents, metadatas = extract_documents(df_data)

    queries, query_ids, qrels = extract_queries(df_queries)

    query_embeddings = openai_embed_in_batches(
        openai_client=openai_client,
        texts=queries,
        model=model
    )
    query_embeddings_lookup = format_query_embeddings_lookup(query_ids, queries, query_embeddings)

    embeddings = openai_embed_in_batches(
        openai_client=openai_client,
        texts=documents,
        model=model,
    )

    collection_name = f'cafes-openai-{model}'
    collection = create_and_populate_chroma_collection(collection_name, ids, documents, metadatas, embeddings)

    results = run_benchmark(
        query_embeddings_lookup=query_embeddings_lookup,
        collection=collection,
        qrels=qrels
    )
    save_results(results, model)
    
def run_openai_large_experiment(api_key, df_data, df_queries):
    model="text-embedding-3-large"
    run_openai_experiment(model, api_key, df_data, df_queries)

def run_openai_small_experiment(api_key, df_data, df_queries):
    model="text-embedding-3-small"
    run_openai_experiment(model, api_key, df_data, df_queries)

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
JINA_API_KEY = os.getenv("JINA_API_KEY")

run_openai_small_experiment(OPENAI_API_KEY, df_cafes, golden_dataset)
run_openai_large_experiment(OPENAI_API_KEY, df_cafes, golden_dataset)
run_jina_experiment(JINA_API_KEY, df_cafes, golden_dataset)