# Compare Embedding Models

In [17]:
import json
import pandas as pd
import polars as pl

from uptrain.operators import CsvReader, JsonReader
from uptrain import Settings, EvalLLM, APIClient, Evals
from uptrain.operators import VectorSearch

In [3]:
settings = Settings()
eval_client = EvalLLM(settings=settings)

## Define all the experiment parameters

In [4]:
embedding_models = ['MiniLM-L6-v2', 'mpnet-base-v2', 'all-distilroberta-v1']
top_k = 5
distance_metric = "l2_distance"  # "cosine_similarity"
experiment_name = "Embedding-Model-Experiment-v1"

## Read the queries and documents

In [5]:
queries = JsonReader(fpath="data/fiqa_queries_select.jsonl").setup(settings).run()["output"]
documents = CsvReader(fpath="data/fiqa_documents_select.csv").setup(settings).run()["output"]["document"].to_list()

## Retrieve the top k documents for each query

In [None]:
top_k_documents = pl.DataFrame()

for embedding_model in embedding_models:
    results = VectorSearch(embeddings_model=embedding_model,
                           top_k=top_k,
                           col_in_query="question",
                           col_in_document="document",
                           documents=documents,
                           distance_metric=distance_metric).setup(settings).run(queries)["output"]
    results = results.with_columns([pl.lit(embedding_model).alias("embedding_model")])    
    top_k_documents = pl.concat([top_k_documents, results])

In [None]:
settings = Settings(
    uptrain_access_token="up-**********************"
)

eval_client = APIClient(settings = settings)

results = eval_client.evaluate_experiments(
    experiment_name,
    data = top_k_documents,
    checks = [Evals.CONTEXT_RELEVANCE],
    exp_columns = ['embedding_model'],
    metadata = {"uptrain_index_columns": ['question', 'retrieval_rank']}
)