### Evaluating context relevance on TREC DL track (2021, 2022, ...) with NIST human annotations on passage retrieval rankings

In [None]:
import ir_datasets

trec_dl_2021 = ir_datasets.load(
    "msmarco-passage-v2/trec-dl-2021/judged"
)  # 53 queries
qrels_2021 = trec_dl_2021.qrels_dict()

trec_dl_2022 = ir_datasets.load(
    "msmarco-passage-v2/trec-dl-2022/judged"
)  # 76 queries
qrels_2022 = trec_dl_2022.qrels_dict()

# trec_dl_2023 = ir_datasets.load('msmarco-passage-v2/trec-dl-2023')  # TODO check with Cortex team to see if the

# import json

# # process the raw qrels data
# qrels_2023 = {}
# for i, row in trec_2023_raw_qrels.iterrows():
#     query_id = str(row["QUERY_ID"])
#     if query_id not in qrels_2023:
#         qrels_2023[query_id] = {}
#     doc_scores = json.loads(row["DOC_SCORES"])
#     for pair in doc_scores:
#         doc_id = pair['doc']
#         score = pair['score']
#         qrels_2023[query_id][doc_id] = score

In [None]:
from trulens.feedback.v2.feedback import ContextRelevance

old_criteria = """
- CONTEXT that is IRRELEVANT to the QUESTION should score 0.
- CONTEXT that is RELEVANT to some of the QUESTION should get an intermediate score.
- CONTEXT that is RELEVANT to most of the QUESTION should get a score closer to 3.
- CONTEXT that is RELEVANT to the entirety of the QUESTION should get a score of 3, which is the full mark.
- CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 3.
"""

In [None]:
from collections import defaultdict

import ir_datasets


def generate_trec_dl_benchmark(
    max_samples_per_query_per_score: int = 3,
    dataset_path: str = "msmarco-passage-v2/trec-dl-2021/judged",
):
    # Combine queries and qrels from multiple datasets
    queries = {}
    qrels = defaultdict(dict)
    docs_store = None

    dataset = ir_datasets.load(dataset_path)
    # Merge queries
    queries.update({q.query_id: q for q in dataset.queries_iter()})
    # Merge qrels
    for qid, docs in dataset.qrels_dict().items():
        qrels[qid].update(docs)
    # Get docs_store
    if docs_store is None:
        docs_store = dataset.docs_store()

    print("Total number of queries:", len(queries))
    print("Total number of qrels:", len(qrels))
    # Sampling
    for query_id, query in queries.items():
        if query_id not in qrels:
            print("query_id not found in qrels")
            continue  # Skip queries without relevance judgments

        # Get documents by relevance scores
        relevant_docs = defaultdict(list)
        for doc_id, score in qrels[query_id].items():
            relevant_docs[score].append(doc_id)

        # Initialize sampling counts
        sampled_docs = []

        # Use scoreddocs for all scores (0, 1, 2, and 3)
        for score in [0, 1, 2, 3]:
            if score in relevant_docs:
                # Get ranked documents using scoreddocs
                ranked_docs = []
                for scored_doc in ir_datasets.load(
                    dataset_path
                ).scoreddocs_iter():
                    if (
                        scored_doc.query_id == query_id
                        and scored_doc.doc_id in relevant_docs[score]
                    ):
                        ranked_docs.append((
                            scored_doc.doc_id,
                            scored_doc.score,
                        ))

                # Sort by score (descending) and select top documents
                ranked_docs.sort(key=lambda x: x[1], reverse=True)

                top_docs = [
                    doc_id
                    for doc_id, _ in ranked_docs[
                        :max_samples_per_query_per_score
                    ]
                ]

                # Add to sampled documents
                sampled_docs.extend(top_docs)
        doc_text_seen = set()  # deduplication of identical passages
        # Yield the sampled data
        for doc_id in sampled_docs:
            doc = docs_store.get(doc_id)
            if doc and doc.text not in doc_text_seen:
                doc_text_seen.add(doc.text)
                yield {
                    "query_id": query_id,
                    "query": query.text,
                    "doc_id": doc_id,
                    "expected_response": doc.text
                    if hasattr(doc, "text")
                    else doc.body,
                    "expected_score": qrels[query_id][doc_id]
                    / 3,  # Normalize to [0, 1]
                }

In [None]:
# trec_2021_samples = list(generate_trec_dl_benchmark(max_samples_per_query_per_score=3, dataset_path="msmarco-passage-v2/trec-dl-2021/judged"))
# trec_2022_samples = list(generate_trec_dl_benchmark(max_samples_per_query_per_score=3, dataset_path="msmarco-passage-v2/trec-dl-2022/judged"))
# trec_combined = trec_2021_samples + trec_2022_samples

import pandas as pd

# trec_combined_df = pd.DataFrame(trec_combined)
# trec_combined_df.to_csv("trec_dl_2021_2022_benchmark.csv", index=False)

trec_combined_df = pd.read_csv("trec_dl_2021_2022_benchmark.csv")
print(len(trec_combined_df))

In [None]:
print(len(qrels_2021), len(qrels_2022))

In [None]:
from trulens.benchmark.benchmark_frameworks.experiments.dataset_preprocessing import (
    visualize_expected_score_distribution,
)

trec_combined_relevance_scores = [
    entry["expected_score"] for _, entry in trec_combined_df.iterrows()
]
visualize_expected_score_distribution(trec_combined_relevance_scores)

In [None]:
trec_combined_df

In [None]:
import os

from trulens.core.session import TruSession
from trulens.providers.openai import OpenAI

# os.environ["OPENAI_API_KEY"] = "sk-..."

connection_params = {
    "account": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "user": os.environ.get("SNOWFLAKE_USER"),
    "password": os.environ.get("SNOWFLAKE_USER_PASSWORD"),
    "database": os.environ.get("SNOWFLAKE_DATABASE"),
    "schema": os.environ.get("SNOWFLAKE_SCHEMA"),
    "warehouse": os.environ.get("SNOWFLAKE_WAREHOUSE"),
    "role": os.environ.get("SNOWFLAKE_ROLE"),
    "init_server_side": False,  # Set to True to enable server side feedback functions
}

# connector = SnowflakeConnector(**connection_params)
# session = TruSession(connector=connector)

session = TruSession()
session.reset_database()


# snowpark_session = Session.builder.configs(connection_params).create()

gpt_4o = OpenAI(model_engine="gpt-4o")
gpt_4o_mini = OpenAI(model_engine="gpt-4o-mini")
# llama3_405b = Cortex(snowflake.connector.connect(**connection_params), model_engine="llama3.1-405b")
# mistral_large = Cortex(snowflake.connector.connect(**connection_params), model_engine="mistral-large")
# llama3_1_8b = Cortex(snowflake.connector.connect(**connection_params), model_engine="llama3.1-8b")


PROVIDERS = [
    gpt_4o,
    gpt_4o_mini,
]


# criteria without explicit rubrics
old_criteria = """
- CONTEXT that is IRRELEVANT to the QUESTION should score 0.
- CONTEXT that is RELEVANT to some of the QUESTION should get an intermediate score.
- CONTEXT that is RELEVANT to most of the QUESTION should get a score closer to 3.
- CONTEXT that is RELEVANT to the entirety of the QUESTION should get a score of 3, which is the full mark.
- CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 3.
"""


def trulens_context_relevance(
    provider, query: str, context: str, gt_score: float
) -> str:
    trulens_context_relevance_res = provider.context_relevance_with_cot_reasons(
        question=query, context=context
    )
    return f"{trulens_context_relevance_res[0]};{gt_score};{trulens_context_relevance_res[1]}"


def trulens_context_relevance_no_rubric(
    provider, query: str, context: str, gt_score: float
) -> str:
    trulens_context_relevance_res = provider.context_relevance_with_cot_reasons(
        question=query, context=context, criteria=old_criteria
    )
    return f"{trulens_context_relevance_res[0]};{gt_score};{trulens_context_relevance_res[1]}"

In [None]:
ContextRelevance.criteria

In [None]:
from trulens.apps.basic import TruBasicApp
from trulens.core import Feedback
from trulens.core import Provider

THRESHOLD = 0.5  # for passage retrieval annotation, we consider a score of 0.5 or above as relevant


class CustomTermFeedback(Provider):
    def true_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 1 else 0.0

    def true_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 0 else 0.0

    def false_positive(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 1 and binary_gt_score == 0 else 0.0

    def false_negative(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        binary_score = 1 if feedback_score >= 0.5 else 0
        binary_gt_score = 1 if gt_score >= THRESHOLD else 0
        return 1.0 if binary_score == 0 and binary_gt_score == 1 else 0.0

    def term_absolute_error(self, output: str) -> float:
        feedback_score, gt_score = (
            float(output.split(";")[0]),
            float(output.split(";")[1]),
        )
        return abs(feedback_score - gt_score)

    def raw_gt_score(self, output: str) -> float:
        return float(output.split(";")[1]) * 3

    def raw_feedback_score(self, output: str) -> float:
        return float(output.split(";")[0]) * 3


custom_term_feedback = CustomTermFeedback()

f_tp = Feedback(
    custom_term_feedback.true_positive,
    name="True Positive",
    higher_is_better=True,
).on_output()
f_tn = Feedback(
    custom_term_feedback.true_negative,
    name="True Negative",
    higher_is_better=True,
).on_output()
f_fp = Feedback(
    custom_term_feedback.false_positive,
    name="False Positive",
    higher_is_better=False,
).on_output()
f_fn = Feedback(
    custom_term_feedback.false_negative,
    name="False Negative",
    higher_is_better=False,
).on_output()
f_abs_err = Feedback(
    custom_term_feedback.term_absolute_error,
    name="Absolute Error",
    higher_is_better=False,
).on_output()
f_raw_gt_score = Feedback(
    custom_term_feedback.raw_gt_score,
    name="Raw GT Score",
    higher_is_better=True,
).on_output()
f_raw_feedback_score = Feedback(
    custom_term_feedback.raw_feedback_score,
    name="Raw Feedback Score",
    higher_is_better=True,
).on_output()

CUSTOM_FEEDBACK_FUNCS = [
    f_tp,
    f_tn,
    f_fp,
    f_fn,
    f_abs_err,
    f_raw_gt_score,
    f_raw_feedback_score,
]


def run_experiment_for_provider(provider, func_wrapper, dataset_df, app_name):
    tru_wrapped_app = TruBasicApp(
        func_wrapper,
        app_name=app_name,
        app_version=f"{provider.model_engine}-context-relevance",
        feedbacks=CUSTOM_FEEDBACK_FUNCS,
    )

    for i, row in dataset_df.iterrows():
        arg_1 = row["query"]
        arg_2 = row["expected_response"]
        arg_3 = row["expected_score"]

        try:
            with tru_wrapped_app as _:
                tru_wrapped_app.app(provider, arg_1, arg_2, arg_3)

        except Exception as e:
            print(
                f"Error {e} in run_feedback_experiment row {i} with first arg {arg_1} and second arg {arg_2}"
            )


# with concurrent.futures.ThreadPoolExecutor() as executor:
#     futures = [executor.submit(run_experiment_for_provider, provider, trec_doc_2022) for provider in PROVIDERS]
#     concurrent.futures.wait(futures)

### Run experiments

In [None]:
PROVIDERS

In [None]:
for provider in PROVIDERS:
    print(f"Running provider: {provider.model_engine}")
    run_experiment_for_provider(
        provider,
        trulens_context_relevance,
        trec_combined_df,
        "trec_dl_2021_2022_combined",
    )

### Evaluate Gaurav's prompt, UMBRELA prompt, and zero-shot categorical prompt

In [None]:
gaurav_prompt = """
You are an expert search result rater. You are given a user query and a search result. Your task is to rate the search result based on its relevance to the user query. You should rate the search result on a scale of 0 to 3, where:
    0: The search result has no relevance to the user query.
    1: The search result has low relevance to the user query. In this case the search result may contain some information which seems very slightly related to the user query but not enough information to answer the user query. The search result contains some references or very limited information about some entities present in the user query. In case the query is a statement on a topic, the search result should be tangentially related to it.
    2: The search result has medium relevance to the user query. If the user query is a question, the search result may contain some information that is relevant to the user query but not enough information to answer the user query. If the user query is a search phrase/sentence, either the search result is centered around about most but not all entities present in the user query, or if all the entities are present in the result, the search result while not being centered around it has medium level of relevance. In case the query is a statement on a topic, the search result should be related to the topic.
    3: The search result has high relevance to the user query. If the user query is a question, the search result contains information that can answer the user query. Otherwise if the search query is a search phrase/sentence, it provides relevant information about all entities that are present in the user query and the search result is centered around the entities mentioned in the query. In case the query is a statement on a topic, the search result should be either be directly addressing it or be on the same topic.
    
    You should think step by step about the user query and the search result and rate the search result. You should also provide a reasoning for your rating.
    
    Use the following format:
    Rating: Example Rating
    Reasoning: Example Reasoning
    
    ### Examples
    Example:
    Example 1:
    INPUT:
    User Query: What is the definition of an accordion?
    Search Result: Accordion definition, Also called piano accordion. a portable wind instrument having a large bellows for forcing air through small metal reeds, a keyboard for the right hand, and buttons for sounding single bass notes or chords for the left hand. a similar instrument having single-note buttons instead of a keyboard.
    OUTPUT:
    Rating: 3
    Reasoning: In this case the search query is a question. The search result directly answers the user question for the definition of an accordion, hence it has high relevance to the user query.
    
    Example 2:
    INPUT:
    User Query: dark horse
    Search Result: Darkhorse is a person who everyone expects to be last in a race. Think of it this way. The person who looks like he can never get laid defies the odds and gets any girl he can by being sly,shy and cunning. Although he\'s not a player, he can really charm the ladies.
    OUTPUT:
    Rating: 3
    Reasoning: In this case the search query is a search phrase mentioning \'dark horse\'. The search result contains information about the term \'dark horse\' and provides a definition for it and is centered around it. Hence it has high relevance to the user query.
    
    Example 3:
    INPUT:
    User Query: Global warming and polar bears
    Search Result: Polar bear The polar bear is a carnivorous bear whose native range lies largely within the Arctic Circle, encompassing the Arctic Ocean, its surrounding seas and surrounding land masses. It is a large bear, approximately the same size as the omnivorous Kodiak bear (Ursus arctos middendorffi).
    OUTPUT:
    Rating: 2
    Reasoning: In this case the search query is a search phrase mentioning two entities \'Global warming\' and \'polar bears\'. The search result contains is centered around the polar bear which is one of the two entities in the search query. Therefore it addresses most of the entities present and hence has medium relevance. 
    
    Example 4:
    INPUT:
    User Query: Snowflake synapse private link
    Search Result: "This site can\'t be reached" error when connecting to Snowflake via Private Connectivity\nThis KB article addresses an issue that prevents connections to Snowflake failing with: "This site can\'t be reached" ISSUE: Attempting to reach Snowflake via Private Connectivity fails with the "This site can\'t be reached" error
    OUTPUT:
    Rating: 1
    Reasoning: In this case the search result is a search query mentioning \'Snowflake synapse private link\'. However the search result doesn\'t contain information about it. However it shows an error message for a generic private link which is tangentially related to the query, since snowflake synapse private link is a type of private link. Hence it has low relevance to the user query.
    
    Example 5:
    INPUT:
    User Query: The Punisher is American.
    Search Result: The Rev(Samuel Smith) is a fictional character, a supervillain appearing in American comic books published by Marvel Comics. Created by Mike Baron and Klaus Janson, the character made his first appearance in The Punisher Vol. 2, #4 (November 1987). He is an enemy of the Punisher.
    OUTPUT:
    Rating: 1
    Reasoning: In this case the search query is a statement concerning the Punisher. However the search result is about a character called Rev, who is an enemy of the Punisher. The search result is tangentially related to the user query but does not address topic about Punisher being an American. Hence it has low relevance to the user query.

    Example 6:
    INPUT:
    User Query: query_history
    Search Result: The function task_history() is not enough for the purposes when the required result set is more than 10k.If we perform UNION between information_schema and account_usage , then we will get more than 10k records along with recent records as from information_schema.query_history to snowflake.account_usage.query_history is 45 mins behind.
    OUTPUT:
    Rating: 1
    Reasoning: In this case the search query mentioning one entity \'query_history\'. The search result is neither centered around it and neither has medium relevance, it only contains an unimportant reference to it. Hence it has low relevance to the user query.
    
    Example 7:
    INPUT:
    User Query: Who directed pulp fiction?
    Search Result: Life on Earth first appeared as early as 4.28 billion years ago, soon after ocean formation 4.41 billion years ago, and not long after the formation of the Earth 4.54 billion years ago.
    OUTPUT:
    Rating: 0
    Reasoning: In the case the search query is a question. However the search result does is completely unrelated to it. Hence the search result is completely irrelevant to the movie pulp fiction. 
    ###
    
    Now given the user query and search result below, rate the search result based on its relevance to the user query and provide a reasoning for your rating.
   

"""


umbrela_prompt = """
Given a query and a passage, you must provide a score on an
integer scale of 0 to 3 with the following meanings:
0 = represent that the passage has nothing to do with the query,
1 = represents that the passage seems related to the query but
does not answer it,
2 = represents that the passage has some answer for the query,
but the answer may be a bit unclear, or hidden amongst extraneous
information and
3 = represents that the passage is dedicated to the query and
contains the exact answer.
Important Instruction: Assign category 1 if the passage is
somewhat related to the topic but not completely, category 2 if
passage presents something very important related to the entire
topic but also has some extra information and category 3 if the
passage only and entirely refers to the topic. If none of the
above satisfies give it category 0.
Query: {query}
Passage: {passage}
Split this problem into steps:
Consider the underlying intent of the search.
Measure how well the content matches a likely intent of the query
(M).
Measure how trustworthy the passage is (T).
Consider the aspects above and the relative importance of each,
and decide on a final score (O). Final score must be an integer
value only.
Do not provide any code in result. Provide each score in the
format of: ##final score: score without providing any reasoning.
Always provide an output of the final category score described above (the final score
on a scale of 0 to 3).

"""


zero_shot_categorical_prompt = """
Given a query and a passage, you just categorize the passage based on how well it answers the query. The categories that can be assigned are the following:

IRRELEVANT -- if a passage is categorized as irrelevant to a query, this means that the passage has nothing to do with the query.
FAIR -- if a passage is categorized as fair to a query, this means that the passage has low relevance to the query. The passage contains some information which seems very slightly related to the query but not enough information to answer the query. The passage contains some references or very limited information about some entities present in the query. In case the query is a statement on a topic, the passage should be tangentially related to it.
GOOD -- if a passage is categorized as good to a query, this means that the passage has medium relevance to the query. If the query is a question, the passage may contain some information that is relevant to the query but not enough information to answer the query. If the query is a phrase/sentence, either the result is centered around most but not all entities present in the query, or if all the entities are present in the passage, the passage, while not being centered around it, has a medium level of relevance. In case the query is a statement on a topic, the passage should be related to the topic.
EXCELLENT -- if a passage is categorized as excellent to a query, this means that the passage has a high relevance to the query. If the query is a question, the passage should contain information that can answer the query. Otherwise if the query is a phrase/sentence, it provides relevant information about all entities that are present in the query and the passage is centered around the entities mentioned in the query. In case the query is a statement on a topic, the passage should be either directly addressing it or be on the same topic.

You should think step by step about the query and the passage and provide a categorization. You should also provide a reasoning for your categorization. If you absolutely cannot figure out a categorization, assign IRRELEVANT.


Query: {query}
Passage: {passage}

Provide the output in the format of: ##Categorization: <category chosen for the query passage pair>
Always provide an output of the final categoriy described above.

"""

In [None]:
from trulens.feedback.v2.feedback import ContextRelevance

trulens_prompt = (
    ContextRelevance.system_prompt + "\n\n" + ContextRelevance.user_prompt
)
print(f"TruLens prompt: \n\n {trulens_prompt}")

In [None]:
from openai import OpenAI

client = OpenAI()


# Function to rate context relevance
def gaurav_prompt_relevance(
    query: str, passage: str, model_engine="gpt-4o"
) -> dict:
    # Prepare the prompt
    response = client.chat.completions.create(
        model=model_engine,
        messages=[
            {
                "role": "system",
                "content": gaurav_prompt,
            },
            {
                "role": "user",
                "content": f""" INPUT:
                            User Query: {query}
                            Search Result: {passage}
                            OUTPUT:\n""",
            },
        ],
    )

    # Parse the response
    output = response.choices[0].message.content.strip()

    # Extract the rating and reasoning from the output
    rating = None
    reasoning = None
    try:
        for line in output.split("\n"):
            if line.startswith("Rating:"):
                rating = int(line.split(":")[1].strip())
            elif line.startswith("Reasoning:"):
                reasoning = line.split(":")[1].strip()
    except Exception as e:
        print(f"Error parsing response: {e}")

    return {"rating": rating, "reasoning": reasoning, "raw_response": output}


def umbrela_prompt_relevance(
    query: str, passage: str, model_engine="gpt-4o"
) -> dict:
    # Prepare the prompt
    response = client.chat.completions.create(
        model=model_engine,
        messages=[
            {
                "role": "system",
                "content": umbrela_prompt.format(query=query, passage=passage),
            },
        ],
    )

    # Parse the response
    output = response.choices[0].message.content.strip()

    # Extract the rating and reasoning from the output
    rating = None
    reasoning = None
    try:
        for line in output.split("\n"):
            if line.startswith("##final score:"):
                rating = int(line.split(":")[1].strip())
            elif line.startswith("Final score:"):
                rating = int(line.split(":")[1].strip())
    except Exception as e:
        print(f"Error parsing response: {e}")

    return {"rating": rating, "reasoning": reasoning, "raw_response": output}


def categorical_prompt_relevance(
    query: str, passage: str, model_engine="gpt-4o"
) -> dict:
    # Prepare the prompt
    response = client.chat.completions.create(
        model=model_engine,
        messages=[
            {
                "role": "system",
                "content": zero_shot_categorical_prompt.format(
                    query=query, passage=passage
                ),
            },
        ],
    )

    # Parse the response
    output = response.choices[0].message.content.strip()

    # Extract the rating and reasoning from the output
    category = None
    reasoning = None
    try:
        for line in output.split("\n"):
            if line.startswith("##Categorization:"):
                category = line.split(":")[1].strip()
            elif line.startswith("Reasoning:"):
                reasoning = line.split(":")[1].strip()
    except Exception as e:
        print(f"Error parsing response: {e}")

    CATEGORY_TO_RATING = {
        "IRRELEVANT": 0,
        "FAIR": 1,
        "GOOD": 2,
        "EXCELLENT": 3,
    }
    if category in CATEGORY_TO_RATING:
        rating = CATEGORY_TO_RATING[category]
    else:
        rating = None

    return {"rating": rating, "reasoning": reasoning, "raw_response": output}

In [None]:
trec_combined_df

In [None]:
import pandas as pd

# Initialize an empty list to store results
for model in ["gpt-4o", "gpt-4o-mini"]:
    results = []

    # Iterate over the DataFrame rows
    for i, row in trec_combined_df.iterrows():
        query = row["query"]
        passage = row["expected_response"]
        ground_truth = (
            int(row["expected_score"]) * 3
        )  # recover raw score {0, 1, 2, 3}

        # print(f"Query: {query}")
        # print(f"Passage: {passage}")

        # Gaurav Prompt
        gaurav_result = gaurav_prompt_relevance(
            query, passage, model_engine=model
        )
        gaurav_rating = gaurav_result["rating"]

        # Umbrela Prompt
        umbrela_result = umbrela_prompt_relevance(
            query, passage, model_engine=model
        )
        umbrela_rating = umbrela_result["rating"]

        # Categorical Prompt
        categorical_result = categorical_prompt_relevance(
            query, passage, model_engine=model
        )
        categorical_rating = categorical_result["rating"]

        # Append results to the list
        results.append({
            "query_id": row[
                "query_id"
            ],  # Assuming 'query_id' column exists in trec_combined_df
            "query": query,
            "passage": passage,
            "ground_truth": ground_truth,
            "gaurav_rating": gaurav_rating,
            "umbrela_rating": umbrela_rating,
            "categorical_rating": categorical_rating,
        })

    # Convert results into a DataFrame
    results_df = pd.DataFrame(results)

    # Save results to CSV for further analysis
    results_df.to_csv(f"{model}_3_prompts_results.csv", index=False)

    # Inspect the DataFrame
    print(results_df.head())