In [2]:
# Build 🏗️

# Data: NVIDIA 10-k Filings
# Model: OpenAI text-3-embedding small, GPT-3.5-turbo
# Tooling: LangChain or LlamaIndex (you choose)
# Vector Store: FAISS
# Additional Component: Add one of the following: 1) visibility with WandB OR 2) evaluation with RAGAS
# Ship 🚢

# Evaluate your answers to the following questions
# "Who is the E-VP, Operations - and how old are they?"
# "What is the gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023?"
# Record <10 min loom video walkthrough
# $$ Extra Credit: Deploy to public URL on HF with Chainlit front end

In [4]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import pdfplumber
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.retrievers import MultiQueryRetriever
from langchain_community.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)
from tqdm import tqdm
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from questions import question_list
from datasets import Dataset
import faiss

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]
out_fp = './data/'
callbacks = [StreamingStdOutCallbackHandler()]
load_dotenv()

True

In [2]:
def load_pdfs():
    loader = PyPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf")
    pages = loader.load()
    return pages
pdf_pages = load_pdfs()
full_text = '\n'.join([pt.page_content for pt in pdf_pages])

In [3]:
pdf_pages[0].metadata

{'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf',
 'page': 0}

![Chunking Consideration1](./assets/midterm_1.PNG)

![Chunking Consideration2](./assets/midterm_2.PNG)

![Eyeballing Chunksize](./assets/midterm_3.PNG)

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=700)
documents = text_splitter.split_documents(pdf_pages)

print("Original Pages: ", len(pdf_pages))
print("Number page chunks: ", len(documents))

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# vector_store = FAISS.from_documents(documents, embeddings)
faiss_fn = 'nvidia_10k_faiss_index.bin'
# vector_store.save_local(out_fp + faiss_fn)
vector_store = FAISS.load_local(out_fp + faiss_fn, embeddings, allow_dangerous_deserialization=True)
retriever = vector_store.as_retriever()

mixtral_llm = HuggingFaceTextGenInference(
    inference_server_url="http://KNEDDLALP001:8090/",
    max_new_tokens=4000,
    temperature=0.01,
    repetition_penalty=1.03
)

openai_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
template = """Answer the question based only on the following context. If you cannot answer the question with the context, respond with 'I don't know'. You'll get a big bonus and a potential promotion if you provide a high quality answer:

Context:
{context}

Question:
{question}
"""
prompt_template = ChatPromptTemplate.from_template(template)
retrieval_augmented_qa_chain_openai = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt_template | openai_llm, "context": itemgetter("context")}
)

retrieval_augmented_qa_chain_mixtral = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt_template | mixtral_llm, "context": itemgetter("context")}
)

Original Pages:  96
Number page chunks:  219


  warn_deprecated(



In [9]:
'''
openai_results = {}

for i, question in tqdm(enumerate(question_list), total=len(question_list)):
    response = retrieval_augmented_qa_chain_openai.invoke({"question" : question})
    openai_results[i] = {"question": question,
                  "response": response["response"].content,
                  "context": '\n'.join([document.page_content for document in response["context"]]),
                  "page_numbers": set([document.metadata['page'] for document in response["context"]])}

mixtral_results = {}
for i, question in tqdm(enumerate(question_list), total=len(question_list)):
    response = retrieval_augmented_qa_chain_mixtral.invoke({"question" : question})
    mixtral_results[i] = {"question": question,
                  "response": response["response"],
                  "context": '\n'.join([document.page_content for document in response["context"]]),
                  "page_numbers": set([document.metadata['page'] for document in response["context"]])}
'''

100%|██████████| 2/2 [00:01<00:00,  1.11it/s]
100%|██████████| 2/2 [00:03<00:00,  1.68s/it]


In [8]:
'''
ground_truth_openai = pd.DataFrame(openai_results).T.rename(
    columns={"response": "ground_truth"}
)
ground_truth_mixtral = pd.DataFrame(mixtral_results).T.rename(
    columns={"response": "ground_truth"}
)

ground_truth = pd.merge(
    ground_truth_openai,
    ground_truth_mixtral,
    on="question",
    how="inner",
    validate="1:1",
    suffixes=['_openai', '_mixtral']
)

# ground_truth.to_csv(out_fp + 'ground_truth.csv', index=False)
'''
ground_truth = pd.read_csv(out_fp + 'ground_truth.csv')
ground_truth

Unnamed: 0,question,ground_truth_openai,context_openai,page_numbers_openai,ground_truth_mixtral,context_mixtral,page_numbers_mixtral
0,"Who is the E-VP, Operations - and how old are ...",Debora Shoquist is the Executive Vice Presiden...,"Table of Contents\nstrategy , planning, report...","{11, 12, 84}","\nAnswer:\nDebora Shoquist is the E-VP, Operat...","Table of Contents\nstrategy , planning, report...","{11, 12, 84}"
1,What is the gross carrying amount of Total Amo...,"$3,539 million",Table of Contents\nNVIDIA Corporation and Sub...,"{67, 68, 62, 63}",\nAnswer:\nThe gross carrying amount of Total ...,Table of Contents\nNVIDIA Corporation and Sub...,"{67, 68, 62, 63}"


In [11]:
"""
@dataclass
class TestsetGenerator:
    generator_llm: BaseRagasLLM
    critic_llm: BaseRagasLLM
    embeddings: BaseRagasEmbeddings
    docstore: DocumentStore

    @classmethod
    def with_openai(
        cls,
        generator_llm: str = "gpt-3.5-turbo-16k",
        critic_llm: str = "gpt-4",
        embeddings: str = "text-embedding-ada-002",
        docstore: t.Optional[DocumentStore] = None,
        run_config: t.Optional[RunConfig] = None,
        chunk_size: int = 1024    
"""

generator = TestsetGenerator.with_openai(
    generator_llm="gpt-3.5-turbo",
    critic_llm="gpt-4",
    embeddings="text-embedding-3-small",
    chunk_size=1024,
)
testset = generator.generate_with_langchain_docs(
    documents,
    test_size=10,
    distributions={simple: 0.25, reasoning: 0.25, multi_context: 0.5},
)

Filename and doc_id are the same for all nodes.                   
Generating: 100%|██████████| 10/10 [00:24<00:00,  2.45s/it]


In [12]:
idx = np.random.choice(len(testset.test_data))
print('idx: ', idx)
print(testset.test_data[idx].question)
print(testset.test_data[idx].ground_truth)
print(len(testset.test_data[idx].contexts))
print(testset.test_data[idx].contexts)

idx:  7
"What financial units are in the statements and how is net income per share calculated?"
The financial units mentioned in the statements are Compute & Networking and Graphics reporting units. Net income per share is calculated as net income divided by basic weighted average shares for basic net income per share, and as net income divided by diluted weighted average shares for diluted net income per share.
2
['been anti-dilutive15 40 21 \n(1)    Calculated as net income divided by basic weighted average shares.\n(2)    Calculated as net income divided by diluted weighted average shares.\nNote 6 - Goodwill\nAs of Janua ry 28, 2024, the total carrying amount of goodwill was $4.4 billion, consisting of goodwill balances allocated to our Compute & Networking and\nGraphics reporting units of $4.1 billion and $370  million, respectively . As of January 29, 2023, the total carrying amount of goodwill was $4.4 billion, consisting of\ngoodwill balances allocated to our Compute & Networki

In [9]:
# testset_df = testset.to_pandas()
# testset_df.to_csv(out_fp + 'testset.csv', index=False)
testset_df = pd.read_csv(out_fp + 'testset.csv')
testset_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,How are marketable equity securities recorded ...,['Table of Contents\nNVIDIA Corporation and S...,Marketable equity securities are recorded as l...,simple,True
1,What factors contributed to the decrease in ca...,['Table of Contents\nLiquidity and Capital Res...,Lower marketable securities maturities and hig...,simple,True
2,"""What investments fall under 'Investments in N...",['Table of Contents\nNVIDIA Corporation and S...,Investments in non-affiliated entities for NVI...,reasoning,True
3,"""How are marketable equity securities with fai...",['primarily investments in privately held comp...,Marketable equity securities with fair values ...,reasoning,True
4,"""What investments fall under 'Investments in N...",['Table of Contents\nNVIDIA Corporation and S...,NVIDIA Corp's investments in non-affiliated en...,multi_context,True
5,"""What industries benefit from NVIDIA computing...",['Table of Contents\nResearchers and developer...,"Researchers, developers, gamers, professional ...",multi_context,True
6,"""Which countries are being audited for taxes a...","[""While we believe that we have adequately pro...","Germany, India, Israel, and Taiwan are being a...",multi_context,True
7,"""What financial units are in the statements an...",['been anti-dilutive15 40 21 \n(1) Calculat...,The financial units mentioned in the statement...,multi_context,True
8,"""What role do solution architects play in opti...","['example, our solution architects work with C...",Our solution architects work with CSPs to prov...,multi_context,True
9,"""What are the restrictions on Covered Officers...",['COMPENSATION RECOVERY POLICY\nlaw or contrac...,No Covered Officer shall be entitled to indemn...,reasoning,True


In [28]:
test_questions = testset_df["question"].values.tolist()
test_groundtruths = testset_df["ground_truth"].values.tolist()

# answers_mixtral = []
# contexts_mixtral = []

answers_openai = []
contexts_openai = []

for question in tqdm(test_questions, total=len(test_questions)):
  # response_mixtral = retrieval_augmented_qa_chain_mixtral.invoke({"question" : question})
  # answers_mixtral.append(response_mixtral["response"])
  # contexts_mixtral.append([context.page_content for context in response_mixtral["context"]])

  response_openai = retrieval_augmented_qa_chain_openai.invoke({"question" : question})
  answers_openai.append(response_openai["response"].content)
  contexts_openai.append([context.page_content for context in response_openai["context"]])


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:15<00:00,  1.53s/it]


In [30]:
response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer_mixtral" : answers_mixtral,
    "contexts_mixtral" : contexts_mixtral,

    "answer_openai" : answers_openai,
    "contexts_openai" : contexts_openai,

    "ground_truth" : test_groundtruths
})


response_dataset_df = pd.DataFrame(response_dataset)
response_dataset_df.to_csv(out_fp + "response_dataset.csv", index=False)
# response_dataset_df = pd.read_csv(out_fp + "response_dataset.csv")
# response_dataset_df = response_dataset_df.astype(str)
# response_dataset = Dataset.from_pandas(response_dataset_df)
response_dataset_df.head()

Unnamed: 0,question,answer_mixtral,contexts_mixtral,answer_openai,contexts_openai,ground_truth
0,How are marketable equity securities recorded ...,\nAnswer:\nMarketable equity securities are re...,[Table of Contents\nNVIDIA Corporation and Su...,Marketable equity securities are recorded as l...,[Table of Contents\nNVIDIA Corporation and Su...,Marketable equity securities are recorded as l...
1,What factors contributed to the decrease in ca...,\nAnswer:\n The decrease in cash provided by i...,[Table of Contents\nLiquidity and Capital Reso...,The decrease in cash provided by investing act...,[Table of Contents\nLiquidity and Capital Reso...,Lower marketable securities maturities and hig...
2,"""What investments fall under 'Investments in N...","\nAnswer:\n Based on the provided context, the...",[Table of Contents\nNVIDIA Corporation and Su...,The investments that fall under 'Investments i...,[Table of Contents\nNVIDIA Corporation and Su...,Investments in non-affiliated entities for NVI...
3,"""How are marketable equity securities with fai...","\nAnswer:\nBased on the provided context, mark...",[primarily investments in privately held compa...,Marketable equity securities with fair values ...,[primarily investments in privately held compa...,Marketable equity securities with fair values ...
4,"""What investments fall under 'Investments in N...","\nAnswer:\nBased on the provided context, 'Inv...",[Table of Contents\nNVIDIA Corporation and Su...,The investments that fall under 'Investments i...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA Corp's investments in non-affiliated en...


In [31]:
ragas_eval_results_openai = evaluate(
    response_dataset.rename_column("answer_openai", "answer").rename_column(
        "contexts_openai", "contexts"
    ),
    metrics
)
ragas_eval_results_openai_df = ragas_eval_results_openai.to_pandas()
# ragas_eval_results_openai_df.to_csv(out_fp + 'ragas_eval_results_openai_df.csv', index=False)
ragas_eval_results_openai_df = pd.read_csv(out_fp + 'ragas_eval_results_openai_df.csv')
ragas_eval_results_openai_df.head(3)

Evaluating: 100%|██████████| 50/50 [06:38<00:00,  7.97s/it]


Unnamed: 0,question,answer_mixtral,contexts_mixtral,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How are marketable equity securities recorded ...,\nAnswer:\nMarketable equity securities are re...,[Table of Contents\nNVIDIA Corporation and Su...,Marketable equity securities are recorded as l...,[Table of Contents\nNVIDIA Corporation and Su...,Marketable equity securities are recorded as l...,1.0,1.0,1.0,1.0,0.989971
1,What factors contributed to the decrease in ca...,\nAnswer:\n The decrease in cash provided by i...,[Table of Contents\nLiquidity and Capital Reso...,The decrease in cash provided by investing act...,[Table of Contents\nLiquidity and Capital Reso...,Lower marketable securities maturities and hig...,1.0,0.979134,1.0,0.916667,0.744975
2,"""What investments fall under 'Investments in N...","\nAnswer:\n Based on the provided context, the...",[Table of Contents\nNVIDIA Corporation and Su...,The investments that fall under 'Investments i...,[Table of Contents\nNVIDIA Corporation and Su...,Investments in non-affiliated entities for NVI...,1.0,0.946394,1.0,1.0,0.746483


In [27]:
ragas_eval_results_mixtral = evaluate(
    response_dataset.rename_column("answer_mixtral", "answer").rename_column(
        "contexts_mixtral", "contexts"
    ),
    metrics
)
ragas_eval_results_mixtral_df = ragas_eval_results_mixtral.to_pandas()
# ragas_eval_results_mixtral_df.to_csv(out_fp + 'ragas_eval_results_mixtral_df.csv', index=False)
ragas_eval_results_mixtral_df = pd.read_csv(out_fp + 'ragas_eval_results_mixtral_df.csv')
ragas_eval_results_mixtral_df.head(3)

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How are marketable equity securities recorded ...,\nAnswer:\nMarketable equity securities are re...,['Table of Contents\nNVIDIA Corporation and S...,Marketable equity securities are recorded as l...,,1.0,1.0,1.0,0.732213
1,What factors contributed to the decrease in ca...,\nAnswer:\n The decrease in cash provided by i...,['Table of Contents\nLiquidity and Capital Res...,Lower marketable securities maturities and hig...,1.0,0.976411,1.0,0.916667,0.740503
2,"""What investments fall under 'Investments in N...","\nAnswer:\n Based on the provided context, the...",['Table of Contents\nNVIDIA Corporation and S...,Investments in non-affiliated entities for NVI...,1.0,0.945018,1.0,1.0,0.533431


In [32]:
'''
Vector store-backed retriever

A vector store retriever is a retriever that uses a vector store to retrieve documents. It is a lightweight wrapper around the vector store class to make it conform to the retriever interface. It uses the search methods implemented by a vector store, like similarity search and MMR, to query the texts in the vector store.
'''

'''
This seems like the best general option, and a major upgrade from Vector store-backed retriever

The MultiQueryRetriever automates the process of prompt tuning by using an LLM to generate multiple queries from different perspectives for a given user input query. For each query, it retrieves a set of relevant documents and takes the unique union across all queries to get a larger set of potentially relevant documents. By generating multiple perspectives on the same question, the MultiQueryRetriever might be able to overcome some of the limitations of the distance-based retrieval and get a richer set of results.
'''

'''
I don't need to compress the data here

Contextual Compression - Contextual compression is meant to fix this. The idea is simple: instead of immediately returning retrieved documents as-is, you can compress them using the context of the given query, so that only the relevant information is returned. “Compressing” here refers to both compressing the contents of an individual document and filtering out documents wholesale.
'''

'''
Interesting one:

Ensemble Retriever

The EnsembleRetriever takes a list of retrievers as input and ensemble the results of their get_relevant_documents() methods and rerank the results based on the Reciprocal Rank Fusion algorithm.

By leveraging the strengths of different algorithms, the EnsembleRetriever can achieve better performance than any single algorithm.
'''
advanced_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=mixtral_llm)

In [34]:
retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
advanced_document_chain = create_stuff_documents_chain(mixtral_llm, retrieval_qa_prompt)
advanced_retrieval_chain = create_retrieval_chain(advanced_retriever, advanced_document_chain)

In [35]:
multi_answers = []
multi_contexts = []

for question in tqdm(test_questions, total=len(test_questions)):
  multi_response = advanced_retrieval_chain.invoke({"input" : question})
  multi_answers.append(multi_response["answer"])
  multi_contexts.append([context.page_content for context in multi_response["context"]])

100%|██████████| 10/10 [02:12<00:00, 13.27s/it]


In [38]:
response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : multi_answers,
    "contexts" : multi_contexts,
    "ground_truth" : test_groundtruths
})

response_dataset_advanced_retrieval_df = pd.DataFrame(response_dataset_advanced_retrieval)
# response_dataset_advanced_retrieval_df.to_csv(out_fp + "response_dataset_advanced_retrieval_df.csv", index=False)
# response_dataset_advanced_retrieval_df = pd.read_csv(out_fp + "response_dataset_advanced_retrieval_df.csv")
response_dataset_advanced_retrieval_df

Unnamed: 0,question,answer,contexts,ground_truth
0,How are marketable equity securities recorded ...,\nAssistant: Marketable equity securities are ...,[Table of Contents\nNVIDIA Corporation and Su...,Marketable equity securities are recorded as l...
1,What factors contributed to the decrease in ca...,\nAssistant: The decrease in cash provided by ...,[Table of Contents\nLiquidity and Capital Reso...,Lower marketable securities maturities and hig...
2,"""What investments fall under 'Investments in N...","\n\nAssistant: Based on the context provided, ...",[Table of Contents\nNVIDIA Corporation and Su...,Investments in non-affiliated entities for NVI...
3,"""How are marketable equity securities with fai...",\nAssistant: Marketable equity securities with...,[primarily investments in privately held compa...,Marketable equity securities with fair values ...
4,"""What investments fall under 'Investments in N...",\nAssistant: According to the provided context...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA Corp's investments in non-affiliated en...
5,"""What industries benefit from NVIDIA computing...",\n\nAssistent: NVIDIA computing solutions bene...,[The Comput e & Networking segment is comprise...,"Researchers, developers, gamers, professional ..."
6,"""Which countries are being audited for taxes a...","\nAssistant: Based on the provided context, th...",[While we believe that we have adequately prov...,"Germany, India, Israel, and Taiwan are being a..."
7,"""What financial units are in the statements an...",\nAssistant: The financial statements provided...,[of the tax position. Our policy is to include...,The financial units mentioned in the statement...
8,"""What role do solution architects play in opti...",\n\nAssystem: Solution architects play a cruci...,"[Table of Contents\ntier-1 suppliers, and star...",Our solution architects work with CSPs to prov...
9,"""What are the restrictions on Covered Officers...",\n\nAssystem: According to the Compensation Re...,[COMPENSATION RECOVERY POLICY\nlaw or contract...,No Covered Officer shall be entitled to indemn...


In [39]:
advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics)
advanced_retrieval_results_df = advanced_retrieval_results.to_pandas()
# advanced_retrieval_results_df.to_csv(out_fp + "advanced_retrieval_results_df.csv", index=False)
advanced_retrieval_results_df = pd.read_csv(out_fp + "advanced_retrieval_results_df.csv")
advanced_retrieval_results_df.head(2)

Evaluating: 100%|██████████| 50/50 [02:20<00:00,  2.82s/it]


Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How are marketable equity securities recorded ...,\nAssistant: Marketable equity securities are ...,[Table of Contents\nNVIDIA Corporation and Su...,Marketable equity securities are recorded as l...,1.0,0.779195,1.0,0.420635,0.725374
1,What factors contributed to the decrease in ca...,\nAssistant: The decrease in cash provided by ...,[Table of Contents\nLiquidity and Capital Reso...,Lower marketable securities maturities and hig...,1.0,0.976363,1.0,0.9,0.989354


In [52]:
eval_openai = pd.DataFrame(
    list(ragas_eval_results_openai.items()), columns=["Metric", "Baseline OpenAI"]
)
eval_mixtral = pd.DataFrame(
    list(ragas_eval_results_mixtral.items()), columns=["Metric", "Baseline Mixtral"]
)
multi_mixtral = pd.DataFrame(
    list(advanced_retrieval_results.items()),
    columns=["Metric", "MultiQueryRetriever with Stuffing Mixtral"],
)

df_merged = pd.merge(
    eval_openai, eval_mixtral, on="Metric", how="inner", validate="1:1"
)
df_merged = pd.merge(df_merged, multi_mixtral, on="Metric", how="inner", validate="1:1")

df_merged["Multi vs OpenAI Baseline"] = (
    df_merged["MultiQueryRetriever with Stuffing Mixtral"]
    - df_merged["Baseline OpenAI"]
)
df_merged["Multi vs Mixtral Baseline"] = (
    df_merged["MultiQueryRetriever with Stuffing Mixtral"]
    - df_merged["Baseline Mixtral"]
)

df_merged["Baseline OpenAI vs. Baseline Mixtral"] = (
    df_merged["Baseline OpenAI"]
    - df_merged["Baseline Mixtral"]
)


# df_merged = pd.read_csv(out_fp + 'df_merged.csv')

df_merged['winner'] = df_merged[['Baseline OpenAI', 'Baseline Mixtral', 'MultiQueryRetriever with Stuffing Mixtral']].idxmax(axis=1)
df_merged["winner vs. MultiQueryRetriever with Stuffing Mixtral"] = df_merged.apply(
    lambda row: row[row["winner"]] - row["MultiQueryRetriever with Stuffing Mixtral"],
    axis=1
)

# df_merged.to_csv(out_fp + 'df_merged.csv', index=False)
# df_merged = pd.read_csv(out_fp + 'df_merged.csv')
df_merged

Unnamed: 0,Metric,Baseline OpenAI,Baseline Mixtral,MultiQueryRetriever with Stuffing Mixtral,Multi vs OpenAI Baseline,Multi vs Mixtral Baseline,Baseline OpenAI vs. Baseline Mixtral,winner,winner vs. MultiQueryRetriever with Stuffing Mixtral
0,faithfulness,0.872222,0.903704,0.981481,0.109259,0.077778,-0.031481,MultiQueryRetriever with Stuffing Mixtral,0.0
1,answer_relevancy,0.938066,0.840912,0.877955,-0.060111,0.037043,0.097154,Baseline OpenAI,0.060111
2,context_recall,1.0,1.0,0.925,-0.075,-0.075,0.0,Baseline OpenAI,0.075
3,context_precision,0.975,0.975,0.894563,-0.080437,-0.080437,0.0,Baseline OpenAI,0.080437
4,answer_correctness,0.736165,0.736031,0.782601,0.046436,0.046571,0.000134,MultiQueryRetriever with Stuffing Mixtral,0.0
