In [1]:
# Build 🏗️

# Data: NVIDIA 10-k Filings
# Model: OpenAI text-3-embedding small, GPT-3.5-turbo
# Tooling: LangChain or LlamaIndex (you choose)
# Vector Store: FAISS
# Additional Component: Add one of the following: 1) visibility with WandB OR 2) evaluation with RAGAS
# Ship 🚢

# Evaluate your answers to the following questions
# "Who is the E-VP, Operations - and how old are they?"
# "What is the gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023?"
# Record <10 min loom video walkthrough
# $$ Extra Credit: Deploy to public URL on HF with Chainlit front end

In [2]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import pdfplumber
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.retrievers import MultiQueryRetriever
from langchain_community.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)
from tqdm import tqdm
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from questions import question_list
from datasets import Dataset
import faiss

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]
out_fp = './data/'
callbacks = [StreamingStdOutCallbackHandler()]
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [23]:
def load_pdfs():
    loader = PyPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf")
    pages = loader.load()
    return pages
pdf_pages = load_pdfs()
full_text = '\n'.join([pt.page_content for pt in pdf_pages])

In [4]:
pdf_pages[0].metadata

{'source': 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf',
 'page': 0}

![Chunking Consideration1](./assets/midterm_1.PNG)

![Chunking Consideration2](./assets/midterm_2.PNG)

![Eyeballing Chunksize](./assets/midterm_3.PNG)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=700)
documents = text_splitter.split_documents(pdf_pages)

print("Original Pages: ", len(pdf_pages))
print("Number page chunks: ", len(documents))

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# vector_store = FAISS.from_documents(documents, embeddings)
faiss_fn = 'nvidia_10k_faiss_index.bin'
# vector_store.save_local(out_fp + faiss_fn)
vector_store = FAISS.load_local(out_fp + faiss_fn, embeddings, allow_dangerous_deserialization=True)
retriever = vector_store.as_retriever()

mixtral_llm = HuggingFaceTextGenInference(
    inference_server_url="http://KNEDDLALP001:8090/",
    max_new_tokens=4000,
    temperature=0.01,
    repetition_penalty=1.03
)

openai_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
template = """Answer the question based only on the following context. If you cannot answer the question with the context, respond with 'I don't know'. You'll get a big bonus and a potential promotion if you provide a high quality answer:

Context:
{context}

Question:
{question}
"""
prompt_template = ChatPromptTemplate.from_template(template)
retrieval_augmented_qa_chain_openai = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt_template | openai_llm, "context": itemgetter("context")}
)

retrieval_augmented_qa_chain_mixtral = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt_template | mixtral_llm, "context": itemgetter("context")}
)

Original Pages:  96
Number page chunks:  219


  warn_deprecated(



In [6]:
openai_results = {}

for i, question in tqdm(enumerate(question_list), total=len(question_list)):
    response = retrieval_augmented_qa_chain_openai.invoke({"question" : question})
    openai_results[i] = {"question": question,
                  "response": response["response"].content,
                  "context": '\n'.join([document.page_content for document in response["context"]]),
                  "page_numbers": set([document.metadata['page'] for document in response["context"]])}

mixtral_results = {}
for i, question in tqdm(enumerate(question_list), total=len(question_list)):
    response = retrieval_augmented_qa_chain_mixtral.invoke({"question" : question})
    mixtral_results[i] = {"question": question,
                  "response": response["response"],
                  "context": '\n'.join([document.page_content for document in response["context"]]),
                  "page_numbers": set([document.metadata['page'] for document in response["context"]])}


100%|██████████| 10/10 [00:11<00:00,  1.14s/it]
100%|██████████| 10/10 [00:31<00:00,  3.11s/it]


In [7]:
ground_truth_openai = pd.DataFrame(openai_results).T.rename(
    columns={"response": "ground_truth"}
)
ground_truth_mixtral = pd.DataFrame(mixtral_results).T.rename(
    columns={"response": "ground_truth"}
)

ground_truth = pd.merge(
    ground_truth_openai,
    ground_truth_mixtral,
    on="question",
    how="inner",
    validate="1:1",
    suffixes=['_openai', '_mixtral']
)

ground_truth.to_csv(out_fp + 'ground_truth.csv', index=False)
# ground_truth = pd.read_csv(out_fp + 'ground_truth.csv')
ground_truth

Unnamed: 0,question,ground_truth_openai,context_openai,page_numbers_openai,ground_truth_mixtral,context_mixtral,page_numbers_mixtral
0,"Who is the E-VP, Operations - and how old are ...",Debora Shoquist is the Executive Vice Presiden...,"Table of Contents\nstrategy , planning, report...","{11, 12, 84}","\nAnswer:\nDebora Shoquist is the E-VP, Operat...","Table of Contents\nstrategy , planning, report...","{11, 12, 84}"
1,What is the gross carrying amount of Total Amo...,"$3,539 million",Table of Contents\nNVIDIA Corporation and Sub...,"{67, 68, 62, 63}",\nAnswer:\nThe gross carrying amount of Total ...,Table of Contents\nNVIDIA Corporation and Sub...,"{67, 68, 62, 63}"
2,How much revenue was generated outside of the ...,44%,Table of Contents\nNVIDIA Corporation and Sub...,"{40, 78, 39}",\nAnswer:\nRevenue from sales to customers out...,Table of Contents\nNVIDIA Corporation and Sub...,"{40, 78, 39}"
3,"As of February 16, 2024, what is the name and ...","As of February 16, 2024, the President and Chi...",Pursuant to the requirements of the Securities...,"{11, 84, 44}","\nAnswer:\nJen-Hsun Huang, 60 years old.",Pursuant to the requirements of the Securities...,"{11, 84, 44}"
4,What specific cybersecurity event is mentioned...,Ransomware attacks by organized criminal threa...,"Cyber-attacks, including ransomware attacks by...","{19, 20, 30}",\nAnswer:\n The specific cybersecurity event m...,"Cyber-attacks, including ransomware attacks by...","{19, 20, 30}"
5,What percentage of NVIDIA's workforce was tech...,"83% of NVIDIA's workforce was technical, and 4...",Table of Contents\nor generate enough renewabl...,"{10, 11}",\nAnswer:\nThe provided context does not inclu...,Table of Contents\nor generate enough renewabl...,"{10, 11}"
6,What are the risks associated with failure in ...,The risks associated with failure in estimatin...,for our products. Some of our competitors oper...,"{16, 12, 14, 15}",\nAnswer:\n The risks associated with failure ...,for our products. Some of our competitors oper...,"{16, 12, 14, 15}"
7,Describe one specific risk related to NVIDIA's...,One specific risk related to NVIDIA's internat...,export contr ol rules at any time and further ...,"{9, 26, 25}",\nAnswer:\nOne specific risk related to NVIDIA...,export contr ol rules at any time and further ...,"{9, 26, 25}"
8,"As of the end of fiscal year 2024, what percen...",I don't know.,Table of Contents\nor generate enough renewabl...,"{35, 10, 11}",\nAnswer:\nThe context does not provide inform...,Table of Contents\nor generate enough renewabl...,"{35, 10, 11}"
9,What is NVIDIA's strategy for managing intelle...,NVIDIA relies primarily on a combination of pa...,Table of Contents\nWe are working with several...,"{8, 54, 10, 6}",\nAnswer:\nNVIDIA primarily relies on a combin...,Table of Contents\nWe are working with several...,"{8, 54, 10, 6}"


In [8]:
"""
@dataclass
class TestsetGenerator:
    generator_llm: BaseRagasLLM
    critic_llm: BaseRagasLLM
    embeddings: BaseRagasEmbeddings
    docstore: DocumentStore

    @classmethod
    def with_openai(
        cls,
        generator_llm: str = "gpt-3.5-turbo-16k",
        critic_llm: str = "gpt-4",
        embeddings: str = "text-embedding-ada-002",
        docstore: t.Optional[DocumentStore] = None,
        run_config: t.Optional[RunConfig] = None,
        chunk_size: int = 1024    
"""

generator = TestsetGenerator.with_openai(
    generator_llm="gpt-3.5-turbo",
    critic_llm="gpt-4",
    embeddings="text-embedding-3-small",
    chunk_size=1024,
)
testset = generator.generate_with_langchain_docs(
    documents,
    test_size=10,
    distributions={simple: 0.25, reasoning: 0.25, multi_context: 0.5},
)

Filename and doc_id are the same for all nodes.                   
Generating: 100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


In [9]:
idx = np.random.choice(len(testset.test_data))
print('idx: ', idx)
print(testset.test_data[idx].question)
print(testset.test_data[idx].ground_truth)
print(len(testset.test_data[idx].contexts))
print(testset.test_data[idx].contexts)

idx:  0
What was the outcome of the appeal filed by the plaintiffs in the United States Court of Appeals for the Ninth Circuit regarding the dismissal of the case against NVIDIA and certain NVIDIA executives?
The majority of a three-judge Ninth Circuit panel affirmed in part and reversed in part the district court’s dismissal of the case, with a third judge dissenting.
1
['complaint asserted that NVIDIA  and certain NVIDIA  executives violated Section 10(b) of the Securities Exchange Act of 1934, as amended, or the Exchange Act,\nand SEC Rule 10b-5, by making materially false or misleading statements related to channel inventory and the impact of cryptocurrency mining on GPU demand\nbetween May 10, 2017 and November 14, 2018 . Plaintif fs also alleged that the NVIDIA  executiv es who they named as defendants violated Section 20(a) of the\nExchange Act. Plaintif fs sought class certificatio n, an award of unspecified compensatory damages, an award of reasonable costs and expenses, inclu

In [10]:
testset_df = testset.to_pandas()
testset_df.to_csv(out_fp + 'testset.csv', index=False)
# testset_df = pd.read_csv(out_fp + 'testset.csv')
testset_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,What was the outcome of the appeal filed by th...,[complaint asserted that NVIDIA and certain N...,The majority of a three-judge Ninth Circuit pa...,simple,True
1,How is anti-dilution calculated in terms of ne...,[been anti-dilutive15 40 21 \n(1) Calculate...,Calculated as net income divided by basic weig...,simple,True
2,What assets are in NVIDIA's financial statemen...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA's financial statements include assets s...,reasoning,True
3,"""How have recent USG export restrictions impac...","[products, benefiting competitors that offer a...",Recent USG export restrictions have impacted N...,reasoning,True
4,"""What's the fair value of NVIDIA's equity secu...",[Table of Contents\nNVIDIA Corporation and Su...,The fair value of NVIDIA's equity securities o...,multi_context,True
5,"""How are marketable equity securities valued a...",[primarily investments in privately held compa...,Marketable equity securities are valued at fai...,multi_context,True
6,"""What countries are being audited for taxes an...",[While we believe that we have adequately prov...,The significant tax jurisdictions currently un...,multi_context,True
7,"""How does NVIDIA determine fair value of finan...",[Table of Contents\nNVIDIA Corporation and Su...,The fair values of our financial assets and li...,multi_context,True
8,"""What global training does the Deep Learning I...","[example, our solution architects work with CS...",The Deep Learning Institute offers in-person a...,multi_context,True
9,What are the primary sources of liquidity for ...,[Table of Contents\nLiquidity and Capital Reso...,The primary sources of liquidity for the compa...,simple,True


In [12]:
test_questions = testset_df["question"].values.tolist()
test_groundtruths = testset_df["ground_truth"].values.tolist()

answers_mixtral = []
contexts_mixtral = []

answers_openai = []
contexts_openai = []

for question in tqdm(test_questions, total=len(test_questions)):
  response_mixtral = retrieval_augmented_qa_chain_mixtral.invoke({"question" : question})
  answers_mixtral.append(response_mixtral["response"])
  contexts_mixtral.append([context.page_content for context in response_mixtral["context"]])

  response_openai = retrieval_augmented_qa_chain_openai.invoke({"question" : question})
  answers_openai.append(response_openai["response"].content)
  contexts_openai.append([context.page_content for context in response_openai["context"]])


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:50<00:00, 11.03s/it]


In [13]:
response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer_mixtral" : answers_mixtral,
    "contexts_mixtral" : contexts_mixtral,

    "answer_openai" : answers_openai,
    "contexts_openai" : contexts_openai,

    "ground_truth" : test_groundtruths
})


response_dataset_df = pd.DataFrame(response_dataset)
response_dataset_df.to_csv(out_fp + "response_dataset.csv", index=False)
# response_dataset_df = pd.read_csv(out_fp + "response_dataset.csv")
# response_dataset_df = response_dataset_df.astype(str)
# response_dataset = Dataset.from_pandas(response_dataset_df)
response_dataset_df.head()

Unnamed: 0,question,answer_mixtral,contexts_mixtral,answer_openai,contexts_openai,ground_truth
0,What was the outcome of the appeal filed by th...,"\nAnswer:\nOn August 25, 2023, a majority of a...",[complaint asserted that NVIDIA and certain N...,The outcome of the appeal filed by the plainti...,[complaint asserted that NVIDIA and certain N...,The majority of a three-judge Ninth Circuit pa...
1,How is anti-dilution calculated in terms of ne...,"\nAnswer:\n Based on the provided context, ant...",[been anti-dilutive15 40 21 \n(1) Calculate...,Anti-dilution is calculated as net income divi...,[been anti-dilutive15 40 21 \n(1) Calculate...,Calculated as net income divided by basic weig...
2,What assets are in NVIDIA's financial statemen...,\nAnswer:\nNVIDIA's financial statements inclu...,[Table of Contents\nNVIDIA Corporation and Su...,Assets in NVIDIA's financial statements includ...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA's financial statements include assets s...
3,"""How have recent USG export restrictions impac...",\nAnswer:\n\nRecent USG export restrictions ha...,[supercomputing industries. These restrictions...,The recent USG export restrictions have impact...,[supercomputing industries. These restrictions...,Recent USG export restrictions have impacted N...
4,"""What's the fair value of NVIDIA's equity secu...",\nAnswer:\nThe fair value of NVIDIA's equity s...,[Table of Contents\napproximately 7 million sh...,The fair value of NVIDIA's equity securities o...,[Table of Contents\napproximately 7 million sh...,The fair value of NVIDIA's equity securities o...


In [15]:
ragas_eval_results_openai = evaluate(
    response_dataset.rename_column("answer_openai", "answer").rename_column(
        "contexts_openai", "contexts"
    ),
    metrics
)
ragas_eval_results_openai_df = ragas_eval_results_openai.to_pandas()
# ragas_eval_results_openai_df.to_csv(out_fp + 'ragas_eval_results_openai_df.csv', index=False)
ragas_eval_results_openai_df = pd.read_csv(out_fp + 'ragas_eval_results_openai_df.csv')
ragas_eval_results_openai_df.head(3)

Evaluating: 100%|██████████| 50/50 [00:59<00:00,  1.20s/it]


Unnamed: 0,question,answer_mixtral,contexts_mixtral,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was the outcome of the appeal filed by th...,"\nAnswer:\nOn August 25, 2023, a majority of a...",[complaint asserted that NVIDIA and certain N...,The outcome of the appeal filed by the plainti...,[complaint asserted that NVIDIA and certain N...,The majority of a three-judge Ninth Circuit pa...,1.0,1.0,1.0,1.0,0.729155
1,How is anti-dilution calculated in terms of ne...,"\nAnswer:\n Based on the provided context, ant...",[been anti-dilutive15 40 21 \n(1) Calculate...,Anti-dilution is calculated as net income divi...,[been anti-dilutive15 40 21 \n(1) Calculate...,Calculated as net income divided by basic weig...,1.0,0.92485,1.0,0.833333,0.732961
2,What assets are in NVIDIA's financial statemen...,\nAnswer:\nNVIDIA's financial statements inclu...,[Table of Contents\nNVIDIA Corporation and Su...,Assets in NVIDIA's financial statements includ...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA's financial statements include assets s...,1.0,0.94874,1.0,0.916667,0.628824


In [16]:
ragas_eval_results_mixtral = evaluate(
    response_dataset.rename_column("answer_mixtral", "answer").rename_column(
        "contexts_mixtral", "contexts"
    ),
    metrics
)
ragas_eval_results_mixtral_df = ragas_eval_results_mixtral.to_pandas()
ragas_eval_results_mixtral_df.to_csv(out_fp + 'ragas_eval_results_mixtral_df.csv', index=False)
# ragas_eval_results_mixtral_df = pd.read_csv(out_fp + 'ragas_eval_results_mixtral_df.csv')
ragas_eval_results_mixtral_df.head(3)

Evaluating: 100%|██████████| 50/50 [02:06<00:00,  2.54s/it]


Unnamed: 0,question,answer,contexts,answer_openai,contexts_openai,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was the outcome of the appeal filed by th...,"\nAnswer:\nOn August 25, 2023, a majority of a...",[complaint asserted that NVIDIA and certain N...,The outcome of the appeal filed by the plainti...,[complaint asserted that NVIDIA and certain N...,The majority of a three-judge Ninth Circuit pa...,1.0,0.917517,1.0,1.0,0.599909
1,How is anti-dilution calculated in terms of ne...,"\nAnswer:\n Based on the provided context, ant...",[been anti-dilutive15 40 21 \n(1) Calculate...,Anti-dilution is calculated as net income divi...,[been anti-dilutive15 40 21 \n(1) Calculate...,Calculated as net income divided by basic weig...,1.0,0.919676,1.0,0.833333,0.726027
2,What assets are in NVIDIA's financial statemen...,\nAnswer:\nNVIDIA's financial statements inclu...,[Table of Contents\nNVIDIA Corporation and Su...,Assets in NVIDIA's financial statements includ...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA's financial statements include assets s...,0.857143,0.946886,1.0,0.916667,0.30898


In [17]:
'''
Vector store-backed retriever

A vector store retriever is a retriever that uses a vector store to retrieve documents. It is a lightweight wrapper around the vector store class to make it conform to the retriever interface. It uses the search methods implemented by a vector store, like similarity search and MMR, to query the texts in the vector store.
'''

'''
This seems like the best general option, and a major upgrade from Vector store-backed retriever

The MultiQueryRetriever automates the process of prompt tuning by using an LLM to generate multiple queries from different perspectives for a given user input query. For each query, it retrieves a set of relevant documents and takes the unique union across all queries to get a larger set of potentially relevant documents. By generating multiple perspectives on the same question, the MultiQueryRetriever might be able to overcome some of the limitations of the distance-based retrieval and get a richer set of results.
'''

'''
I don't need to compress the data here

Contextual Compression - Contextual compression is meant to fix this. The idea is simple: instead of immediately returning retrieved documents as-is, you can compress them using the context of the given query, so that only the relevant information is returned. “Compressing” here refers to both compressing the contents of an individual document and filtering out documents wholesale.
'''

'''
Interesting one:

Ensemble Retriever

The EnsembleRetriever takes a list of retrievers as input and ensemble the results of their get_relevant_documents() methods and rerank the results based on the Reciprocal Rank Fusion algorithm.

By leveraging the strengths of different algorithms, the EnsembleRetriever can achieve better performance than any single algorithm.
'''
advanced_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=mixtral_llm)

In [18]:
retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
advanced_document_chain = create_stuff_documents_chain(mixtral_llm, retrieval_qa_prompt)
advanced_retrieval_chain = create_retrieval_chain(advanced_retriever, advanced_document_chain)

In [19]:
multi_answers = []
multi_contexts = []

for question in tqdm(test_questions, total=len(test_questions)):
  multi_response = advanced_retrieval_chain.invoke({"input" : question})
  multi_answers.append(multi_response["answer"])
  multi_contexts.append([context.page_content for context in multi_response["context"]])

100%|██████████| 10/10 [02:13<00:00, 13.35s/it]


In [20]:
response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : multi_answers,
    "contexts" : multi_contexts,
    "ground_truth" : test_groundtruths
})

response_dataset_advanced_retrieval_df = pd.DataFrame(response_dataset_advanced_retrieval)
# response_dataset_advanced_retrieval_df.to_csv(out_fp + "response_dataset_advanced_retrieval_df.csv", index=False)
response_dataset_advanced_retrieval_df = pd.read_csv(out_fp + "response_dataset_advanced_retrieval_df.csv")
response_dataset_advanced_retrieval_df

Unnamed: 0,question,answer,contexts,ground_truth
0,What was the outcome of the appeal filed by th...,\n\nAssistant: The appeal was partially affirm...,[complaint asserted that NVIDIA and certain N...,The majority of a three-judge Ninth Circuit pa...
1,How is anti-dilution calculated in terms of ne...,\n\nAssystem: Anti-dilution is calculated by t...,[been anti-dilutive15 40 21 \n(1) Calculate...,Calculated as net income divided by basic weig...
2,What assets are in NVIDIA's financial statemen...,\n\nAssistant: NVIDIA's financial statements l...,[Table of Contents\nNVIDIA Corporation and Su...,NVIDIA's financial statements include assets s...
3,"""How have recent USG export restrictions impac...",\nAssistant: The recent USG export restriction...,"[products, benefiting competitors that offer a...",Recent USG export restrictions have impacted N...
4,"""What's the fair value of NVIDIA's equity secu...",\nAssistant: According to the provided context...,[Table of Contents\napproximately 7 million sh...,The fair value of NVIDIA's equity securities o...
5,"""How are marketable equity securities valued a...","\n\nAssistant: Marketable equity securities, w...",[primarily investments in privately held compa...,Marketable equity securities are valued at fai...
6,"""What countries are being audited for taxes an...",\nAssistant: According to the provided context...,[While we believe that we have adequately prov...,The significant tax jurisdictions currently un...
7,"""How does NVIDIA determine fair value of finan...",\nAssistant: NVIDIA determines the fair value ...,[Table of Contents\nNVIDIA Corporation and Su...,The fair values of our financial assets and li...
8,"""What global training does the Deep Learning I...",\n\nAssistant: The Deep Learning Institute (DL...,"[example, our solution architects work with CS...",The Deep Learning Institute offers in-person a...
9,What are the primary sources of liquidity for ...,\n\nAssistant: The primary sources of liquidit...,[Table of Contents\nLiquidity and Capital Reso...,The primary sources of liquidity for the compa...


In [21]:
advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics)
advanced_retrieval_results_df = advanced_retrieval_results.to_pandas()
advanced_retrieval_results_df.to_csv(out_fp + "advanced_retrieval_results_df.csv", index=False)
# advanced_retrieval_results_df = pd.read_csv(out_fp + "advanced_retrieval_results_df.csv")
advanced_retrieval_results_df.head(2)

Evaluating:  62%|██████▏   | 31/50 [00:35<00:52,  2.75s/it]Task exception was never retrieved
future: <Task finished name='Task-2756' coro=<AsyncClient.aclose() done, defined at /home/william.wetzel/venvs/maven_midterm/lib/python3.10/site-packages/httpx/_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "/home/william.wetzel/venvs/maven_midterm/lib/python3.10/site-packages/httpx/_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "/home/william.wetzel/venvs/maven_midterm/lib/python3.10/site-packages/httpx/_transports/default.py", line 385, in aclose
    await self._pool.aclose()
  File "/home/william.wetzel/venvs/maven_midterm/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "/home/william.wetzel/venvs/maven_midterm/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 305, in _close_connectio

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was the outcome of the appeal filed by th...,\n\nAssistant: The appeal was partially affirm...,[complaint asserted that NVIDIA and certain N...,The majority of a three-judge Ninth Circuit pa...,1.0,0.911314,1.0,1.0,0.52257
1,How is anti-dilution calculated in terms of ne...,\n\nAssystem: Anti-dilution is calculated by t...,[been anti-dilutive15 40 21 \n(1) Calculate...,Calculated as net income divided by basic weig...,,0.924823,1.0,0.755556,0.598197


In [22]:
eval_openai = pd.DataFrame(
    list(ragas_eval_results_openai.items()), columns=["Metric", "Baseline OpenAI"]
)
eval_mixtral = pd.DataFrame(
    list(ragas_eval_results_mixtral.items()), columns=["Metric", "Baseline Mixtral"]
)
multi_mixtral = pd.DataFrame(
    list(advanced_retrieval_results.items()),
    columns=["Metric", "MultiQueryRetriever with Stuffing Mixtral"],
)

df_merged = pd.merge(
    eval_openai, eval_mixtral, on="Metric", how="inner", validate="1:1"
)
df_merged = pd.merge(df_merged, multi_mixtral, on="Metric", how="inner", validate="1:1")

df_merged["Multi vs OpenAI Baseline"] = (
    df_merged["MultiQueryRetriever with Stuffing Mixtral"]
    - df_merged["Baseline OpenAI"]
)
df_merged["Multi vs Mixtral Baseline"] = (
    df_merged["MultiQueryRetriever with Stuffing Mixtral"]
    - df_merged["Baseline Mixtral"]
)

df_merged["Baseline OpenAI vs. Baseline Mixtral"] = (
    df_merged["Baseline OpenAI"] - df_merged["Baseline Mixtral"]
)


# df_merged = pd.read_csv(out_fp + 'df_merged.csv')
df_merged["winner"] = df_merged[
    ["Baseline OpenAI", "Baseline Mixtral", "MultiQueryRetriever with Stuffing Mixtral"]
].idxmax(axis=1)
df_merged["winner vs. MultiQueryRetriever with Stuffing Mixtral"] = df_merged.apply(
    lambda row: row[row["winner"]] - row["MultiQueryRetriever with Stuffing Mixtral"],
    axis=1,
)

# df_merged.to_csv(out_fp + 'df_merged.csv', index=False)
df_merged = pd.read_csv(out_fp + 'df_merged.csv')
df_merged

Unnamed: 0,Metric,Baseline OpenAI,Baseline Mixtral,MultiQueryRetriever with Stuffing Mixtral,Multi vs OpenAI Baseline,Multi vs Mixtral Baseline,Baseline OpenAI vs. Baseline Mixtral,winner,winner vs. MultiQueryRetriever with Stuffing Mixtral
0,faithfulness,1.0,0.985714,1.0,0.0,0.014286,0.014286,Baseline OpenAI,0.0
1,answer_relevancy,0.940292,0.928846,0.91587,-0.024422,-0.012976,0.011446,Baseline OpenAI,0.024422
2,context_recall,1.0,1.0,1.0,0.0,0.0,0.0,Baseline OpenAI,0.0
3,context_precision,0.897222,0.897222,0.853194,-0.044028,-0.044028,0.0,Baseline OpenAI,0.044028
4,answer_correctness,0.684928,0.551577,0.596732,-0.088196,0.045154,0.133351,Baseline OpenAI,0.088196
