In [1]:
!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai langchain-qdrant

In [2]:
!pip install -qU ragas

In [3]:
!pip install -qU qdrant-client pymupdf pandas

In [34]:
!pip install -qU langchain_openai langchain_huggingface langchain_core==0.2.38 langchain langchain_community langchain-text-splitters

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-core 0.10.26 requires nltk<4.0.0,>=3.8.1, which is not installed.
llama-index-legacy 0.9.48 requires nltk<4.0.0,>=3.8.1, which is not installed.
langchain-experimental 0.0.52 requires langchain<0.2.0,>=0.1.8, but you have langchain 0.2.16 which is incompatible.
langchain-experimental 0.0.52 requires langchain-core<0.2.0,>=0.1.24, but you have langchain-core 0.2.38 which is incompatible.


In [1]:
import langchain_core
langchain_core.__version__

'0.2.38'

In [2]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

## Task 1: Dealing with the Data
Below shows the methods I used to choose the best chunking strategy 

In [21]:
# load PDFs
from langchain_community.document_loaders import PyMuPDFLoader
import uuid

pdf_links = [
    "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
    "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
]

documents = []
for pdf_link in pdf_links:
    loader = PyMuPDFLoader(pdf_link)
    loaded_docs = loader.load()
    documents.extend(loaded_docs)


In [89]:
len(documents)

137

In [90]:
# Created chunks by differet chunk size and by section
from langchain.text_splitter import RecursiveCharacterTextSplitter
def generate_chunk(CHUNK_SIZE,CHUNK_OVERLAP,CHUNK_BY_SECTION):
    if not CHUNK_BY_SECTION:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = CHUNK_SIZE,
            chunk_overlap = CHUNK_OVERLAP,
            length_function = len,
        )
        return text_splitter.split_documents(documents)
    else:
        text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n"], # Prioritize paragraph breaks, then line breaks
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP
        )
        return text_splitter.split_documents(documents)

split_chunks_500 = generate_chunk(500,50,False)
print(len(split_chunks_500))

split_chunks_1000 = generate_chunk(1000,200,False)
print(len(split_chunks_1000))

split_chunks_by_section = generate_chunk(10000,50,True) #the numbers put here doesn't matter
print(len(split_chunks_by_section))



910
512
137


In [61]:
# Assign uuid for each chunk
import uuid

id_set = set()
for document in split_chunks_500:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["uuid"] = id

id_set = set()
for document in split_chunks_1000:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["uuid"] = id

id_set = set()
for document in split_chunks_by_section:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["uuid"] = id

In [74]:
# Let's start from create some sample questions for testing using chunk_size 500
import random
from langchain_openai import ChatOpenAI

qa_sample_documents = random.sample(split_chunks_500, 10)
print(len(qa_sample_documents))


qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
question_generation_chain = qa_prompt_template | qa_chat_model

10


In [75]:
# Get Question-Context map - The ground truth!
import tqdm
def create_questions(documents, n_questions):
  questions = {}
  relevant_docs = {}
  for document in tqdm.tqdm(documents):
    questions_generated = question_generation_chain.invoke({"context": document.page_content, "n_questions": n_questions})
    for question in questions_generated.content.split("\n"):
      question_id = str(uuid.uuid4())
      questions[question_id] = "".join(question.split(".")[1:]).strip()
      relevant_docs[question_id] = [document.metadata["uuid"]]
  return questions, relevant_docs

qa_questions, qa_relevant_contexts = create_questions(qa_sample_documents, 2)

100%|██████████| 10/10 [00:14<00:00,  1.48s/it]


In [67]:
# Use Qdrant for VDB
from langchain_openai import OpenAIEmbeddings

EMBEDDING_MODEL = "text-embedding-3-small"

embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

def generate_vdb(chunks):
    LOCATION = ":memory:"
    COLLECTION_NAME = "legal data"
    VECTOR_SIZE = 1536

    qdrant_client = QdrantClient(LOCATION)

    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
    )

    qdrant_vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name=COLLECTION_NAME,
        embedding=embeddings,
    )

    qdrant_vector_store.add_documents(chunks)
    return qdrant_vector_store

In [68]:
qdrant_vector_store_500 = generate_vdb(split_chunks_500)
qdrant_vector_store_1000 = generate_vdb(split_chunks_1000)
qdrant_vector_store_by_section = generate_vdb(split_chunks_by_section)

In [82]:
# Create the evaluation function to calculate "is_hit" rate
import pandas as pd
def eva_chunk_size(vdb):
  retriever = vdb.as_retriever(k=5)
  questions = qa_questions
  relevant_docs = qa_relevant_contexts
  #corpus = {qa_item.metadata["uuid"] : qa_item.page_content for qa_item in qa_sample_documents}

  eval_results = []
  for id, question in tqdm.tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["uuid"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})
  return pd.DataFrame(eval_results)["is_hit"].mean()


In [83]:
print("hit_freq for chunk size 500: ", eva_chunk_size(qdrant_vector_store_500))

100%|██████████| 20/20 [00:03<00:00,  5.39it/s]

hit_freq for chunk size 500:  0.8





In [86]:
# Do the same thing for chunk size 1000
qa_sample_documents = random.sample(split_chunks_1000, 10)
print(len(qa_sample_documents))

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
question_generation_chain = qa_prompt_template | qa_chat_model

qa_questions, qa_relevant_contexts = create_questions(qa_sample_documents, 2)
print("hit_freq for chunk size 1000: ", eva_chunk_size(qdrant_vector_store_1000))

10


100%|██████████| 10/10 [00:14<00:00,  1.43s/it]
100%|██████████| 20/20 [00:03<00:00,  5.69it/s]

hit_freq for chunk size 1000:  0.85





In [88]:
# Do the same thing for chunk by section
qa_sample_documents = random.sample(split_chunks_by_section, 10)
print(len(qa_sample_documents))

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
question_generation_chain = qa_prompt_template | qa_chat_model

qa_questions, qa_relevant_contexts = create_questions(qa_sample_documents, 2)
print("hit_freq for chunk by section: ", eva_chunk_size(qdrant_vector_store_by_section))

10


100%|██████████| 10/10 [00:17<00:00,  1.75s/it]
100%|██████████| 20/20 [00:03<00:00,  5.84it/s]

hit_freq for chunk by section:  0.6





## Task 3: Creating a Golden Test Data Set

In [5]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

num_qa_pairs = 20

testset = generator.generate_with_langchain_docs(split_chunks_750, num_qa_pairs, distributions)
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

embedding nodes:   0%|          | 0/1258 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
testset_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How can adversarial role-playing exercises hel...,"[Violent, or Hateful Content; \nObscene, Degra...",Adversarial role-playing exercises can help id...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
1,Who is Karen Levy and what is her role in disc...,"[•\nKaren Levy, Assistant Professor, Departmen...",Karen Levy is an Assistant Professor in the De...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
2,How should organizations establish minimum thr...,"[Dangerous, Violent, or Hateful \nContent; CBR...",Organizations should establish minimum thresho...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
3,What is the significance of algorithmic discri...,[healthcare clinical algorithms that are used ...,Algorithmic discrimination protections in heal...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
4,How can risks be re-evaluated when adapting GA...,[models. \nValue Chain and Component \nIntegra...,Risks can be re-evaluated when adapting GAI mo...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
5,How can information security be maintained for...,"[the attack chain, including informing attacke...",Information security for GAI models and system...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
6,How can label errors impact the stability and ...,"[For example, test datasets commonly used to b...",Label errors in test datasets used to benchmar...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
7,Why is transparency important in predictive po...,[NOTICE & \nEXPLANATION \nWHY THIS PRINCIPLE I...,Transparency is important in predictive polici...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
8,What are some innovative solutions provided by...,"[requirements on drivers, such as slowing down...",Innovative solutions provided by industry to m...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
9,What are the risks associated with model extra...,[MS-2.10-001 \nConduct AI red-teaming to asses...,Outputting of training data samples in AI red-...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True


In [7]:
from langchain.prompts import ChatPromptTemplate

template = """
Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, response with "I don't know".

Question:
{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_template(template)

In [14]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
retriever = qdrant_vector_store_1000.as_retriever()

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [15]:
question = "What measures should be taken to review training data for CBRN information and intellectual property?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result["response"].content)

To review training data for CBRN information and intellectual property, the following measures should be taken:

1. **Document Training Data Curation Policies**: Establish and document policies for the curation of training data in accordance with applicable laws and regulations.

2. **Establish Data Collection and Retention Policies**: Create policies that address the collection, retention, and minimum quality of data, considering risks such as the disclosure of inappropriate CBRN information, use of illegal or dangerous content, and potential biases in training data.

3. **Implement Data Review Measures**: Take reasonable measures to review training data specifically for CBRN information and intellectual property, and remove any data that is deemed inappropriate.

4. **Prevent and Flag Inappropriate Outputs**: Implement measures to prevent, flag, or take action in response to outputs that reproduce particular training data, such as plagiarized or trademarked content.

5. **Conduct Dil

In [17]:
test_questions = testset_df["question"].values.tolist()
test_groundtruths = testset_df["ground_truth"].values.tolist()

answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

response_dataset[0]

{'question': 'How can adversarial role-playing exercises help identify anomalous failure modes related to information integrity in GAI systems?',
 'answer': "Adversarial role-playing exercises can help identify anomalous failure modes related to information integrity in GAI systems by conducting GAI red-teaming or chaos testing. These exercises simulate potential misuse scenarios and unintended outputs, allowing evaluators to observe how the system behaves under various adversarial conditions. This approach can reveal unforeseen issues that may not be apparent in controlled testing environments, thereby enhancing the understanding of the system's vulnerabilities and the integrity of the information it processes.",
 'contexts': ["understand potential misuse scenarios and unintended outputs. \nInformation Integrity; Information \nSecurity \nMS-4.2-002 \nEvaluate GAI system performance in real-world scenarios to observe its \nbehavior in practical environments and reveal issues that might

In [18]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [19]:
results

{'faithfulness': 0.8481, 'answer_relevancy': 0.9196, 'context_recall': 0.8833, 'context_precision': 0.8819, 'answer_correctness': 0.6919}

In [20]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How can adversarial role-playing exercises hel...,[understand potential misuse scenarios and uni...,Adversarial role-playing exercises can help id...,Adversarial role-playing exercises can help id...,0.875,0.946942,1.0,1.0,0.816698
1,Who is Karen Levy and what is her role in disc...,"[•\nElana Zeide, Assistant Professor, Universi...",I don't know.,Karen Levy is an Assistant Professor in the De...,0.0,0.0,0.0,0.0,0.187297
2,How should organizations establish minimum thr...,[Content; Value Chain and \nComponent Integrat...,Organizations should establish minimum thresho...,Organizations should establish minimum thresho...,0.96875,0.925227,1.0,1.0,0.528853
3,What is the significance of algorithmic discri...,[•\nAn algorithm designed to identify patients...,The significance of algorithmic discrimination...,Algorithmic discrimination protections in heal...,0.2,0.999999,1.0,1.0,0.894475
4,How can risks be re-evaluated when adapting GA...,"[context, or may be more speculative and there...",Risks can be re-evaluated when adapting GAI mo...,Risks can be re-evaluated when adapting GAI mo...,1.0,0.962327,0.5,1.0,0.562881
5,How can information security be maintained for...,"[vulnerabilities in systems (hardware, softwar...",Information security for Generative Artificial...,Information security for GAI models and system...,1.0,0.921816,1.0,1.0,0.637209
6,How can label errors impact the stability and ...,"[training data, which may be too large for hum...",Label errors in test datasets used to benchmar...,Label errors in test datasets used to benchmar...,0.875,0.95296,1.0,1.0,0.563043
7,Why is transparency important in predictive po...,[NOTICE & \nEXPLANATION \nWHY THIS PRINCIPLE I...,Transparency is important in predictive polici...,Transparency is important in predictive polici...,1.0,1.0,0.666667,1.0,0.766444
8,What are some innovative solutions provided by...,[The National Highway Traffic Safety Administr...,Some innovative solutions provided by industry...,Innovative solutions provided by industry to m...,1.0,0.973194,1.0,1.0,0.920652
9,What are the risks associated with model extra...,"[decision making, policy and procedural update...",The risks associated with model extraction in ...,Outputting of training data samples in AI red-...,1.0,1.0,1.0,0.638889,0.520126


All that's left to do is call "evaluate" and away we go!

In [26]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/95 [00:00<?, ?it/s]

In [27]:
results

{'faithfulness': 0.7118, 'answer_relevancy': 0.8747, 'context_recall': 0.6360, 'context_precision': 0.7032, 'answer_correctness': 0.5630}

In [30]:
results_df['faithfulness'].mean()

0.7118421052631579

In [29]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How does the tendency to avoid inconsistency c...,The tendency to avoid inconsistency contribute...,[Five: Inconsistency-Avoidance Tendency\n[Peop...,The tendency to avoid inconsistency contribute...,1.0,0.943772,0.5,0.805556,0.320374
1,What are some of the challenges faced by start...,Some challenges faced by startups in establish...,[ied and determined. Sales calls get made. The...,"In a startup, it is easy for the code not to g...",1.0,0.991209,1.0,1.0,0.600231
2,What factors should be considered when decidin...,I don't know.,[including your formal education. So I will st...,The answer to given question is not present in...,0.0,0.0,1.0,0.0,0.195205
3,What should be valued when evaluating candidat...,"When evaluating candidates for a startup, it i...",[priate for your particular startup.\nWith a w...,The answer to given question is not present in...,1.0,0.974912,1.0,0.0,0.181087
4,What are the consequences of not raising enoug...,Not raising enough money risks the survival of...,[Here’s why you shouldn’t do that:\nWhat are t...,Not raising enough money risks the survival of...,1.0,0.991354,0.333333,0.833333,0.503574
5,How does Structured Procrastination suggest us...,Structured Procrastination suggests that inste...,[standing.)\nThe gist of Structured Procrastin...,Structured Procrastination suggests that inste...,1.0,0.957711,1.0,0.916667,0.996443
6,What analogy is used to describe the layers of...,The analogy used to describe the layers of ris...,[as if it’s an onion. Just like you peel an on...,The analogy used to describe the layers of ris...,1.0,1.0,1.0,0.75,0.891681
7,How can Structured Procrastination be used to ...,Structured Procrastination can be used to one'...,[standing.)\nThe gist of Structured Procrastin...,Structured Procrastination suggests that inste...,1.0,0.987979,0.5,0.805556,0.583912
8,How is the quality of a startup's product defi...,The quality of a startup's product in the tech...,[The quality of a startup’s pr\nproduct\noduct...,The quality of a startup's product in the tech...,1.0,0.993645,0.0,1.0,0.65102
9,What role can a campus computer lab play in he...,A campus computer lab can play a significant r...,[What should I do while I’m in school?\nI’m a ...,A campus computer lab can play a role in helpi...,0.75,0.979579,0.333333,0.75,0.877352


## Task : Testing OpenAI's Claim

Now that we've seen how our retriever can impact the performance of our RAG pipeline - let's see how changing our embedding model impacts performance.

####🏗️ Activity #1:

Please provide markdown, or code comments, to explain which each of the following steps are doing!

#### apply new embedding using model "text-embedding-3-small"

In [31]:
te3_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

#### set up vectordb using new embedding

In [32]:
qdrant_client.create_collection(
    collection_name=COLLECTION_NAME+"TE3",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME+"TE3",
    embedding=te3_embeddings,
)

qdrant_vector_store.add_documents(documents)

['99e63225c64c40faa4aa278d4e57bf1e',
 'a813b61eb9534e2fbf8a8885f86e6273',
 '4b89ea46dbb8445aa21c816723bb5b20',
 'fa5cebbc23554bf4a79a2a31e4973faa',
 'f4d7fce84de0401fac85961c34c46969',
 '4347ae0357244d4498d9f67257ef1c61',
 '8d3071d8a3724c97bdd2367318da45f6',
 '0468c6ae045b45c3b33a39b723c11383',
 '7882489e032c4dfab59ebc24bf45ee7d',
 'fbf52383b44d4b75a1145840bbd49df4',
 '29684c870c624e26b4661531350ac5b1',
 '45b17d68efc741ed9c2a7c657bb52130',
 '4421fab5cb1e4f81bdff46a6877cf58e',
 '0b29a81bf613467eb8d58512a52b6bb0',
 '2a6d3c1b7a8b4cbc8fe3fe78896be06f',
 '6c7d3888924847729c96780886db321e',
 '1183b0bd52074372b3e4790971f4f13f',
 '8487cf2f48a848d181b5bc73b19e5dca',
 'd33bfa12441d4793975e7a044df303f2',
 'f26f5afad04940f7be0e98da6d175851',
 '1a27d49c435540099895c2b02e1f3311',
 '37c6cde14b9b46598dea6d7bfa7c63e6',
 '74928a5ea14748bea73e28e844da3603',
 '895326ba0b354ccab3311cd1cd508eea',
 'b55ad28eb2a3456d96db2feef4de8700',
 '3eb1d7a976cc41578dad72b9be8a2583',
 'b9436a8ce698409bbcdfd6d8e4e8567a',
 

#### set up retriever

In [33]:
te3_retriever = qdrant_vector_store.as_retriever()

#### LCEL equivalent. It creates a document processing chain which is responsible for processing documents retrieved during the retrieval phase and combining them for further question-answering or summarization tasks

In [35]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(primary_qa_llm, retrieval_qa_prompt)

#### LCEL equivalent. it creates a retrieval chain which can retrieve relevant documents from a document store based on a query and pass those documents into the previously created document chain for further processing or answering

In [36]:
from langchain.chains import create_retrieval_chain

te3_retrieval_chain = create_retrieval_chain(te3_retriever, document_chain)

#### loop thourgh the questions and get the corresponded answer and context for later evaluation

In [37]:
answers = []
contexts = []

for question in test_questions:
  response = te3_retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

#### wrap the information in a Hugging Face dataset for use in the Ragas library

In [38]:
te3_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

#### run evaluation

In [39]:
te3_advanced_retrieval_results = evaluate(te3_response_dataset_advanced_retrieval, metrics)

Evaluating:   0%|          | 0/95 [00:00<?, ?it/s]

#### get evaluation results

In [40]:
te3_advanced_retrieval_results

{'faithfulness': 0.7975, 'answer_relevancy': 0.9703, 'context_recall': 0.6228, 'context_precision': 0.6243, 'answer_correctness': 0.6278}

#### merge it with previous evaluation to see changes

In [41]:
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'ADA'])
df_comparison = pd.DataFrame(list(te3_advanced_retrieval_results.items()), columns=['Metric', 'TE3'])

df_merged = pd.merge(df_baseline, df_comparison, on='Metric')

df_merged['Baseline -> TE3'] = df_merged['TE3'] - df_merged['ADA']

df_merged

Unnamed: 0,Metric,ADA,TE3,Baseline -> TE3
0,faithfulness,0.711842,0.797475,0.085633
1,answer_relevancy,0.874734,0.970282,0.095548
2,context_recall,0.635965,0.622807,-0.013158
3,context_precision,0.703216,0.624269,-0.078947
4,answer_correctness,0.563029,0.627816,0.064787


####❓ Question #3:

Do you think, in your opinion, `text-embedding-3-small` is significantly better than `ada`?

yes I mean it increaed faithfulness, answer_relevancy, and answer_correctness. 
Faithfulness and answer_relevancy are tie to generation part, it means TE3 is better than ADA in terms of generating more relevant and correct answers.

Answer correctness measures whether the generated answer is factually/semantically accurate. TE3 shows a reasonable improvement too.

TE3 falls behind ADA in context recall and context precision, which might indicate that while TE3 generates better answers, it sacrifices some efficiency in retrieving the optimal context.

## Task 5: Selecting an Advanced Retriever and Evaluating

#### 🏗️ Activity #2

While the changes that occured due to modifying the embedding model were desirable - you're now tasked with improving `context_recall`, or `context_precision` (or both!).

You'll follow these steps:

1. Reason about this list of Advanced Retrieval methods:
  - [Contextual Compression (Reranker)](https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/contextual_compression/)
  - [MultiQueryRetriever](https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/MultiQueryRetriever/)
  - [Parent Document Retriever](https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/parent_document_retriever/)
2. Select the method you think will be the most performant.
3. Implement that method.
4. Create a LCEL chain that utlizes the new Retriever method.
5. Evaluate this LCEL and compare to the TE3 results.

> NOTE: We will spend more time in Session 14 diving into advanced retrieval methods, this activity is only to serve as a basic introduction to the idea of component-wise improvements and how they might impact metrics.

In [42]:
from IPython.display import display, Markdown

def pretty_print(message: str) -> None:
    display(Markdown(f"```markdown\n{message}\n```"))

In [43]:
question = test_questions[0]
print(question)
response = te3_retrieval_chain.invoke({"input" : question})
pretty_print([context.page_content for context in response["context"]])
pretty_print(response["answer"])

How does the tendency to avoid inconsistency contribute to people being reluctant to change?


```markdown
['Five: Inconsistency-Avoidance Tendency\n[People are] reluctant to change, which is a form of inconsistency\navoidance. We see this in all human habits, constructive and', 'less brain-blocked by its previous conclusions…\nOne corollary of Inconsistency-Avoidance Tendency is that a per-\nson making big sacriXces in the course of assuming a new identity', '[T]ending to be maintained in place by the anti-change tendency\nof the brain are one’s previous conclusions, human loyalties, repu-\ntational identity, commitments…', 'tational identity, commitments…\nIt is easy to see that a quickly reached conclusion, triggered by\nDoubt-Avoidance Tendency, when combined with a tendency to']
```

```markdown
The tendency to avoid inconsistency contributes to people's reluctance to change by creating a resistance to altering established habits, beliefs, and identities. This inconsistency-avoidance tendency leads individuals to cling to their previous conclusions, loyalties, and commitments, making it difficult for them to embrace new ideas or identities. As a result, even when faced with the need for change, people may find it challenging to let go of their past and adapt to new circumstances, as doing so would require them to confront and potentially disrupt their established sense of self and stability.
```

In [55]:
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor
# from langchain_openai import OpenAI

# primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
# compressor = LLMChainExtractor.from_llm(primary_qa_llm )
# compression_retriever = ContextualCompressionRetriever(
#     base_compressor=compressor, base_retriever=te3_retriever
# )

# compressed_docs = compression_retriever.invoke(question)

In [51]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


redundant_filter = EmbeddingsRedundantFilter(embeddings=te3_embeddings)
relevant_filter = EmbeddingsFilter(embeddings=te3_embeddings, similarity_threshold=0.5)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[redundant_filter, relevant_filter]
)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=te3_retriever
)

compressed_docs = compression_retriever.invoke(question)

In [52]:
pretty_print([context.page_content for context in compressed_docs])

```markdown
['Five: Inconsistency-Avoidance Tendency\n[People are] reluctant to change, which is a form of inconsistency\navoidance. We see this in all human habits, constructive and', 'less brain-blocked by its previous conclusions…\nOne corollary of Inconsistency-Avoidance Tendency is that a per-\nson making big sacriXces in the course of assuming a new identity', '[T]ending to be maintained in place by the anti-change tendency\nof the brain are one’s previous conclusions, human loyalties, repu-\ntational identity, commitments…']
```

In [55]:
re_ranker_retrieval_chain = create_retrieval_chain(compression_retriever, document_chain)

In [56]:
question = test_questions[0]
print(question)
response = re_ranker_retrieval_chain.invoke({"input" : question})
pretty_print([context.page_content for context in response["context"]])
pretty_print(response["answer"])

How does the tendency to avoid inconsistency contribute to people being reluctant to change?


```markdown
['Five: Inconsistency-Avoidance Tendency\n[People are] reluctant to change, which is a form of inconsistency\navoidance. We see this in all human habits, constructive and', 'less brain-blocked by its previous conclusions…\nOne corollary of Inconsistency-Avoidance Tendency is that a per-\nson making big sacriXces in the course of assuming a new identity', '[T]ending to be maintained in place by the anti-change tendency\nof the brain are one’s previous conclusions, human loyalties, repu-\ntational identity, commitments…']
```

```markdown
The tendency to avoid inconsistency contributes to people's reluctance to change by creating a resistance to altering established habits, beliefs, and identities. This avoidance is rooted in the brain's inclination to maintain previous conclusions and commitments, which can lead to a reluctance to embrace new ideas or identities. As individuals make significant sacrifices to adopt a new identity, they may find it challenging to let go of their past loyalties and reputational identities, further reinforcing their resistance to change.
```

In [57]:
answers = []
contexts = []

for question in test_questions:
  response = re_ranker_retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [58]:
re_ranker_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [59]:
re_ranker_advanced_retrieval_results = evaluate(re_ranker_response_dataset_advanced_retrieval, metrics)

Evaluating:   0%|          | 0/95 [00:00<?, ?it/s]

Exception raised in Job[24]: TimeoutError()


In [60]:
re_ranker_advanced_retrieval_results

{'faithfulness': 0.7130, 'answer_relevancy': 0.9190, 'context_recall': 0.5263, 'context_precision': 0.6287, 'answer_correctness': 0.6071}

In [61]:
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'ADA'])
df_t3 = pd.DataFrame(list(te3_advanced_retrieval_results.items()), columns=['Metric', 'TE3'])
df_re_ranker = pd.DataFrame(list(re_ranker_advanced_retrieval_results.items()), columns=['Metric', 'RERANKER'])


df_merged1 = pd.merge(df_baseline, df_t3, on='Metric')
df_merged2 = pd.merge(df_merged1, df_re_ranker, on='Metric')

df_merged2['Baseline -> TE3'] = df_merged2['TE3'] - df_merged2['ADA']
df_merged2['TE3 -> RERANKER'] = df_merged2['RERANKER'] - df_merged2['TE3']

df_merged2

Unnamed: 0,Metric,ADA,TE3,RERANKER,Baseline -> TE3,TE3 -> RERANKER
0,faithfulness,0.711842,0.797475,0.712991,0.085633,-0.084484
1,answer_relevancy,0.874734,0.970282,0.918967,0.095548,-0.051315
2,context_recall,0.635965,0.622807,0.526316,-0.013158,-0.096491
3,context_precision,0.703216,0.624269,0.628655,-0.078947,0.004386
4,answer_correctness,0.563029,0.627816,0.60705,0.064787,-0.020766


#### 🚧 BONUS CHALLENGE 🚧

> NOTE: Completing this challenge will provide full marks on the assignment, regardless of the complete of the notebook. You do not need to complete this in the notebook for full marks.

##### **MINIMUM REQUIREMENTS**:

1. Baseline `LCEL RAG` Application using `NAIVE RETRIEVAL`
2. Baseline Evaluation using `RAGAS METRICS`
  - [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)
  - [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html)
  - [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html)
  - [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html)
  - [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html)
3. Implement a `SEMANTIC CHUNKING STRATEGY`.
4. Create an `LCEL RAG` Application using `SEMANTIC CHUNKING` with `NAIVE RETRIEVAL`.
5. Compare and contrast results.

##### **SEMANTIC CHUNKING REQUIREMENTS**:

Chunk semantically similar (based on designed threshold) sentences, and then paragraphs, greedily, up to a maximum chunk size. Minimum chunk size is a single sentence.

Have fun!