# BONUS CHALLENGE USING SEMANTIC CHUNKING

In [1]:
!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai langchain-qdrant
!pip install -qU ragas
!pip install -qU qdrant-client pymupdf pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [3]:
from operator import itemgetter
import pandas as pd

from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.document_loaders import PyMuPDFLoader

from datasets import Dataset

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision
from ragas.testset.evolutions import simple, reasoning, multi_context

from myutils.rag_pipeline_utils import SimpleTextSplitter, SemanticTextSplitter, VectorStore, AdvancedRetriever
from myutils.ragas_pipeline import RagasPipeline

  from .autonotebook import tqdm as notebook_tqdm


#### 🚧 BONUS CHALLENGE 🚧

> NOTE: Completing this challenge will provide full marks on the assignment, regardless of the complete of the notebook. You do not need to complete this in the notebook for full marks.

##### **MINIMUM REQUIREMENTS**:

1. Baseline `LCEL RAG` Application using `NAIVE RETRIEVAL`
2. Baseline Evaluation using `RAGAS METRICS`
  - [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)
  - [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html)
  - [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html)
  - [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html)
  - [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html)
3. Implement a `SEMANTIC CHUNKING STRATEGY`.
4. Create an `LCEL RAG` Application using `SEMANTIC CHUNKING` with `NAIVE RETRIEVAL`.
5. Compare and contrast results.

##### **SEMANTIC CHUNKING REQUIREMENTS**:

Chunk semantically similar (based on designed threshold) sentences, and then paragraphs, greedily, up to a maximum chunk size. Minimum chunk size is a single sentence.

Have fun!

## STEP 0 - Load the Corpus

In [4]:
# load docs
PDF_LINK = "https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf"
loader = PyMuPDFLoader(PDF_LINK)
documents = loader.load()

In [5]:
len(documents)

195

## STEP 1 - Set Up and Run the RAGAS Pipeline to Get Test Questions

#### Set up RAGAS Parameters

In [6]:
# LLM models used in RAGAS pipeline
ragas_generator_llm_model = 'gpt-3.5-turbo'
ragas_critic_llm_model = 'gpt-4o-mini'

# embeddings used for RAGAS pipeline
ragas_openai_embeddings_model = 'text-embedding-3-small'

# text splitter params
ragas_chunk_size = 600
ragas_chunk_overlap = 50

# number of qa pairs needed - reduce if running into rate limit issues
ragas_number_of_qa_pairs = 20

# initialize distributions - desired distribution of question types
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# set up list of RAGAS metrics used below
ragas_metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

# name of file to persist RAGAS Q&A on disk
ragas_testset_filename = "testset_from_ragas_run.csv"

# FLAG TO INDICATE IF RAGAS TESTSET SHOULD BE GENERATED IN THIS RUN
# IF it is run, note the cost and time estimate below!!!
generate_testset_now = False

#### Instantiate RAGAS Pipeline, Run Pipeline, Generate Test Questions
> NOTE: 🛑 Running this cell as presented will incur a charge of ~$3USD from OpenAI usage. Most of this cost is produced by the Synthetic Data Generation step. 🛑

Time to run the cell: approx. 15 minutes on my local machine

In [7]:
ragas_pipeline = RagasPipeline(
        generator_llm_model=ragas_generator_llm_model,
        critic_llm_model=ragas_critic_llm_model,
        embedding_model=ragas_openai_embeddings_model,
        number_of_qa_pairs=ragas_number_of_qa_pairs,
        chunk_size=ragas_chunk_size,
        chunk_overlap=ragas_chunk_overlap,
        documents=documents,
        distributions=distributions
)

if generate_testset_now is True:
    testset_df = ragas_pipeline.generate_testset()
    testset_df.to_csv(ragas_testset_filename)
else:
    pass

#### Load RAGAS Q&A from disk

In [8]:
test_df = pd.read_csv(ragas_testset_filename)
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

## STEP 2 - Set Up and Run Different RAG Pipelines to be Compared

### 0 - Set Up Common Objects and Parameters

In [9]:
chunk_size = 200
chunk_overlap = 50

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
embeddings_dimension = 1536

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [10]:
template = """
Use the provided context to answer the following question.
If you can't answer the question based on the context, say you don't know.

Question:
{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_template(template=template)

#### REPEAT STEPS 1 through 9 BELOW FOR EACH CASE

1.  Load document(s) into lists of text
2.  Set up Text Splitter - note this will either be recursive text splitter or Semantic Chunking in this case
3.  Define embeddings - OpenAIEmbeddings()
4.  set up vector store, load docs and index
5.  set up chunk retriever
6.  Set up retrieval chain using LCEL
7.  run chain by looping over each question in test list
8.  save q, a, and c in HF dataset object
9.  run RAGAS eval using stated metrics

### 1 - Define A Helper Function

In [11]:
def run_and_eval_rag_pipeline(location, collection_name, embed_dim, text_splits, embeddings,
                              prompt, qa_llm, metrics):
    """
    Helper function that runs and evaluates different rag pipelines
        based on different text_splits presented to the pipeline
    """
    # vector store
    vs = VectorStore(location=location, 
                     name=collection_name, 
                     documents=text_splits,
                     size=embed_dim, 
                     embedding=embeddings)

    qdvs = vs.set_up_vectorstore().qdrant_vector_store

    # retriever
    retriever = AdvancedRetriever(vectorstore=qdvs).set_up_simple_retriever()

    # q&a chain using LCEL
    retrieval_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | qa_llm, "context": itemgetter("context")}
    )

    # run RAG pipeline
    answers = []
    contexts = []

    for question in test_questions:
        response = retrieval_chain.invoke({"question" : question})
        answers.append(response["response"].content)
        contexts.append([context.page_content for context in response["context"]])

    # Save RAG pipeline results to HF Dataset object
    response_dataset = Dataset.from_dict({
        "question" : test_questions,
        "answer" : answers,
        "contexts" : contexts,
        "ground_truth" : test_groundtruths
    })

    # Run RAGAS Evaluation - using metrics
    results = evaluate(response_dataset, metrics)

    # save results to df
    results_df = results.to_pandas()

    return results, results_df

###  STEP 2a. Naive (aka SIMPLE) Retrieval Using LCEL Chain

In [12]:
simple_collection_name = "PMarcaBlogs_simple"

# instantiate simple text splitter
simple_text_splitter = \
    SimpleTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, documents=documents)

# split text for simple case
simple_text_splits = simple_text_splitter.split_text()

simple_results, simple_results_df = \
    run_and_eval_rag_pipeline(location=":memory:", collection_name=simple_collection_name,
                              embed_dim=embeddings_dimension, text_splits=simple_text_splits,
                              embeddings=embeddings, prompt=prompt, qa_llm=primary_qa_llm,
                              metrics=ragas_metrics)

Evaluating: 100%|██████████| 100/100 [00:35<00:00,  2.82it/s]


###  STEP 2b. RAG Pipeline With Semantic Chunking 

In [13]:
sem_collection_name = "PMarcaBlogs_semantic"

sem_text_splitter = \
    SemanticTextSplitter(llm_embeddings=embeddings, threshold_type="interquartile", documents=documents)

# split text for simple case
sem_text_splits = sem_text_splitter.split_text()

sem_results, sem_results_df = \
    run_and_eval_rag_pipeline(location=":memory:", collection_name=sem_collection_name,
                              embed_dim=embeddings_dimension, text_splits=sem_text_splits,
                              embeddings=embeddings, prompt=prompt, qa_llm=primary_qa_llm,
                              metrics=ragas_metrics)

loaded 195 to be split 
returning docs split into 329 chunks 


Evaluating: 100%|██████████| 100/100 [00:51<00:00,  1.95it/s]


### STEP 2c - Compare The Results

In [14]:
df_baseline = pd.DataFrame(list(simple_results.items()), columns=['Metric', 'SimpleChunking'])
df_comparison = pd.DataFrame(list(sem_results.items()), columns=['Metric', 'SemanticChunking'])

df_merged = pd.merge(df_baseline, df_comparison, on='Metric')

df_merged['Baseline -> SemanticChunking'] = df_merged['SemanticChunking'] - df_merged['SimpleChunking']

df_merged

Unnamed: 0,Metric,SimpleChunking,SemanticChunking,Baseline -> SemanticChunking
0,faithfulness,0.62445,0.8978,0.273349
1,answer_relevancy,0.739658,0.935363,0.195705
2,context_recall,0.681548,1.0,0.318452
3,context_precision,0.604167,0.658333,0.054167
4,answer_correctness,0.537187,0.624024,0.086836


## Analysis of Results

1.  The results with `Semantic Chunking` seem to be dramatically improved in `RETRIEVAL`-focused metrics like `context_recall` and `answer_relevancy`.

2.  Even in measures like `faithfulness` that primarily assesses generation part of the pipeline, the results seem quite improved.

3.  Given other results, I would have expected `answer_correctness` to be higher.  It would b useful to dig into factual similarity and semantic similarity differences.