# RAG Technique Experiments

In [46]:
import os
import qdrant_client

from llama_index.core import (
    Settings,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.extractors import QuestionsAnsweredExtractor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.llms.ollama import Ollama
import torch
from llama_index.core.llama_dataset import (
    LabelledRagDataset,
    CreatedBy,
    CreatedByType,
    LabelledRagDataExample,
)
from llama_index.core.llama_pack import download_llama_pack
import inspect
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.core.vector_stores.types import VectorStoreQuery
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation import RetrieverEvaluator
import pandas as pd
from llama_index.core.evaluation import QueryResponseDataset

In [2]:
# This import is required for asynchronous functions to work
import nest_asyncio

nest_asyncio.apply()

## Configuration

### Model Settings

In [3]:
LLM_MODEL = "mistral:latest"
LLM = Ollama(model=LLM_MODEL, request_timeout=36000.0)
Settings.llm = LLM
EMBED_MODEL = os.environ.get("EMBED_MODEL", "BAAI/bge-base-en-v1.5")

In [4]:
EMBED_MODEL = os.environ.get("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
Settings.embed_model = FastEmbedEmbedding(model_name=EMBED_MODEL)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
RERANK_MODEL = SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base")

### Vector Store Settings

In [6]:
QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
QDRANT_PORT = os.environ.get("PORT", 6333)

In [7]:
QDRANT_CLIENT = qdrant_client.QdrantClient(
    # location=":memory:"
    host=QDRANT_HOST,
    port=QDRANT_PORT,
)
QDRANT_CLIENT.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='style_python')])

In [8]:
QDRANT_CLIENT_ASYNC = qdrant_client.AsyncQdrantClient(
    # location=":memory:"
    host=QDRANT_HOST,
    port=QDRANT_PORT,
)
await QDRANT_CLIENT_ASYNC.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='style_python')])

In [9]:
vector_store = QdrantVectorStore(
    client=QDRANT_CLIENT,
    aclient=QDRANT_CLIENT_ASYNC,
    enable_hybrid=True,
    collection_name="style_python"
)

Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
index = VectorStoreIndex.from_vector_store(vector_store)

In [11]:
query = "Should I always use parentheses around tuples?"

## Techniques

In [83]:
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine

In [85]:
hyde = HyDEQueryTransform(include_original=True)

In [93]:
from llama_index.core.indices.query.query_transform.base import (
    StepDecomposeQueryTransform,
)
from llama_index.core.query_engine import MultiStepQueryEngine

In [94]:
step_decompose_transform = StepDecomposeQueryTransform(llm=Settings.llm, verbose=True)

### Test Queries

In [12]:
query_engine_basic = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=12,
    use_async=True
)

In [13]:
response = query_engine_basic.query(query)
response.response

" According to the provided style guide, using parentheses around single-item tuples is more visually obvious than using commas. However, it's not required and is a matter of preference. In return statements or conditional statements, parentheses should be used sparingly and only for implied line continuation or to indicate a tuple."

In [98]:
query_engine_hybrid = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    use_async=True
)

In [99]:
response = query_engine_hybrid.query(query)
response.response

" According to the provided context, it is recommended to use parentheses sparingly in Python, especially around tuples. They are only required when the tuple contains more than one type or a mix of types, or when used as return types from functions. It's generally fine to omit parentheses for single-type tuples, as they can be distinguished from lists by their trailing comma. However, using parentheses may make the tuple more visually obvious in some cases. Ultimately, the decision depends on personal coding style and adherence to established conventions within your project or organization."

In [17]:
query_engine_rerank = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    node_postprocessors=[RERANK_MODEL],
    use_async=True
)

In [18]:
response = query_engine_rerank.query(query)
response.response

' It is recommended in the provided style guide to use parentheses sparingly and only around tuples when it makes the code more visually obvious than using a comma as the separator, especially for a one-item tuple. However, the use of parentheses around multi-item tuples is not required. In fact, typed tuples are commonly used as the return type from a function without using parentheses.'

In [86]:
query_engine_hyde = TransformQueryEngine(query_engine_hybrid, hyde)

In [87]:
response = query_engine_hyde.query(query)
response.response

' According to the provided context, it is recommended to use parentheses sparingly and not in return statements or conditional statements unless for implied line continuation or to indicate a tuple. However, when defining a tuple with multiple types or elements of different types, parentheses are required. So, while not always necessary, using parentheses around tuples can be helpful for clarity and proper representation.'

In [103]:
query_engine_multistep = MultiStepQueryEngine(
    query_engine=query_engine_hybrid,
    query_transform=step_decompose_transform,
    index_summary="Used to answer questions about Python programming style and best practices",
)

In [105]:
response = query_engine_multistep.query(query)
response.response

[1;3;33m> Current query: Should I always use parentheses around tuples?
[0m[1;3;38;5;200m> New query:  Is it a common practice in Python to use parentheses around tuples for clarity and readability?
[0m[1;3;33m> Current query: Should I always use parentheses around tuples?
[0m[1;3;38;5;200m> New query:  Is it a best practice in Python to use parentheses around multi-item tuples for clarity and readability?
[0m[1;3;33m> Current query: Should I always use parentheses around tuples?
[0m[1;3;38;5;200m> New query:  Is it recommended in Python to use parentheses around multi-item tuples for clarity and readability, or is their use optional?
[0m

' In Python, using parentheses around single-item tuples is optional and largely a matter of personal preference or project style guidelines. For multi-item tuples or tuples with different types, parentheses are necessary to define the tuple type explicitly. However, their use is generally optional in return statements or conditional statements unless used for implied line continuation or to indicate a tuple. Ultimately, whether to use parentheses around tuples depends on your coding style and preferences.'

### Test 1 - Evaluate Correctness, Relevancy, Faithfulness, Context Similarity

In [19]:
rag_dataset = LabelledRagDataset.from_json("data/testsets/style_guide_testset.json")

In [None]:
RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")

In [21]:
inspect.signature(RagEvaluatorPack)

<Signature (query_engine: llama_index.core.base.base_query_engine.BaseQueryEngine, rag_dataset: llama_index.core.llama_dataset.base.BaseLlamaDataset, judge_llm: Optional[llama_index.core.llms.llm.LLM] = None, embed_model: Optional[llama_index.core.base.embeddings.base.BaseEmbedding] = None, show_progress: bool = True, result_path: Optional[str] = None)>

In [24]:
rag_evaluator_basic = RagEvaluatorPack(
    query_engine=query_engine_basic,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [25]:
rag_evaluator_hybrid = RagEvaluatorPack(
    query_engine=query_engine_hybrid,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [26]:
rag_evaluator_rerank = RagEvaluatorPack(
    query_engine=query_engine_rerank,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [88]:
rag_evaluator_hyde = RagEvaluatorPack(
    query_engine=query_engine_hyde,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [106]:
rag_evaluator_multistep = RagEvaluatorPack(
    query_engine=query_engine_multistep,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [27]:
benchmark_df_basic = rag_evaluator_basic.run()

100%|█
100%|█
2it [00:11,  5.56s/it]
2it [00:11,  5.67s/it]
2it [00:12,  6.22s/it]
2it [00:09,  4.52s/it]
2it [00:12,  6.42s/it]
1it [00:06,  6.04s/it]


In [28]:
benchmark_df_hybrid = rag_evaluator_hybrid.run()

2it [00:12,  6.44s/it]
2it [00:11,  5.98s/it]
2it [00:11,  5.81s/it]
2it [00:11,  5.80s/it]
2it [00:12,  6.03s/it]
1it [00:06,  6.41s/it]


In [29]:
benchmark_df_rerank = rag_evaluator_rerank.run()

2it [00:13,  6.55s/it]
2it [00:11,  5.52s/it]
2it [00:21, 10.53s/it]
2it [00:13,  6.51s/it]
2it [00:10,  5.49s/it]
1it [00:05,  5.47s/it]


In [90]:
benchmark_df_hyde = rag_evaluator_hyde.run()

2it [00:10,  5.42s/it]
2it [00:11,  5.92s/it]
2it [00:13,  6.62s/it]
2it [00:17,  8.99s/it]
2it [00:12,  6.01s/it]
1it [00:05,  5.11s/it]


In [107]:
benchmark_df_multistep = rag_evaluator_multistep.run()

2it [00:13,  6.92s/it]
2it [00:10,  5.40s/it]
2it [00:11,  5.89s/it]
2it [00:22, 11.39s/it]
2it [00:11,  5.74s/it]
1it [00:05,  5.28s/it]


In [108]:
benchmark_results = [
    benchmark_df_basic,
    benchmark_df_hybrid,
    benchmark_df_rerank,
    benchmark_df_hyde,
    benchmark_df_multistep
]
for result_df in benchmark_results:
    print(result_df.head())
    print()

rag                            base_rag
metrics                                
mean_correctness_score         4.818182
mean_relevancy_score           0.909091
mean_faithfulness_score        0.909091
mean_context_similarity_score  0.846435

rag                            base_rag
metrics                                
mean_correctness_score         4.818182
mean_relevancy_score           1.000000
mean_faithfulness_score        0.909091
mean_context_similarity_score  0.846435

rag                            base_rag
metrics                                
mean_correctness_score         4.818182
mean_relevancy_score           0.909091
mean_faithfulness_score        1.000000
mean_context_similarity_score  0.846435

rag                            base_rag
metrics                                
mean_correctness_score         4.791667
mean_relevancy_score           0.916667
mean_faithfulness_score        0.833333
mean_context_similarity_score  0.847857

rag                            base_

### Test 2 - Evaluate Hit Rate and Maximum Marginal Relevance

In [31]:
index_async = VectorStoreIndex.from_vector_store(vector_store=vector_store, use_async=True)

In [32]:
retriever = index_async.as_retriever(
    similarity_top_k=2,
    sparse_top_k=12,
    use_async=True
)

In [33]:
retriever_hybrid = index_async.as_retriever(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    use_async=True
)

In [35]:
retriever_rerank = index_async.as_retriever(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    node_postprocessors=[RERANK_MODEL],
    use_async=True
)

In [75]:
# This notebook assumes you have already made this for your VectorStore
# If not, run this function on your VectorStore
def create_and_save_test_dataset():
    nodes = vector_store.get_nodes()
    qa_dataset = generate_question_context_pairs(
        nodes, llm=Settings.llm, num_questions_per_chunk=1
    )
    qa_dataset.save_json("qa_dataset.json")
    return qa_dataset
# qa_dataset=create_and_save_test_dataset()

qa_dataset = EmbeddingQAFinetuneDataset.from_json("qa_dataset.json")

In [70]:
metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

In [71]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever, use_async=True
)

In [72]:
retriever_evaluator_hybrid = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever_hybrid, use_async=True
)

In [73]:
retriever_evaluator_rerank = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever_rerank, use_async=True
)

In [109]:
import copy
qa_dataset_hyde = copy.deepcopy(qa_dataset)

In [120]:
transformed_queries = {}
for query_id, query in qa_dataset_hyde.queries.items():
    transformed_query = hyde.run(query)
    merged_query_and_answer = " ".join(transformed_query.custom_embedding_strs)
    qa_dataset_hyde.queries[query_id] = merged_query_and_answer

In [155]:
for query_id, query in qa_dataset_hyde.queries.items():
    merged_query_and_answer = " ".join(query.custom_embedding_strs)
    qa_dataset_hyde.queries[query_id] = merged_query_and_answer

In [74]:
inspect.signature(retriever_evaluator.aevaluate)

<Signature (query: str, expected_ids: List[str], expected_texts: Optional[List[str]] = None, mode: llama_index.core.evaluation.retrieval.base.RetrievalEvalMode = <RetrievalEvalMode.TEXT: 'text'>, **kwargs: Any) -> llama_index.core.evaluation.retrieval.base.RetrievalEvalResult>

In [76]:
eval_results_basic = await retriever_evaluator.aevaluate_dataset(dataset=qa_dataset, workers=10, show_progress=True)

100%|█


In [77]:
eval_results_hybrid = await retriever_evaluator_hybrid.aevaluate_dataset(dataset=qa_dataset, workers=10, show_progress=True)

100%|█


In [78]:
eval_results_rerank = await retriever_evaluator_rerank.aevaluate_dataset(dataset=qa_dataset, workers=10, show_progress=True)

100%|█


In [156]:
eval_results_hyde = await retriever_evaluator_rerank.aevaluate_dataset(dataset=qa_dataset_hyde, workers=10, show_progress=True)


  0%| |[A
  1%| |[A
  1%| |[A
  2%| |[A
  3%| |[A
  7%| |[A
  8%| |[A
 10%| |[A
 11%| |[A
 12%| |[A
 14%|▏|[A
 14%|▏|[A
 15%|▏|[A
 17%|▏|[A
 18%|▏|[A
 18%|▏|[A
 20%|▏|[A
 20%|▏|[A
 21%|▏|[A
 22%|▏|[A
 24%|▏|[A
 25%|▏|[A
 26%|▎|[A
 27%|▎|[A
 28%|▎|[A
 31%|▎|[A
 33%|▎|[A
 33%|▎|[A
 35%|▎|[A
 37%|▎|[A
 39%|▍|[A
 39%|▍|[A
 40%|▍|[A
 41%|▍|[A
 42%|▍|[A
 44%|▍|[A
 46%|▍|[A
 46%|▍|[A
 47%|▍|[A
 49%|▍|[A
 50%|▍|[A
 52%|▌|[A
 52%|▌|[A
 53%|▌|[A
 56%|▌|[A
 56%|▌|[A
 58%|▌|[A
 58%|▌|[A
 59%|▌|[A
 59%|▌|[A
 60%|▌|[A
 63%|▋|[A
 63%|▋|[A
 65%|▋|[A
 66%|▋|[A
 67%|▋|[A
 68%|▋|[A
 70%|▋|[A
 71%|▋|[A
 72%|▋|[A
 73%|▋|[A
 75%|▋|[A
 75%|▊|[A
 77%|▊|[A
 78%|▊|[A
 79%|▊|[A
 82%|▊|[A
 84%|▊|[A
 85%|▊|[A
 86%|▊|[A
 88%|▉|[A
 90%|▉|[A
 90%|▉|[A
 92%|▉|[A
100%|█|[A


In [79]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    # if include_cohere_rerank:
    #     crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
    #     columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [80]:
display_results("Basic Retriever", eval_results_basic)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Basic Retriever,0.830065,0.79085,0.415033,0.830065,0.79085,0.491204


In [81]:
display_results("Hybrid Retriever", eval_results_hybrid)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Hybrid Retriever,0.836601,0.794118,0.418301,0.836601,0.794118,0.493732


In [82]:
display_results("Rerank Retriever", eval_results_rerank)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Rerank Retriever,0.836601,0.794118,0.418301,0.836601,0.794118,0.493732


In [157]:
display_results("HyDE Retriever", eval_results_hyde)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,HyDE Retriever,0.69281,0.611111,0.346405,0.69281,0.611111,0.387819
