# RAG Technique Experiments

In [403]:
import os
import qdrant_client

from llama_index.core import (
    Settings,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.extractors import QuestionsAnsweredExtractor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.llms.ollama import Ollama
import torch
from llama_index.core.llama_dataset import (
    LabelledRagDataset,
    CreatedBy,
    CreatedByType,
    LabelledRagDataExample,
)
from llama_index.core.llama_pack import download_llama_pack
import inspect
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.core.vector_stores.types import VectorStoreQuery
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation import RetrieverEvaluator
import pandas as pd

In [348]:
# This import is required for asynchronous functions to work
import nest_asyncio

nest_asyncio.apply()

## Configuration

### Model Settings

In [285]:
LLM_MODEL = "qwen2:0.5b"
LLM = Ollama(model=LLM_MODEL, request_timeout=36000.0)
Settings.llm = LLM
EMBED_MODEL = os.environ.get("EMBED_MODEL", "BAAI/bge-base-en-v1.5")

In [286]:
EMBED_MODEL = os.environ.get("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
Settings.embed_model = FastEmbedEmbedding(model_name=EMBED_MODEL)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [288]:
RERANK_MODEL = SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base")

### Vector Store Settings

In [289]:
QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
QDRANT_PORT = os.environ.get("PORT", 6333)

In [290]:
QDRANT_CLIENT = qdrant_client.QdrantClient(
    # location=":memory:"
    host=QDRANT_HOST,
    port=QDRANT_PORT,
)
QDRANT_CLIENT.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='style_python')])

In [291]:
QDRANT_CLIENT_ASYNC = qdrant_client.AsyncQdrantClient(
    # location=":memory:"
    host=QDRANT_HOST,
    port=QDRANT_PORT,
)
await QDRANT_CLIENT_ASYNC.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='style_python')])

In [292]:
vector_store = QdrantVectorStore(
    client=QDRANT_CLIENT,
    aclient=QDRANT_CLIENT_ASYNC,
    enable_hybrid=True,
    collection_name="style_python"
)

Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [293]:
index = VectorStoreIndex.from_vector_store(vector_store)

In [282]:
query = "Should I always use parentheses around tuples?"

### Test Queries

In [334]:
query_engine_basic = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=12,
    use_async=True
)

In [369]:
response = query_engine_basic.query(query)
response.response

"No. In Python, tuples are enclosed in parentheses when you need them to be treated as a group of items that can't be modified after they have been assigned values. This is true for all types of expressions and variables used in Python. You should not use parentheses around tuples unless explicitly told otherwise by the interpreter or your application."

In [335]:
query_engine_hybrid = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    use_async=True
)

In [375]:
response = query_engine_hybrid.query(query)
response.response

'Yes, you should always use parentheses around tuples to indicate that a tuple is being treated as a collection of objects in the same order. This helps prevent confusion between tuple elements and other types of variable that can be combined into a single object within a tuple. By doing so, you avoid using multiple types for one object, which is often not desirable when working with collections like tuples.'

In [336]:
query_engine_rerank = index.as_query_engine(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    node_postprocessors=[rerank],
    use_async=True
)

In [374]:
response = query_engine_rerank.query(query)
response.response

'Yes, it is fine to use parentheses around tuples when using Python syntax in this context. The context also mentions that typed lists can only contain objects of a single type, so it is acceptable to use parentheses around these types as well.\n\nHowever, for more complex or nested types within tuple, it might be more efficient and readable to use parentheses around these elements.'

In [337]:
query_engines = [
    ("basic", query_engine_basic),
    ("hybrid",query_engine_hybrid),
    ("rerank",query_engine_rerank)
]

### Test 1 - Evaluate Correctness, Relevancy, Faithfulness, Context Similarity

In [376]:
rag_dataset = LabelledRagDataset.from_json("data/testsets/style_guide_testset.json")

In [None]:
RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")

In [378]:
inspect.signature(RagEvaluatorPack)

<Signature (query_engine: llama_index.core.base.base_query_engine.BaseQueryEngine, rag_dataset: llama_index.core.llama_dataset.base.BaseLlamaDataset, judge_llm: Optional[llama_index.core.llms.llm.LLM] = None, embed_model: Optional[llama_index.core.base.embeddings.base.BaseEmbedding] = None, show_progress: bool = True, result_path: Optional[str] = None)>

In [379]:
rag_evaluator_basic = RagEvaluatorPack(
    query_engine=query_engine,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [380]:
rag_evaluator_hybrid = RagEvaluatorPack(
    query_engine=query_engine_hybrid,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [381]:
rag_evaluator_rerank = RagEvaluatorPack(
    query_engine=query_engine_rerank,  # built with the same source Documents as the rag_dataset
    rag_dataset=rag_dataset,
    judge_llm=Settings.llm,
    embed_model=Settings.embed_model,
    show_progress=True
)

In [349]:
benchmark_df_basic = rag_evaluator_basic.run()

2it [00:01,  1.06it/s]
2it [00:02,  1.25s/it]
2it [00:02,  1.23s/it]
2it [00:02,  1.16s/it]
2it [00:02,  1.23s/it]
1it [00:01,  1.34s/it]


In [351]:
benchmark_df_hybrid = rag_evaluator_hybrid.run()

2it [00:02,  1.19s/it]
2it [00:02,  1.27s/it]
2it [00:03,  1.59s/it]
2it [00:02,  1.26s/it]
2it [00:02,  1.26s/it]
1it [00:01,  1.25s/it]


In [353]:
benchmark_df_rerank = rag_evaluator_rerank.run()

2it [00:02,  1.05s/it]
2it [00:02,  1.22s/it]
2it [00:02,  1.13s/it]
2it [00:02,  1.08s/it]
2it [00:02,  1.26s/it]
1it [00:01,  1.35s/it]


In [382]:
benchmark_results = [benchmark_df_basic, benchmark_df_hybrid, benchmark_df_rerank]
for result_df in benchmark_results:
    print(result_df.head())
    print()

rag                            base_rag
metrics                                
mean_correctness_score         2.363636
mean_relevancy_score           0.545455
mean_faithfulness_score        0.454545
mean_context_similarity_score  0.831865

rag                            base_rag
metrics                                
mean_correctness_score         2.909091
mean_relevancy_score           0.545455
mean_faithfulness_score        0.272727
mean_context_similarity_score  0.831865

rag                            base_rag
metrics                                
mean_correctness_score         2.916667
mean_relevancy_score           0.333333
mean_faithfulness_score        0.333333
mean_context_similarity_score  0.837198



### Test 2 - Evaluate Hit Rate and Maximum Marginal Relevance

In [384]:
index_async = VectorStoreIndex.from_vector_store(vector_store=vector_store, use_async=True)

In [385]:
retriever = index_async.as_retriever(
    similarity_top_k=2,
    sparse_top_k=12,
    use_async=True
)

In [386]:
retriever_hybrid = index_async.as_retriever(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    use_async=True
)

In [387]:
retriever_rerank = index_async.as_retriever(
    similarity_top_k=2,
    sparse_top_k=12,
    vector_store_query_mode="hybrid",
    node_postprocessors=[rerank],
    use_async=True
)

In [393]:
# This notebook assumes you have already made this for your VectorStore
# If not, run this function on your VectorStore
def create_and_save_test_dataset():
    nodes = vector_store.get_nodes()
    qa_dataset = generate_question_context_pairs(
        nodes, llm=Settings.llm, num_questions_per_chunk=1
    )
    qa_dataset.save_json("qa_dataset.json")
    return qa_dataset
# qa_dataset=create_and_save_test_dataset()

In [395]:
metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]
# metrics = ["mrr", "hit_rate"]

In [396]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever, use_async=True
)

In [397]:
retriever_evaluator_hybrid = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever_hybrid, use_async=True
)

In [398]:
retriever_evaluator_rerank = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever_rerank, use_async=True
)

In [399]:
inspect.signature(retriever_evaluator.aevaluate)

<Signature (query: str, expected_ids: List[str], expected_texts: Optional[List[str]] = None, mode: llama_index.core.evaluation.retrieval.base.RetrievalEvalMode = <RetrievalEvalMode.TEXT: 'text'>, **kwargs: Any) -> llama_index.core.evaluation.retrieval.base.RetrievalEvalResult>

In [400]:
eval_results_basic = await retriever_evaluator.aevaluate_dataset(dataset=qa_dataset, workers=10, show_progress=True)

100%|█


In [401]:
eval_results_hybrid = await retriever_evaluator_hybrid.aevaluate_dataset(dataset=qa_dataset, workers=10, show_progress=True)

100%|█


In [402]:
eval_results_rerank = await retriever_evaluator_rerank.aevaluate_dataset(dataset=qa_dataset, workers=10, show_progress=True)

100%|█


In [406]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    # if include_cohere_rerank:
    #     crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
    #     columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [410]:
display_results("Basic Retriever", eval_results_basic)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Basic Retriever,0.281046,0.25817,0.140523,0.281046,0.25817,0.161969


In [408]:
display_results("Hybrid Retriever", eval_results_hybrid)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Hybrid Retriever,0.27451,0.254902,0.137255,0.27451,0.254902,0.159441


In [409]:
display_results("Rerank Retriever", eval_results_rerank)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Rerank Retriever,0.27451,0.254902,0.137255,0.27451,0.254902,0.159441
