# Contextual retrieval with Nova Lite and llama-index
In this notebook, you will learn how to improve the context in your vector store using contextual retrieval with Amazon Bedrock and the Nova family of models. We will be using Llama-index pipelines to orchestrate and automate the workflow execution.

### 1) Setup
* Install python modules
* Import required classes/functions
* Set all static variables
* Define custom classes/functions required
* Initialise clients (AWS/llama-index)

In [None]:
%pip install -q --upgrade pip
%pip install -q --upgrade packaging
%pip install -q llama-index-core
%pip install -q llama-index-embeddings-bedrock
%pip install -q llama-index-llms-bedrock-converse
%pip install -q llama_index_postprocessor_colbert_rerank
%pip install -q llama-index-readers-web
%pip install -q llama-index-retrievers-bm25

In [None]:
import boto3
import copy
import html2text
import nest_asyncio
import pandas as pd
import requests
import Stemmer
from typing import List

from botocore.config import Config
from llama_index.core import Settings, VectorStoreIndex, QueryBundle
from llama_index.core.evaluation import (
    RetrieverEvaluator,
    generate_question_context_pairs
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TransformComponent, NodeWithScore
from llama_index.embeddings.bedrock import BedrockEmbedding
from llama_index.llms.bedrock_converse import BedrockConverse
from llama_index.postprocessor.colbert_rerank import ColbertRerank
from llama_index.readers.web import SimpleWebPageReader
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever

AWS_REGION = "us-east-1"
BEDROCK_EMBEDDING_MODEL = "cohere.embed-english-v3"
BEDROCK_TEXT_GENERATION_MODEL = "us.amazon.nova-lite-v1:0"
BEDROCK_MAX_TOKENS = 5000
BEDROCK_TEMPERATURE = 0.0
BEDROCK_BOTOCORE_MAX_RETRIES = 20
LLAMA_INDEX_CHUNK_SIZE=512
LLAMA_INDEX_CHUNK_OVERLAP=51
LLAMA_INDEX_INGESTION_DOCUMENTS=[
    "https://www.aboutamazon.com/news/company-news/amazon-ceo-andy-jassy-2022-letter-to-shareholders",
    "https://www.aboutamazon.com/news/company-news/ceo-andy-jassys-2023-letter-to-shareholders"
]
LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS = [
    "hit_rate", 
    "mrr", 
    "recall"
]
LLAMA_INDEX_RETRIEVAL_TOP_K=5
LLAMA_INDEX_RERANKER="colbert-ir/colbertv2.0"
LLAMA_INDEX_SAMPLE_DATA_NUM_QUESTIONS_PER_CHUNK=2

class LlamaIndexEmbeddingBM25RerankerRetriever(BaseRetriever):
    """
    A hybrid retriever that combines vector-based and BM25 retrieval with reranking.

    This retriever implements a multi-stage retrieval process:
    1. Retrieves documents using a vector-based retriever (semantic search)
    2. Retrieves documents using BM25 (lexical search)
    3. Combines both result sets
    4. Reranks the combined results using a ColBERT reranker

    The final output is a reranked list of documents that leverages both semantic and lexical matching,
    potentially providing better search results than either method alone.

    Attributes:
        _vector_retriever (VectorIndexRetriever): Retriever for vector/embedding-based search
        bm25_retriever (BM25Retriever): Retriever for BM25 lexical search
        reranker (ColbertRerank): Reranker to score and sort combined results

    Example:
        retriever = LlamaIndexEmbeddingBM25RerankerRetriever(
            vector_retriever=vector_retriever,
            bm25_retriever=bm25_retriever,
            reranker=reranker
        )
        results = retriever.retrieve(query)
    """

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        bm25_retriever: BM25Retriever,
        reranker: ColbertRerank,
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        self.reranker = reranker

        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        bm25_nodes = self.bm25_retriever.retrieve(query_bundle)

        vector_nodes.extend(bm25_nodes)

        retrieved_nodes = self.reranker.postprocess_nodes(
            vector_nodes, query_bundle
        )

        return retrieved_nodes

class LlamaIndexContextualEnrichment(TransformComponent):
    """
    A transformation component that enriches document nodes with contextual information using LLM.

    This class processes nodes from document chunks (typically from Amazon shareholder letters) 
    and adds contextual metadata by analyzing each chunk within its full document context.

    The enrichment process:
    1. Retrieves the full source document for each node
    2. Converts HTML content to plain text
    3. Uses LLM to generate contextual information by analyzing the chunk within the complete document
    4. Adds the context as metadata to each node

    Parameters:
        nodes: List of document nodes to be enriched

    Returns:
        List[Node]: New list of nodes with added contextual metadata

    Attributes:
        web_page_content (dict): Cache of retrieved web page contents

    Note:
        - Requires an active internet connection to fetch source documents
        - Depends on external libraries: requests, html2text
        - Assumes nodes have valid source URL relationships
        - Uses LLM configured in Settings for context generation
    """
    @classmethod
    def __call__(self, nodes, **kwargs):

        web_page_content = {}
        node_total = len(nodes)
        node_current_index = 1
        nodes_new = []

        for node in nodes:
            print(f"enriching node: {node.node_id} ({node_current_index})/{node_total}")
            new_node = copy.deepcopy(node)
            for r in new_node.relationships:
                if(str(r) == "NodeRelationship.SOURCE"):
                    source_url = new_node.relationships[r].node_id
                    if(new_node.relationships[r].node_id not in web_page_content):
                        web_page_content[new_node.relationships[r].node_id] = requests.get(new_node.relationships[r].node_id).text
                        web_page_content[new_node.relationships[r].node_id] = html2text.html2text(web_page_content[new_node.relationships[r].node_id])

            whole_document = web_page_content[source_url]
            prompt = f"""
                ## Here is the source document:
                <document>
                {whole_document}
                </document>
                ## Here is the chunk we want to situate within the whole document
                <chunk>
                {new_node.text}
                </chunk>
                ## Your role
                You are a financial document analysis specialist with expertise in annual shareholder letters,
                particularly those from Amazon.
                ## Your Task
                Your task is to analyze the <chunk> from an Amazon shareholder
                letter written by Amazon CEO and enhance its searchability and context.
                Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. 
                Answer only with the succinct context and nothing else.
            """
            new_node.metadata['context'] = Settings.llm.complete(prompt).text
            nodes_new.append(new_node)

            node_current_index += 1

        return nodes_new


def llama_index_evaluation_print_results(name, eval_results, metrics = LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS):
    """
    Display evaluation results from LlamaIndex retrieval evaluation in a formatted DataFrame.

    Args:
        name (str): The name of the retriever being evaluated.
        eval_results (list): List of evaluation result objects containing metric values.
        metrics (list, optional): List of metric names to include in the results. 
            Defaults to LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS.

    Returns:
        pandas.DataFrame: A DataFrame containing:
            - 'retrievers': Name of the retriever
            - Average values for each specified metric across all evaluation results

    Example:
        >>> eval_results = [result1, result2, result3]  # evaluation results
        >>> df = llama_index_evaluation_print_results("MyRetriever", eval_results)
    """

    metric_dicts = []
    for eval_result in eval_results:
        metric_dicts.append(eval_result.metric_vals_dict)

    full_df = pd.DataFrame(metric_dicts)

    return pd.DataFrame({
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS},
    })

aws_client_bedrock_runtime = boto3.client(
    config=Config(
        region_name=AWS_REGION,
        retries={
            'max_attempts': BEDROCK_BOTOCORE_MAX_RETRIES
        }
    ),
    region_name=AWS_REGION,
    service_name="bedrock-runtime"
)

Settings.embed_model = BedrockEmbedding(
    client=aws_client_bedrock_runtime,
    model_id=BEDROCK_EMBEDDING_MODEL
)

Settings.llm = BedrockConverse(
    client=aws_client_bedrock_runtime,
    max_tokens=BEDROCK_MAX_TOKENS,
    model=BEDROCK_TEXT_GENERATION_MODEL,
    temperature=BEDROCK_TEMPERATURE
)

colbert_reranker = ColbertRerank(
    top_n=LLAMA_INDEX_RETRIEVAL_TOP_K,
    model=LLAMA_INDEX_RERANKER,
    tokenizer=LLAMA_INDEX_RERANKER,
    keep_retrieval_score=True,
)

### 2) Scrape content
* Define web crawler, and specify documents to ingest
* Split documents into chunks (to fit within embedding model constraints)

In [None]:
llama_index_documents = SimpleWebPageReader(
    html_to_text=True
).load_data(
    LLAMA_INDEX_INGESTION_DOCUMENTS
)
print(f"number of documents loaded: {len(llama_index_documents)}")
llama_index_document_splitter = SentenceSplitter(
    chunk_size=LLAMA_INDEX_CHUNK_SIZE,
    chunk_overlap=LLAMA_INDEX_CHUNK_OVERLAP
)
llama_index_documents_split = llama_index_document_splitter(
    llama_index_documents
)
print(f"documents split into chunks: {len(llama_index_documents_split)}")

### 3a) Create retriever-basic from content
* create an ingestion pipeline
* execute the pipeline
* save the cache of the pipeline to disk
* move the pipeline output into an in memory index
* create a retriever to retrieve documents from the index

In [None]:
llama_index_pipeline = IngestionPipeline(
    transformations=[
        Settings.embed_model
    ]
)

llama_index_nodes = llama_index_pipeline.run(
    documents=llama_index_documents_split,
    show_progress=True
)
print(f"chunks ingested into index: {len(llama_index_nodes)}")

llama_index_pipeline.persist("sam_test_pipeline_cache")

llama_index_index = VectorStoreIndex(
    nodes=llama_index_nodes,
)

llama_index_retriever = llama_index_index.as_retriever(
    similarity_top_k=LLAMA_INDEX_RETRIEVAL_TOP_K
)

### 3b) Create retriever-enriched from content
* create an ingestion pipeline
* execute the pipeline
* save the cache of the pipeline to disk
* move the pipeline output into an in memory index
* create a retriever to retrieve documents from the index

In [None]:
llama_index_pipeline_enriched = IngestionPipeline(
    transformations=[
        LlamaIndexContextualEnrichment(),
        Settings.embed_model
    ]
)

llama_index_nodes_enriched = llama_index_pipeline_enriched.run(
    documents=llama_index_documents_split,
    show_progress=True
)
print(f"chunks ingested into index (enriched): {len(llama_index_nodes_enriched)}")

llama_index_pipeline_enriched.persist("sam_test_pipeline_cache_enriched")

llama_index_index_enriched = VectorStoreIndex(
    nodes=llama_index_nodes_enriched,
)

llama_index_retriever_enriched = llama_index_index_enriched.as_retriever(
    similarity_top_k=LLAMA_INDEX_RETRIEVAL_TOP_K
)

### 3c) Create retriever-enriched-bm25 from content
* using enriched index, create a BM25 retriever

In [None]:
llama_index_retriever_bm25 = BM25Retriever.from_defaults(
    index=llama_index_index_enriched,
    similarity_top_k=LLAMA_INDEX_RETRIEVAL_TOP_K,
    stemmer=Stemmer.Stemmer("english"),
    language="english"
)

### 3d) Create retriever-enriched-bm25-with-reranker from content
* using both the enriched retriever AND the BM25 retriever, create a hybrid retriever that also adds re-ranking capabilities

In [None]:
llama_index_embedding_bm25_retriever_reranker = LlamaIndexEmbeddingBM25RerankerRetriever(
    llama_index_retriever_enriched,
    llama_index_retriever_bm25,
    reranker=colbert_reranker
)

### 4) Generate Testing Data
* setup asyncio for async processing
* iterate over split documents, and generate "n" sample questions based on the content

In [None]:
nest_asyncio.apply()

llama_index_dataset_qa = generate_question_context_pairs(
    nodes=llama_index_documents_split,
    num_questions_per_chunk=LLAMA_INDEX_SAMPLE_DATA_NUM_QUESTIONS_PER_CHUNK,
    qa_generate_prompt_tmpl="""
        ##Context information
        ---------------------
        {context_str}
        ---------------------

        ##Your role
        You are an expert financial analyst specialized in creating evaluation questions for RAG systems, 
        with deep knowledge of how to evaluate document retrieval systems using a given Amazon shareholders letter context.

        ##Task
        A web-crawler retrieved the provided context information from the aboutamazon.com website, which hosts Amazon shareholders letters.
        Your task is to analyze this context and setup {num_questions_per_chunk} questions for an upcoming evaluation. 
        Restrict the questions to the content from the provided context information.

        ##Output Requirements
        - Generate exactly {num_questions_per_chunk} questions
        - Each question must be answerable solely from the provided context
        - Questions must be self-contained without requiring external knowledge
        - Present only the questions with no additional text
        - Number each question
        - Each question should be on a new line
        - Generate questions based solely on the actual content of the shareholders letter, ignoring any website navigation elements, headers, 
        footers, or menu items. Focus only on substantive information from the letter itself.
    """,
    llm=Settings.llm
)

### 5a) Evaluate retriever-original

In [None]:
llama_index_evaluator = RetrieverEvaluator.from_metric_names(
    retriever=llama_index_retriever,
    metric_names=LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS
)
results = await llama_index_evaluator.aevaluate_dataset(llama_index_dataset_qa)

### 5b) Evaluate retriever-enriched

In [None]:
llama_index_evaluator_enriched = RetrieverEvaluator.from_metric_names(
    retriever=llama_index_retriever_enriched,
    metric_names=LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS
)
results_enriched = await llama_index_evaluator_enriched.aevaluate_dataset(llama_index_dataset_qa)

### 5c) Evaluate retriever-enriched-bm25

In [None]:
llama_index_evaluator_bm25 = RetrieverEvaluator.from_metric_names(
    retriever=llama_index_retriever_bm25,
    metric_names=LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS
)
results_bm25 = await llama_index_evaluator_bm25.aevaluate_dataset(llama_index_dataset_qa)

### 5d) Evaluate retriever-enriched-bm25-with-reranker

In [None]:
llama_index_evaluator_hybrid_search = RetrieverEvaluator.from_metric_names(
    retriever=llama_index_embedding_bm25_retriever_reranker,
    metric_names=LLAMA_INDEX_RETRIEVAL_EVALUATION_METRICS
)
results_hybrid = await llama_index_evaluator_hybrid_search.aevaluate_dataset(llama_index_dataset_qa)

### 6) Print Results
* using all the evaluations generated earlier, combine into a single dataframe and print

In [None]:
pd.concat(
    [
        llama_index_evaluation_print_results("retriever-original", results),
        llama_index_evaluation_print_results("retriever-enriched", results_enriched),
        llama_index_evaluation_print_results("retriever-enriched-bm25", results_bm25),
        llama_index_evaluation_print_results("retriever-enriched-hybrid", results_hybrid)
    ],
    ignore_index=True,
    axis=0
)