In [1]:
from llama_parse import LlamaParse
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SimilarityPostprocessor

from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

LLAMAPARSER_API_KEY = os.environ.get("LLAMAPARSER_API_KEY")


loader = LlamaParse(
        api_key=LLAMAPARSER_API_KEY,  # can also be set in your env as LLAMA_CLOUD_API_KEY
        result_type="markdown",  # "markdown" and "text" are available
        verbose=True,
    )

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
embedding_model = OpenAIEmbedding(model = "text-embedding-3-small", api_key=OPENAI_API_KEY)

In [4]:
print("Uploading documents")
documents  = loader.load_data(file_path= "CAG_Research_Paper.pdf")
print("Successfully uploaded")

Uploading documents
Started parsing the file under job_id c3b412c2-30af-4326-83cc-6d4d9de04747
Successfully uploaded


In [5]:
for i in documents:
    i.metadata = {"name" : "CAG_Research_Paper.pdf"}

In [6]:
splitter = SentenceSplitter(
   include_metadata=True
)
nodes = splitter.get_nodes_from_documents(documents)

In [7]:
for node in nodes:
    if node.text!= "":
        node.embedding = embedding_model.get_text_embedding(node.get_content(metadata_mode="all"))

In [8]:
nodes

[TextNode(id_='48f46bc5-82bd-4d61-a573-9b21d425c8e7', embedding=[0.00684560788795352, 0.020683323964476585, 0.0540723092854023, 0.013784443959593773, -0.005430538207292557, -0.006925517693161964, 0.014077446423470974, 0.004584826063364744, -0.038489896804094315, 0.034867316484451294, 0.013285007327795029, -0.05614996701478958, 0.012372703291475773, 0.022614479064941406, -0.004784600343555212, -0.018006345257163048, -0.007278452627360821, -0.0630754828453064, -0.01240599900484085, 0.004002150148153305, 0.017287157475948334, -0.01756684109568596, 0.006669140420854092, 0.008850011974573135, -0.026330284774303436, -0.04813234880566597, 0.024625541642308235, 0.046267785131931305, 0.021322602406144142, -0.022587841376662254, 0.019071809947490692, -0.03180410712957382, -0.060252003371715546, -0.019244948402047157, -0.013498100452125072, 0.07287775725126266, 0.016381513327360153, 0.0020293763373047113, 0.010574732907116413, 0.018965262919664383, -0.007771229837089777, -0.020536823198199272, -0

In [9]:
print(f"Document split into {len(nodes)} chunks.\n" )

Document split into 7 chunks.



In [10]:
nodes[0].text

'# Don’t Do RAG: When Cache-Augmented Generation is All You Need for Knowledge Tasks\n\nBrian J Chan∗ Hen-Hsen Huang Chao-Ting Chen∗ Insititue of Information Science Jui-Hung Cheng∗ Academia Sinica\n\narXiv:2412.15605v1 [cs.CL] 20 Dec 2024 Department of Computer Science National Chengchi University Taipei, Taiwan\n\nhhhuang@iis.sinica.edu.tw {110703065,110703038,110703007}@nccu.edu.tw\n\n# Abstract\n\nRetrieval-augmented generation (RAG) has gained traction as a powerful approach for enhancing language models by integrating external knowledge sources. However, RAG introduces challenges such as retrieval latency, potential errors in document selection, and increased system complexity. With the advent of large language models (LLMs) featuring significantly extended context windows, this paper proposes an alternative paradigm, cache-augmented generation (CAG) that bypasses real-time retrieval. Our method involves preloading all relevant resources, especially when the documents or knowledg

In [29]:
from llama_index.core import Settings
Settings.embed_model = embedding_model

In [30]:
from llama_index.core.indices.vector_store.base import VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://e3149d5f-9707-4d2c-b1fa-ed29d7654016.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="_Dc_l7DWzKMN-DOiN7oNoyGixqeJNfmycR8b5RUUXzqj4wzNyrY0wg",
)

print(qdrant_client.get_collections())

vector_store = QdrantVectorStore(client= qdrant_client, collection_name="cag_research_paper")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

collections=[CollectionDescription(name='cag_research_paper')]


In [31]:

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='cag_research_paper')]


In [32]:
vars(qdrant_client.get_collections().collections[0])

{'name': 'cag_research_paper'}

In [33]:
vector_store

QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='cag_research_paper', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None, text_key='text')

In [34]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x131f8438260>

In [35]:
from llama_index.core.response_synthesizers import ResponseMode, get_response_synthesizer
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SimilarityPostprocessor

# Create Retriever and Query Engine with SimilarityPostprocessor
retriever=VectorIndexRetriever(index= index, similarity_top_k=15, vector_store_query_mode= VectorStoreQueryMode.DEFAULT, verbose = True)
postprocessor=SimilarityPostprocessor(similarity_cutoff=0.3)
query_engine=RetrieverQueryEngine(retriever=retriever, node_postprocessors=[postprocessor], response_synthesizer = get_response_synthesizer(response_mode = ResponseMode.COMPACT))

In [36]:
query = "What is CAG?"
print(query)
results=query_engine.query(query)
pprint_response(results,show_source=True)
print(results)

What is CAG?
Final Response: CAG stands for cache-augmented generation. It is a
methodology proposed in the research paper that leverages long-context
large language models (LLMs) to preload all relevant documents in
advance and precompute key-value (KV) caches. This approach eliminates
the need for real-time retrieval during inference, reducing retrieval
latency, minimizing errors, and simplifying system architecture while
ensuring high-quality responses.
______________________________________________________________________
Source Node 1/6
Node ID: 48f46bc5-82bd-4d61-a573-9b21d425c8e7
Similarity: 0.44678912
Text: # Don’t Do RAG: When Cache-Augmented Generation is All You Need
for Knowledge Tasks  Brian J Chan∗ Hen-Hsen Huang Chao-Ting Chen∗
Insititue of Information Science Jui-Hung Cheng∗ Academia Sinica
arXiv:2412.15605v1 [cs.CL] 20 Dec 2024 Department of Computer Science
National Chengchi University Taipei, Taiwan  hhhuang@iis.sinica.edu.tw
{110703065...
___________________________