In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from qdrant_client import models
from qdrant_client import QdrantClient
from tqdm.auto import tqdm


In [1]:
from datasets import load_dataset

# Step 1: Load the SQuAD dataset
dataset = load_dataset("squad")

# Step 2: Extract unique contexts from the dataset
data = [item["context"] for item in dataset["train"]]
texts = list(set(data))

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 190948.72 examples/s]
Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 165987.60 examples/s]


In [3]:
def batch_iterate(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i : i + batch_size]

In [6]:
class EmbedData:

    def __init__(self, 
                 embed_model_name="nomic-ai/nomic-embed-text-v1.5",
                 batch_size=32):
        
        self.embed_model_name = embed_model_name
        self.embed_model = self._load_embed_model()
        self.batch_size = batch_size
        self.embeddings = []

    def _load_embed_model(self):
        embed_model = HuggingFaceEmbedding(model_name=self.embed_model_name,
                                           trust_remote_code=True,
                                           cache_folder='./hf_cache')
        return embed_model
    
    def generate_embedding(self, context):
        return self.embed_model.get_text_embedding_batch(context)
    
    def embed(self, contexts):
        self.contexts = contexts
        
        for batch_context in tqdm(batch_iterate(contexts, self.batch_size),
                                  total=len(contexts)//self.batch_size,
                                  desc="Embedding data in batches"):
                                  
            batch_embeddings = self.generate_embedding(batch_context)
            
            self.embeddings.extend(batch_embeddings)

In [8]:
batch_size = 32

embeddata = EmbedData(batch_size=batch_size)

embeddata.embed(texts)

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
<All keys matched successfully>
Embedding data in batches:  22%|██▏       | 132/590 [13:32:51<47:00:22, 369.48s/it]    


KeyboardInterrupt: 

In [9]:
import pickle
batch_size = 32

embeddata = EmbedData(batch_size=batch_size)

with open('embeddings_and_contexts.pkl', 'rb') as f:
    a, b = pickle.load(f)

embeddata.embeddings = a[:]
embeddata.contexts = b[:]

<All keys matched successfully>


In [31]:
# embeddata.contexts[0], embeddata.embeddings[0]

In [None]:
class QdrantVDB:

    def __init__(self, collection_name, vector_dim=768, batch_size=512):
        self.collection_name = collection_name
        self.batch_size = batch_size
        self.vector_dim = vector_dim

    def define_client(self):
        self.client = QdrantClient(url="http://localhost:6333",
                                   prefer_grpc=True)
        
    def create_collection(self):
        
        if not self.client.collection_exists(collection_name=self.collection_name):

            self.client.create_collection(collection_name=self.collection_name,
                                          
                                          vectors_config=models.VectorParams(
                                                              size=self.vector_dim,
                                                              distance=models.Distance.DOT,
                                                              on_disk=True),
                                          
                                          optimizers_config=models.OptimizersConfigDiff(
                                                                            default_segment_number=5,
                                                                            indexing_threshold=0)
                                         )
    
    def ingest_data(self, embeddata):
    
        for batch_context, batch_embeddings in tqdm(zip(batch_iterate(embeddata.contexts, self.batch_size), 
                                                        batch_iterate(embeddata.embeddings, self.batch_size)), 
                                                    total=len(embeddata.contexts)//self.batch_size, 
                                                    desc="Ingesting in batches"):
        
            self.client.upload_collection(collection_name=self.collection_name,
                                        vectors=batch_embeddings,
                                        payload=[{"context": context} for context in batch_context])

        self.client.update_collection(collection_name=self.collection_name,
                                    optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000),
                                    quantization_config=models.BinaryQuantization(
                                                        binary=models.BinaryQuantizationConfig(always_ram=True)),
                                         )
        

In [24]:
database = QdrantVDB("squad_collection")
database.define_client()
database.create_collection()
database.ingest_data(embeddata)

Ingesting in batches: 37it [00:07,  4.67it/s]                        


In [None]:
import time

class Retriever:

    def __init__(self, vector_db, embeddata):
        
        self.vector_db = vector_db
        self.embeddata = embeddata
    
    def search(self, query):
        query_embedding = self.embeddata.embed_model.get_query_embedding(query)
        
        # Start the timer
        start_time = time.time()
        
        result = self.vector_db.client.search(
            collection_name=self.vector_db.collection_name,
            
            query_vector=query_embedding,
            
            search_params=models.SearchParams(
                quantization=models.QuantizationSearchParams(
                    ignore=False,
                    rescore=True,
                    oversampling=2.0,
                )
            ),
            
            timeout=1000,
        )
        
        # End the timer
        end_time = time.time()
        elapsed_time = end_time - start_time

        print(f"Execution time for the search: {elapsed_time:.4f} seconds")

        return result

In [26]:
from llama_index.llms.ollama import Ollama

class RAG:

    def __init__(self,
                 retriever,
                 llm_name="llama3.2:1b"):
        
        self.llm_name = llm_name
        self.llm = self._setup_llm()
        self.retriever = retriever
        self.qa_prompt_tmpl_str = """Context information is below.
                                     ---------------------
                                     {context}
                                     ---------------------
                                     
                                     Given the context information above I want you
                                     to think step by step to answer the query in a
                                     crisp manner, incase case you don't know the
                                     answer say 'I don't know!'
                                     
                                     ---------------------
                                     Query: {query}
                                     ---------------------
                                     Answer: """
    
    def _setup_llm(self):
        return Ollama(model=self.llm_name)
    
    def generate_context(self, query):
    
        result = self.retriever.search(query)
        context = [dict(data) for data in result]
        combined_prompt = []

        for entry in context:
            context = entry["payload"]["context"]

            combined_prompt.append(context)

        return "\n\n---\n\n".join(combined_prompt)
    
    def query(self, query):
        context = self.generate_context(query=query)
        
        prompt = self.qa_prompt_tmpl_str.format(context=context,
                                                query=query)
        
        response = self.llm.complete(prompt)
        
        return dict(response)['text']

In [27]:
retriever = Retriever(database, embeddata)

rag = RAG(retriever)

In [30]:
query = """The premium and VIP services in Airports
           are reserved for which type of passengers?"""

answer = rag.query(query)

  result = self.vector_db.client.search(


Execution time for the search: 0.4293 seconds


ReadTimeout: timed out

In [None]:
from IPython.display import Markdown, display

display(Markdown(str(answer)))