Setup and Configuration : Weaviate Connection with Huggingface

For making use of weaviate, check this link (https://docs.weaviate.io/weaviate/model-providers/huggingface/embeddings)

In [1]:
import os
import weaviate
# from langchain_community.vectorstores import Weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import os

# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Recommended: save sensitive data as environment variables
huggingface_key = os.getenv("HUGGINGFACE_APIKEY")
headers = {
    "X-HuggingFace-Api-Key": huggingface_key,
}

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,                       # `weaviate_url`: your Weaviate URL
    auth_credentials=Auth.api_key(weaviate_api_key),      # `weaviate_key`: your Weaviate API key
    headers=headers
)

# Work with Weaviate
client.is_ready()

# client.close()

True

In [2]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 

Data Ingestion

<pre style="font-size: 12px;">
PDFs   → Text Extraction   →  Chunking   →    Embedding     →   Weaviate Storage
  ↓           ↓                 ↓              ↓                    ↓
[PDF1,2,3] → [Text1,2,3] → [Chunk1,2,3...] → [Vector1,2,3...] → Collection
</pre>

Document Processing : PDF Loading

In [3]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

loader = loader = PyPDFDirectoryLoader(
    path = "./pdfs/",
    glob = "**/[!.]*.pdf",
    mode = "page",
    headers = None,
    extraction_mode = "plain",
    # extraction_kwargs = None,
)
docs = loader.load()
docs


[Document(metadata={'producer': 'Acrobat Distiller 20.0 (Windows); modified using iText® Core 8.0.2 (AGPL version) ©2000-2023 Apryse Group NV', 'creator': 'FrameMaker 16.0.1', 'creationdate': '2024-08-22T09:07:13+00:00', 'author': 'Sebastian Raschka', 'moddate': '2025-01-17T02:08:30-05:00', 'title': 'Build a Large Language Model (From Scratch)', 'source': 'pdfs\\Build_a_Large_Language_Model.pdf', 'total_pages': 370, 'page': 0, 'page_label': 'Build a Large Language Model (From Scratch)'}, page_content='MANNING\nSebastian Raschka\nFROMSCRATCH\nBUILD A'),
 Document(metadata={'producer': 'Acrobat Distiller 20.0 (Windows); modified using iText® Core 8.0.2 (AGPL version) ©2000-2023 Apryse Group NV', 'creator': 'FrameMaker 16.0.1', 'creationdate': '2024-08-22T09:07:13+00:00', 'author': 'Sebastian Raschka', 'moddate': '2025-01-17T02:08:30-05:00', 'title': 'Build a Large Language Model (From Scratch)', 'source': 'pdfs\\Build_a_Large_Language_Model.pdf', 'total_pages': 370, 'page': 1, 'page_labe

Document Processing : Text Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
chunks = text_splitter.split_documents(docs)
len(chunks)

931

Vector Database setup : Weaviate (Collection creation and Schema definition)

In [5]:
# Delete the collection if it exists
client.collections.delete("Chatbot")

# Create the collection with the new API
client.collections.create(
                name="Chatbot",
                properties=[
                    weaviate.classes.config.Property(
                        name="content", data_type=weaviate.classes.config.DataType.TEXT
                    ),
                    weaviate.classes.config.Property(
                        name="source", data_type=weaviate.classes.config.DataType.TEXT
                    ),
                ],
                vector_config=[
                    Configure.Vectors.text2vec_huggingface(
                        name="content_vector",
                        source_properties=["content"],
                        model="sentence-transformers/all-MiniLM-L6-v2", #Embeddings model
                        wait_for_model=True,
                        use_cache=True,
                    )
                ],
            )

print("Created Document collection")


Created Document collection


Custom Vectorstore implementation (As we are using version 4)

In [9]:
from langchain.schema import BaseRetriever, Document
from langchain_community.vectorstores import VectorStore
from typing import List, Optional, Any
from pydantic import Field

class WeaviateV4Retriever(BaseRetriever):
    """Custom retriever for Weaviate v4"""
    vectorstore: Any = Field(description="The vectorstore to use")
    search_kwargs: dict = Field(default_factory=dict, description="Search kwargs")

# class WeaviateV4Retriever(BaseRetriever):
#     def __init__(self, vectorstore, search_kwargs):
#         self.vectorstore = vectorstore
#         self.search_kwargs = search_kwargs
    
    def _get_relevant_documents(self, query: str) -> List[Document]:  # Changed method name
        k = self.search_kwargs.get("k", 4)
        return self.vectorstore.similarity_search(query, k)

class WeaviateV4VectorStore(VectorStore):
    def __init__(self, client, collection_name):
        self.client = client
        self.collection_name = collection_name
        self.collection = client.collections.get(collection_name)
    
    @classmethod
    def from_texts(cls, texts, embedding=None, metadatas=None, **kwargs):
        """Required method for VectorStore inheritance"""
        client = kwargs.get("client")
        collection_name = kwargs.get("collection_name")
        
        if not client or not collection_name:
            raise ValueError("client and collection_name are required")
        
        vectorstore = cls(client, collection_name)
        
        # Create documents from texts
        documents = []
        for i, text in enumerate(texts):
            metadata = metadatas[i] if metadatas else {}
            doc = Document(page_content=text, metadata=metadata)
            documents.append(doc)
        
        vectorstore.add_documents(documents)
        return vectorstore
    
    def add_documents(self, documents):
        """Add documents to Weaviate"""
        data_objects = []
        for doc in documents:
            data_obj = {
                "content": doc.page_content,
                "source": doc.metadata.get("source", "unknown"),
            }
            data_objects.append(data_obj)
        
        self.collection.data.insert_many(data_objects)
        return self
    
    def similarity_search(self, query, k=4, **kwargs):
        """Search for similar documents"""
        response = self.collection.query.near_text(
            query=query,
            limit=k,
            return_properties=["content", "source"]
        )
        
        documents = []
        for obj in response.objects:
            doc = Document(
                page_content=obj.properties.get("content", ""),
                metadata={"source": obj.properties.get("source", "")}
            )
            documents.append(doc)
        
        return documents
    
    def as_retriever(self, search_kwargs=None):
        """Return as LangChain retriever"""
        return WeaviateV4Retriever(
            vectorstore=self,
            search_kwargs=search_kwargs or {}
        )

# Usage
vectordb = WeaviateV4VectorStore(client, "Chatbot")
vectordb.add_documents(chunks)
query = "What is Causal Attention?"
vectordb.similarity_search(query, k=20)


[Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='score to all other tokens.\nFigure 6.12 The causal attention \nmechanism, where the attention scores \nbetween input tokens are displayed in a \nmatrix format. The empty cells indicate \nmasked positions due to the causal attention \nmask, preventing tokens from attending to \nfuture tokens. The values in the cells \nrepresent attention scores; the last token, \ntime, is the only one that computes \nattention scores for all preceding tokens.\nLicensed to Gowtham Arulmozhi <arulmozg@oregonstate.edu>'),
 Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='with trainable weights.\nIn this section, we extended\nthe self-attention mechanism\nwith a causal mask and\ndropout mask.\nIn the next section, we\nextend causal attention\nto multi-head attention.\nFigure 3.23 Here’s what we’ve done so far. We began with a simplified attention mechanism, added trainable \nweights, a

Prompt Template

In [10]:
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

Query Processing

<pre style="font-size: 12px;">
User Question → Query Embedding → Vector Search → Top-k Docs → DocuFormat → Prompt Template →  LLM    →   Answer
     ↓               ↓               ↓              ↓          ↓            ↓                   ↓           ↓
"What is ML?" → [0.1,0.3,...] → Similarity → [Doc1,2,3] → "doc1\ndoc2..." → "Use context..." → "ML is..." → "Final answer"
</pre>



In [None]:
# Use with LangChain chains
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_groq import ChatGroq

llm = ChatGroq(model="mistral-saba-24b")


retriever = vectordb.as_retriever()

question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

query = "What is Causal Attention?"
chain.invoke({"input": query})


{'input': 'What is Causal Attention?',
 'context': [Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='score to all other tokens.\nFigure 6.12 The causal attention \nmechanism, where the attention scores \nbetween input tokens are displayed in a \nmatrix format. The empty cells indicate \nmasked positions due to the causal attention \nmask, preventing tokens from attending to \nfuture tokens. The values in the cells \nrepresent attention scores; the last token, \ntime, is the only one that computes \nattention scores for all preceding tokens.\nLicensed to Gowtham Arulmozhi <arulmozg@oregonstate.edu>'),
  Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='score to all other tokens.\nFigure 6.12 The causal attention \nmechanism, where the attention scores \nbetween input tokens are displayed in a \nmatrix format. The empty cells indicate \nmasked positions due to the causal attention \nmask, preventing tokens from atte

In [14]:
query = "Explain masked multi-head attention??"
vectordb.similarity_search(query, k=20)

retriever = vectordb.as_retriever()

question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

chain.invoke({"input": query})

{'input': 'Explain masked multi-head attention??',
 'context': [Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='ever, the key insight is that when we renormalize the attention weights after masking,\nLicensed to Gowtham Arulmozhi <arulmozg@oregonstate.edu>'),
  Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='ever, the key insight is that when we renormalize the attention weights after masking,\nLicensed to Gowtham Arulmozhi <arulmozg@oregonstate.edu>'),
  Document(metadata={'source': 'pdfs\\Build_a_Large_Language_Model.pdf'}, page_content='with trainable weights.\nIn this section, we extended\nthe self-attention mechanism\nwith a causal mask and\ndropout mask.\nIn the next section, we\nextend causal attention\nto multi-head attention.\nFigure 3.23 Here’s what we’ve done so far. We began with a simplified attention mechanism, added trainable \nweights, and then added a causal attention mask. Next, we will extend