<a href="https://colab.research.google.com/github/tarunku/open_llm/blob/main/HF_Expert_Knowledge_Worker_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU --upgrade fsspec==2025.3.0

In [None]:
!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu-cu11 openpyxl pacmap datasets langchain-community ragatouille

## Very Important -  Restart Session

In [None]:
# Imports
import os
import glob
import torch

from google.colab import userdata
from google.colab import drive

from huggingface_hub import login

from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, AutoModelForSpeechSeq2Seq, pipeline
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

## Knowledge Base

In [None]:
# Connect to google drive
drive.mount("/content/drive")

In [None]:
# huggingface login
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

## **Overall Workflow**
1. **Load and chunk markdown documents** from a specified directory.
2. **Convert text chunks into numerical embeddings** using a pre-trained model.
3. **Store/retrieve embeddings in a FAISS vector database** for efficient similarity search.


1. **Load files**  
   - It reads all files from a specified directory (`raw_knowledge_base_path`).
   - Uses `DirectoryLoader` from **LangChain** to load these documents.
   - Attaches metadata (`doc_type`) to identify which subfolder a document belongs to.

2. **Split documents into chunks**  
   - Uses `RecursiveCharacterTextSplitter` to break documents into chunks of ~1000 characters.
   - Overlaps 100 characters between chunks to preserve context.
   - Uses **Markdown-specific separators** (e.g., headings `#`, code blocks ```).
   - Returns the chunked documents.

In [None]:
# Splits documents into smaller chunks for embedding and retrieval.

def chunking(raw_knowledge_base_path , filter):
  # Read in documents using LangChain's loaders
  # Take everything in all the sub-folders of our knowledgebase

  folders = glob.glob(raw_knowledge_base_path)

  text_loader_kwargs = {'encoding': 'utf-8'}

  documents = []
  for folder in folders:
      doc_type = os.path.basename(folder)
      loader = DirectoryLoader(folder, glob= filter, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
      folder_docs = loader.load()
      for doc in folder_docs:
          doc.metadata["doc_type"] = doc_type
          documents.append(doc)

  # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
  # This list is taken from LangChain's MarkdownTextSplitter class
  MARKDOWN_SEPARATORS = [
      "\n#{1,6} ",
      "```\n",
      "\n\\*\\*\\*+\n",
      "\n---+\n",
      "\n___+\n",
      "\n\n",
      "\n",
      " ",
      "",
  ]

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=1000,  # The maximum number of characters in a chunk: we selected this value arbitrarily
      chunk_overlap=100,  # The number of characters to overlap between chunks
      add_start_index=True,  # If `True`, includes chunk's start index in metadata
      strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
      separators=MARKDOWN_SEPARATORS,
  )

  chunks = text_splitter.split_documents(documents)
  return chunks


## **Embedding and Storing in a Vector Database**
### **Purpose:**  
Converts text chunks into vector embeddings and stores them for similarity search.

### **Process:**
1. **Define Embedding Model**  
   - Uses `HuggingFaceEmbeddings` with the `thenlper/gte-small` model.
   - Runs embeddings on **GPU (`cuda`)**.
   - Normalizes embeddings for **cosine similarity**.

2. **Check for Existing Vector Database**  
   - If the FAISS vector database file (`VECTOR_DB_PATH`) exists, it loads it.
   - Otherwise, it:
     - Calls `chunking(RAW_KB_PATH)` to get document chunks.
     - Converts chunks into embeddings.
     - Stores the vector database locally.

In [None]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

VECTOR_DB_PATH = "/content/drive/MyDrive/__tmp/llms/knowledge_vector_db"  # Path to store the database
RAW_KB_PATH = "/content/drive/MyDrive/__tmp/llms/knowledge-base/*"  # Path to folder where files are stored
RAW_KB_DOC_FILTER = "**/*.md"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

# Check if the database file already exists
if os.path.exists(VECTOR_DB_PATH):
    print("Loading existing vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(VECTOR_DB_PATH, embedding_model, allow_dangerous_deserialization=True) # Changed EMBEDDING_MODEL_NAME to embedding_model
else:
    print("Creating new vector database...")
    KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
        chunking(RAW_KB_PATH, RAW_KB_DOC_FILTER), embedding_model, distance_strategy=DistanceStrategy.COSINE # Changed EMBEDDING_MODEL_NAME to embedding_model
    )
    print("Saving vector database to Google Drive...")
    KNOWLEDGE_VECTOR_DATABASE.save_local(VECTOR_DB_PATH)
    print("Vector database saved successfully!")


- This code **embeds a user query** and retrieves the **most relevant documents** using FAISS.
- **How it works**:
  1. Converts the query into a vector embedding.
  2. Searches for the **top 5 closest matches** in the vector database.
  3. Returns the **most relevant document chunks**.
  4. Measures **retrieval speed**.
  5. Prints metadata of retrieved documents.
- **Use Cases**:
  - AI-powered **search and recommendation systems**.
  - **Chatbots** that provide information based on stored knowledge.
  - **Automated document retrieval** in knowledge bases.


In [None]:
# Embed a user query in the same space
import time
start_time = time.time()  # Record the start time

user_query = "Who is Lancaster?"
#query_vector = embedding_model.embed_query(user_query)
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)

end_time = time.time()  # Record the end time
retrieval_time = end_time - start_time  # Calculate the time difference

print(f"Retrieval time: {retrieval_time:.4f} seconds")

print('\n'.join([str(doc.metadata) for doc in retrieved_docs]))

- Creates a **text-generation pipeline** for **efficient model inference**.
- **Key Parameters:**
  - **`task="text-generation"`** → Specifies the type of model.
  - **`do_sample=True`** → Uses random sampling instead of greedy decoding.
  - **`temperature=0.2`** → Low value makes responses more deterministic.
  - **`repetition_penalty=1.1`** → Prevents repetition by penalizing repeated words.
  - **`return_full_text=False`** → Only returns generated text (excludes input prompt).
  - **`max_new_tokens=500`** → Limits generation length to 500 tokens.
  
This function **loads a quantized text-generation model** using **Hugging Face Transformers** and **bitsandbytes (bnb) for efficient inference**.  

It sets up a **causal language model (CLM)** with optimized **4-bit quantization** to reduce memory usage while maintaining accuracy. The function **returns a text-generation pipeline and tokenizer**, ready to generate text.

- **Quantization** reduces memory and speeds up inference, especially for large models.
- **Key settings:**
  - **`load_in_4bit=True`** → Loads the model in **4-bit** mode, significantly reducing VRAM usage.
  - **`bnb_4bit_use_double_quant=True`** → Uses **double quantization** for better compression.
  - **`bnb_4bit_quant_type="nf4"`** → Uses **NF4 (Normal Float 4)**, a specialized data format optimized for AI.
  - **`bnb_4bit_compute_dtype=torch.bfloat16`** → Uses **bfloat16** (faster and more stable than float16).

🚀 **Why 4-bit quantization?**  
- Reduces memory usage (helps run large models on consumer GPUs).  
- Allows **faster inference** without major accuracy loss.

In [None]:

def load_pipeline(READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"):
  bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
  )
  model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
  tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

  return pipeline(
      model=model,
      tokenizer=tokenizer,
      task="text-generation",
      do_sample=True,
      temperature=0.2,
      repetition_penalty=1.1,
      return_full_text=False,
      max_new_tokens=500,
  ), tokenizer

In [None]:
# pipeline, tokenizer = load_pipeline("HuggingFaceH4/zephyr-7b-beta")
# pipeline, tokenizer = load_pipeline("meta-llama/Meta-Llama-3.1-8B-Instruct")
pipeline, tokenizer = load_pipeline("Qwen/Qwen2-7B-Instruct")


In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be found from the context, then do not give an answer, but say i dont know.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

**answers a user query using a Retrieval-Augmented Generation (RAG) pipeline** by:  

1. **Retrieving relevant documents** from a **FAISS vector database**.  
2. **Optionally reranking** the retrieved documents using a **pretrained RAG model**.  
3. **Building a final prompt** by including the retrieved documents.  
4. **Generating an answer** using a **text-generation model (LLM)**.  

### **Key Steps**
1. **Retrieve Documents** → Finds `num_retrieved_docs` most relevant documents using FAISS.  
2. **Rerank (Optional)** → If a `reranker` model is provided, it refines the ranking.  
3. **Format the Prompt** → Constructs a final prompt using retrieved documents.  
4. **Generate Answer** → Passes the prompt to the **LLM** for response generation.  
5. **Return** → The function outputs the **generated answer** and the **final set of documents used**.  


In [None]:
from ragatouille import RAGPretrainedModel
from typing import Optional, List, Tuple
from transformers import AutoTokenizer, Pipeline
from tqdm.notebook import tqdm
from tqdm.notebook import tqdm
import pandas as pd
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain.docstore.document import Document as LangchainDocument

def answer_with_rag(
    question: str,
    llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 5,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    #print(final_prompt)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs

In [None]:
question = "what Emily Carter was doing during 2019-2021?"

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

answer, relevant_docs = answer_with_rag(question, pipeline, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)


In [None]:
answer

In [None]:
question = "what is the capital city of India?"

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

answer, relevant_docs = answer_with_rag(question, pipeline, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)


In [None]:
answer