In [1]:
import fitz
import json
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from qdrant_client import QdrantClient, models
from qdrant_client.models import PointStruct
from together import Together
import PIL.Image
import os
import google.generativeai as genai
from pdf2image import convert_from_path
import time
import uuid  # Import the uuid module

# Ensure you have these environment variables set
TOGETHER_AI_API_KEY = "64880c44ef37384040dc253c954ed2f190c0e4702c3e80745e5eb78221f47376"
GOOGLE_API_KEY = "AIzaSyBe7hdWbsCf6kQmyoMAUXbOlr7p8v1Tjhk"

# Qdrant Configuration
QDRANT_PATH = "qdrant_data"  # Path to store Qdrant data locally
QDRANT_COLLECTION_NAME = "rag_chatbot_collection"

def load_prompt(prompt_file: Path) -> str:
    """Loads the prompt from the given text file."""
    try:
        with open(prompt_file, "r", encoding="utf-8") as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Error: Prompt file not found at {prompt_file}. Using default prompt.")
        return ""
    except Exception as e:
        print(f"Error loading prompt from {prompt_file}: {e}. Using default prompt.")
        return ""

def extract_text_from_pdf(pdf_path: Path) -> dict:
    """
    Extracts text from a PDF file, page by page, using fitz.
    If fitz extraction fails or extracts less than 20 words, it falls back to Gemini Vision API.

    Args:
      pdf_path: Path to the input PDF file.

    Returns:
      A dictionary where keys are page numbers (starting from 1) and values are the
      extracted text from that page.
    """
    try:
        pdf_document = fitz.open(pdf_path)
        page_text = {}
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text = page.get_text()
            page_text[page_number + 1] = text
        pdf_document.close()

        total_words = sum(len(text.split()) for text in page_text.values())
        if total_words >= 20:
            return page_text
        else:
            print(f"fitz extracted less than 20 words ({total_words}). Falling back to Gemini Vision API.")
    except Exception as e:
        print(f"An error occurred during PDF extraction with fitz: {e}. Falling back to Gemini Vision API.")

    # Fallback to Gemini Vision API
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel(model_name="gemini-1.5-pro")
        images = convert_from_path(pdf_path)
        gemini_page_text = {}
        pdf_name = os.path.splitext(os.path.basename(str(pdf_path)))[0]
        output_dir = "GeminiVisionResult"
        os.makedirs(output_dir, exist_ok=True)

        prompt_file_path = Path("prompt.txt")
        prompt = load_prompt(prompt_file_path)

        if not images:
            raise FileNotFoundError(f"Could not convert the PDF to images")

        for i, img in enumerate(images):
            page_number = i + 1
            output_file_path = os.path.join(output_dir, f"{pdf_name}_{page_number}.txt")

            try:
                response = model.generate_content([prompt, img], generation_config={"max_output_tokens": 4096})
                response.resolve()
                gemini_page_text[page_number] = response.text
                print(f"Gemini processed page {page_number}")
            except Exception as page_err:
                print(f"Error processing page {page_number} with Gemini: {page_err}")
                gemini_page_text[page_number] = f"Error: An error occurred during Gemini processing of page {page_number}: {page_err}"
        return gemini_page_text

    except FileNotFoundError as e:
        print(f"Error: Could not find file: {e}")
        return {}
    except Exception as e:
        print(f"Error during Gemini Vision API processing: {e}")
        return {}

def semantic_chunking(text_dict: dict, chunk_size: int = 500, chunk_overlap: int = 50):
    """
    Chunks the extracted text semantically.

    Args:
      text_dict: Dictionary of page-wise extracted text.
      chunk_size: Maximum size of each chunk.
      chunk_overlap: Number of overlapping characters between chunks.

    Returns:
      A list of text chunks.
    """
    all_text = "\n".join(text_dict.values())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        length_function=len,
    )
    chunks = text_splitter.split_text(all_text)
    return chunks

def embed_and_upsert_to_qdrant(chunks: list[str], qdrant_client, collection_name: str):
    """
    Embeds the text chunks using HuggingFace embeddings and upserts them to Qdrant.
    """
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2") # Choose an appropriate model

    batch_size = 32
    points = []
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        # Generate UUIDs for point IDs
        ids = [uuid.uuid4().hex for _ in range(len(batch_chunks))]
        embeds = embeddings.embed_documents(batch_chunks)
        metadatas = [{"text": text} for text in batch_chunks]
        for id, embed, metadata in zip(ids, embeds, metadatas):
            points.append(PointStruct(id=id, vector=embed, payload=metadata))

    qdrant_client.upsert(collection_name=collection_name, points=points, wait=True)
    print(f"Upserted {len(chunks)} chunks to Qdrant.")

def generate_response(query: str, context: str, model_name: str = "meta-llama/Llama-3-8b-chat-hf"):
    """
    Generates a response using Together AI's API with the `together` library.

    Args:
      query: The user's question.
      context: Retrieved relevant text from Qdrant.
      model_name: The name of the Together AI model to use.

    Returns:
      The LLM's response.
    """
    if not TOGETHER_AI_API_KEY:
        raise ValueError("TOGETHER_AI_API_KEY environment variable not set.")

    client = Together(api_key=TOGETHER_AI_API_KEY)

    prompt = f"Context:\n{context}\n\nQuestion: {query}"

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=512,
        temperature=0.7
    )

    return response.choices[0].message.content

def query_qdrant(query: str, qdrant_client, collection_name: str, top_k: int = 5):
    """
    Queries Qdrant for relevant chunks.

    Args:
      query: The user's question.
      qdrant_client: The Qdrant client.
      collection_name: The name of the Qdrant collection.
      top_k: Number of relevant chunks to retrieve.

    Returns:
      A string containing the concatenated text of the top_k chunks.
    """
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    query_vector = embeddings.embed_query(query)

    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k,
        with_payload=True
    )
    context = "\n\n".join([hit.payload["text"] for hit in search_result])
    return context

class RAGChatbot:
    def __init__(self):
        self.qdrant_client = QdrantClient(path=QDRANT_PATH)
        self.collection_name = QDRANT_COLLECTION_NAME
        print(f"Creating Qdrant collection '{self.collection_name}'...")
        try:
            self.qdrant_client.recreate_collection(
                collection_name=self.collection_name,
                vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE),
            )
        except Exception as e:
            raise Exception(f"Error creating Qdrant collection: {e}")

    def ingest_pdfs(self, pdf_paths: list[Path]):
        """Ingests a list of PDFs, chunks them, and uploads to Qdrant."""
        for pdf_path in pdf_paths:
            print(f"Processing PDF: {pdf_path}")
            extracted_text = extract_text_from_pdf(pdf_path)
            if extracted_text:
                chunks = semantic_chunking(extracted_text)
                embed_and_upsert_to_qdrant(chunks, self.qdrant_client, self.collection_name)
            else:
                print(f"No text extracted from the PDF: {pdf_path}")

    def query(self, query: str):
        """Queries the chatbot with a user's question."""
        context = query_qdrant(query, self.qdrant_client, self.collection_name)
        if not context:
            return "No relevant information found in the document."
        response = generate_response(query, context)
        return response


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_files = [
    Path("7. UNIMECH AEROSPACE_Price band.pdf")
]

chatbot = RAGChatbot()

chatbot.ingest_pdfs(pdf_files)
print("Qdrant setup and ready for querying.")

  self.qdrant_client.recreate_collection(


Creating Qdrant collection 'rag_chatbot_collection'...
Processing PDF: 7. UNIMECH AEROSPACE_Price band.pdf


  embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2") # Choose an appropriate model
2025-01-09 20:03:52.110342: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736433232.124018   13905 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736433232.128061   13905 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-09 20:03:52.143252: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Upserted 198 chunks to Qdrant.
Qdrant setup and ready for querying.


In [3]:
print("=" * 50)
# Start an interactive chat session
while True:
    user_query = input("You: ")
    if user_query.lower() == "exit":
        break
    response = chatbot.query(user_query)
    print(f"User: {user_query}")
    print(f"Chatbot: {response}")
    print("=" * 50)

User: What is IPO?
Chatbot: IPO stands for Initial Public Offering, which is the first public offering of a company's stock or equity shares to the general public.
User: 
Chatbot: It appears that you are asking a general question about the Initial Public Offer (IPO) of Unimech Aerospace and Manufacturing Limited, an engineering solutions company. Please feel free to clarify or ask your specific question, and I'll do my best to assist you.


KeyboardInterrupt: Interrupted by user