In [2]:
import fitz
import json
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from together import Together
import PIL.Image
import os
import google.generativeai as genai
from pdf2image import convert_from_path
import time

# Ensure you have these environment variables set
TOGETHER_AI_API_KEY = "64880c44ef37384040dc253c954ed2f190c0e4702c3e80745e5eb78221f47376"
PINECONE_API_KEY = "pcsk_2aEGcj_7cwy95qcT59b57wGLdNgNquJdiTiBJXNU27UiEob5cisrASpM99fcBHPeHwxp4U"
PINECONE_ENVIRONMENT = "us-east-1"
PINECONE_BASE_INDEX_NAME = "rag-chatbot-index"
GOOGLE_API_KEY = "AIzaSyBe7hdWbsCf6kQmyoMAUXbOlr7p8v1Tjhk"

def load_prompt(prompt_file: Path) -> str:
    """Loads the prompt from the given text file."""
    try:
        with open(prompt_file, "r", encoding="utf-8") as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Error: Prompt file not found at {prompt_file}. Using default prompt.")
        return ""
    except Exception as e:
        print(f"Error loading prompt from {prompt_file}: {e}. Using default prompt.")
        return ""

def extract_text_from_pdf(pdf_path: Path) -> dict:
    """
    Extracts text from a PDF file, page by page, using fitz.
    If fitz extraction fails or extracts less than 20 words, it falls back to Gemini Vision API.

    Args:
      pdf_path: Path to the input PDF file.

    Returns:
      A dictionary where keys are page numbers (starting from 1) and values are the
      extracted text from that page.
    """
    try:
        pdf_document = fitz.open(pdf_path)
        page_text = {}
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text = page.get_text()
            page_text[page_number + 1] = text
        pdf_document.close()

        total_words = sum(len(text.split()) for text in page_text.values())
        if total_words >= 20:
            return page_text
        else:
            print(f"fitz extracted less than 20 words ({total_words}). Falling back to Gemini Vision API.")
    except Exception as e:
        print(f"An error occurred during PDF extraction with fitz: {e}. Falling back to Gemini Vision API.")

    # Fallback to Gemini Vision API
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel(model_name="gemini-1.5-pro")
        images = convert_from_path(pdf_path)
        gemini_page_text = {}
        pdf_name = os.path.splitext(os.path.basename(str(pdf_path)))[0]
        output_dir = "GeminiVisionResult"
        os.makedirs(output_dir, exist_ok=True)

        prompt_file_path = Path("prompt.txt")
        prompt = load_prompt(prompt_file_path)

        if not images:
            raise FileNotFoundError(f"Could not convert the PDF to images")

        for i, img in enumerate(images):
            page_number = i + 1
            output_file_path = os.path.join(output_dir, f"{pdf_name}_{page_number}.txt")

            try:
                response = model.generate_content([prompt, img], generation_config={"max_output_tokens": 4096})
                response.resolve()
                gemini_page_text[page_number] = response.text
                print(f"Gemini processed page {page_number}")
            except Exception as page_err:
                print(f"Error processing page {page_number} with Gemini: {page_err}")
                gemini_page_text[page_number] = f"Error: An error occurred during Gemini processing of page {page_number}: {page_err}"
        return gemini_page_text

    except FileNotFoundError as e:
        print(f"Error: Could not find file: {e}")
        return {}
    except Exception as e:
        print(f"Error during Gemini Vision API processing: {e}")
        return {}

def semantic_chunking(text_dict: dict, chunk_size: int = 500, chunk_overlap: int = 50):
    """
    Chunks the extracted text semantically.

    Args:
      text_dict: Dictionary of page-wise extracted text.
      chunk_size: Maximum size of each chunk.
      chunk_overlap: Number of overlapping characters between chunks.

    Returns:
      A list of text chunks.
    """
    all_text = "\n".join(text_dict.values())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        length_function=len,
    )
    chunks = text_splitter.split_text(all_text)
    return chunks

def embed_and_upsert_to_pinecone(chunks: list[str], index):
    """
    Embeds the text chunks using HuggingFace embeddings and upserts them to Pinecone.
    """
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2") # Choose an appropriate model

    batch_size = 32
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        ids = [f"chunk-{i}-{j}" for j in range(len(batch_chunks))]
        embeds = embeddings.embed_documents(batch_chunks)
        metadata = [{"text": text} for text in batch_chunks]
        to_upsert = list(zip(ids, embeds, metadata))
        index.upsert(vectors=to_upsert)
    print(f"Upserted {len(chunks)} chunks to Pinecone.")

def generate_response(query: str, context: str, model_name: str = "meta-llama/Llama-3-8b-chat-hf"):
    """
    Generates a response using Together AI's API with the `together` library.

    Args:
      query: The user's question.
      context: Retrieved relevant text from Pinecone.
      model_name: The name of the Together AI model to use.

    Returns:
      The LLM's response.
    """
    if not TOGETHER_AI_API_KEY:
        raise ValueError("TOGETHER_AI_API_KEY environment variable not set.")

    client = Together(api_key=TOGETHER_AI_API_KEY)

    prompt = f"Context:\n{context}\n\nQuestion: {query}"

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=512,
        temperature=0.7
    )

    return response.choices[0].message.content

def query_pinecone(query: str, index, top_k: int = 5):
    """
    Queries Pinecone for relevant chunks.

    Args:
      query: The user's question.
      index: The Pinecone index to query.
      top_k: Number of relevant chunks to retrieve.

    Returns:
      A string containing the concatenated text of the top_k chunks.
    """
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    query_vector = embeddings.embed_query(query)

    results = index.query(vector=query_vector, top_k=top_k, include_values=False, include_metadata=True)
    context = "\n\n".join([match.metadata["text"] for match in results.matches])
    return context

class RAGChatbot:
    def __init__(self, existing_index_name="rag-chatbot-index-1736437058"):
        if not PINECONE_API_KEY:
            raise ValueError("Pinecone API key must be set.")
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        self.pinecone_index_name = existing_index_name
        print(f"Connecting to existing Pinecone index '{self.pinecone_index_name}'...")
        try:
            self.index = self.pc.Index(self.pinecone_index_name)
        except Exception as e:
            raise Exception(f"Error connecting to Pinecone index '{self.pinecone_index_name}': {e}")

    def ingest_pdfs(self, pdf_paths: list[Path]):
        """Ingests a list of PDFs, chunks them, and uploads to Pinecone."""
        for pdf_path in pdf_paths:
            print(f"Processing PDF: {pdf_path}")
            extracted_text = extract_text_from_pdf(pdf_path)
            if extracted_text:
                chunks = semantic_chunking(extracted_text)
                embed_and_upsert_to_pinecone(chunks, self.index)
            else:
                print(f"No text extracted from the PDF: {pdf_path}")

    def query(self, query: str):
        """Queries the chatbot with a user's question."""
        context = query_pinecone(query, self.index)
        if not context:
            return "No relevant information found in the document."
        response = generate_response(query, context)
        return response

  from tqdm.autonotebook import tqdm


In [3]:
chatbot = RAGChatbot(existing_index_name="rag-chatbot-index-1736437058")

# You can now query the existing index
user_query = "What are the main points of this document?"
response = chatbot.query(user_query)
print(f"Question: {user_query}")
print(f"Answer: {response}")

Connecting to existing Pinecone index 'rag-chatbot-index-1736437058'...


  embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
2025-01-10 01:16:31.649609: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736451991.664677    9676 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736451991.668704    9676 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-10 01:16:31.683868: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Question: What are the main points of this document?
Answer: The main points of this document appear to be:

1. Compliance with listing and legal requirements: The document mentions compliance with listing and other legal requirements relating to financial statements.
2. Financial statements: The document discusses the preparation and presentation of financial statements, including a summary of significant accounting policies and the basis of preparation and presentation.
3. Management discussion and analysis: The document includes a management discussion and analysis of the financial condition and results of operations.
4. Related party transactions: The document requires the disclosure of related party transactions and significant related party transactions.
5. Audit and internal control: The document mentions management letters/letters of internal control weaknesses issued by statutory auditors, internal audit reports relating to internal control weaknesses, and the appointment, rem