In [4]:
import hashlib
import os
import time
from pathlib import Path
from typing import List, Dict

import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from pdf2image import convert_from_path
from pinecone import Pinecone
from PyPDF2 import PdfReader
from together import Together

# --- Configuration ---
TOGETHER_AI_API_KEY = "64880c44ef37384040dc253c954ed2f190c0e4702c3e80745e5eb78221f47376"
PINECONE_API_KEY = "pcsk_4Y7Kbx_UDYP7pqs92wU4PkmYnAPzGdD6HUCmZnr5npQEdz18se3FwMhyUTcuRggekeTvFm"
PINECONE_ENVIRONMENT = "us-east-1"
PINECONE_BASE_INDEX_NAME = "rag-chatbot-index"
NUM_PINECONE_INDEXES = 5
GOOGLE_API_KEY = "AIzaSyBe7hdWbsCf6kQmyoMAUXbOlr7p8v1Tjhk"
EMBEDDING_MODEL_NAME = "all-mpnet-base-v2"
LLM_MODEL_NAME = "meta-llama/Llama-3-8b-chat-hf"
CHUNK_SIZE = 300
CHUNK_OVERLAP = 50
MAX_SUMMARY_TOKENS = 350
MAX_RESPONSE_TOKENS = 512
SUMMARY_TEMPERATURE = 0.4
RESPONSE_TEMPERATURE = 0.4

# --- Helper Functions ---
def load_prompt(prompt_file: Path) -> str:
    """Loads the prompt from the given text file."""
    try:
        with open(prompt_file, "r", encoding="utf-8") as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Error: Prompt file not found at {prompt_file}. Using default prompt.")
        return ""
    except Exception as e:
        print(f"Error loading prompt from {prompt_file}: {e}. Using default prompt.")
        return ""


def extract_text_from_pdf(pdf_path: Path) -> Dict[int, str]:
    """Extracts text from a PDF, trying PyPDF2 first, then Gemini Vision API."""
    try:
        pdf_reader = PdfReader(str(pdf_path))
        page_text = {}
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text = page.extract_text()
            page_text[page_number + 1] = text

        total_words = sum(len(text.split()) for text in page_text.values())
        if total_words >= 20:
            return page_text
        else:
            print(f"PyPDF2 extracted less than 20 words ({total_words}). Falling back to Gemini Vision API.")
    except Exception as e:
        print(f"An error occurred during PDF extraction with PyPDF2: {e}. Falling back to Gemini Vision API.")

    # Fallback to Gemini Vision API
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel(model_name="gemini-1.5-pro")
        images = convert_from_path(pdf_path)
        gemini_page_text = {}
        pdf_name = os.path.splitext(os.path.basename(str(pdf_path)))[0]
        output_dir = "GeminiVisionResult"
        os.makedirs(output_dir, exist_ok=True)

        prompt_file_path = Path("prompt.txt")
        prompt = load_prompt(prompt_file_path)

        if not images:
            raise FileNotFoundError(f"Could not convert the PDF to images")

        for i, img in enumerate(images):
            page_number = i + 1
            output_file_path = os.path.join(output_dir, f"{pdf_name}_{page_number}.txt")

            try:
                response = model.generate_content([prompt, img], generation_config={"max_output_tokens": 4096})
                response.resolve()
                gemini_page_text[page_number] = response.text
                print(f"Gemini processed page {page_number}")
            except Exception as page_err:
                print(f"Error processing page {page_number} with Gemini: {page_err}")
                gemini_page_text[page_number] = f"Error: An error occurred during Gemini processing of page {page_number}: {page_err}"
        return gemini_page_text

    except FileNotFoundError as e:
        print(f"Error: Could not find file: {e}")
        return {}
    except Exception as e:
        print(f"Error during Gemini Vision API processing: {e}")
        return {}


def semantic_chunking(text_dict: Dict[int, str], chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP) -> List[str]:
    """Chunks the extracted text semantically."""
    all_text = "\n".join(text_dict.values())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        length_function=len,
    )
    return text_splitter.split_text(all_text)


def summarize_chunks(chunks: List[str], prompt: str, pdf_name: str, model_name: str = LLM_MODEL_NAME) -> List[str]:
    """Summarizes a list of text chunks using Together AI's API with progress tracking."""
    if not TOGETHER_AI_API_KEY:
        raise ValueError("TOGETHER_AI_API_KEY environment variable not set.")

    client = Together(api_key=TOGETHER_AI_API_KEY)
    summarized_chunks = []
    total_chunks = len(chunks)

    print(f"\nProcessing {pdf_name}")
    print(f"Total chunks to process: {total_chunks}")

    for i, chunk in enumerate(chunks):
        try:
            prompt_with_chunk = f"{prompt}\n\nChunk: {chunk}"
            response = client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt_with_chunk}],
                max_tokens=MAX_SUMMARY_TOKENS,
                temperature=SUMMARY_TEMPERATURE,
            )
            summary = response.choices[0].message.content
            summarized_chunks.append(summary)

            # Progress update
            progress = (i + 1) / total_chunks * 100
            print(f"\r{pdf_name} - Progress: {progress:.1f}% ({i + 1}/{total_chunks} chunks)", end="", flush=True)

            # Add a small delay to avoid rate limiting
            time.sleep(0.5)

        except Exception as e:
            print(f"\nError summarizing chunk {i} of {pdf_name}: {e}")
            summarized_chunks.append(chunk)

    print(f"\n✓ Completed processing {pdf_name}\n")
    return summarized_chunks


def embed_and_upsert_to_pinecone(chunks: List[str], index, document_name: str):
    """Embeds the text chunks and upserts them to Pinecone."""
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    batch_size = 32
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i: i + batch_size]
        ids = [f"chunk-{document_name}-{i}-{j}" for j in range(len(batch_chunks))]
        embeds = embeddings.embed_documents(batch_chunks)
        metadata = [{"text": text, "document": document_name} for text in batch_chunks]
        to_upsert = list(zip(ids, embeds, metadata))
        index.upsert(vectors=to_upsert)
    print(f"Upserted {len(chunks)} chunks to Pinecone for document {document_name}.")


def generate_response(query: str, context: str, model_name: str = LLM_MODEL_NAME) -> str:
    """Generates a response using Together AI's API."""
    if not TOGETHER_AI_API_KEY:
        raise ValueError("TOGETHER_AI_API_KEY environment variable not set.")

    client = Together(api_key=TOGETHER_AI_API_KEY)

    prompt = f"Context:\n{context}\n\nQuestion: {query}"

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=MAX_RESPONSE_TOKENS,
            temperature=RESPONSE_TEMPERATURE,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error generating response: {e}")
        return "I apologize, but I encountered an error while generating the response. Please try again."


def query_pinecone(query: str, index, top_k: int = 15) -> str:
    """Queries Pinecone for relevant chunks."""
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    query_vector = embeddings.embed_query(query)

    results = index.query(vector=query_vector, top_k=top_k, include_values=False, include_metadata=True)
    context = "\n\n".join([match.metadata["text"] for match in results.matches if match.metadata["document"] in query.lower()])
    return context


# --- RAG Chatbot Class ---
class RAGChatbot:
    def __init__(self, base_index_name: str, num_indexes: int):
        if not PINECONE_API_KEY:
            raise ValueError("Pinecone API key must be set.")
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        self.base_index_name = base_index_name
        self.num_indexes = num_indexes
        self.indexes = {}
        for i in range(1, num_indexes + 1):
            index_name = f"{self.base_index_name}-{i}"
            print(f"Connecting to existing Pinecone index '{index_name}'...")
            try:
                self.indexes[i] = self.pc.Index(index_name)
            except Exception as e:
                raise Exception(f"Error connecting to Pinecone index '{index_name}': {e}")

    def _get_index_for_document(self, document_name: str) -> int:
        """Hashes the document name and distributes to an index."""
        hashed_name = int(hashlib.sha256(document_name.encode()).hexdigest(), 16)
        return (hashed_name % self.num_indexes) + 1

    def ingest_pdfs(self, pdf_paths: List[Path]):
        """Ingests a list of PDFs, chunks them, and uploads to Pinecone."""
        prompt_file_path = Path("prompt.txt")
        prompt = load_prompt(prompt_file_path)
        
        total_pdfs = len(pdf_paths)
        print(f"\nStarting processing of {total_pdfs} PDFs...")
        
        for pdf_index, pdf_path in enumerate(pdf_paths, 1):
            pdf_name = os.path.splitext(os.path.basename(str(pdf_path)))[0]
            print(f"\n[{pdf_index}/{total_pdfs}] Processing PDF: {pdf_name}")
            
            try:
                print(f"Extracting text from {pdf_name}...")
                extracted_text = extract_text_from_pdf(pdf_path)
                
                if extracted_text:
                    print(f"Chunking text from {pdf_name}...")
                    chunks = semantic_chunking(extracted_text)
                    print(f"Generated {len(chunks)} chunks for {pdf_name}")
                    
                    summarized_chunks = summarize_chunks(chunks, prompt, pdf_name)
                    
                    print(f"Embedding and upserting chunks for {pdf_name}...")
                    index_id = self._get_index_for_document(pdf_name)
                    embed_and_upsert_to_pinecone(summarized_chunks, self.indexes[index_id], pdf_name)
                    
                    print(f"✓ Successfully completed processing {pdf_name}")
                    print("-" * 80)
                else:
                    print(f"⚠ No text extracted from PDF: {pdf_name}")
            
            except Exception as e:
                print(f"❌ Error processing {pdf_name}: {e}")
                continue
        
        print(f"\n✓ Completed processing all {total_pdfs} PDFs")

    def query(self, query: str) -> str:
        """Queries the chatbot with a user's question."""
        index_id = self._get_index_for_document(query)
        context = query_pinecone(query, self.indexes[index_id])
        if not context:
            return "No relevant information found in the document."
        return generate_response(query, context)


# --- Main Execution ---
if __name__ == "__main__":
    # Set your directory
    pdfs_directory = Path(r"/teamspace/studios/this_studio/hdfc-securities/Documents/RHP_Documents")

    # Create the directory if it doesn't exist
    os.makedirs(pdfs_directory, exist_ok=True)

    # Get all PDF files from the directory
    pdf_paths = list(pdfs_directory.glob("*.pdf"))
    if not pdf_paths:
        raise FileNotFoundError(f"No PDF files found in the directory {pdfs_directory}. Ensure a folder named pdfs exist in the same directory")
    else:
        print(f"PDFs found in the directory: {[os.path.basename(str(file)) for file in pdf_paths]}")

    # Initialize the RAG chatbot with an existing Pinecone index name
    chatbot = RAGChatbot(base_index_name=PINECONE_BASE_INDEX_NAME, num_indexes=NUM_PINECONE_INDEXES)

    # Ingest the PDFs
    chatbot.ingest_pdfs(pdf_paths)

    # Example Query
    while True:
        user_query = input("Question: ")
        start_time = time.time()
        response = chatbot.query(user_query)
        end_time = time.time()
        print(f"Answer: {response}")
        print(f"Time taken: {end_time - start_time:.4f} seconds")

PDFs found in the directory: ['Abha Power and Steel_RHP.pdf', 'Aditya Ultra Steel_RHP.pdf', 'Aeron Composite_RHP.pdf', 'Apex Ecotech_RHP.pdf', 'Arkade Developers_RHP.pdf', 'Baazar Style Retail_RHP.pdf', 'Blackbuck_RHP.pdf', 'Boss Packaging Solutions_RHP.pdf', 'C2C Advanced Systems_RHP.pdf', 'Concord Enviro Systems Limited_RHP.pdf', 'DAM Capital Advisors Limited_RHP.pdf', 'Danish Power_RHP.pdf', 'Deccan Transcon Leasing_RHP.pdf', 'Dhanlaxmi Crop Science_RHP.pdf', 'Diffusion Engineers Ltd_RHP.pdf', 'Divyadhan Recycling Industries_RHP.pdf', 'ECO Mobility_RHP.pdf', 'Emerald Tyre Manufacturers_RHP.pdf', 'Enviro Infra Engineers_RHP.pdf', 'Envirotech Systems_RHP.pdf', 'Excellent Wires and Packaging_RHP.pdf', 'Forge Auto International_RHP.pdf', 'Freshara Agro Exports_RHP.pdf', 'Gajanand International_RHP.pdf', 'Gala Precision Engineering_RHP.pdf', 'Ganesh Infraworld_RHP.pdf', 'Garuda Construction and Engineering_RHP.pdf', 'Godavari Biorefineries_RHP.pdf', 'HVAX Technologies_RHP.pdf', 'Identica

Chunking text from Abha Power and Steel_RHP...
Generated 3990 chunks for Abha Power and Steel_RHP

Processing Abha Power and Steel_RHP
Total chunks to process: 3990
Abha Power and Steel_RHP - Progress: 16.1% (643/3990 chunks)

KeyboardInterrupt: 