In [4]:
!pip install pinecone-client
!pip install tf-keras
!pip install pymupdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [5]:
import fitz
import json
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from together import Together

# Ensure you have these environment variables set
TOGETHER_AI_API_KEY = "64880c44ef37384040dc253c954ed2f190c0e4702c3e80745e5eb78221f47376"  # Replace with your actual API key
PINECONE_API_KEY = "pcsk_2aEGcj_7cwy95qcT59b57wGLdNgNquJdiTiBJXNU27UiEob5cisrASpM99fcBHPeHwxp4U"
PINECONE_ENVIRONMENT = "us-east-1"
PINECONE_INDEX_NAME = "rag-chatbot-index"

def extract_text_from_pdf(pdf_path: Path) -> dict:
    """
    Extracts text from a PDF file, page by page, using fitz.

    Args:
      pdf_path: Path to the input PDF file.

    Returns:
      A dictionary where keys are page numbers (starting from 1) and values are the
      extracted text from that page.
      Returns an empty dictionary if fitz fails to open the PDF.
    """

    try:
        pdf_document = fitz.open(pdf_path)
        if not pdf_document:
            return {}  # return empty if the pdf doc is empty

        page_text = {}
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text = page.get_text()
            page_text[page_number + 1] = text  # Page numbers start from 1

        pdf_document.close()
        return page_text

    except Exception as e:
        print(f"An error occurred during PDF extraction with fitz: {e}")
        return {}

def semantic_chunking(text_dict: dict, chunk_size: int = 500, chunk_overlap: int = 50):
    """
    Chunks the extracted text semantically.

    Args:
      text_dict: Dictionary of page-wise extracted text.
      chunk_size: Maximum size of each chunk.
      chunk_overlap: Number of overlapping characters between chunks.

    Returns:
      A list of text chunks.
    """
    all_text = "\n".join(text_dict.values())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        length_function=len,
    )
    chunks = text_splitter.split_text(all_text)
    return chunks

def embed_and_upsert_to_pinecone(chunks: list[str]):
    """
    Embeds the text chunks using HuggingFace embeddings and upserts them to Pinecone.
    """
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2") # Choose an appropriate model

    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(PINECONE_INDEX_NAME)

    batch_size = 32
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        ids = [f"chunk-{i}-{j}" for j in range(len(batch_chunks))]
        embeds = embeddings.embed_documents(batch_chunks)
        metadata = [{"text": text} for text in batch_chunks]
        to_upsert = list(zip(ids, embeds, metadata))
        index.upsert(vectors=to_upsert)
    print(f"Upserted {len(chunks)} chunks to Pinecone.")

def generate_response(query: str, context: str, model_name: str = "meta-llama/Llama-3-8b-chat-hf"):
    """
    Generates a response using Together AI's API with the `together` library.

    Args:
      query: The user's question.
      context: Retrieved relevant text from Pinecone.
      model_name: The name of the Together AI model to use.

    Returns:
      The LLM's response.
    """
    if not TOGETHER_AI_API_KEY:
        raise ValueError("TOGETHER_AI_API_KEY environment variable not set.")

    client = Together(api_key=TOGETHER_AI_API_KEY)

    prompt = f"Context:\n{context}\n\nQuestion: {query}"

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=512,
        temperature=0.7
    )

    return response.choices[0].message.content

def query_pinecone(query: str, top_k: int = 5):
    """
    Queries Pinecone for relevant chunks.

    Args:
      query: The user's question.
      top_k: Number of relevant chunks to retrieve.

    Returns:
      A string containing the concatenated text of the top_k chunks.
    """
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    query_vector = embeddings.embed_query(query)

    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(PINECONE_INDEX_NAME)

    results = index.query(vector=query_vector, top_k=top_k, include_values=False, include_metadata=True)
    context = "\n\n".join([match.metadata["text"] for match in results.matches])
    return context

class RAGChatbot:
    def __init__(self):
        if not PINECONE_API_KEY:
            raise ValueError("Pinecone API key must be set.")
        try:
            self.pc = Pinecone(api_key=PINECONE_API_KEY)
            if PINECONE_INDEX_NAME not in self.pc.list_indexes():
                print(f"Creating Pinecone index '{PINECONE_INDEX_NAME}'...")
                try:
                    self.pc.create_index(
                        name=PINECONE_INDEX_NAME,
                        dimension=768,  # Dimension of all-mpnet-base-v2 embeddings
                        metric="cosine",
                        spec=ServerlessSpec(
                            cloud="aws",
                            region=PINECONE_ENVIRONMENT
                        )
                    )
                    self.index = self.pc.Index(PINECONE_INDEX_NAME)
                except Exception as e:
                    if "already exists" in str(e):
                        print(f"Pinecone index '{PINECONE_INDEX_NAME}' already exists. Connecting to it.")
                        self.index = self.pc.Index(PINECONE_INDEX_NAME)
                    else:
                        raise e
            else:
                print(f"Pinecone index '{PINECONE_INDEX_NAME}' already exists. Connecting to it.")
                self.index = self.pc.Index(PINECONE_INDEX_NAME)

        except Exception as e:
            raise Exception(f"Error connecting to or creating Pinecone index: {e}")

    def ingest_pdf(self, pdf_path: Path):
        """Ingests the PDF, chunks it, and uploads to Pinecone."""
        extracted_text = extract_text_from_pdf(pdf_path)
        if extracted_text:
            chunks = semantic_chunking(extracted_text)
            embed_and_upsert_to_pinecone(chunks)
        else:
            print("No text extracted from the PDF.")

    def query(self, query: str):
        """Queries the chatbot with a user's question."""
        context = query_pinecone(query)
        if not context:
            return "No relevant information found in the document."
        response = generate_response(query, context)
        return response

In [6]:
pdf_file_path = Path("rbi.pdf") 

chatbot = RAGChatbot()

chatbot.ingest_pdf(pdf_file_path) # Commenting this out to avoid re-ingestion for now
print("Pinecone setup and ready for querying.")

print("=" * 50)
# Start an interactive chat session
while True:
    user_query = input("You: ")
    if user_query.lower() == "exit":
        break
    response = chatbot.query(user_query)
    print(f"User: {user_query}")
    print(f"Chatbot: {response}")
    print("=" * 50)

Creating Pinecone index 'rag-chatbot-index'...
Pinecone index 'rag-chatbot-index' already exists. Connecting to it.
Upserted 28 chunks to Pinecone.
Pinecone setup and ready for querying.
User: Tell more about funding from this document.
Chatbot: According to the document, the funding for a Debt Resolution Scheme (DRS) should be provided upfront and fully cover the required settlement amounts. This means that the government should provide a detailed budgetary provision to fully fund the scheme before it is announced.

In other words, the government should commit to providing the necessary funds upfront to settle the debts of the impacted borrowers, rather than announcing the scheme and then trying to find the funds later. This approach is intended to ensure that the scheme is fully funded and can be implemented effectively.

Additionally, the document states that lenders should be funded by the government to fully cover the required settlement amounts, implying that the government will 