##Initial Setup and Dependencies
Install necessary Python libraries:

In [None]:
# Install required packages
!pip install gradio sentence-transformers PyPDF2 numpy

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from grad

In [None]:
!pip install openai==0.28



##Import Required Modules

Import all essential libraries and modules, including SentenceTransformer for embedding and Gradio for the user interface.

In [None]:
# Import necessary modules
import os
from typing import List, Dict
from sentence_transformers import SentenceTransformer, util
import openai
from PyPDF2 import PdfReader
import numpy as np
import gradio as gr

  from tqdm.autonotebook import tqdm, trange


##Set Up the OpenAI API

Configure the OpenAI API key for accessing the GPT-3.5 model.

In [None]:
# Set OpenAI API key
openai.api_key = " "

##Build the Chatbot Class

###Initialization (__init__ method):
Initialize the SentenceTransformer for document embedding and set up storage for documents and their sources.

###PDF Loading (load_pdf method):
Extract text from PDFs and create chunks to prepare the content for embedding.

###Text Chunking (create_chunks method):
Split extracted text into manageable pieces (e.g., 500 characters) for processing.

###Question Answering (answer_question method):
Embed user queries and find the most relevant chunks from the loaded documents. Use OpenAI’s ChatCompletion API to generate responses based on the context.

In [None]:
class MultiPDFQuestionAnswering:
    def __init__(self):
        """Initialize the base QA system without loading any PDFs"""
        self.retriever = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        # Initialize empty storage for multiple PDFs
        self.documents = []
        self.document_embeddings = None
        self.pdf_sources = {}  # Track which chunks came from which PDFs

    def load_pdf(self, pdf_path: str) -> None:
        """Load and process a single PDF"""
        try:
            print(f"Loading PDF: {os.path.basename(pdf_path)}...")
            pdf_text = self.extract_text_from_pdf(pdf_path)
            new_chunks = self.create_chunks(pdf_text)

            start_idx = len(self.documents)
            for i in range(len(new_chunks)):
                self.pdf_sources[start_idx + i] = os.path.basename(pdf_path)

            self.documents.extend(new_chunks)
            self.document_embeddings = self.retriever.encode(self.documents)

            print(f"Successfully loaded {os.path.basename(pdf_path)}")
            print(f"Total chunks in system: {len(self.documents)}")
        except Exception as e:
            print(f"Error loading PDF {os.path.basename(pdf_path)}: {str(e)}")

    def load_multiple_pdfs(self, pdf_directory: str) -> None:
        """Load all PDFs from a directory"""
        try:
            pdf_files = [f for f in os.listdir(pdf_directory) if f.lower().endswith('.pdf')]
            print(f"Found {len(pdf_files)} PDF files in directory")

            for pdf_file in pdf_files:
                full_path = os.path.join(pdf_directory, pdf_file)
                self.load_pdf(full_path)
        except Exception as e:
            print(f"Error loading PDFs from directory: {str(e)}")

    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        pdf_reader = PdfReader(file_path)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
        return text.strip()

    def create_chunks(self, text: str, chunk_size: int = 500) -> List[str]:
        """Create chunks from text"""
        chunks = []
        sentences = text.split('.')
        current_chunk = ""

        for sentence in sentences:
            if not sentence.strip():
                continue

            if len(current_chunk) + len(sentence) < chunk_size:
                current_chunk += sentence.strip() + ". "
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence.strip() + ". "

        if current_chunk:
            chunks.append(current_chunk)

        return chunks if chunks else [text]

    def retrieve_relevant_chunks(self, query: str, top_k: int = 3) -> Dict[str, List]:
        """Retrieve relevant chunks and their sources"""
        query_embedding = self.retriever.encode(query)
        similarities = util.cos_sim(query_embedding, self.document_embeddings)[0]

        top_k_indices = similarities.argsort(descending=True)[:top_k].cpu().numpy()

        relevant_info = {
            'text': [],
            'sources': []
        }

        for idx in top_k_indices:
            relevant_info['text'].append(self.documents[idx])
            relevant_info['sources'].append(self.pdf_sources[idx])

        return relevant_info

    def generate_response(self, query: str, context: Dict[str, List]) -> str:
        """Generate detailed response using OpenAI"""
        combined_context = " ".join(context['text'])

        system_prompt = """You are a helpful expert assistant that provides detailed, comprehensive answers based on the given context.
        Your responses should:
        1. Be thorough and well-explained
        2. Include all relevant details from the context
        3. Be structured in a clear, readable format
        4. Use proper paragraphs and formatting
        5. Maintain accuracy and relevance to the question

        If the information is not available in the context, respond with:
        "Sorry, I cannot assist you with that."
        """

        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Using the following context, please provide a detailed answer to the question.\n\nContext: {combined_context}\n\nQuestion: {query}"}
                ],
                temperature=0.4,
                max_tokens=500
            )

            answer = response['choices'][0]['message']['content']
            return answer

        except Exception as e:
            return f"An error occurred while generating the response: {str(e)}"

    def answer_question(self, question: str) -> str:
        """Process question and generate answer"""
        try:
            if not self.documents:
                return "No documents have been loaded into the system. Please load some PDFs first."

            context = self.retrieve_relevant_chunks(question)

            if not context['text']:
                return "Sorry, I cannot assist you with that."

            return self.generate_response(question, context)

        except Exception as e:
            return f"An error occurred while processing your question: {str(e)}"

##Develop the User Interface

Use Gradio to build a simple yet functional web-based interface where users can input their queries.

In [None]:
# Define Gradio interface
def main():
    # Instantiate the QA system
    qa_system = MultiPDFQuestionAnswering()

    # Define the path where the PDFs are located (update this path as needed)
    pdf_directory = "/content/sample_data/Data"  # Directory containing multiple PDFs

    # Load all PDFs from the specified directory
    qa_system.load_multiple_pdfs(pdf_directory)

    # Define the function that will be used by the Gradio interface
    def gradio_interface(question):
        # Return the answer generated by the QA system
        return qa_system.answer_question(question)

    # Create and launch the Gradio interface
    gr.Interface(
        fn=gradio_interface,
        inputs="text",
        outputs="text",
        title="Banking Assistance"
    ).launch()

# Run the main function
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Found 1 PDF files in directory
Loading PDF: bank-services-agreement-privacy-notice.pdf...
Successfully loaded bank-services-agreement-privacy-notice.pdf
Total chunks in system: 571
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7c15b92cc6ddf6708f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
