In [3]:
! pip install PyPDF2
! pip install chromadb
! pip install tavily

Collecting tavily
  Downloading tavily-1.1.0.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tavily
  Building wheel for tavily (setup.py) ... [?25l[?25hdone
  Created wheel for tavily: filename=tavily-1.1.0-py3-none-any.whl size=6128 sha256=a7d8b92aa0500e2a08ba825bef877ec5a3a1b08e95bcac8321fac79dd431708f
  Stored in directory: /root/.cache/pip/wheels/a7/67/b7/9aec4851724de28ac2bc34ff10b042af43d7f2dd1552e5906e
Successfully built tavily
Installing collected packages: tavily
Successfully installed tavily-1.1.0


In [2]:
from openai import OpenAI
import gradio as gr
import os
import PyPDF2
import chromadb
from dotenv import load_dotenv
import requests
from urllib.parse import urlparse
import re

In [4]:
os.environ["OPENAI_API_KEY"] = "" #Add your OPENAI API Key
load_dotenv()
from openai import OpenAI
client = OpenAI(api_key="")

In [5]:
# Initialize Tavily client with error handling
try:
    from tavily import TavilyClient
    tavily_client = TavilyClient(api_key="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")
    TAVILY_AVAILABLE = True
except ImportError:
    TAVILY_AVAILABLE = False
    print("‚ö†Ô∏è  Tavily not installed. Web search functionality will be disabled.")
    print("üí° Run: pip install tavily-python")

In [6]:
# Initialize ChromaDB
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="Physics-PDFs")

In [7]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

In [8]:
def chunk_text(text, chunk_size=1000):
    """Split text into chunks"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [9]:
def store_pdf_in_db(pdf_path):
    """Extract, chunk and store PDF in ChromaDB"""
    try:
        text = extract_text_from_pdf(pdf_path)
        chunks = chunk_text(text)

        # Clear existing data
        collection.delete(where={"source": {"$ne": ""}})  # Delete all documents

        # Add chunks to ChromaDB
        for i, chunk in enumerate(chunks):
            collection.add(
                documents=[chunk],
                metadatas=[{"source": pdf_path, "chunk_id": i}],
                ids=[f"chunk_{i}_{os.path.basename(pdf_path)}"]
            )
        return f"‚úÖ Successfully loaded {len(chunks)} chunks from {pdf_path}"
    except Exception as e:
        return f"‚ùå Error: {str(e)}"

In [10]:
def get_relevant_context(query, n_results=3):
    """Retrieve relevant context from ChromaDB"""
    try:
        results = collection.query(
            query_texts=[query],
            n_results=n_results
        )
        return "\n\n".join(results['documents'][0]) if results['documents'] else ""
    except Exception as e:
        print(f"ChromaDB query error: {e}")
        return ""

In [11]:
def is_pdf_context_relevant(pdf_context, query):
    """Check if PDF context is actually relevant to the query"""
    if not pdf_context or len(pdf_context.strip()) < 50:
        return False

    # Check for common irrelevant patterns in PDF context
    irrelevant_patterns = [
        "table of contents", "copyright", "abstract", "references",
        "appendix", "index", "acknowledg", "preface"
    ]

    pdf_lower = pdf_context.lower()
    query_lower = query.lower()

    # If PDF context contains many irrelevant sections, it's likely not useful
    irrelevant_count = sum(1 for pattern in irrelevant_patterns if pattern in pdf_lower)
    if irrelevant_count > 2:
        return False

    # Check if query terms actually appear in the context
    query_terms = query_lower.split()
    relevant_terms = sum(1 for term in query_terms if len(term) > 3 and term in pdf_lower)

    # If less than 30% of meaningful query terms are in context, consider it irrelevant
    meaningful_terms = [term for term in query_terms if len(term) > 3]
    if meaningful_terms and (relevant_terms / len(meaningful_terms)) < 0.3:
        return False

    return True

In [12]:
def search_with_tavily(query):
    """Search using Tavily and get content from first result"""
    if not TAVILY_AVAILABLE:
        return "Web search unavailable. Please install tavily-python: pip install tavily-python"

    try:
        # Search for the query
        search_response = tavily_client.search(
            query=query,
            search_depth="basic",
            max_results=3
        )

        if not search_response.get('results'):
            return "No search results found."

        # Get the first result
        first_result = search_response['results'][0]
        first_url = first_result.get('url', '')

        # Use Tavily's content if available
        content = first_result.get('content', '')

        if not content:
            content = f"Title: {first_result.get('title', 'N/A')}\nURL: {first_url}"

        # Check if it's a PDF
        if first_url.lower().endswith('.pdf'):
            return f"üìÑ PDF Source: {first_url}\n\nThis appears to be a PDF document. Please visit the link to view the PDF: {first_url}"

        return f"üîç Search Result from: {first_url}\n\n{content}"

    except Exception as e:
        return f"‚ùå Search error: {str(e)}"

In [13]:
def chat_with_physics_bot(message, history):
    # First try to get relevant context from PDF
    pdf_context = get_relevant_context(message)

    # Check if PDF context is actually relevant
    use_pdf = is_pdf_context_relevant(pdf_context, message)

    system_prompt = """You are a friendly Physics Teacher Bot. Follow these rules:
    1. Use the provided context to answer questions accurately
    2. If the context doesn't fully answer the question, use your knowledge to provide a complete answer
    3. Be educational and clear in your explanations"""

    if use_pdf:
        # Use PDF context
        user_content = f"Context from PDF:\n{pdf_context}\n\nQuestion: {message}"
        source_note = "üìö Answer based on uploaded PDF"
    else:
        # Use web search
        search_results = search_with_tavily(message)
        user_content = f"Search Results:\n{search_results}\n\nQuestion: {message}"
        source_note = "üîç Answer based on web search"

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content}
            ],
            temperature=0.2
        )

        bot_response = response.choices[0].message.content
        return f"{source_note}\n\n{bot_response}"

    except Exception as e:
        return f"‚ùå Error generating response: {str(e)}"

In [15]:
def load_pdf_and_chat(pdf_file, message, history):
    # First store the PDF
    result_msg = store_pdf_in_db(pdf_file.name)

    # Then get response
    response = chat_with_physics_bot(message, history)

    return result_msg, response

In [16]:
def launch_app():
    with gr.Blocks() as demo:
        gr.Markdown("# ‚öõÔ∏è Physics Teacher chatbot with PDF RAG")

        with gr.Row():
            with gr.Column():
                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
                load_btn = gr.Button("Load PDF into Knowledge Base")
                load_status = gr.Textbox(label="Load Status", interactive=False)

                gr.Markdown("""
                ### How it works:
                1. **Upload a PDF** - Answers will prioritize PDF content ONLY when relevant
                2. **Ask questions** - If PDF doesn't have relevant answers, web search will be used
                3. **Clear indicators** - Sources are clearly marked (üìö PDF or üîç Web)
                """)

            with gr.Column():
                chat_interface = gr.ChatInterface(
                    fn=chat_with_physics_bot,
                    title="Chat with Physics Teacher Bot",
                    description="Ask questions about the loaded PDF or related to Physics concepts. The bot will use PDF content only when relevant, otherwise use web search."
                )

        # Load PDF when button clicked
        load_btn.click(
            fn=lambda pdf: store_pdf_in_db(pdf.name) if pdf else "Please upload a PDF first",
            inputs=[pdf_input],
            outputs=[load_status]
        )

    demo.launch()

In [17]:
import requests
import os

def download_pdf_from_github(url, filename="PhysicsTeacher.pdf"):
    response = requests.get(url)
    response.raise_for_status()  # ensure no errors
    with open(filename, "wb") as f:
        f.write(response.content)
    return filename

In [18]:
if __name__ == "__main__":
    # Correct GitHub raw URL
    github_pdf_url = "https://github.com/sumit1311singh/PhysicsTeacher-CurrentElectricity-Bot/raw/main/Current_Electricity.pdf"

    # Download PDF
    pdf_file = download_pdf_from_github(github_pdf_url, "Physics-PDFs")
    print(f"Downloaded PDF: {pdf_file}")

    # Store in DB
    store_pdf_in_db(pdf_file)

    # Launch app
    launch_app()

Downloaded PDF: Physics-PDFs


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 79.3M/79.3M [00:00<00:00, 92.4MiB/s]
  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5eb52757e590cadfe9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
