In [3]:
# notebooks/model_exploration.py

import os
import nltk
from PyPDF2 import PdfReader
from google import generativeai as genai

nltk.download("punkt")

GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("Set GOOGLE_API_KEY in your environment variables before running.")
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel("gemini-1.5-flash")

def load_pdf_text(pdf_path):
    """
    Extract and return all text from a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        str: Full text extracted from the PDF.
    """
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"
    return full_text

def chunk_text(text, max_len=500):
    """
    Split text into sentence-based chunks up to a maximum character length.

    Args:
        text (str): Input text.
        max_len (int): Maximum character length for each chunk.

    Returns:
        list: List of text chunks.
    """
    from nltk.tokenize import sent_tokenize

    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sent in sentences:
        if len(current_chunk) + len(sent) <= max_len:
            current_chunk += " " + sent
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def answer_question(context, question):
    """
    Answer a question using Gemini based on a given context.

    Args:
        context (str): Context text to search for the answer.
        question (str): Question to be answered.

    Returns:
        str: Answer generated by Gemini, or None if no answer is found.
    """
    prompt = f"""
You are a helpful assistant. Use the context below to answer the question.

Context:
{context}

Question: {question}
Answer:
"""
    response = model.generate_content(prompt)
    return response.text.strip() if response and response.text else None

if __name__ == "__main__":
    """
    Load text from a sample PDF, chunk it, and attempt to answer a sample question.
    """
    pdf_file = "../data/sample_document.pdf"
    text = load_pdf_text(pdf_file)
    print(f"Loaded text length: {len(text)}")

    chunks = chunk_text(text)
    print(f"Split into {len(chunks)} chunks")

    question = "What is the deadline for submission?"
    for chunk in chunks:
        answer = answer_question(chunk, question)
        if answer and answer.lower() not in ["", "no answer", "not found"]:
            print(f"Answer: {answer}")
            break


[nltk_data] Downloading package punkt to C:\Users\Dell Precision
[nltk_data]     3560\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '../data/sample_document.pdf'