In [1]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def extract_text_from_pdf(pdf_path, tesseract_cmd=None, min_text_chars=30, dpi=300):
    """
    Extract text from a PDF using a two-step process:
    1. Direct text extraction via pdfplumber (page by page).
    2. Fallback to OCR (using pytesseract) if a page is empty or has too few chars.

    :param pdf_path: Path to the PDF file
    :param tesseract_cmd: (Optional) Path to Tesseract executable if not on PATH
    :param min_text_chars: Minimum threshold for text length to consider "valid"
    :param dpi: Resolution used when converting PDF pages to images for OCR
    :return: Extracted text as a single string
    """
    # If Tesseract is not on PATH, specify its full path
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    all_extracted_text = []

    # Open the PDF with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        for page_number in range(total_pages):
            page_text = pdf.pages[page_number].extract_text() or ""
            page_text = page_text.strip()

            # If there's enough direct text, use it
            if len(page_text) >= min_text_chars:
                all_extracted_text.append(f"Page {page_number+1}:\n{page_text}\n")
            else:
                # Fallback: OCR
                images = convert_from_path(pdf_path, dpi=dpi, 
                                           first_page=page_number+1, 
                                           last_page=page_number+1)
                if images:
                    ocr_text = pytesseract.image_to_string(images[0]) or ""
                    ocr_text = ocr_text.strip()
                    if ocr_text:
                        all_extracted_text.append(f"Page {page_number+1} (OCR):\n{ocr_text}\n")
                    else:
                        # If OCR also returns empty, note it
                        all_extracted_text.append(f"Page {page_number+1}:\n[No text found]\n")

    # Combine everything
    return "\n".join(all_extracted_text)

def process_multiple_pdfs(pdf_folder, output_folder, tesseract_cmd=None):
    """
    Process multiple PDFs from pdf_folder, extracting text (with OCR fallback),
    and save the text to output_folder in .txt files.
    
    :param pdf_folder: Folder containing PDF files
    :param output_folder: Folder to save the extracted .txt files
    :param tesseract_cmd: Optional path to Tesseract executable
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            extracted_text = extract_text_from_pdf(pdf_path, tesseract_cmd=tesseract_cmd)
            
            # Save to a .txt file (same base name as the PDF)
            txt_filename = filename.rsplit(".pdf", 1)[0] + ".txt"
            txt_path = os.path.join(output_folder, txt_filename)
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(extracted_text)

            print(f"Extracted text saved to: {txt_path}")

if __name__ == "__main__":
    # Example usage:
    pdf_input_dir = "resources/ATEP CC&Rs"  # your PDF folder
    output_text_dir = "output/pdf"         # where to store the .txt files
    
    # If Tesseract is not on PATH, specify its full location:
    # e.g., tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"
    tesseract_cmd = None

    process_multiple_pdfs(pdf_input_dir, output_text_dir, tesseract_cmd)


Extracted text saved to: output/pdf/IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF.txt
Extracted text saved to: output/pdf/IRVLIB-#1412730-v1-RECORDED___CC&Rs_(Doc_#2018000255183).PDF.txt


In [2]:
import os
from typing import List

import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS


def extract_documents_from_pdf(pdf_path: str,
                               tesseract_cmd: str = None,
                               min_text_chars: int = 30,
                               dpi: int = 300) -> List[Document]:
    """
    Extracts text from a PDF file page by page.
    Uses direct extraction via pdfplumber; if a page returns too little text,
    it falls back to OCR via pytesseract.
    
    Args:
        pdf_path (str): Path to the PDF file.
        tesseract_cmd (str, optional): Full path to Tesseract executable if not on PATH.
        min_text_chars (int, optional): Minimum number of characters to consider the page valid.
        dpi (int, optional): Resolution for converting PDF pages to images for OCR.
    
    Returns:
        List[Document]: A list of Document objects (one per page) with metadata.
    """
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    documents = []
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        for page_number in range(total_pages):
            page = pdf.pages[page_number]
            page_text = page.extract_text() or ""
            page_text = page_text.strip()

            # If extracted text is too short, fallback to OCR
            if len(page_text) < min_text_chars:
                images = convert_from_path(pdf_path, dpi=dpi,
                                           first_page=page_number + 1,
                                           last_page=page_number + 1)
                if images:
                    ocr_text = pytesseract.image_to_string(images[0]) or ""
                    ocr_text = ocr_text.strip()
                    page_text = ocr_text

            # If still empty, note it (or you could choose to skip this page)
            if not page_text:
                page_text = "[No text found]"

            metadata = {"source": os.path.basename(pdf_path), "page": page_number + 1}
            documents.append(Document(page_content=page_text, metadata=metadata))
    return documents


def build_pdf_vectorstore(pdf_directory: str,
                          tesseract_cmd: str = None,
                          min_text_chars: int = 30,
                          dpi: int = 300,
                          chunk_size: int = 500,
                          chunk_overlap: int = 100) -> FAISS:
    """
    Processes all PDF files in a given directory by extracting text from each page
    (with an OCR fallback) and builds a FAISS vector store from text chunks.
    
    Args:
        pdf_directory (str): Folder containing PDF files.
        tesseract_cmd (str, optional): Path to Tesseract executable (if needed).
        min_text_chars (int, optional): Minimum characters required for direct text extraction.
        dpi (int, optional): DPI for image conversion for OCR.
        chunk_size (int, optional): Maximum number of characters per chunk.
        chunk_overlap (int, optional): Overlap between chunks.
    
    Returns:
        FAISS: A FAISS vector store containing the embedded document chunks.
    """
    all_documents = []
    # Process each PDF in the directory
    for filename in os.listdir(pdf_directory):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            docs = extract_documents_from_pdf(pdf_path,
                                              tesseract_cmd=tesseract_cmd,
                                              min_text_chars=min_text_chars,
                                              dpi=dpi)
            all_documents.extend(docs)
    
    # Split the extracted documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap)
    docs_chunks = text_splitter.split_documents(all_documents)
    # Filter out any empty chunks
    docs_chunks = [doc for doc in docs_chunks if doc.page_content.strip()]
    
    if not docs_chunks:
        raise ValueError("No valid text chunks found to build the vector store. "
                         "Check your OCR output or text splitter settings.")
    
    # Compute embeddings using a free, small model and build the FAISS vector store
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs_chunks, embeddings)
    
    return vectorstore


if __name__ == "__main__":
    # Set the directory containing your PDF files
    pdf_directory = "resources/pdf"
    
    # Optionally, specify the full path to the Tesseract executable if it is not in your PATH.
    tesseract_cmd = None  # e.g., "C:/Program Files/Tesseract-OCR/tesseract.exe"
    
    # Build the vector store directly from PDFs
    vector_store = build_pdf_vectorstore(pdf_directory,
                                         tesseract_cmd=tesseract_cmd,
                                         min_text_chars=30,
                                         dpi=300,
                                         chunk_size=500,
                                         chunk_overlap=100)
    
    print("Vector store successfully built. The following document chunks are stored:")
    for doc in vector_store.docstore._dict.values():
        source = doc.metadata.get("source", "Unknown")
        page = doc.metadata.get("page", "Unknown")
        print(f"Document from {source} - Page {page}")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Vector store successfully built. The following document chunks are stored:
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 1
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 1
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 2
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 3
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 3
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 3
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s_re_ATEP_(Doc_#2019000073116).PDF - Page 3
Document from IRVLIB-#1439777-v1-RECORDED___Supplemental_Declaration_of_CC&R_s

In [1]:
import subprocess
from langchain.docstore.document import Document

# -----------------------------------------------
# Step 1: Query Handling & Document Retrieval
# -----------------------------------------------
def retrieve_context(vectorstore, query: str, k: int = 3) -> str:
    """
    Retrieve the top k similar document chunks from the vector store,
    assembling them into a single context string that includes
    source filenames and page numbers.
    
    Args:
        vectorstore: The FAISS vector store.
        query (str): The user's query.
        k (int): Number of top matching chunks to retrieve.
    
    Returns:
        str: An assembled context string.
    """
    # Similarity search automatically encodes the query.
    retrieved_docs = vectorstore.similarity_search(query, k=k)
    
    context_parts = []
    for doc in retrieved_docs:
        source = doc.metadata.get("source", "Unknown source")
        page = doc.metadata.get("page", "Unknown page")
        text = doc.page_content.strip()
        context_parts.append(f"Source: {source} - Page: {page}\n{text}")
    
    # Separate chunks clearly
    context = "\n\n".join(context_parts)
    return context

# -----------------------------------------------
# Step 2: Answer Generation using DeepSeek R1:8B Locally
# -----------------------------------------------
def call_deepseek_local(prompt: str,
                        model: str = "deepseek-r1:8b",
                        temperature: float = 0.0,
                        max_tokens: int = 512) -> str:
    """
    Calls the locally installed DeepSeek R1:8B model via the Ollama CLI.
    
    Args:
        prompt (str): The prompt to feed into the model.
        model (str): The model identifier.
        temperature (float): Sampling temperature.
        max_tokens (int): Maximum tokens to generate.
    
    Returns:
        str: The raw output from the model.
    """
    # Build the command for the local call.
    command = [
        "ollama", "run", model,
        "--temperature", str(temperature),
        "--max-tokens", str(max_tokens)
    ]
    
    # Call the model locally (this blocks until completion).
    result = subprocess.run(command, input=prompt, capture_output=True, text=True)
    
    if result.returncode != 0:
        raise Exception("Local model execution failed: " + result.stderr)
    
    return result.stdout.strip()

def answer_query(vectorstore, query: str, k: int = 3) -> dict:
    """
    Given a user query, retrieve context from the vector store, construct
    a prompt, and generate an answer with the local DeepSeek R1:8B model.
    The model is instructed to include both the answer and the references.
    
    Args:
        vectorstore: The FAISS vector store.
        query (str): The user query.
        k (int): Number of retrieved chunks to use as context.
    
    Returns:
        dict: A dictionary with two keys:
              "answer": The answer text.
              "references": The document names and pages referenced.
    """
    # Retrieve context (includes source file names and page numbers).
    context = retrieve_context(vectorstore, query, k=k)
    
    # Construct the prompt with explicit instructions.
    prompt = (
        "You are a helpful assistant. Answer the query only using the following context. "
        "Do not include any information not present in the context. Include references to the source "
        "(i.e., file name and page number) in your answer.\n\n"
        f"Context:\n{context}\n\n"
        f"Query: {query}\n\n"
        "Please respond using the following format exactly:\n"
        "Answer: <your answer here>\n"
        "References: <list of source file names and page numbers, separated by commas>\n"
    )
    
    # Call the local DeepSeek model.
    output = call_deepseek_local(prompt)
    
    # Parse the output expecting the following format:
    # Answer: <text>
    # References: <text>
    answer_lines = []
    references_lines = []
    current_section = None
    for line in output.splitlines():
        if line.startswith("Answer:"):
            current_section = "answer"
            answer_lines.append(line[len("Answer:"):].strip())
        elif line.startswith("References:"):
            current_section = "references"
            references_lines.append(line[len("References:"):].strip())
        else:
            if current_section == "answer":
                answer_lines.append(line.strip())
            elif current_section == "references":
                references_lines.append(line.strip())
    
    answer_text = "\n".join(answer_lines).strip()
    references_text = "\n".join(references_lines).strip()
    
    return {"answer": answer_text, "references": references_text}

# -----------------------------------------------
# Integration Example (Final Output)
# -----------------------------------------------
if __name__ == "__main__":
    # Assume your FAISS vector store (vector_store) has been built already.
    # For example:
    # from your_vectorstore_module import build_pdf_vectorstore
    # pdf_directory = "resources/pdf"
    # vector_store = build_pdf_vectorstore(pdf_directory, ...)
    
    # Get the user query.
    query = input("Enter your query: ")
    
    try:
        result = answer_query(vector_store, query, k=3)
        print("\n--- Final Answer ---")
        print(result["answer"])
        print("\n--- References ---")
        print(result["references"])
    except Exception as e:
        print(f"An error occurred: {e}")


An error occurred: name 'vector_store' is not defined
