In [23]:
# Import libraries

import os
import PyPDF2
import re
import numpy as np
import openai
from typing import List, Dict, Any, Tuple
import logging
import time

In [2]:
# Set up logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [3]:
# Read pdf files from a directory and extract text using PyPDF2
# Define the class for reading PDF files

class SimplePDFReader:
    """Class for reading PDF files from a directory."""
    
    def read_pdf(self, pdf_path: str) -> str:
        """
        Extract text from a single PDF file.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Extracted text from the PDF
        """
        if not os.path.exists(pdf_path):
            logger.error(f"PDF file not found: {pdf_path}")
            return ""
        
        try:
            text = ""
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                num_pages = len(pdf_reader.pages)
                
                logger.info(f"Reading {pdf_path} with {num_pages} pages")
                
                for page_num in range(num_pages):
                    page = pdf_reader.pages[page_num]
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                    
                    # Log progress for large PDFs
                    if num_pages > 20 and (page_num + 1) % 10 == 0:
                        logger.info(f"Progress: {page_num + 1}/{num_pages} pages processed")
            
            # Clean the text
            text = self._clean_text(text)
            logger.info(f"Successfully extracted {len(text)} characters from {pdf_path}")
            return text
        except Exception as e:
            logger.error(f"Error reading PDF {pdf_path}: {str(e)}")
            return ""
    
    def _clean_text(self, text: str) -> str:
        """Clean the extracted text."""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def read_pdfs_from_directory(self, directory_path: str) -> Dict[str, str]:
        """
        Read all PDFs from a directory.
        
        Args:
            directory_path: Path to directory containing PDFs
            
        Returns:
            Dictionary mapping filenames to their extracted text content
        """
        if not os.path.isdir(directory_path):
            logger.error(f"Directory not found: {directory_path}")
            return {}
        
        pdf_contents = {}
        pdf_files = [f for f in os.listdir(directory_path) if f.lower().endswith('.pdf')]
        
        if not pdf_files:
            logger.warning(f"No PDF files found in {directory_path}")
            return {}
        
        logger.info(f"Found {len(pdf_files)} PDF files in {directory_path}")
        
        for i, pdf_file in enumerate(pdf_files):
            pdf_path = os.path.join(directory_path, pdf_file)
            logger.info(f"Processing PDF {i+1}/{len(pdf_files)}: {pdf_file}")
            
            text = self.read_pdf(pdf_path)
            if text:
                pdf_contents[pdf_file] = text
            else:
                logger.warning(f"No text was extracted from {pdf_file}")
        
        successful = sum(1 for text in pdf_contents.values() if text)
        logger.info(f"Successfully processed {successful} out of {len(pdf_files)} PDF files")
        
        return pdf_contents
    
    def save_extracted_text(self, pdf_contents: Dict[str, str], output_dir: str) -> None:
        """
        Save extracted text from PDFs to individual text files.
        
        Args:
            pdf_contents: Dictionary mapping filenames to their extracted text
            output_dir: Directory where text files will be saved
        """
        if not pdf_contents:
            logger.warning("No PDF contents to save")
            return
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        for pdf_file, text in pdf_contents.items():
            # Create an output filename by replacing .pdf extension with .txt
            output_filename = os.path.splitext(pdf_file)[0] + ".txt"
            output_path = os.path.join(output_dir, output_filename)
            
            try:
                with open(output_path, 'w', encoding='utf-8') as file:
                    file.write(text)
                logger.info(f"Saved extracted text to {output_path}")
            except Exception as e:
                logger.error(f"Error saving text for {pdf_file}: {str(e)}")

In [4]:
# Run the PDF reader

reader = SimplePDFReader()
pdf_directory = r"\Users\yyy04\Downloads\New folder\data"

pdf_contents = reader.read_pdfs_from_directory(pdf_directory)

# Print a summary of what was read
for filename, content in pdf_contents.items():
    print(f"{filename}: {len(content)} characters, sample: {content[:100]}...")

2025-05-07 10:07:03,160 - INFO - Found 34 PDF files in \Users\yyy04\Downloads\New folder\data
2025-05-07 10:07:03,160 - INFO - Processing PDF 1/34: III100(A).pdf
2025-05-07 10:07:03,175 - INFO - Reading \Users\yyy04\Downloads\New folder\data\III100(A).pdf with 16 pages
2025-05-07 10:07:03,384 - INFO - Successfully extracted 61630 characters from \Users\yyy04\Downloads\New folder\data\III100(A).pdf
2025-05-07 10:07:03,385 - INFO - Processing PDF 2/34: III100(B).pdf
2025-05-07 10:07:03,397 - INFO - Reading \Users\yyy04\Downloads\New folder\data\III100(B).pdf with 1 pages
2025-05-07 10:07:03,408 - INFO - Successfully extracted 1816 characters from \Users\yyy04\Downloads\New folder\data\III100(B).pdf
2025-05-07 10:07:03,409 - INFO - Processing PDF 3/34: III100(C).pdf
2025-05-07 10:07:03,419 - INFO - Reading \Users\yyy04\Downloads\New folder\data\III100(C).pdf with 1 pages
2025-05-07 10:07:03,430 - INFO - Successfully extracted 2123 characters from \Users\yyy04\Downloads\New folder\data\III

III100(A).pdf: 61630 characters, sample: University Policies [ Section III: Academic Affairs ](https://policies.umd.edu/academic-affairs) Pol...
III100(B).pdf: 1816 characters, sample: University Policies [ Section III: Academic Affairs ](https://policies.umd.edu/academic-affairs) Pol...
III100(C).pdf: 2123 characters, sample: University Policies [ Section III: Academic Affairs ](https://policies.umd.edu/academic-affairs) Pol...
III100.pdf: 10081 characters, sample: USM Bylaws, Policies and Procedures of the Board of Regents III-1.00-1 III-1.00 - POLICY ON ACADEMIC...
III110.pdf: 14588 characters, sample: USM Bylaws, Policies and Procedures of the Board of Regents UNIVERSITY SYSTEM OF MARYLAND III-1.10 -...
III110A.pdf: 99318 characters, sample: III-1.10(A) UNIVERSITY OF MARYLAND POLICY AND PROCEDURES CONCERNING SCHOLARLY MISCONDUCT (Approved b...
III111.pdf: 7868 characters, sample: USM Bylaws, Policies and Procedures of the Board of Regents III‐1.11 POLICY ON CONFLICTS OF INTEREST...

In [17]:
import uuid
from typing import List, Dict, Any, Optional

# Required libraries for embeddings and ChromaDB
import openai
import chromadb
from chromadb.utils import embedding_functions

api_key = "***"

In [25]:
# Create the pipeline

class PDFEmbeddingPipeline:
    """Pipeline for reading PDFs, generating embeddings, and storing in ChromaDB."""
    
    def __init__(
        self,
        openai_api_key: str,
        chroma_db_path: str = "./chroma_db",
        collection_name: str = "pdf_embeddings",
        embedding_model: str = "text-embedding-ada-002",
        chunk_size: int = 1000,
        chunk_overlap: int = 200
    ):
        """
        Initialize the PDF to Embeddings pipeline.
        
        Args:
            openai_api_key: OpenAI API key for embeddings
            chroma_db_path: Directory to store ChromaDB
            collection_name: Name of the ChromaDB collection
            embedding_model: OpenAI embedding model name
            chunk_size: Size of text chunks
            chunk_overlap: Overlap between chunks
        """
        self.openai_api_key = openai_api_key
        openai.api_key = openai_api_key
        self.embedding_model = embedding_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        # Initialize ChromaDB
        self.chroma_client = chromadb.PersistentClient(path=chroma_db_path)
        
        # Create or get the collection
        openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=openai_api_key,
            model_name=embedding_model
        )
        
    
        try:
            self.collection = self.chroma_client.get_collection(
                name=collection_name,
                embedding_function=openai_ef
            )
            logger.info(f"Connected to existing collection: {collection_name}")
        except ValueError:
            self.collection = self.chroma_client.create_collection(
                name=collection_name,
                embedding_function=openai_ef
            )
            logger.info(f"Created new collection: {collection_name}")
    
    def read_pdf(self, pdf_path: str) -> str:
        """
        Extract text from a PDF file.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Extracted text from the PDF
        """
        if not os.path.exists(pdf_path):
            logger.error(f"PDF file not found: {pdf_path}")
            return ""
        
        try:
            text = ""
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                num_pages = len(pdf_reader.pages)
                
                logger.info(f"Reading {pdf_path} with {num_pages} pages")
                
                for page_num in range(num_pages):
                    page = pdf_reader.pages[page_num]
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                    
                    # Log progress for large PDFs
                    if num_pages > 20 and (page_num + 1) % 10 == 0:
                        logger.info(f"Progress: {page_num + 1}/{num_pages} pages processed")
            
            # Clean the text
            text = self._clean_text(text)
            logger.info(f"Successfully extracted {len(text)} characters from {pdf_path}")
            return text
        except Exception as e:
            logger.error(f"Error reading PDF {pdf_path}: {str(e)}")
            return ""
    
    def _clean_text(self, text: str) -> str:
        """Clean the extracted text."""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def chunk_text(self, text: str, metadata: Dict[str, str]) -> List[Dict[str, Any]]:
        """
        Split text into chunks with specified overlap and add metadata.
        
        Args:
            text: Text to chunk
            metadata: Metadata for the document (e.g., filename, path)
            
        Returns:
            List of dictionaries with text chunks and metadata
        """
        if not text:
            return []
        
        chunks = []
        start = 0
        chunk_id = 0
        
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            
            # If we're not at the end of the text, try to break at a logical point
            if end < len(text):
                # Look for good breakpoints: paragraph, sentence, or word
                paragraph_break = text.rfind('\n', start, end)
                sentence_break = text.rfind('. ', start, end)
                space_break = text.rfind(' ', start, end)
                
                if paragraph_break != -1 and paragraph_break > start + self.chunk_size // 2:
                    end = paragraph_break + 1
                elif sentence_break != -1 and sentence_break > start + self.chunk_size // 2:
                    end = sentence_break + 2
                elif space_break != -1:
                    end = space_break + 1
            
            chunk_text = text[start:end].strip()
            if chunk_text:
                # Create a unique ID for this chunk
                unique_id = f"{metadata.get('filename', 'doc')}_{chunk_id}"
                
                # Create chunk with metadata
                chunk = {
                    "id": unique_id,
                    "text": chunk_text,
                    "metadata": {
                        **metadata,
                        "chunk_id": chunk_id,
                        "start_char": start,
                        "end_char": end
                    }
                }
                chunks.append(chunk)
                chunk_id += 1
            
            # Move the start position, accounting for overlap
            start = end - self.chunk_overlap if end - self.chunk_overlap > start else end
        
        logger.info(f"Created {len(chunks)} chunks from text")
        return chunks
    
    def add_to_chroma(self, chunks: List[Dict[str, Any]]) -> None:
        """
        Add chunks to ChromaDB.
        
        Args:
            chunks: List of dictionaries with text chunks and metadata
        """
        if not chunks:
            logger.warning("No chunks to add to ChromaDB")
            return
        
        batch_size = 20  # Process in batches to avoid rate limits
        total_added = 0
        
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]
            
            try:
                # Extract data for ChromaDB
                ids = [chunk["id"] for chunk in batch]
                texts = [chunk["text"] for chunk in batch]
                metadatas = [chunk["metadata"] for chunk in batch]
                
                # Add documents to ChromaDB
                self.collection.add(
                    ids=ids,
                    documents=texts,
                    metadatas=metadatas
                )
                
                total_added += len(batch)
                logger.info(f"Added batch {i//batch_size + 1}/{(len(chunks)-1)//batch_size + 1} to ChromaDB")
                
            except Exception as e:
                logger.error(f"Error adding batch to ChromaDB: {str(e)}")
            
            # Add a small delay to avoid rate limiting
            if i + batch_size < len(chunks):
                time.sleep(0.5)
        
        logger.info(f"Successfully added {total_added} chunks to ChromaDB")
    
    def process_pdf(self, pdf_path: str) -> int:
        """
        Process a PDF file: read, chunk, and add to ChromaDB.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Number of chunks added to ChromaDB
        """
        # Extract filename from path
        filename = os.path.basename(pdf_path)
        
        # Create metadata for this PDF
        metadata = {
            "filename": filename,
            "source_path": pdf_path,
            "processed_date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        
        # Read PDF
        text = self.read_pdf(pdf_path)
        if not text:
            logger.warning(f"No text extracted from {pdf_path}")
            return 0
        
        # Chunk the text
        chunks = self.chunk_text(text, metadata)
        if not chunks:
            logger.warning(f"No chunks created from {pdf_path}")
            return 0
        
        # Add chunks to ChromaDB
        self.add_to_chroma(chunks)
        
        return len(chunks)
    
    def process_directory(self, directory_path: str) -> Dict[str, int]:
        """
        Process all PDFs in a directory.
        
        Args:
            directory_path: Path to directory containing PDFs
            
        Returns:
            Dictionary mapping filenames to number of chunks added
        """
        if not os.path.isdir(directory_path):
            logger.error(f"Directory not found: {directory_path}")
            return {}
        
        results = {}
        pdf_files = [f for f in os.listdir(directory_path) if f.lower().endswith('.pdf')]
        
        if not pdf_files:
            logger.warning(f"No PDF files found in {directory_path}")
            return {}
        
        logger.info(f"Found {len(pdf_files)} PDF files in {directory_path}")
        
        for i, pdf_file in enumerate(pdf_files):
            pdf_path = os.path.join(directory_path, pdf_file)
            logger.info(f"Processing PDF {i+1}/{len(pdf_files)}: {pdf_file}")
            
            chunks_added = self.process_pdf(pdf_path)
            results[pdf_file] = chunks_added
        
        return results
    
    def query(self, query_text: str, n_results: int = 5) -> Dict[str, Any]:
        """
        Query the ChromaDB for similar chunks.
        
        Args:
            query_text: Text to search for
            n_results: Number of results to return
            
        Returns:
            Dictionary with query results
        """
        try:
            results = self.collection.query(
                query_texts=[query_text],
                n_results=n_results
            )
            return results
        except Exception as e:
            logger.error(f"Error querying ChromaDB: {str(e)}")
            return {}

In [26]:
pipeline = PDFEmbeddingPipeline(
    openai_api_key=api_key,
    chroma_db_path="./chroma_db",
    chunk_size=1000,
    chunk_overlap=200
)

# Process an entire directory
results = pipeline.process_directory("./data")
print(f"Processed {len(results)} PDF files")

# Query the database
similar_chunks = pipeline.query("What is machine learning?", n_results=3)

2025-05-07 10:36:06,095 - INFO - Connected to existing collection: pdf_embeddings
2025-05-07 10:36:06,096 - INFO - Found 34 PDF files in ./data
2025-05-07 10:36:06,096 - INFO - Processing PDF 1/34: III100(A).pdf
2025-05-07 10:36:06,109 - INFO - Reading ./data\III100(A).pdf with 16 pages
2025-05-07 10:36:06,284 - INFO - Successfully extracted 61630 characters from ./data\III100(A).pdf
2025-05-07 10:36:06,285 - INFO - Created 91 chunks from text
2025-05-07 10:36:07,085 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-07 10:36:07,229 - INFO - Added batch 1/5 to ChromaDB
2025-05-07 10:36:08,406 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-07 10:36:08,573 - INFO - Added batch 2/5 to ChromaDB
2025-05-07 10:36:09,257 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-07 10:36:09,303 - INFO - Added batch 3/5 to ChromaDB
2025-05-07 10:36:10,864 - INFO - HTTP Request: P

Processed 34 PDF files
