In [5]:
import pypdf
import os
from typing import List, Dict
import re

In [6]:
class DocumentParser:
    def __init__(self, pdf_directory: str, output_directory: str):
        self.pdf_directory = pdf_directory
        self.output_directory = output_directory
        self.documents = {}

        # Ensure the output directory exists
        os.makedirs(self.output_directory, exist_ok=True)

    def read_pdf(self, file_path: str) -> str:
        """Extract text from PDF file"""
        with open(file_path, 'rb') as file:
            pdf_reader = pypdf.PdfReader(file)
            text = ''
            for page in pdf_reader.pages:
                text += page.extract_text() + '\n'
        return text

    def clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove multiple newlines
        text = re.sub(r'\n+', '\n', text)
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters
        text = re.sub(r'[^\w\s\.,!?-]', '', text)
        return text.strip()

    def split_into_chunks(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks of approximately equal size"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0

        for word in words:
            current_size += len(word) + 1  # +1 for space
            if current_size > chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_size = len(word)
            else:
                current_chunk.append(word)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def save_chunks(self, filename: str, chunks: List[str]):
        """Save chunks into the output directory as text files"""
        base_filename = os.path.splitext(filename)[0]
        for i, chunk in enumerate(chunks):
            chunk_file_path = os.path.join(
                self.output_directory, f"{base_filename}_chunk_{i + 1}.txt")
            with open(chunk_file_path, 'w', encoding='utf-8') as f:
                f.write(chunk)

    def process_documents(self) -> Dict[str, List[str]]:
        """Process all PDF documents in the directory"""
        for filename in os.listdir(self.pdf_directory):
            if filename.endswith('.pdf'):
                file_path = os.path.join(self.pdf_directory, filename)
                # Extract text
                raw_text = self.read_pdf(file_path)
                # Clean text
                cleaned_text = self.clean_text(raw_text)
                # Split into chunks
                chunks = self.split_into_chunks(cleaned_text)
                # Save chunks to output directory
                self.save_chunks(filename, chunks)
                # Store with filename as key
                self.documents[filename] = chunks

        return self.documents




In [7]:
if __name__ == "__main__":
    # Paths for input and output directories
    pdf_directory = '../pdfs'
    output_directory = '../parsed_content'

    parser = DocumentParser(pdf_directory, output_directory)
    documents = parser.process_documents()

    # Print some statistics
    print("\nDocument Processing Summary:")
    for doc_name, chunks in documents.items():
        print(f"\nDocument: {doc_name}")
        print(f"Number of chunks: {len(chunks)}")
        print(
            f"Average chunk length: {sum(len(chunk) for chunk in chunks)/len(chunks):.2f} characters")
        print(f"Sample chunk: {chunks[0][:200]}...")


Document Processing Summary:

Document: goog-10-k-2023 (1).pdf
Number of chunks: 337
Average chunk length: 996.39 characters
Sample chunk: UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 ___________________________________________ FORM 10-K ___________________________________________ Mark One ANNUAL REPORT PURSUAN...

Document: tsla-20231231-gen.pdf
Number of chunks: 434
Average chunk length: 994.44 characters
Sample chunk: UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K Mark One x ANNUAL REPORT PURSUANT TO SECTION 13 OR 15d OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ende...

Document: uber-10-k-2023.pdf
Number of chunks: 683
Average chunk length: 995.01 characters
Sample chunk: UNITED STATESSECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549____________________________________________ FORM 10-K ____________________________________________ Mark One ANNUAL REPORT PURSUAN...
