### 1. 采用  pdf splitting 的方法，分析整个文件夹的 PDFs

In [3]:
import PyPDF2
import json
from pathlib import Path
import os
from typing import Dict, Any, List
from tqdm import tqdm

class PDFPageExtractor:
    def __init__(self):
        pass
        
    def extract_pdf_pages(self, pdf_path: str) -> Dict[str, Any]:
        """Extract text from PDF file page by page and return as JSON format."""
        pdf_path = Path(pdf_path).resolve()
        
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found at path: {pdf_path}")
        
        if not pdf_path.is_file():
            raise ValueError(f"Path exists but is not a file: {pdf_path}")
            
        if pdf_path.suffix.lower() != '.pdf':
            raise ValueError(f"File is not a PDF (extension is {pdf_path.suffix}): {pdf_path}")
        
        try:
            result = {
                "title": pdf_path.stem,
                "pages": []
            }
            
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                total_pages = len(reader.pages)
                
                if total_pages == 0:
                    raise ValueError(f"PDF file is empty: {pdf_path}")
                
                print(f"\nExtracting pages from: {pdf_path.name}")
                print(f"Total pages: {total_pages}")
                
                for page_num in tqdm(range(total_pages), desc="Processing pages"):
                    try:
                        page = reader.pages[page_num]
                        page_text = page.extract_text()
                        
                        page_entry = {
                            "page_number": page_num + 1,
                            "content": page_text.strip() if page_text else ""
                        }
                        
                        result["pages"].append(page_entry)
                        
                    except Exception as e:
                        print(f"\nWarning: Error extracting text from page {page_num + 1}: {str(e)}")
                        result["pages"].append({
                            "page_number": page_num + 1,
                            "content": "",
                            "error": str(e)
                        })
            
            return result
                
        except Exception as e:
            raise Exception(f"Error reading PDF {pdf_path}: {str(e)}")
    
    def save_to_json(self, data: Dict[str, Any], output_path: str) -> None:
        """Save the extracted data to a JSON file."""
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"Successfully saved JSON output to: {output_path}")
        except Exception as e:
            raise Exception(f"Error saving JSON file: {str(e)}")

def get_pdf_files(directory: str) -> List[Path]:
    """Get all PDF files in the specified directory."""
    directory_path = Path(directory)
    return sorted(directory_path.glob('*.pdf'))

def process_pdf(pdf_path: str, output_path: str) -> Dict[str, Any]:
    """Process a single PDF file and save the results to JSON."""
    extractor = PDFPageExtractor()
    
    try:
        result = extractor.extract_pdf_pages(pdf_path)
        extractor.save_to_json(result, output_path)
        return result
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {str(e)}")
        return None

def process_directory(directory_path: str) -> Dict[str, Any]:
    """Process all PDFs in a directory and return processing statistics."""
    pdf_files = get_pdf_files(directory_path)
    total_pdfs = len(pdf_files)
    
    if total_pdfs == 0:
        print(f"No PDF files found in directory: {directory_path}")
        return {
            "total_pdfs": 0,
            "processed": 0,
            "failed": 0,
            "files": []
        }
    
    print(f"\nFound {total_pdfs} PDF files to process")
    
    stats = {
        "total_pdfs": total_pdfs,
        "processed": 0,
        "failed": 0,
        "files": []
    }
    
    output_dir = Path(directory_path) / "pdf_json_output"
    output_dir.mkdir(exist_ok=True)
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        output_path = output_dir / f"{pdf_file.stem}_pages.json"
        
        try:
            result = process_pdf(str(pdf_file), str(output_path))
            if result:
                stats["processed"] += 1
                stats["files"].append({
                    "pdf_name": pdf_file.name,
                    "json_name": output_path.name,
                    "pages": len(result["pages"]),
                    "status": "success"
                })
            else:
                stats["failed"] += 1
                stats["files"].append({
                    "pdf_name": pdf_file.name,
                    "status": "failed"
                })
        except Exception as e:
            print(f"\nError processing {pdf_file.name}: {str(e)}")
            stats["failed"] += 1
            stats["files"].append({
                "pdf_name": pdf_file.name,
                "status": "failed",
                "error": str(e)
            })
    
    # Save processing stats
    stats_path = output_dir / "processing_stats.json"
    with open(stats_path, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    
    return stats


In [4]:

def main():
    # Directory containing PDF files
    directory_path = r"D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share"
    
    try:
        print("Starting batch PDF processing...")
        stats = process_directory(directory_path)
        
        print("\nProcessing Summary:")
        print(f"Total PDFs found: {stats['total_pdfs']}")
        print(f"Successfully processed: {stats['processed']}")
        print(f"Failed: {stats['failed']}")
        print(f"\nDetailed processing stats saved to: {Path(directory_path) / 'pdf_json_output' / 'processing_stats.json'}")
        
    except Exception as e:
        print(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main()

Starting batch PDF processing...

Found 10 PDF files to process


Processing PDFs:   0%|          | 0/10 [00:00<?, ?it/s]


Extracting pages from: Breastfeeding telephone triage and ad... (Z-Library).pdf
Total pages: 140


Processing pages: 100%|██████████| 140/140 [00:04<00:00, 29.14it/s]
Processing PDFs:  10%|█         | 1/10 [00:05<00:45,  5.06s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Breastfeeding telephone triage and ad... (Z-Library)_pages.json

Extracting pages from: Bright Futures Guidelines for Health... (Z-Library).pdf
Total pages: 1459


Processing pages: 100%|██████████| 1459/1459 [01:28<00:00, 16.58it/s]
Processing PDFs:  20%|██        | 2/10 [01:33<07:13, 54.19s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Bright Futures Guidelines for Health... (Z-Library)_pages.json

Extracting pages from: Caring for Your Baby and Young Child  Birth to Age 5 (Tanya Altmann American Academy of Pediatrics) (Z-Library).pdf
Total pages: 962


Processing pages: 100%|██████████| 962/962 [01:55<00:00,  8.36it/s]
Processing PDFs:  30%|███       | 3/10 [03:30<09:39, 82.85s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Caring for Your Baby and Young Child  Birth to Age 5 (Tanya Altmann American Academy of Pediatrics) (Z-Library)_pages.json

Extracting pages from: Managing Infectious Diseases in Child Care and Schools A Quick Reference Guide (Susan S. Aronson, Timothy R. Shope) (Z-Library).pdf
Total pages: 268


Processing pages: 100%|██████████| 268/268 [00:30<00:00,  8.76it/s]
Processing PDFs:  40%|████      | 4/10 [04:01<06:14, 62.34s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Managing Infectious Diseases in Child Care and Schools A Quick Reference Guide (Susan S. Aronson, Timothy R. Shope) (Z-Library)_pages.json

Extracting pages from: Nelson Textbook of Pediatrics, 2-Volume (Robert M. Kliegman MD (Editor) etc.) (Z-Library).pdf
Total pages: 2304


Processing pages: 100%|██████████| 2304/2304 [04:09<00:00,  9.25it/s]
Processing PDFs:  50%|█████     | 5/10 [08:12<10:52, 130.42s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Nelson Textbook of Pediatrics, 2-Volume (Robert M. Kliegman MD (Editor) etc.) (Z-Library)_pages.json

Extracting pages from: Pediatric Dermatology A Quick Referen... (Z-Library).pdf
Total pages: 850


Processing pages: 100%|██████████| 850/850 [08:07<00:00,  1.75it/s]
Processing PDFs:  60%|██████    | 6/10 [16:20<16:47, 251.80s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Pediatric Dermatology A Quick Referen... (Z-Library)_pages.json

Extracting pages from: Pediatric Orthopaedics and Sports Inj... (Z-Library).pdf
Total pages: 800


Processing pages: 100%|██████████| 800/800 [00:19<00:00, 41.72it/s]
Processing PDFs:  70%|███████   | 7/10 [16:40<08:48, 176.19s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Pediatric Orthopaedics and Sports Inj... (Z-Library)_pages.json

Extracting pages from: Quick Reference Guide to Pediatric Ca... (Z-Library).pdf
Total pages: 1244


Processing pages: 100%|██████████| 1244/1244 [01:44<00:00, 11.95it/s]
Processing PDFs:  80%|████████  | 8/10 [18:26<05:07, 153.79s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\Quick Reference Guide to Pediatric Ca... (Z-Library)_pages.json

Extracting pages from: The Harriet Lane Handbook 22nd Editio... (Z-Library).pdf
Total pages: 1304


Processing pages: 100%|██████████| 1304/1304 [00:45<00:00, 28.75it/s]
Processing PDFs:  90%|█████████ | 9/10 [19:13<02:00, 120.32s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\The Harriet Lane Handbook 22nd Editio... (Z-Library)_pages.json

Extracting pages from: You-ology A Puberty Guide for EVERY B... (Z-Library).pdf
Total pages: 226


Processing pages: 100%|██████████| 226/226 [00:10<00:00, 21.22it/s]
Processing PDFs: 100%|██████████| 10/10 [19:23<00:00, 116.39s/it]

Successfully saved JSON output to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\You-ology A Puberty Guide for EVERY B... (Z-Library)_pages.json

Processing Summary:
Total PDFs found: 10
Successfully processed: 10
Failed: 0

Detailed processing stats saved to: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\processing_stats.json



