In [1]:
!pip install PyMuPDF
!pip install python-doctr
!pip install tf2onnx



In [4]:
import fitz
import json
from pathlib import Path
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import warnings

!export CUDA_VISIBLE_DEVICES=""
warnings.filterwarnings("ignore")

def extract_text_from_pdf(pdf_path: Path) -> dict:
    """
    Extracts text from a PDF file, page by page, using fitz first. If fitz
    extracts less than 10 words on average per page, it falls back to doctr.

    Args:
      pdf_path: Path to the input PDF file.

    Returns:
      A dictionary where keys are page numbers (starting from 1) and values are the
      extracted text from that page.
      Returns an empty dictionary if both fitz and doctr fail
    """

    try:
        pdf_document = fitz.open(pdf_path)
        if not pdf_document:
            return {}  # return empty if the pdf doc is empty

        page_text = {}
        total_words = 0
        all_text_extracted = True

        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text = page.get_text()
            if not text.strip():
              all_text_extracted = False
              break # Stop extraction with fitz if a page is empty
            word_count = len(text.split())
            total_words += word_count
            page_text[page_number + 1] = text  # Page numbers start from 1


        pdf_document.close()

        if all_text_extracted and page_text and total_words / len(page_text) >= 10:
             return page_text

        # If a Scanned PDF
        print("Its a Scanned PDF, running OCR.")

        doc = DocumentFile.from_pdf(str(pdf_path))
        model = ocr_predictor(pretrained=True) # This is where the model is being created
        result = model(doc)

        page_text_doctr = {}
        for idx, page in enumerate(result.pages):
            text_content = ""
            for block in page.blocks:
                for line in block.lines:
                    for word in line.words:
                        text_content += word.value + " "
                text_content += "\n"  # Add a new line after each block
            page_text_doctr[idx + 1] = text_content.strip() #remove trailing spaces
        return page_text_doctr

    except Exception as e:
        print(f"An error occurred: {e}")
        return {}


def save_to_json(data: dict, output_path: Path):
    """
    Saves extracted PDF text data to a JSON file.

    Args:
      data: The dictionary containing the extracted text.
      output_path: The path where to save the json file.
    """
    try:
        with open(output_path, "w", encoding="utf-8") as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
    except Exception as e:
        print(f"Error saving to JSON: {e}")


if __name__ == '__main__':
    pdf_file_path = Path("test.pdf")  # Replace with the actual path to your PDF file
    output_json_path = Path("output.json")

    extracted_text = extract_text_from_pdf(pdf_file_path)

    if extracted_text:
       save_to_json(extracted_text, output_json_path)
       print(f"Successfully extracted text from '{pdf_file_path}' and saved to '{output_json_path}'.")
    else:
        print("Failed to extract text from the PDF or PDF is empty.")

Its a Scanned PDF, running OCR.




Successfully extracted text from 'test.pdf' and saved to 'output.json'.
