In [1]:
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import os
import pytesseract
from PIL import Image
from docx import Document

In [2]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text

def extract_images_from_pdf(pdf_path, output_folder="images"):
    """Extract images from a PDF file and save them as separate image files."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    images = convert_from_path(pdf_path,500, poppler_path=r"D:\poppler-24.08.0\Library\bin")
    image_paths = []
    
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        image_paths.append(image_path)
    
    return image_paths

def preprocess_image(image_path):
    image = Image.open(image_path).convert("L")  # Convert to grayscale
    image = image.point(lambda x: 0 if x < 140 else 255)  # Thresholding
    return image

def extract_text_from_image(image_path):
    """Extract text from an image using OCR."""
    image = Image.open(image_path)
    pytesseract.pytesseract.tesseract_cmd = r'D:\\tesseract\\tesseract.exe'
    image = preprocess_image(image_path)
    custom_config = r'--oem 3 --psm 6'  # OCR Engine Mode 3, Page Segmentation Mode 6
    return pytesseract.image_to_string(image, config=custom_config, lang="eng")

def save_text_to_word(text, output_path="extracted_text.docx"):
    """Save extracted text to a Word file."""
    doc = Document()
    doc.add_paragraph(text)
    doc.save(output_path)
    print(f"Text saved to {output_path}")
    
def main():
    pdf_path = "pages.pdf"  # Replace with your PDF file path
    
    text = extract_text_from_pdf(pdf_path)
    
    image_paths = extract_images_from_pdf(pdf_path)
    
    for image_path in image_paths:
        extracted_text = extract_text_from_image(image_path)
        text += f"\nExtracted Text from {image_path}:\n{extracted_text}\n"
    
    save_text_to_word(text)

if __name__ == "__main__":
    main()


Text saved to extracted_text.docx


In [3]:
os.listdir()

['.git', 'images', 'page.pdf', 'README.md', 'script.ipynb']