In [3]:
import os
import shutil
from PyPDF2 import PdfReader
from PIL import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Users\gbray\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
from docx import Document

def has_selectable_text(pdf_path):
    """Check if the PDF has selectable text and return the text if present."""
    text_content = ""
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text_content += page.extract_text() + "\n\n"  # Two newlines for a new page
    return text_content.strip()

def ocr_image(image_path):
    """Perform OCR on the image and return the extracted text."""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang='fra')
    return text

def extract_docx_text(docx_path):
    """Extract text from a DOCX file."""
    doc = Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def process_document(file_path, folderok_path):
    """Process the document and move/select the appropriate action based on file type."""
    file_ext = os.path.splitext(file_path)[1].lower()
    
    # PDF Handling
    if file_ext == '.pdf':
        text_content = has_selectable_text(file_path)
        if text_content:
            with open(os.path.join(folderok_path, os.path.splitext(os.path.basename(file_path))[0] + ".txt"), 'w', encoding='utf-8') as f:
                f.write(text_content)
        else:
            text_content = ocr_image(file_path)
            with open(os.path.join(folderok_path, os.path.splitext(os.path.basename(file_path))[0] + ".txt"), 'w', encoding='utf-8') as f:
                f.write(text_content)
    
    # TXT Handling
    elif file_ext == '.txt':
        shutil.move(file_path, os.path.join(folderok_path, os.path.basename(file_path)))
    
    # JPG and PNG Handling
    elif file_ext in ['.jpg', '.jpeg', '.png']:
        text_content = ocr_image(file_path)
        with open(os.path.join(folderok_path, os.path.splitext(os.path.basename(file_path))[0] + ".txt"), 'w', encoding='utf-8') as f:
            f.write(text_content)
    
    # DOCX Handling
    elif file_ext == '.docx':
        text_content = extract_docx_text(file_path)
        with open(os.path.join(folderok_path, os.path.splitext(os.path.basename(file_path))[0] + ".txt"), 'w', encoding='utf-8') as f:
            f.write(text_content)

if __name__ == '__main__':
    input_folder = "C:\\Users\\gbray\\Desktop\\python\\ocr\\PasteDocsHere"
    output_folder = "C:\\Users\\gbray\\Desktop\\python\\ocr\\OutputAfterOCR"
    
    for doc in os.listdir(input_folder):
        process_document(os.path.join(input_folder, doc), output_folder)
