In [1]:
import pytesseract
import pdf2image
from PIL import Image
from pdf2image import convert_from_path
import os
from pathlib import Path
import shutil


def convert_pdf_to_text(pdf_path, output_path):
    try:
        temp_image_dir = 'temp/' + pdf_path.stem
        os.makedirs(temp_image_dir, exist_ok=True)
        images = convert_from_path(pdf_path, output_folder=temp_image_dir, fmt="jpeg", dpi=200)
        with open(output_path, 'w') as f:
            for image in images:
                text = pytesseract.image_to_string(image)
                f.write(text)

        return True

    except Exception as e:
        print(f"Error converting {pdf_path}: {str(e)}")
        return False

def main():
    input_dir = '../web_scraping/arxiv_pdfs'
    output_dir = 'pdf_ocr'
    temp_dir = 'temp'

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Create temp dir to store temp jpeg images
    os.makedirs(temp_dir, exist_ok=True)
    
    # Get all PDF files
    pdf_files = list(Path(input_dir).glob('*.pdf'))
    
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to convert")
    
    successful = 0
    failed = 0

    # Convert each PDF
    for i, pdf_path in enumerate(pdf_files, 1):
        # Create output filename (replace .pdf with .txt)
        output_filename = pdf_path.stem + '.txt'
        output_path = os.path.join(output_dir, output_filename)
        
        print(f"[{i}/{len(pdf_files)}] Converting {pdf_path.name}...", end='\n')

        if convert_pdf_to_text(pdf_path, output_path):
            print("✓ Success")
            successful += 1
        else:
            print("✗ Failed")
            failed += 1

    shutil.rmtree(temp_dir)

if __name__ == "__main__":
    main()

Found 200 PDF files to convert
[1/200] Converting 2510.25613.pdf...
✓ Success
[2/200] Converting 2510.25439.pdf...
✓ Success
[3/200] Converting 2510.25377.pdf...
✓ Success
[4/200] Converting 2510.25559.pdf...
✓ Success
[5/200] Converting 2510.25565.pdf...
✓ Success
[6/200] Converting 2510.25767.pdf...
✓ Success
[7/200] Converting 2510.25570.pdf...
✓ Success
[8/200] Converting 2510.25376.pdf...
✓ Success
[9/200] Converting 2510.25438.pdf...
✓ Success
[10/200] Converting 2510.25638.pdf...
✓ Success
[11/200] Converting 2510.25764.pdf...
✓ Success
[12/200] Converting 2510.25407.pdf...
✓ Success
[13/200] Converting 2510.25349.pdf...
✓ Success
[14/200] Converting 2510.25605.pdf...
✓ Success
[15/200] Converting 2510.25615.pdf...
✓ Success
[16/200] Converting 2510.25601.pdf...
✓ Success
[17/200] Converting 2510.25629.pdf...
✓ Success
[18/200] Converting 2510.25365.pdf...
✓ Success
[19/200] Converting 2510.25359.pdf...
✓ Success
[20/200] Converting 2510.25563.pdf...
✓ Success
[21/200] Convertin