In [None]:
# this script was used to filter which documents were OCR'd and which were bare scans

import os
import fitz
import shutil
import pandas as pd

pdf_directories = [
# ...directories...
]

def is_page_text(page):
    return len(page.get_text()) > 0

def is_fully_ocrd(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)
        for page in pdf_document:
            if not is_page_text(page):
                return False
        return True
    except Exception as e:
        print(f"error processing {pdf_path}: {str(e)}")
        return False

all_ocr_results = []

for pdf_directory in pdf_directories:
    speaker_name = os.path.basename(pdf_directory)
    
    # appends "OCR" to the end of the original folder name and makes a new one
    new_folder = os.path.join(pdf_directory + 'OCR')
    os.makedirs(new_folder, exist_ok=True)
    
    # OCR check on each PDF in the directory
    fully_ocrd_count = 0
    total_count = 0
    
    for root, _, files in os.walk(pdf_directory):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                fully_ocrd = is_fully_ocrd(pdf_path)
                status = "Fully OCR'd" if fully_ocrd else "Not Fully OCR'd"
                print(f'{file}: {status}')
                all_ocr_results.append([file, status])
                total_count += 1
                
                if fully_ocrd:
                    fully_ocrd_count += 1
                    # duplicate fully OCR'd PDF into the new folder
                    dest_path = os.path.join(new_folder, file)
                    shutil.copy(pdf_path, dest_path)
    
    # calculate the percentage of fully OCR'd files
    fully_ocrd_percentage = (fully_ocrd_count / total_count) * 100
    
    # create dataframe from the OCR results and saves it as an excel file
    df = pd.DataFrame(all_ocr_results, columns=["File", "OCR Status"])

    excel_file_name = f"{speaker_name}_OCR_Status.xlsx"
    excel_file_path = os.path.join(new_folder, excel_file_name)
    df.to_excel(excel_file_path, index=False)
    
    with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
        writer.book.active.title = "Main"
        main_sheet = writer.sheets["Main"]
        main_sheet.cell(row=1, column=3, value="Fully OCR'd Percentage")
        main_sheet.cell(row=2, column=3, value=fully_ocrd_percentage)
