#### 1. Setup - choose input and output directories, number N PDFs to extract each execution

In [9]:
import os
import shutil
import textract
import time

# CHANGE: Path to the directory containing PDFs
input_directory = "../Comments Received After Field Review" # Adjust this path to the input folder containing PDFs to convert.
embedded_output_directory = f"./textractOutputs/{input_directory[3:]}_embeddedText"  # Adjust this path to where you want to save the text files.
image_output_directory = f"./textractOutputs/{input_directory[3:]}_images" # adjust this path to where you want to save scanned image-based files.

# Create output directory if it doesn't exist
os.makedirs(embedded_output_directory, exist_ok=True)
os.makedirs(image_output_directory, exist_ok=True)

curr_index = 0 # track how many you've processed so far
n = 1000 # Set n to desired number of files per execution of below cell.
files = [file for file in os.listdir(input_directory) if file.endswith('.pdf')]

#### 2. Extract N PDFs from input_directory -- this will cycle through entire input_directory N files at a time.

In [13]:
# n = n # change this and run it before below if you want to edit starting point

In [21]:
# Function to check if it's embedded text-based PDF, or image/errors
def is_text_based(pdf_path):
    try: #hm
        text = extract_text(pdf_path)
        return bool(text.strip())
    except Exception as e:
        return False

num_embedded, num_images, errors = 0, 0, 0

start_time = time.time()

if curr_index + n < len(files):
    n_files = files[curr_index : curr_index + n] # move n files along
else:
    n_files = files[curr_index :] # finish

for filename in n_files:
    if filename.endswith(".pdf"):  # Process only PDF files (skip any .txt files)
        pdf_path = os.path.join(input_directory, filename)

        if is_text_based(pdf_path):
            # Extract text using pdfminer
            text = extract_text(pdf_path)
            output_file_path = os.path.join(embedded_output_directory, f"{os.path.splitext(filename)[0]}_embedded.txt")
            try: #line 
                # Extract text from the PDF
                text = textract.process(pdf_path)
                
                # Decode bytes to string
                text_str = text.decode('utf-8')
                
                # Define output file path
                output_filename = f"{os.path.splitext(filename)[0]}.txt"
                output_path = os.path.join(embedded_output_directory, output_filename)
                
                # Write the extracted text to a text file
                with open(output_path, "w", encoding="utf-8") as text_file:
                    text_file.write(text_str)
                num_embedded += 1
        
            except Exception as e:
                errors += 1
        
        else: 
            try:
                output_file_path = os.path.join(image_output_directory, f"{os.path.splitext(filename)[0]}_image.pdf")
                shutil.move(pdf_path, output_file_path)
                num_images += 1
            except:
                errors += 1
curr_index += n

end_time = time.time()
runtime = end_time - start_time
print(f"Processed {num_embedded} embedded PDFs and {num_images} scanned PDFs \n Total runtime: {runtime:.2f} seconds \n {errors} Errors")

Processed 0 embedded PDFs and 798 scanned PDFs 
 Total runtime: 2.54 seconds 
 1 Errors
