In [56]:
import os
import json
import logging
import multiprocessing
import fitz  # PyMuPDF library
import re

In [57]:
# Set up logging
logging.basicConfig(filename='text_extraction.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [58]:
# Input and output folders
input_dir = "C:/Users/tinot/Downloads/PyMuPDF"
output_dir = "C:/Users/tinot/Downloads/PyMuPDF/Output"

In [59]:
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [60]:
# Define function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    try:
        # Open the PDF document
        doc = fitz.open(pdf_path)

        # Initialize the extracted text
        text = ""

        # Loop through each page in the document
        for page in doc:
            # Extract the text from the page
            page_text = page.get_text()

            # Add the page text to the extracted text
            text += page_text

        # Clean the extracted text
        text = clean_text(text)

        # Return the cleaned text
        return text

    except Exception as e:
        # Log the error and return None
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        return None

In [61]:
# Define function to clean the extracted text
def clean_text(text):
    # Remove unwanted characters
    text = re.sub(r'[^\w\s]', '', text)

    # Replace multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove leading and trailing whitespaces
    text = text.strip()

    # Return the cleaned text
    return text

In [62]:
# Define function to write cleaned text to JSON Lines file
def write_text_to_jsonl(text, jsonl_path):
    try:
        # Write the cleaned text to the JSON Lines file
        with open(jsonl_path, 'a') as f:
            f.write(json.dumps(text) + '\n')

    except Exception as e:
        # Log the error and return None
        logging.error(f"Error writing text to {jsonl_path}: {e}")
        return None

# Loop through each PDF in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".pdf"):
        # Construct the full file paths
        pdf_path = os.path.join(input_dir, filename)
        jsonl_path = os.path.join(output_dir, filename.replace(".pdf", ".jsonl"))

        # Extract the text from the PDF
        extracted_text = extract_text_from_pdf(pdf_path)

        # Write the cleaned text to the JSON Lines file
        if extracted_text is not None:
            write_text_to_jsonl(extracted_text, jsonl_path)