Jupyter Notebook <b>Notary</b> contains three scripts that perform the following transformations:

- JPG to PNG with `Pillow`,
- PNG text to TXT with `pytesseract`,
- and PDF text to TXT with `pdf2image` and `pytesseract`.

This tool was initially designed to retrieve valuable information from images, such as screenshots of scanned books.

In [None]:
# !pip install Pillow pytesseract pdf2image

In [None]:
# JPG TO PNG
from PIL import Image
import os

# Function to convert a single image from JPEG to PNG
def convert_to_png(input_path, output_path):
    try:
        img = Image.open(input_path)
        img.save(output_path, 'PNG')
        print(f"Converted: {input_path} -> {output_path}")
    except Exception as e:
        print(f"Failed to convert {input_path}: {str(e)}")

# Function to convert all JPEG images in a directory to PNG
def convert_all_jpeg_to_png(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith('.jpg') or filename.endswith('.jpeg'):
            input_path = os.path.join(input_dir, filename)
            output_filename = os.path.splitext(filename)[0] + '.png'
            output_path = os.path.join(output_dir, output_filename)
            convert_to_png(input_path, output_path)

if __name__ == "__main__":
    input_directory = "/absolute/path/to/file"
    output_directory = "/absolute/path/"
    convert_all_jpeg_to_png(input_directory, output_directory)

In [None]:
# PNG TEXT FOLDER TO TXT
import cv2
import pytesseract
import os

def main():
    """The main function."""
    folder_path = "/absolute/path/to/folder"
    output_file = open("output.txt", "w")
    # Get a list of all the PNG files in the folder.
    png_files = os.listdir(folder_path)
    # Iterate over the PNG files.
    for png_file in png_files:
        # Read the PNG file.
        image = cv2.imread(os.path.join(folder_path, png_file))
        # Recognize the text in the PNG file.
        text = pytesseract.image_to_string(image)
        # Save the text to a file.
        with open(os.path.join(folder_path, png_file.replace(".png", ".txt")), "w") as f:
            f.write(text)

    # Create a list of all the txt files in the folder
    txt_files = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            txt_files.append(os.path.join(folder_path, file))

    # Sort the list
    sorted_list = sorted(txt_files)
    print(sorted_list)

    # Iterate over the txt files and read their contents
    for txt_file in sorted_list:
        with open(txt_file, "r") as f:
            output_file.write(f.read())

    # Close the output file
    output_file.close()

if __name__ == "__main__":
    main()

In [2]:
# PDF TO TXT
import pytesseract
from pdf2image import convert_from_path

def pdf_to_txt(pdf_path, output_path):
  """Converts a PDF file to a TXT file."""
  # Convert the PDF file to images.
  images = convert_from_path(pdf_path, dpi=300, fmt="PNG")
  # Extract the text from the images.
  texts = [pytesseract.image_to_string(img) for img in images]
  # Write the text to a file.
  with open(output_path, "w") as file:
    for text in texts:
      file.write(text)

if __name__ == "__main__":
  pdf_path = "/absolute/path/to/file.pdf"
  output_path = "/path/to/file.txt"

  pdf_to_txt(pdf_path, output_path)