In [1]:
from collections import defaultdict
import os

import base64
from openai import OpenAI
from PIL import Image, UnidentifiedImageError
import pytesseract
import tqdm

In [2]:
data_dir = "./data"
input_dir = f"{data_dir}"
output_dir = f"{data_dir}"

In [None]:
# Load the files and see how many we have of each type
filenames = defaultdict(list)

for fn in os.listdir(input_dir):

    fp = f"{input_dir}/{fn}"
    try:
        img = Image.open(fp)
        ext = img.format.lower()

        # Convert any mpos to jpegs
        if ext == "mpo":
            img.save(fp, "jpeg")

    except UnidentifiedImageError:
        ext = os.path.splitext(fn)[1][1:]

    filenames[ext].append(fn)

print({k: len(v) for k, v in filenames.items()})

In [4]:
# Create the openai client
client = OpenAI(
    api_key="API_KEY_HERE"
)

In [None]:
# Process the images
for i, fn in enumerate(tqdm.tqdm(filenames["jpeg"])):

    # Get the string from the image
    img = Image.open(f"{input_dir}/{fn}")
    raw_str = pytesseract.image_to_string(img)

    # Unscramble the string
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        store=True,
        messages=[
            {
                "role": "user",
                "content": f"Please unscramble the following text, which contains a recipe. Output the result as plain text. Please don't say anything other than giving the output: {raw_str}",
            }
        ],
    )
    parsed_str = completion.choices[0].message.content

    # Save the result
    fp = f"{output_dir}/{os.path.splitext(fn)[0]}.docx"
    with open(fp, "w") as f:
        f.write(parsed_str)

In [None]:
# Process the html files
for i, fn in enumerate(tqdm.tqdm(filenames["html"])):

    # Get the string from the image
    fp = f"{input_dir}/{fn}"
    with open(fp, "r") as f:
        raw_str = f.read()

    # Unscramble the string
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        store=True,
        messages=[
            {
                "role": "user",
                "content": f"Parse the following html for a recipe, and output the result as plain text. Please don't say anything other than giving the output: {raw_str}",
            }
        ],
    )
    parsed_str = completion.choices[0].message.content

    # Save the result
    fp = f"{output_dir}/{os.path.splitext(fn)[0]}.docx"
    with open(fp, "w") as f:
        f.write(parsed_str)