In [1]:
from PIL import Image
import base64
import io

def image_to_base64(image_path):
    # Open the image file
    with Image.open(image_path) as img:
        # Create a BytesIO object to hold the image data
        buffered = io.BytesIO()
        # Save the image to the BytesIO object in a specific format (e.g., JPEG)
        img.save(buffered, format="PNG")
        # Get the byte data from the BytesIO object
        img_bytes = buffered.getvalue()
        # Encode the byte data to base64
        img_base64 = base64.b64encode(img_bytes).decode('utf-8')
        return img_base64



In [None]:
# Example 1: Single Image Input
image_path = 'examples/image.png'  # Replace with your image path
base64_image = image_to_base64(image_path)

In [3]:
import ollama

# Use Ollama to clean and structure the OCR output
response = ollama.chat(
    model="x/llama3.2-vision:latest",
    messages=[{
      "role": "user",
      "content": "The image is a book cover. Output should be in this format - <Name of the Book>: <Name of the Author>. Do not output anything else",
      "images": [base64_image]
    }],
)
# Extract cleaned text
cleaned_text = response['message']['content'].strip()
print(cleaned_text)


The Secret History: Donna Tartt.


In [3]:
# Example 2: Generating the Author's Full Name

image_path = 'examples/image_2.png'  # Replace with your image path
base64_image = image_to_base64(image_path)

response = ollama.chat(
    model="x/llama3.2-vision:latest",
    messages=[{
      "role": "user",
      "content": "The image is a book cover. Output should be in this format - <Name of the Book>: <Full Name of the Author>. Do not output anything else",
      "images": [base64_image]
    }],
)
# Extract cleaned text
cleaned_text = response['message']['content'].strip()
print(cleaned_text)


Norwegian Wood: Haruki Murakami.


In [5]:
# Example 3: Multiple Books

image_path = 'examples/murakami_multiple.png'  # Replace with your image path
base64_image = image_to_base64(image_path)

response = ollama.chat(
    model="x/llama3.2-vision:latest",
    messages=[{
      "role": "user",
      "content": "The image contains multiple book covers. Output all the book covers in this format - <Name of the Book>: <Full Name of the Author>. Do not output anything else",
      "images": [base64_image]
    }],
)
# Extract cleaned text
cleaned_text = response['message']['content'].strip()
print(cleaned_text)


Norwegian Wood: Haruki Murakami
Kafka on the Shore: Haruki Murakami
Men Without Women: Haruki Murakami
Sputnik Sweetheart: Haruki Murakami
South of the Border, West of the Sun: Haruki Murakami
A Wild Sheep Chase: Haruki Murakami
Birthday Stories: Haruki Murakami
Underground: Haruki Murakami
After Dark: Haruki Murakami
After the Quake: Haruki Murakami
The Elephant Vanishes: Haruki Murakami


In [8]:
# Example 4: Stack of Books

image_path = 'examples/murakami_stack.jpg'  # Replace with your image path
base64_image = image_to_base64(image_path)

response = ollama.chat(
    model="x/llama3.2-vision:latest",
    messages=[{
      "role": "user",
      "content": "The image contains a stack of books. Output all the book names in this format - <Name of the Book>: <Full Name of the Author>. Do not output anything else",
      "images": [base64_image]
    }],
)
# Extract cleaned text
cleaned_text = response['message']['content'].strip()
print(cleaned_text)


* Norwegian Wood: Haruki Murakami
* Sputnik Sweetheart: Haruki Murakami
* After Dark: Haruki Murakami
* Dance, Dance, Dance: Haruki Murakami
* Kafka on the Shore: Haruki Murakami
* Hear the Wind Sing: Haruki Murakami
* A Wild Sheep Chase: Haruki Murakami
* Blind Willow, Sleeping Woman: Haruki Murakami
* After the Quake: Haruki Murakami
* The Wind-Up Bird Chronicle: Haruki Murakami
