In [None]:
from dotenv import load_dotenv

load_dotenv()

## Working with PDFs as images

### Approach 2: Use the entire page!

How do we mentally split PDFs? We usually think of them as a set of pages. We can do the same with PDFs, by embedding the entire page!

<img src="data/imgs/manual_bosch_WGG254Z0GR_36_of_56.jpg" width="200px" />
<img src="data/imgs/manual_bosch_WGG254Z0GR_37_of_56.jpg" width="200px" />
<img src="data/imgs/manual_bosch_WGG254Z0GR_38_of_56.jpg" width="200px" />

In [None]:
import weaviate
import os

client = weaviate.connect_to_embedded(
    version="1.32.0",
    headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
        "X-Anthropic-Api-Key": os.getenv("ANTHROPIC_API_KEY"),
    },
    environment_variables={
        "LOG_LEVEL": "error",                   # Reduce amount of logs
        "ENABLE_API_BASED_MODULES": "true",     # Enable API-based modules like multi2vec-cohere
    }
)

In [2]:
client.collections.delete("Pages")

In [None]:
from weaviate.classes.config import Property, DataType, Configure, Tokenization

client.collections.create(
    name="Pages",
    properties=[
        Property(
            name="document_title",
            data_type=DataType.TEXT,
        ),
        Property(
            name="page_image",
            data_type=DataType.BLOB,
        ),
        Property(
            name="filename",
            data_type=DataType.TEXT,
            tokenization=Tokenization.FIELD
        ),
    ],
    vector_config=[
        # Add `Configure.Vectors.multi2vec_cohere` vector to the collection with:
        # name: "default", source properties: ["page_image"], and model: "embed-v4.0"
        # BEGIN_SOLUTION
        Configure.Vectors.multi2vec_cohere(
            name="default",
            image_fields=["page_image"],
            model="embed-v4.0"
        )
        # END_SOLUTION
    ]
)

<weaviate.collections.collection.sync.Collection at 0x10419b7c0>

In [4]:
pages = client.collections.get("Pages")

Load pre-computed vectors & metadata

In [5]:
import numpy as np
import json

with open("data/embeddings/embeddings_metadata.json", "r") as f:
    metadata = json.load(f)

embeddings = np.load("data/embeddings/image_embeddings.npy")

Import data

In [None]:
from tqdm import tqdm
from pathlib import Path
import base64

with pages.batch.fixed_size(batch_size=100) as batch:
    for i, embedding in tqdm(enumerate(embeddings)):
        filepath = Path(metadata["image_paths"][i])
        image = filepath.read_bytes()
        base64_image = base64.b64encode(image).decode('utf-8')
        obj = {
            "document_title": "Bosch WGG254Z0GR Manual",
            "page_image": base64_image,
            "filename": filepath.name
        }

        # Add object to batch for import with (batch.add_object())
        # This time, manually provide the vector with `{"default": embedding}`
        # BEGIN_SOLUTION
        batch.add_object(
            properties=obj,
            vector={"default": embedding}
        )
        # END_SOLUTION

56it [00:00, 775.87it/s]


In [None]:
from weaviate.classes.generate import GenerativeConfig, GenerativeParameters

prompt = GenerativeParameters.grouped_task(
    prompt="How do I clean the drain pump? Please describe the steps in detail based on the provided images.",
    image_properties=["page_image"],  # Property containing images in Weaviate
)


response = pages.generate.near_text(
    # Try a RAG query with:
    # query (what to search for): "How to clean the drain pump" and
    # limit (how many objects to fetch): 3
    # grouped_task (prompt): prompt defined above
    # BEGIN_SOLUTION
    query="How to clean the drain pump",
    limit=3,
    grouped_task=prompt,
    # END_SOLUTION
    # Runtime definition of what generative AI model provider to use
    generative_provider=GenerativeConfig.anthropic()
)

In [9]:
print(response.generative.text)

Based on the images provided, here are the detailed steps to clean the drain pump:

1. Ensure the drain pump is empty. Refer to page 36 for instructions on emptying the drain pump.

2. Unscrew the pump cap carefully, as water may remain in the drain pump.

3. Remove the filter insert from the pump housing. It may be stuck due to dirt particles, so loosen and remove it gently.

4. Clean the interior of the pump, the threads on the pump cap, and the pump housing.

5. The pump cap consists of two parts that can be separated for thorough cleaning.

6. Ensure the impeller in the drain pump can rotate freely.

7. Insert the pump cap, making sure all parts are correctly assembled.

8. Screw the pump cap back in as far as it will go. The handle on the pump cap must be in a vertical position.

9. Insert and close the service flap.

For regular maintenance:
- Clean the drain pump at least once a year.
- Also clean if you notice any faults like blockages or rattling noises.
- Always disconnect th

In [10]:
for o in response.objects:
    print(f"Filename: {o.properties['filename']}")

Filename: manual_bosch_WGG254Z0GR_38_of_56.jpg
Filename: manual_bosch_WGG254Z0GR_36_of_56.jpg
Filename: manual_bosch_WGG254Z0GR_39_of_56.jpg


In [11]:
client.close()

{"build_git_commit":"7cebee0421","build_go_version":"go1.24.5","build_image_tag":"HEAD","build_wv_version":"1.32.0","error":"context canceled","level":"error","msg":"replication engine failed to start after FSM caught up","time":"2025-07-16T17:02:16+01:00"}
{"build_git_commit":"7cebee0421","build_go_version":"go1.24.5","build_image_tag":"HEAD","build_wv_version":"1.32.0","error":"cannot find peer","level":"error","msg":"transferring leadership","time":"2025-07-16T17:02:16+01:00"}
