In [None]:
from dotenv import load_dotenv

load_dotenv()

## Working with PDFs as images

### Approach 2: Use the entire page!

How do we mentally split PDFs? We usually think of them as a set of pages. We can do the same with PDFs, by embedding the entire page!

<img src="data/imgs/hai_ai_index_report_2025_chapter_2_34_of_80.jpg" width="200px" />
<img src="data/imgs/hai_ai_index_report_2025_chapter_2_58_of_80.jpg" width="200px" />
<img src="data/imgs/hai_ai_index_report_2025_chapter_2_69_of_80.jpg" width="200px" />

In [None]:
import utils

# Helper function to connect to Weaviate
client = utils.connect_to_weaviate()

In [None]:
client.collections.delete("Pages")

In [None]:
from weaviate.classes.config import Property, DataType, Configure, Tokenization

client.collections.create(
    name="Pages",
    properties=[
        Property(
            name="document_title",
            data_type=DataType.TEXT,
        ),
        Property(
            name="page_image",
            data_type=DataType.BLOB,
        ),
        Property(
            name="filename",
            data_type=DataType.TEXT,
            tokenization=Tokenization.FIELD
        ),
    ],
    vector_config=[
        # Add `Configure.Vectors.multi2vec_cohere` vector to the collection with:
        # name: "default", source properties: ["page_image"], and model: "embed-v4.0"
        # ADD YOUR CODE HERE
    ]
)

In [None]:
pages = client.collections.get("Pages")

Load pre-computed vectors & metadata

In [None]:
import numpy as np
import json

with open("data/embeddings/hai_embeddings_metadata.json", "r") as f:
    metadata = json.load(f)

embeddings = np.load("data/embeddings/hai_image_embeddings.npy")

Import data

In [None]:
from tqdm import tqdm
from pathlib import Path
import base64

with pages.batch.fixed_size(batch_size=100) as batch:
    for i, embedding in tqdm(enumerate(embeddings)):
        filepath = Path(metadata["image_paths"][i])
        image = filepath.read_bytes()
        base64_image = base64.b64encode(image).decode('utf-8')
        obj = {
            "document_title": "Stanford HAI Report 2025",
            "page_image": base64_image,
            "filename": filepath.name
        }

        # Add object to batch for import with (batch.add_object())
        # This time, manually provide the vector with `{"default": embedding}`
        # ADD YOUR CODE HERE

In [None]:
from weaviate.classes.generate import GenerativeConfig, GenerativeParameters

prompt = GenerativeParameters.grouped_task(
    prompt="What advances has there been in autonomous driving in the last few years?",
    image_properties=["page_image"],  # Property containing images in Weaviate
)


response = pages.generate.near_text(
    # Try a RAG query with:
    # query (what to search for): "How to clean the drain pump" and
    # limit (how many objects to fetch): 3
    # grouped_task (prompt): prompt defined above
    # ADD YOUR CODE HERE
    # Runtime definition of what generative AI model provider to use
    generative_provider=GenerativeConfig.anthropic(
        model="claude-3-5-haiku-latest"
    )
)

In [None]:
print(response.generative.text)

In [None]:
for o in response.objects:
    print(f"Filename: {o.properties['filename']}")

In [None]:
client.close()

<div style="max-width: 90%; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center; box-shadow: 0 5px 15px rgba(0,0,0,0.2); margin: 30px 0 15px 0; box-sizing: border-box;">
    <h1 style="font-size: 2em; margin: 0; text-shadow: 1px 1px 3px rgba(0,0,0,0.3);">🎉 Congratulations!</h1>
    <p style="font-size: 1.1em; margin: 10px 0;">You've successfully built a PDF-driven RAG system with Weaviate</p>
    <hr style="border: none; height: 1px; background: rgba(255,255,255,0.3); margin: 15px 0;"/>
    <p style="margin: 10px 0; font-size: 1em;">
        🚀 <strong>Ready to take your RAG skills to the next level?</strong>
    </p>    
</div>

<div style="max-width: 90%; background: linear-gradient(90deg, #11998e 0%, #38ef7d 100%); padding: 20px; border-radius: 10px; text-align: center; margin: 15px 0; box-sizing: border-box;">
    <div style="max-width: 95%; background: rgba(255,255,255,0.2); padding: 15px; border-radius: 6px;">
        <h3 style="margin: 0 0 10px 0; color: #2c3e50; font-size: 1.1em;">📚 Continue Learning</h3>
        <p style="color: #2c3e50; font-size: 0.9em; margin-bottom: 10px;">
            Expand your knowledge with free courses and comprehensive documentation<br/><br/>
            <a href="https://weaviate.io/developers/academy" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">🎓 Weaviate Academy</a> | 
            <a href="https://weaviate.io/developers/weaviate" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">📖 Documentation</a> | 
            <a href="https://weaviate.io/community/events" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">🎙️ Workshops</a>
        </p>
        <br/>
        <div style="text-align: left; display: inline-block;">
            <p style="color: #2c3e50; font-weight: bold; margin: 5px 0; font-size: 0.9em;">
            🎙️ <strong>Upcoming workshops:</strong><br/><br/>
            • <a href="https://luma.com/ws-2025-10-02?utm_source=weaviate_eventspage" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">Oct 2: Intro to building AI-native applications with Weaviate</a><br>
            • <a href="https://luma.com/ws-2025-10-07?utm_source=weaviate_events" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">Oct 7: Building Intelligent Chatbots with Pydantic AI and Weaviate</a><br>
            </p>
        </div>
    </div>
</div>

<div style="max-width: 90%; background: linear-gradient(90deg, #ffeaa7 0%, #fab1a0 100%); padding: 20px; border-radius: 10px; text-align: center; margin: 15px 0; box-sizing: border-box;">
    <div style="max-width: 95%; background: rgba(255,255,255,0.2); padding: 15px; border-radius: 6px;">
        <h3 style="margin: 0 0 10px 0; color: #2c3e50; font-size: 1.1em;">🌟 Join the Community</h3>
        <p style="color: #2c3e50; font-size: 0.9em; margin-bottom: 10px;">
            Connect with other developers and get help<br/><br/>
            <a href="https://weaviate.io/slack" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">💬 Slack</a> | 
            <a href="https://forum.weaviate.io/" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">💻 Forum</a> | 
            <a href="https://newsletter.weaviate.io" style="color: #2c3e50; text-decoration: underline; font-weight: bold;">📧 Newsletter</a>
        </p>
        <p style="color: #2c3e50; font-size: 0.9em; margin-bottom: 10px;">
        Meet & share • Ask questions • AI news<br>
        </p>
    </div>
</div>

<div style="max-width: 90%; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; box-shadow: 0 5px 15px rgba(0,0,0,0.2); margin: 15px 0; box-sizing: border-box;">
    <p style="margin: 0; font-size: 1em; opacity: 0.9;">
        Thank you for building with Weaviate! 🙏
    </p>
    <p style="margin: 5px 0 0 0; font-size: 0.9em; opacity: 0.8;">
        <em>Happy vector searching!</em> ✨
    </p>
</div>