In [1]:
import weaviate, os

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),
    }
)

client.is_ready()

True

In [2]:
from weaviate.classes.config import Configure, Property, DataType, Multi2VecField

client.collections.delete("MoviesMM")

client.collections.create(
    name="MoviesMM",  # The name of the collection ('NV' for named vectors)
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
        Property(name="poster", data_type=DataType.BLOB),
    ],
    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title
        Configure.NamedVectors.text2vec_openai(
            name="title", source_properties=["title"]
        ),
        # Vectorize the movie overview (summary)
        Configure.NamedVectors.text2vec_openai(
            name="overview", source_properties=["overview"]
        ),
        Configure.NamedVectors.multi2vec_clip(
            name="poster",
            image_fields=[
                Multi2VecField(name="poster", weight=0.9)
            ],
            text_fields=[
                Multi2VecField(name="title", weight=0.1)
            ],
        )
    ],
    # Define the generative module
    generative_config=Configure.Generative.openai("gpt-4"),
)

<weaviate.collections.collection.Collection at 0x1301cbf50>

## Load data

In [3]:
import requests, json
import pandas as pd

data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
data_resp = requests.get(data_url)
df = pd.DataFrame(data_resp.json())

### Example of loading images from the Internet
> We won't use it for import for this project, as that could get flagged by tmdb servers as an attack

In [4]:
import base64, requests

def url_to_base64(url):
    image_response = requests.get(url)
    content = image_response.content
    return base64.b64encode(content).decode("utf-8")

url_to_base64("https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg")

'/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAUFBQUFBQUGBgUICAcICAsKCQkKCxEMDQwNDBEaEBMQEBMQGhcbFhUWGxcpIBwcICkvJyUnLzkzMzlHREddXX0BBQUFBQUFBQYGBQgIBwgICwoJCQoLEQwNDA0MERoQExAQExAaFxsWFRYbFykgHBwgKS8nJScvOTMzOUdER11dff/CABEIA4QCWAMBIgACEQEDEQH/xAAdAAAABwEBAQAAAAAAAAAAAAAAAQIDBAUGBwgJ/9oACAEBAAAAAPO5hLjaXFohWmczKQYAIAAAAAAAAAAAAAAGQMgDIAAAAAAyA6CZggaQip0GLqwZAyABkYIAAAAAAAAAyBkAAAAAAAAZAAAdBUpClIB1EheMaAAMgAAAADIAAGQAAAAAAMgDBAAAAAGQAG/Uba1oJNajQ4GGAAAAAAAAAAAAAAAAAZAAAAAAAwQBgEAOkso0mXafIKyemqM6AZGCAABkAAADBAGQMEAAAAAAAAAAAYIADoKHJNWlxxksnaOZ4gZAwAQAAAu/X/iYgAAAAADBD0bmuLmQAAMgAAAAAOgIfaS2+kkUlCQAAAAAABkDtvpR8wQAACAMAjIH7R5l56AAAAMEZAAAyHQEqcYCTeZYxqAYABAA+66djzSr0Zf3PevN9D5t+jflrhXtLxb0/pHpD58e4OPX/mn0rVehfK3noAyAAAAAAMAh0RlxxpAUCRlK4AAyAA9g4rzp7f8AP3ovnXlvS/RrxL6Y8MfTDhnlL0xQ0/nf6g/Mj6+fLON9C/nHnvcfIfPQMgAAAAAADBH0FklpJREWd0GJSADIyMfXzx6zvcb7D+QDdv8ASj5m/UTxQv175Ur/AFr8yz+oPzI+pfyq9d23i0e0+Y+egAAAAAAAAAY3pGgg4y3B1z/JkgAAAx9OvnjkwPqT838rb/Sj5g/SfE+FPqJyPzz68+bR/UH5kfUb5Xdx9KfPse0+YeegAAAADBAAAAb

In [5]:
from datetime import datetime, timezone

# test top 5 items
for i, movie in enumerate(df.head(5).itertuples(index=False)):
    poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    poster = url_to_base64(poster_path)

    print(movie.title)
    print(poster_path)
    print(poster, "\n")

Edward Scissorhands
https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg
/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAUFBQUFBQUGBgUICAcICAsKCQkKCxEMDQwNDBEaEBMQEBMQGhcbFhUWGxcpIBwcICkvJyUnLzkzMzlHREddXX0BBQUFBQUFBQYGBQgIBwgICwoJCQoLEQwNDA0MERoQExAQExAaFxsWFRYbFykgHBwgKS8nJScvOTMzOUdER11dff/CABEIA4QCWAMBIgACEQEDEQH/xAAdAAAABwEBAQAAAAAAAAAAAAAAAQIDBAUGBwgJ/9oACAEBAAAAAPO5hLjaXFohWmczKQYAIAAAAAAAAAAAAAAGQMgDIAAAAAAyA6CZggaQip0GLqwZAyABkYIAAAAAAAAAyBkAAAAAAAAZAAAdBUpClIB1EheMaAAMgAAAADIAAGQAAAAAAMgDBAAAAAGQAG/Uba1oJNajQ4GGAAAAAAAAAAAAAAAAAZAAAAAAAwQBgEAOkso0mXafIKyemqM6AZGCAABkAAADBAGQMEAAAAAAAAAAAYIADoKHJNWlxxksnaOZ4gZAwAQAAAu/X/iYgAAAAADBD0bmuLmQAAMgAAAAAOgIfaS2+kkUlCQAAAAAABkDtvpR8wQAACAMAjIH7R5l56AAAAMEZAAAyHQEqcYCTeZYxqAYABAA+66djzSr0Zf3PevN9D5t+jflrhXtLxb0/pHpD58e4OPX/mn0rVehfK3noAyAAAAAAMAh0RlxxpAUCRlK4AAyAA9g4rzp7f8AP3ovnXlvS/RrxL6Y8MfTDhnlL0xQ0/nf6g/Mj6+fLON9C/nHnvcfIfPQMgAAAAAADBH0FklpJREWd0GJSADIyMfXzx6zvcb7D+QDdv8ASj5m/UTxQv175Ur/AFr8yz+oPzI+pfyq9d23i0e0+Y+egAAA

### Load poster images from a local folder

In [6]:
import base64

# Helper function to convert a file to base64 representation
def toBase64(path):
    with open(path, 'rb') as file:
        return base64.b64encode(file.read()).decode('utf-8')
    
toBase64("./posters/162_poster.jpg")

'/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAUFBQUFBQUGBgUICAcICAsKCQkKCxEMDQwNDBEaEBMQEBMQGhcbFhUWGxcpIBwcICkvJyUnLzkzMzlHREddXX0BBQUFBQUFBQYGBQgIBwgICwoJCQoLEQwNDA0MERoQExAQExAaFxsWFRYbFykgHBwgKS8nJScvOTMzOUdER11dff/CABEIAu4B9AMBIgACEQEDEQH/xAA1AAAABgMBAAAAAAAAAAAAAAAAAgMEBQYBBwgJAQADAQEBAQAAAAAAAAAAAAAAAQIDBAUG/9oADAMBAAIQAxAAAADncFz7vnGyTM0C5M1kENNM5aqTWTgoB008zsAAlgAAAAAAAAAAAMhgAAAAAAAAAAAMhgAAAAGcZwAAyGAAAAAADIYAAAZwAAAAAAAAAAAAABkYAbByYvvecYjlpnaxEVAPhEOIex0uz8e9LRl4ji6QAJYGcAAAAAAAAAAAAAAAAAAAAAAAAAAAAGcAAAAAAAAAAMhgAAAAAAAAAAAAAAyGAAGcAAAAGw08j6Dy5iBWJnogfKqC4ACBEhUeDonYIDk3GcBMAAM4ADOAAAC4IAAAAYNqp7Jhw57BsBgZAYCyQDAyGA4QDGc5AgBgmt5aW7PDgsAAAAAAAADIYGcBnGcAAAAAAAAAAAGwU1kff8xfBM47EKDucgoDFQstb5NmAA4OkAAAM4AAABMQ/oWGnGlOYA/1H6Y6hCu02uaTD0J0yhDB035xekPDYemHml2FpUJnm3p50Gvm1F16Ho3p/autAmNf1IgW0+wOIA65vnF/aAcFAAABkMDOAAAAAAAAAAAADOAAGQAAC+YOT3/NwMGy0IfOAIzknhMLfaabOqGWSjfL7QAJecAAAAGfSfzY2ME5WekOUQm+v+FZsLDrTuTjQOg4bY9ADYc3z72QHBXZPn76kBzb0VxN0EHC2d9EDpjWm19VBoy31fU4dd8gdu8zBA9ncw9UBwCAAAA

In [7]:
from datetime import datetime, timezone
from pathlib import Path

# test top 5 items
for i, movie in enumerate(df.head(5).itertuples(index=False)):
    poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    posterb64 = toBase64(f"./posters/{movie.id}_poster.jpg")

    print(movie.title)
    print(poster_path)
    print(posterb64, "\n")

Edward Scissorhands
https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg
/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAUFBQUFBQUGBgUICAcICAsKCQkKCxEMDQwNDBEaEBMQEBMQGhcbFhUWGxcpIBwcICkvJyUnLzkzMzlHREddXX0BBQUFBQUFBQYGBQgIBwgICwoJCQoLEQwNDA0MERoQExAQExAaFxsWFRYbFykgHBwgKS8nJScvOTMzOUdER11dff/CABEIAu4B9AMBIgACEQEDEQH/xAA1AAAABgMBAAAAAAAAAAAAAAAAAgMEBQYBBwgJAQADAQEBAQAAAAAAAAAAAAAAAQIDBAUG/9oADAMBAAIQAxAAAADncFz7vnGyTM0C5M1kENNM5aqTWTgoB008zsAAlgAAAAAAAAAAAMhgAAAAAAAAAAAMhgAAAAGcZwAAyGAAAAAADIYAAAZwAAAAAAAAAAAAABkYAbByYvvecYjlpnaxEVAPhEOIex0uz8e9LRl4ji6QAJYGcAAAAAAAAAAAAAAAAAAAAAAAAAAAAGcAAAAAAAAAAMhgAAAAAAAAAAAAAAyGAAGcAAAAGw08j6Dy5iBWJnogfKqC4ACBEhUeDonYIDk3GcBMAAM4ADOAAAC4IAAAAYNqp7Jhw57BsBgZAYCyQDAyGA4QDGc5AgBgmt5aW7PDgsAAAAAAAADIYGcBnGcAAAAAAAAAAAGwU1kff8xfBM47EKDucgoDFQstb5NmAA4OkAAAM4AAABMQ/oWGnGlOYA/1H6Y6hCu02uaTD0J0yhDB035xekPDYemHml2FpUJnm3p50Gvm1F16Ho3p/autAmNf1IgW0+wOIA65vnF/aAcFAAABkMDOAAAAAAAAAAAADOAAGQAAC+YOT3/NwMGy0IfOAIzknhMLfaabOqGWSjfL7QAJecAAAAGfSfzY2ME5

### Insert with Batch

In [8]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

movies = client.collections.get("MoviesMM")

with movies.batch.fixed_size(batch_size=10) as batch:
    for i, movie in enumerate(df.itertuples(index=False)):

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        # poster_path = f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
        poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        posterb64 = toBase64(f"./posters/{movie.id}_poster.jpg")

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            "poster_path": poster_path,
            "poster": posterb64
        }
        print(i, movie.title)

        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )

0 Edward Scissorhands
1 GoodFellas
2 Home Alone
3 Back to the Future Part III
4 Pretty Woman
5 The Godfather Part III
6 Die Hard 2
7 Total Recall
8 Ghost
9 Misery
10 Dances with Wolves
11 The Hunt for Red October
12 Predator 2
13 Tremors
14 Rocky V
15 Gremlins 2: The New Batch
16 Awakenings
17 Kindergarten Cop
18 La Femme Nikita
19 RoboCop 2
20 The Silence of the Lambs
21 Terminator 2: Judgment Day
22 Beauty and the Beast
23 Hook
24 The Addams Family
25 Point Break
26 Cape Fear
27 Thelma & Louise
28 Robin Hood: Prince of Thieves
29 The Naked Gun 2½: The Smell of Fear
30 Hot Shots!
31 My Girl
32 JFK
33 Boyz n the Hood
34 The Last Boy Scout
35 Barton Fink
36 Child's Play 3
37 Delicatessen
38 The Doors
39 Backdraft
40 Reservoir Dogs
41 Aladdin
42 Home Alone 2: Lost in New York
43 Batman Returns
44 Alien³
45 Bram Stoker's Dracula
46 Unforgiven
47 Basic Instinct
48 A Few Good Men
49 Scent of a Woman
50 Porco Rosso
51 Sister Act
52 Army of Darkness
53 The Last of the Mohicans
54 The Bodyguar

### Check for batch errors

In [58]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")

In [60]:
movies.aggregate.over_all()

AggregateReturn(properties={}, total_count=680)