In [None]:
import weaviate, os

# Connect to a cloud instance of Weaviate (with WCS)
# client = weaviate.connect_to_wcs(
#     cluster_url=os.getenv("WORKSHOP_DEMO_URL"),
#     auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WORKSHOP_DEMO_KEY_ADMIN")),
#     headers={
#         "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"), # Replace with your inference API key
#         "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"), # Replace with your inference API key
#     }
# )

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),
    }
)

client.is_ready()

## Create a multi-vector collection with Named Vectors

In [None]:
from weaviate.classes.config import Configure, Property, DataType

client.collections.delete("Movies")

client.collections.create(
    name="Movies",  # The name of the collection ('NV' for named vectors)
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
    ],
    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title
        Configure.NamedVectors.text2vec_openai(
            name="title", source_properties=["title"]
        ),
        # Vectorize the movie overview (summary)
        Configure.NamedVectors.text2vec_openai(
            name="overview", source_properties=["overview"]
        ),
    ],
    # Define the generative module
    generative_config=Configure.Generative.openai("gpt-4"),
)

## Load data

In [None]:
import requests, json
import pandas as pd

data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
data_resp = requests.get(data_url)
df = pd.DataFrame(data_resp.json())

### Quick preview of the data

In [None]:
df.iloc[0]

In [None]:
from datetime import datetime, timezone

for i, movie in enumerate(df.itertuples(index=False)):

    # Convert a JSON date to `datetime` and add time zone information
    release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
        tzinfo=timezone.utc
    )

    movie_obj = {
        "title": movie.title,
        "overview": movie.overview,
        "rating": movie.vote_average,
        "release_date": release_date,
        "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
        # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
        "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    }

    print(movie_obj)

### Insert with Batch

In [None]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

movies = client.collections.get("Movies")

with movies.batch.fixed_size() as batch:
    for i, movie in enumerate(df.itertuples(index=False)):

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
            "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        }

        # print(movie_obj)
        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )

### Check for batch errors

In [None]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")

### Check object count

In [None]:
movies.aggregate.over_all()

## Show vectors

In [None]:
response = movies.query.fetch_objects(limit=1, include_vector=True)
print(response.objects[0].vector["title"])
print(response.objects[0].vector["overview"])