In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
AI_STUDIO_KEY = os.getenv("GOOGLE_AI_STUDIO_KEY")

In [None]:
!gcloud auth print-access-token

In [None]:
import weaviate, os

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        "X-Google-Api-Key": AI_STUDIO_KEY,
    }
)

client.is_ready()

In [None]:
client.get_meta()

## Create a multi-vector collection with Named Vectors

In [None]:
from weaviate.classes.config import Configure, Property, DataType

client.collections.delete("Movies")

client.collections.create(
    name="Movies",  # The name of the collection ('NV' for named vectors)
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
    ],
    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title
        Configure.NamedVectors.text2vec_palm(
            name="title",
            source_properties=["title"],

            model_id="text-embedding-004",
            api_endpoint="generativelanguage.googleapis.com",
            project_id="devrel-projects",
        ),
        # Vectorize the movie overview (summary)
        Configure.NamedVectors.text2vec_palm(
            name="overview",
            source_properties=["overview"],

            model_id="text-embedding-004",
            api_endpoint="generativelanguage.googleapis.com",
            project_id="devrel-projects",
        ),
    ],
)

In [None]:
# ## VERTEX AI

# from weaviate.classes.config import Configure, Property, DataType

# client.collections.delete("Movies")

# client.collections.create(
#     name="Movies",  # The name of the collection ('NV' for named vectors)
#     properties=[ # optional
#         Property(name="title", data_type=DataType.TEXT),
#         Property(name="overview", data_type=DataType.TEXT),
#         Property(name="rating", data_type=DataType.NUMBER),
#         Property(name="release_date", data_type=DataType.DATE),
#         Property(name="tmdb_id", data_type=DataType.INT),
#         Property(name="poster_url", data_type=DataType.TEXT),
#     ],
#     # Define & configure the vectorizer module
#     vectorizer_config=[
#         # Vectorize the movie title
#         Configure.NamedVectors.text2vec_palm(
#             name="title",
#             source_properties=["title"],

#             project_id="devrel-projects",
#             model_id="text-embedding-preview-0409"
#         ),
#         # Vectorize the movie overview (summary)
#         Configure.NamedVectors.text2vec_palm(
#             name="overview",
#             source_properties=["overview"],

#             project_id="devrel-projects",
#             model_id="text-embedding-preview-0409"
#         ),
#     ],
#     # Define the generative module
#     # generative_config=Configure.Generative.palm("gpt-4"),
# )

## Load data

In [None]:
import pandas as pd

df = pd.read_json("./data/movies_data_1990_2024.json")
df.head()

In [None]:
from datetime import datetime, timezone

for i, movie in enumerate(df.itertuples(index=False)):

    # Convert a JSON date to `datetime` and add time zone information
    release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
        tzinfo=timezone.utc
    )

    movie_obj = {
        "title": movie.title,
        "overview": movie.overview,
        "rating": movie.vote_average,
        "release_date": release_date,
        "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
        # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
        "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    }

    print(movie_obj)

### Insert with Batch

In [None]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

counter = 0

movies = client.collections.get("Movies")
with movies.batch.rate_limit(50) as batch:
    for i, movie in enumerate(df.itertuples(index=False)):
        print(f"Adding: {movie.title}")

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
            "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        }

        # print(movie_obj)
        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )

        counter += 1
        if(counter == 100):
            break

### Check for batch errors

In [None]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")
else:
    print("no errors")

### Check object count

In [None]:
movies.aggregate.over_all()

## Show vectors

In [None]:
response = movies.query.fetch_objects(limit=1, include_vector=True)
print("Title:   ", response.objects[0].vector["title"])
print("Overview:", response.objects[0].vector["overview"])

In [None]:
client.close()