## Multimodal Scenario

### Vectorize images with Vertex AI
If you have a project configured with Vertex AI, you can use `multimodalembedding@001` - a multimodal model that can vectorize text, images and video.

### Vectorize images with CLIP
Otherwise, you can use a CLIP model.<br/>
CLIP can only be used with a local deployment using Docker Compose.

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
AI_STUDIO_KEY = os.getenv("GOOGLE_AI_STUDIO_KEY")

VERTEX_AI_KEY = os.getenv("GOOGLE_VERTEX_AI_KEY")
VERTEX_AI_PROJECT = os.getenv("GOOGLE_VERTEX_AI_PROJECT")

if(VERTEX_AI_KEY == None):
    print("VERTEX_AI_KEY is missing. Follow the CLIP path for this exercise.")
    print("Or configure a Google Vertex AI account with  ")

In [None]:
# VERTEX_AI_KEY = "ya29.a0AXooCgskulIrPZyuV..."

## Connect with Vertex AI key
If you have a project configured with Vertex AI, you can use `multimodalembedding@001` - a multimodal model

In [None]:
import weaviate, os

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        # "X-Google-Vertex-Api-Key": VERTEX_AI_KEY,
        "X-Google-Api-Key": VERTEX_AI_KEY,
        "X-Google-Studio-Api-Key": AI_STUDIO_KEY,
    }
)

# Connect to a Weaviate Cloud instance - needs 1.24.14 or newer // 1.25.1 or newer
# client = weaviate.connect_to_wcs(
#     cluster_url=os.getenv("WORKSHOP_DEMO_URL"),
#     auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WORKSHOP_DEMO_KEY_ADMIN")),

#     headers={
#         "X-Google-Api-Key": VERTEX_AI_KEY,
#         "X-Google-Studio-Api-Key": AI_STUDIO_KEY,
#     }
# )

client.is_ready()

## Connect with CLIP

In [None]:
import weaviate

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        "X-Google-Studio-Api-Key": AI_STUDIO_KEY,
    }
)

## Create a new collection

In [None]:
from weaviate.classes.config import Configure, Property, DataType, Multi2VecField

client.collections.delete("MoviesMM")

client.collections.create(
    name="MoviesMM",  # The name of the collection ('NV' for named vectors)
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
        Property(name="poster", data_type=DataType.BLOB),
    ],

    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title and summary
        Configure.NamedVectors.text2vec_palm(
            name="content",
            source_properties=["title", "overview"],

            model_id="text-embedding-004",
            api_endpoint="generativelanguage.googleapis.com",
            project_id="devrel-projects",
        ),

        # Vectorize the movie poster (image)
        Configure.NamedVectors.multi2vec_clip(
            name="poster",
            image_fields=["poster"]
            # image_fields=[
            #     Multi2VecField(name="poster", weight=0.9)
            # ],
            # text_fields=[
            #     Multi2VecField(name="title", weight=0.1)
            # ],
        )

        # Configure.NamedVectors.multi2vec_palm(
        #     name="poster",
        #     image_fields=["poster"],
        #     text_fields=["title"],
        #     location="us-central1",
        #     model_id="multimodalembedding@001",
        #     project_id="devrel-projects"
        # )
    ],
)

## Load data

In [None]:
import pandas as pd

# df = pd.read_json("./data/movies_data_1990_2024.json")
df = pd.read_json("./data/movies_data_small.json")
df.head()

### Example of loading images from the Internet
> We won't use it for import for this project, as that could get flagged by tmdb servers as an attack

In [None]:
import base64, requests

def url_to_base64(url):
    image_response = requests.get(url)
    content = image_response.content
    return base64.b64encode(content).decode("utf-8")

url_to_base64("https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg")

In [None]:
from datetime import datetime, timezone

# test top 3 items
for i, movie in enumerate(df.head(3).itertuples(index=False)):
    poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    poster = url_to_base64(poster_path)

    print(movie.title)
    print(poster_path)
    print(poster, "\n")

### Load poster images from a local folder

In [None]:
import base64

# Helper function to convert a file to base64 representation
def toBase64(path):
    with open(path, 'rb') as file:
        return base64.b64encode(file.read()).decode('utf-8')
    
toBase64("./posters/162_poster.jpg")

In [None]:
from datetime import datetime, timezone
from pathlib import Path

# test top 3 items
for i, movie in enumerate(df.head(3).itertuples(index=False)):
    poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    posterb64 = toBase64(f"./posters/{movie.id}_poster.jpg")

    print(movie.title)
    print(poster_path)
    print(posterb64, "\n")

### Insert with Batch

In [None]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

movies = client.collections.get("MoviesMM")
with movies.batch.rate_limit(100) as batch:

    for i, movie in enumerate(df.itertuples(index=False)):
        if(i == 200): # load the first 200 movie objects
            break

        print(i, movie.title)

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        posterb64 = toBase64(f"./posters/{movie.id}_poster.jpg")

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            "poster_path": poster_path,
            "poster": posterb64
        }

        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )


### Check for batch errors

In [None]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")
else:
    print("No errors")

In [None]:
movies.aggregate.over_all()

In [None]:
client.close()