In [4]:
import weaviate, os

# Connect to a cloud instance of Weaviate (with WCS)
# client = weaviate.connect_to_wcs(
#     cluster_url=os.getenv("WORKSHOP_DEMO_URL"),
#     auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WORKSHOP_DEMO_KEY_ADMIN")),
#     headers={
#         "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"), # Replace with your inference API key
#         "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"), # Replace with your inference API key
#     }
# )

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),
    }
)

client.is_ready()

True

## Create a multi-vector collection with Named Vectors

In [64]:
from weaviate.classes.config import Configure, Property, DataType

client.collections.delete("Movies")

client.collections.create(
    name="Movies",  # The name of the collection ('NV' for named vectors)
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
    ],
    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title
        Configure.NamedVectors.text2vec_openai(
            name="title", source_properties=["title"]
        ),
        # Vectorize the movie overview (summary)
        Configure.NamedVectors.text2vec_openai(
            name="overview", source_properties=["overview"]
        ),
    ],
    # Define the generative module
    generative_config=Configure.Generative.openai("gpt-4"),
)

<weaviate.collections.collection.Collection at 0x11ef91710>

## Load data

In [65]:
import requests, json
import pandas as pd

data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
data_resp = requests.get(data_url)
df = pd.DataFrame(data_resp.json())

### Quick preview of the data

In [66]:
df.iloc[0]

backdrop_path                         /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg
genre_ids                                              [14, 18, 10749]
id                                                                 162
original_language                                                   en
original_title                                     Edward Scissorhands
overview             A small suburban town receives a visit from a ...
popularity                                                      45.694
poster_path                           /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg
release_date                                                1990-12-07
title                                              Edward Scissorhands
video                                                            False
vote_average                                                       7.7
vote_count                                                       12305
Name: 0, dtype: object

In [67]:
from datetime import datetime, timezone

for i, movie in enumerate(df.itertuples(index=False)):

    # Convert a JSON date to `datetime` and add time zone information
    release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
        tzinfo=timezone.utc
    )

    movie_obj = {
        "title": movie.title,
        "overview": movie.overview,
        "rating": movie.vote_average,
        "release_date": release_date,
        "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
        # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
        "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    }

    print(movie_obj)

{'title': 'Edward Scissorhands', 'overview': 'A small suburban town receives a visit from a castaway unfinished science experiment named Edward.', 'rating': 7.7, 'release_date': datetime.datetime(1990, 12, 7, 0, 0, tzinfo=datetime.timezone.utc), 'tmdb_id': 162, 'poster_url': 'https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg'}
{'title': 'GoodFellas', 'overview': 'The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway.', 'rating': 8.5, 'release_date': datetime.datetime(1990, 9, 12, 0, 0, tzinfo=datetime.timezone.utc), 'tmdb_id': 769, 'poster_url': 'https://image.tmdb.org/t/p/w600_and_h900_bestv2/aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg'}
{'title': 'Home Alone', 'overview': "Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. B

### Insert with Batch

In [68]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

movies = client.collections.get("Movies")

with movies.batch.fixed_size() as batch:
    for i, movie in enumerate(df.itertuples(index=False)):

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
            "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        }

        # print(movie_obj)
        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )

### Check for batch errors

In [69]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")

### Check object count

In [70]:
movies.aggregate.over_all()

AggregateReturn(properties={}, total_count=680)

## Show vectors

In [71]:
response = movies.query.fetch_objects(limit=1, include_vector=True)
print(response.objects[0].vector["title"])
print(response.objects[0].vector["overview"])

[-0.02900262549519539, -0.022994374856352806, 0.004147927742451429, -0.048591889441013336, 0.0062514725141227245, 0.000705427082721144, -0.03144799545407295, -0.01752515882253647, -0.009801204316318035, -0.03428778424859047, -0.0021561335306614637, 0.012700152583420277, 0.026136545464396477, -0.017564600333571434, 0.024203913286328316, 0.00413478072732687, 0.025965632870793343, -0.012240001931786537, 0.018879316747188568, -0.03465590253472328, -0.007027154788374901, 0.0036943508312106133, -0.009367348626255989, -0.024072442203760147, 0.009702601470053196, 0.003671343205496669, 0.006044405046850443, -0.032473474740982056, 0.005101096350699663, -0.01186530850827694, 0.01668374054133892, -0.02152189426124096, -0.006557144224643707, -0.010241634212434292, -0.014422429725527763, -0.015776587650179863, -0.008170957677066326, -0.02074621245265007, 0.00976176280528307, -0.002880870597437024, 0.0058274767361581326, -0.007796263322234154, -0.021916309371590614, -0.027056846767663956, -0.02766161

In [72]:
response = movies.query.fetch_object_by_id(generate_uuid5(162), include_vector=True)

# response.vector.keys()
print(response.vector["title"])
print(response.vector["overview"])


[-0.020382605493068695, -0.03982871398329735, -0.008249446749687195, -0.03167567029595375, 0.0047513507306575775, 0.02065804786980152, -0.02433517947793007, -0.028370384126901627, -0.005002690479159355, -0.019597601145505905, 0.007161456160247326, 0.015865379944443703, -0.0009700674563646317, -0.008414710871875286, -0.0011310280533507466, 0.004417378921061754, 0.034512709826231, 8.916852493712213e-06, 0.015163006260991096, -0.022255603224039078, -0.0031865036580711603, 0.0023481312673538923, -0.0019625143613666296, -0.031372688710689545, 0.0026029138825833797, -0.000575843034312129, 0.015452219173312187, -0.03690904378890991, -0.005405522417277098, -0.01990058459341526, 0.023825613781809807, -0.028122488409280777, -0.018881455063819885, -0.01868864707648754, -0.027075814083218575, -0.015741432085633278, -0.007009963970631361, -0.01585160754621029, 0.007140798028558493, -0.000244883936829865, 0.013668741099536419, -0.022324463352560997, -0.017118634656071663, -0.02352263033390045, -0.01