In [6]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
AI_STUDIO_KEY = os.getenv("GOOGLE_AI_STUDIO_KEY")

In [None]:
!gcloud auth print-access-token

In [7]:
import weaviate, os

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
        "X-Google-Api-Key": AI_STUDIO_KEY,
    }
)

client.is_ready()

  client = weaviate.connect_to_local(


True

In [8]:
client.get_meta()

{'hostname': 'http://[::]:8080',
 'modules': {'generative-palm': {'documentationHref': 'https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts',
   'name': 'Generative Search - Google PaLM'},
  'multi2vec-clip': {'clip_model': {'_commit_hash': None,
    '_name_or_path': '/root/.cache/torch/sentence_transformers/sentence-transformers_clip-ViT-B-32/0_CLIPModel',
    'add_cross_attention': False,
    'architectures': ['CLIPModel'],
    'bad_words_ids': None,
    'begin_suppress_tokens': None,
    'bos_token_id': None,
    'chunk_size_feed_forward': 0,
    'cross_attention_hidden_size': None,
    'decoder_start_token_id': None,
    'diversity_penalty': 0,
    'do_sample': False,
    'early_stopping': False,
    'encoder_no_repeat_ngram_size': 0,
    'eos_token_id': None,
    'exponential_decay_length_penalty': None,
    'finetuning_task': None,
    'forced_bos_token_id': None,
    'forced_eos_token_id': None,
    'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
    'init

## Create a multi-vector collection with Named Vectors

In [10]:
from weaviate.classes.config import Configure, Property, DataType

client.collections.delete("Movies")

client.collections.create(
    name="Movies",  # The name of the collection ('NV' for named vectors)
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
    ],
    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title
        Configure.NamedVectors.text2vec_palm(
            name="title",
            source_properties=["title"],

            model_id="text-embedding-004",
            api_endpoint="generativelanguage.googleapis.com",
            project_id="devrel-projects",
        ),
        # Vectorize the movie overview (summary)
        Configure.NamedVectors.text2vec_palm(
            name="overview",
            source_properties=["overview"],

            model_id="text-embedding-004",
            api_endpoint="generativelanguage.googleapis.com",
            project_id="devrel-projects",
        ),
    ],
)

<weaviate.collections.collection.Collection at 0x114bdaf10>

In [5]:
# ## VERTEX AI

# from weaviate.classes.config import Configure, Property, DataType

# client.collections.delete("Movies")

# client.collections.create(
#     name="Movies",  # The name of the collection ('NV' for named vectors)
#     properties=[ # optional
#         Property(name="title", data_type=DataType.TEXT),
#         Property(name="overview", data_type=DataType.TEXT),
#         Property(name="rating", data_type=DataType.NUMBER),
#         Property(name="release_date", data_type=DataType.DATE),
#         Property(name="tmdb_id", data_type=DataType.INT),
#         Property(name="poster_url", data_type=DataType.TEXT),
#     ],
#     # Define & configure the vectorizer module
#     vectorizer_config=[
#         # Vectorize the movie title
#         Configure.NamedVectors.text2vec_palm(
#             name="title",
#             source_properties=["title"],

#             project_id="devrel-projects",
#             model_id="text-embedding-preview-0409"
#         ),
#         # Vectorize the movie overview (summary)
#         Configure.NamedVectors.text2vec_palm(
#             name="overview",
#             source_properties=["overview"],

#             project_id="devrel-projects",
#             model_id="text-embedding-preview-0409"
#         ),
#     ],
#     # Define the generative module
#     # generative_config=Configure.Generative.palm("gpt-4"),
# )

<weaviate.collections.collection.Collection at 0x12c46fed0>

## Load data

In [20]:
import pandas as pd

df = pd.read_json("./data/movies_data_1990_2024.json")
df.head()

Unnamed: 0,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,/3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg,"[14, 18, 10749]",162,en,Edward Scissorhands,A small suburban town receives a visit from a ...,45.694,/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg,1990-12-07,Edward Scissorhands,False,7.7,12305
1,/sw7mordbZxgITU877yTpZCud90M.jpg,"[18, 80]",769,en,GoodFellas,"The true story of Henry Hill, a half-Irish, ha...",57.228,/aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg,1990-09-12,GoodFellas,False,8.5,12106
2,/6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg,"[35, 10751]",771,en,Home Alone,Eight-year-old Kevin McCallister makes the mos...,3.538,/onTSipZ8R3bliBdKfPtsDuHTdlL.jpg,1990-11-16,Home Alone,False,7.4,10599
3,/vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg,"[12, 35, 878]",196,en,Back to the Future Part III,The final installment of the Back to the Futur...,28.896,/crzoVQnMzIrRfHtQw0tLBirNfVg.jpg,1990-05-25,Back to the Future Part III,False,7.5,9918
4,/3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg,"[35, 10749]",114,en,Pretty Woman,When a millionaire wheeler-dealer enters a bus...,97.953,/hVHUfT801LQATGd26VPzhorIYza.jpg,1990-03-23,Pretty Woman,False,7.5,7671


In [13]:
from datetime import datetime, timezone

for i, movie in enumerate(df.itertuples(index=False)):

    # Convert a JSON date to `datetime` and add time zone information
    release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
        tzinfo=timezone.utc
    )

    movie_obj = {
        "title": movie.title,
        "overview": movie.overview,
        "rating": movie.vote_average,
        "release_date": release_date,
        "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
        # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
        "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    }

    print(movie_obj)

{'title': 'Edward Scissorhands', 'overview': 'A small suburban town receives a visit from a castaway unfinished science experiment named Edward.', 'rating': 7.7, 'release_date': datetime.datetime(1990, 12, 7, 0, 0, tzinfo=datetime.timezone.utc), 'tmdb_id': 162, 'poster_url': 'https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg'}
{'title': 'GoodFellas', 'overview': 'The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway.', 'rating': 8.5, 'release_date': datetime.datetime(1990, 9, 12, 0, 0, tzinfo=datetime.timezone.utc), 'tmdb_id': 769, 'poster_url': 'https://image.tmdb.org/t/p/w600_and_h900_bestv2/aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg'}
{'title': 'Home Alone', 'overview': "Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. B

### Insert with Batch

In [14]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

counter = 0

movies = client.collections.get("Movies")
with movies.batch.rate_limit(50) as batch:
    for i, movie in enumerate(df.itertuples(index=False)):
        print(f"Adding: {movie.title}")

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            # "poster_url": f"https://image.tmdb.org/t/p/w300_and_h450_bestv2//{movie.poster_path}"
            "poster_url": f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        }

        # print(movie_obj)
        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )

        counter += 1
        if(counter == 100):
            break

### Check for batch errors

In [16]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")
else:
    print("no errors")

no errors


### Check object count

In [17]:
movies.aggregate.over_all()

AggregateReturn(properties={}, total_count=100)

## Show vectors

In [19]:
response = movies.query.fetch_objects(limit=1, include_vector=True)
print("Title:   ", response.objects[0].vector["title"])
print("Overview:", response.objects[0].vector["overview"])

Title:    [-0.03288900479674339, -0.015490553341805935, -0.008414356037974358, 0.008939075283706188, 0.03340843692421913, 0.009543688036501408, -0.004420144017785788, -0.0044501591473817825, -0.02668086253106594, 0.05094952881336212, -0.004626845475286245, -0.02242645062506199, 0.10951125621795654, 0.02134266123175621, -0.038802627474069595, -0.049608152359724045, 0.010832327418029308, 0.05397211015224457, -0.09020847827196121, 0.004810565151274204, 0.04851146787405014, -0.019263427704572678, 0.012544473633170128, -0.0024711452424526215, -0.029073620215058327, 0.03631407022476196, -0.03263832628726959, 0.009506462141871452, -0.009402542375028133, 0.008104495704174042, 0.026238620281219482, 0.06721949577331543, 0.0518953800201416, -0.05423256754875183, 0.024582834914326668, -0.001719817635603249, -0.03197462856769562, 0.019139355048537254, 0.04255462810397148, -0.03413970023393631, -0.11023177951574326, 0.002852730918675661, -0.010251485742628574, 0.04499778151512146, -0.023707961663603

In [None]:
client.close()