## Weaviate 1.31 Enablement session

### Key features

- Shard movement between nodes
- MUVERA encoding algorithm for multi-vector embeddings
- Vectorizer changes
- HNSW snapshotting

### Connect to Weaviate

In [1]:
import dotenv

dotenv.load_dotenv(verbose=True)

True

In [2]:
import weaviate
import os

client = weaviate.connect_to_local(
    headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
        "X-JinaAI-Api-Key": os.getenv("JINAAI_API_KEY")
    }
)

In [3]:
client.get_meta()["version"]

'1.31.0-rc.0'

Check the cluster setup:

In [4]:
client.cluster.nodes()

[Node(git_hash='a6ede0c', name='node1', shards=None, stats=None, status='HEALTHY', version='1.31.0-rc.0'),
 Node(git_hash='a6ede0c', name='node2', shards=None, stats=None, status='HEALTHY', version='1.31.0-rc.0'),
 Node(git_hash='a6ede0c', name='node3', shards=None, stats=None, status='HEALTHY', version='1.31.0-rc.0')]

Pre-load some data:

In [None]:
from weaviate.classes.config import Configure, Property, DataType

collection_name = "TempCollection"

client.collections.delete(collection_name)

client.collections.create(
    collection_name,
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2vec_cohere(
            name="default",
            source_properties=["title", "body"],
        ),
    ],
    replication_config=Configure.replication(factor=2),  # Note - this just a demo - do NOT use a RF of 2 in production - use 3 or above odd number
    sharding_config=Configure.sharding(
        desired_count=5  # To demonstrate sharding, we set it to an arbitrary high number (for our dataset size, anyway)
    )
)

c = client.collections.get(collection_name)

objects = [
    {"title": "Howl's Moving Castle", "body": "A fantasy novel by Diana Wynne Jones."},
    {"title": "The Hobbit", "body": "A fantasy novel by J.R.R. Tolkien."},
    {"title": "The Hitchhiker's Guide to the Galaxy", "body": "A science fiction novel by Douglas Adams."},
    {"title": "The Great Gatsby", "body": "A novel by F. Scott Fitzgerald."},
    {"title": "1984", "body": "A dystopian novel by George Orwell."},
    {"title": "To Kill a Mockingbird", "body": "A novel by Harper Lee."},
    {"title": "Pride and Prejudice", "body": "A novel by Jane Austen."},
    {"title": "The Catcher in the Rye", "body": "A novel by J.D. Salinger."},
    {"title": "The Lord of the Rings", "body": "A fantasy novel by J.R.R. Tolkien."},
    {"title": "Brave New World", "body": "A dystopian novel by Aldous Huxley."},
    {"title": "Fahrenheit 451", "body": "A dystopian novel by Ray Bradbury."},
    {"title": "The Picture of Dorian Gray", "body": "A novel by Oscar Wilde."},
]

c.data.insert_many(objects)

import time

start_time = time.time()
print("Waiting for object count to update...")

while True:
    n = client.cluster.nodes(collection=collection_name, output="verbose")[0]
    s = n.shards[0]
    if s.object_count != 0:
        print(f"On node {n.name} and shard {s.name} - obj count: {s.object_count}")
        break
    time.sleep(10)
    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time:.1f}s")

finish_time = time.time()
print(f"Time taken for obj count to update: {finish_time - start_time} seconds")

In [None]:
len(c)

## Move shards

In [None]:
from IPython.display import Image, display

img_w = 600
display(Image('./assets/shards-1.png', width=img_w))

In [None]:
display(Image('./assets/shards-2.png', width=img_w))

In [None]:
display(Image('./assets/shards-3.png', width=img_w))

In [None]:
display(Image('./assets/shards-4.png', width=img_w))

In [None]:
nodes_response = client.cluster.nodes(collection=collection_name, output="verbose")

for n in nodes_response:
    print(f"\nNode {n.name} has {len(n.shards)} shards")
    for s in n.shards:
        print(f"Shard {s.name} has {s.object_count} objects from {collection_name}")

In [None]:
n = nodes_response[0]
candidate_shard = None
for src_shard in n.shards:
    node2_shards = [
        n2_shard.name for n2_shard in nodes_response[1].shards
    ]
    if src_shard.name not in node2_shards:
        candidate_shard = src_shard
        break

print(f"Candidate shard to move: {candidate_shard.name}")

SHARD_ID = candidate_shard.name

In [None]:
import requests
import json

# Shard move parameters
SOURCE_NODE = "node1"
DESTINATION_NODE = "node2"

# Create the request payload
payload = {
    "sourceNodeName": "node1",
    "destinationNodeName": DESTINATION_NODE,
    "collectionId": collection_name,
    "shardId": SHARD_ID,
    "transferType": "MOVE"  # Use "MOVE" to relocate the shard, or "COPY" to replicate it
}

# Set up the headers
headers = {
    "Content-Type": "application/json"
}

# Make the API request
response = requests.post(
    f"http://localhost:8080/v1/replication/replicate",
    headers=headers,
    data=json.dumps(payload)
)

operation_id = response.json().get("id")

# Check for the status of the operation
response = requests.get(
    f"http://localhost:8080/v1/replication/replicate/{operation_id}",
    headers=headers,
)

print(response.json())
print("Shard move operation status:")
print(f'Status: {response.json()["status"]["state"]}')

In [None]:
display(Image('./assets/shards-5.png', width=img_w))

Check the status to see if it's finished:

In [None]:
# Check for the status of the operation
response = requests.get(
    f"http://localhost:8080/v1/replication/replicate/{operation_id}",
    headers=headers,
)

print(response.json())
print("Shard move operation status:")
print(f'Status: {response.json()["status"]["state"]}')

In [None]:
for n in client.cluster.nodes(collection=collection_name, output="verbose"):
    print(f"\nNode {n.name} has {len(n.shards)} shards")
    for s in n.shards:
        print(f"Shard {s.name} has {s.object_count} objects from {collection_name}")

## MUVERA

Note - check that this is the branch being used:

`uv pip install git+https://github.com/weaviate/weaviate-python-client.git@dev/1.31`

In [None]:
from IPython.display import Image, display

img_w = 800
display(Image('./assets/mv_explained1.png', width=img_w))

In [None]:
from IPython.display import Image, display

img_w = 800
display(Image('./assets/mv_explained2.png', width=img_w))

In [None]:
from weaviate.classes.config import Configure, Property, DataType

collection_name = "TempCollection"

client.collections.delete(collection_name)

client.collections.create(
    collection_name,
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2colbert_jinaai(
            name="custom_mv",
            source_properties=["title", "body"],
            vector_index_config=Configure.VectorIndex.hnsw(
                multi_vector=Configure.VectorIndex.MultiVector.multi_vector()
            )
        ),
    ],
    replication_config=Configure.replication(factor=3)
)

In [None]:
objects = [
    {"title": "Howl's Moving Castle", "body": "A fantasy novel by Diana Wynne Jones."},
    {"title": "The Hobbit", "body": "A fantasy novel by J.R.R. Tolkien."},
    {"title": "The Hitchhiker's Guide to the Galaxy", "body": "A science fiction novel by Douglas Adams."},
]

In [None]:
c = client.collections.get(collection_name)

with c.batch.fixed_size(200) as batch:
    for i, obj in enumerate(objects):
        batch.add_object(
            properties=obj,
        )

In [None]:
r = c.query.near_text(
    query="a bunch of friends travelling around unseen places"
)

for o in r.objects:
    print(o.properties)

## Vectorizer changes

Note - check that this is the branch being used:

`uv pip install git+https://github.com/weaviate/weaviate-python-client.git@1.31/support-adding-vectors`

In [5]:
from weaviate.classes.config import Configure, Property, DataType

collection_name = "TempCollection"

client.collections.delete(collection_name)

client.collections.create(
    collection_name,
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2vec_cohere(
            name="default",
            source_properties=["title", "body"],
        ),
        Configure.NamedVectors.text2vec_cohere(
            name="new_title",
            source_properties=["title"],
        )
    ],
    # vectorizer_config=Configure.Vectorizer.text2vec_cohere(),
)

/workspaces/weaviate_enablement_1_31/.venv/lib/python3.11/site-packages/weaviate/collections/classes/config.py:1950: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


<weaviate.collections.collection.sync.Collection at 0xffff8ca443d0>

In [6]:
c = client.collections.get(collection_name)

cc = c.config.get().vector_config.keys()

print(cc)

dict_keys(['default', 'new_title'])


In [7]:
c.config.add_vector(
    vector_config=Configure.NamedVectors.text2vec_cohere(
        name="body_only",
        source_properties=["body"],
    )
)

In [8]:
c = client.collections.get(collection_name)

cc = c.config.get().vector_config.keys()

print(cc)

dict_keys(['body_only', 'default', 'new_title'])


In [24]:
c = client.collections.get(collection_name)

c.data.insert_many(objects)

BatchObjectReturn(_all_responses=[UUID('35822527-fc71-4a4c-a479-63314737188d'), UUID('d9ac0256-9501-4b24-858f-08aff9e4cde0'), UUID('0fbc5184-0939-4905-9459-410e2b10005d'), UUID('b336ead0-966d-4874-92df-e6a8104922f4'), UUID('98803a7b-b18f-44dd-b575-d241d520b2bb'), UUID('b12411ec-8177-4d9c-b231-10d8e99359b4'), UUID('13e0bc53-046c-4a7c-955c-d2c4cd5e38db'), UUID('7d6aa81b-7bcb-49ba-8b32-4fa05393f873'), UUID('ae75c2f6-91da-44ff-b15a-4946f67f1613'), UUID('a9673f60-516e-416c-856e-24607546a1ea'), UUID('9e7ce9e1-902f-485c-8704-0ad64805cd0e'), UUID('b51367ee-b66f-4471-b181-af741ecfea2e')], elapsed_seconds=0.3197033405303955, errors={}, uuids={0: UUID('35822527-fc71-4a4c-a479-63314737188d'), 1: UUID('d9ac0256-9501-4b24-858f-08aff9e4cde0'), 2: UUID('0fbc5184-0939-4905-9459-410e2b10005d'), 3: UUID('b336ead0-966d-4874-92df-e6a8104922f4'), 4: UUID('98803a7b-b18f-44dd-b575-d241d520b2bb'), 5: UUID('b12411ec-8177-4d9c-b231-10d8e99359b4'), 6: UUID('13e0bc53-046c-4a7c-955c-d2c4cd5e38db'), 7: UUID('7d6aa81

In [13]:
r = c.query.fetch_objects(limit=1, include_vector=True)

for k, v in r.objects[0].vector.items():
    print(k)
    print(v[:3])

default
[-0.02838134765625, 0.0147247314453125, -0.0009617805480957031]
new_title
[0.00046539306640625, 0.0167388916015625, 0.0046539306640625]
body_only
[-0.034698486328125, 0.016387939453125, -0.00408172607421875]


### The order matters

In [14]:
from weaviate.classes.config import Configure, Property, DataType

collection_name = "TempCollection"

client.collections.delete(collection_name)

client.collections.create(
    collection_name,
    properties=[
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ],
    vectorizer_config=[
        Configure.NamedVectors.text2vec_cohere(
            name="default",
            source_properties=["title", "body"],
        ),
        Configure.NamedVectors.text2vec_cohere(
            name="new_title",
            source_properties=["title"],
        )
    ],
)

/workspaces/weaviate_enablement_1_31/.venv/lib/python3.11/site-packages/weaviate/collections/classes/config.py:1950: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


<weaviate.collections.collection.sync.Collection at 0xffff68d41d50>

In [None]:
c = client.collections.get(collection_name)

c.data.insert_many(objects)

BatchObjectReturn(_all_responses=[UUID('1fafb5b0-521b-4605-bcfd-f17872cae841'), UUID('ff23d516-9e44-411b-84cc-c506ef34befb'), UUID('e64523fb-c6b4-4d0a-b23b-ad39e0abcffe'), UUID('ab9bd7e9-abf3-4d9d-90fd-ed83072262ac'), UUID('b919b6d5-cdb9-4ade-b83b-d09c44bb0c04'), UUID('101447da-de68-4801-82e8-707b87976112'), UUID('c718489e-3b84-4d1c-a5cb-9d596ddf72b8'), UUID('2dad3af8-286a-4ddc-b5e7-98ac7e8ca5cd'), UUID('64ef1772-f089-4751-a519-7fec8feea325'), UUID('290b926f-9ecb-46c8-b00e-07750a73f748'), UUID('a70861ee-597f-40b7-8383-1246e5154425'), UUID('b391f3d6-c082-4f28-8e09-8a783421634a')], elapsed_seconds=0.18357324600219727, errors={}, uuids={0: UUID('1fafb5b0-521b-4605-bcfd-f17872cae841'), 1: UUID('ff23d516-9e44-411b-84cc-c506ef34befb'), 2: UUID('e64523fb-c6b4-4d0a-b23b-ad39e0abcffe'), 3: UUID('ab9bd7e9-abf3-4d9d-90fd-ed83072262ac'), 4: UUID('b919b6d5-cdb9-4ade-b83b-d09c44bb0c04'), 5: UUID('101447da-de68-4801-82e8-707b87976112'), 6: UUID('c718489e-3b84-4d1c-a5cb-9d596ddf72b8'), 7: UUID('2dad3a

In [16]:
c.config.add_vector(
    vector_config=Configure.NamedVectors.text2vec_cohere(
        name="body_only",
        source_properties=["body"],
    )
)

In [17]:
r = c.query.fetch_objects(limit=1, include_vector=True)

for k, v in r.objects[0].vector.items():
    print(k)
    print(v[:3])

default
[-0.009490966796875, 0.0374755859375, -0.0396728515625]
new_title
[0.0052490234375, 0.035430908203125, 0.0004413127899169922]
