# Collection setup and data load

## Get keys and urls

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
JINAAI_API_KEY = os.getenv("JINAAI_API_KEY")

print(WEAVIATE_URL)
print(WEAVIATE_KEY)
print(JINAAI_API_KEY)

if(WEAVIATE_URL == "UPDATE_ME_WEAVIATE_URL"):
    raise Exception("Please update .env and Restart the notebook (see Restart button, next to Run All)")

https://vm7vt6fttjqqoh2riw9mvg.c0.europe-west3.gcp.weaviate.cloud
6VxSJTcUZ7GY0l986XYFTt3mkEMeIm7F0pNc
jina_0dbcfcbf17fe431aa4e4ebb2d7e31a6caI4V-9CeQnNfQcpnqys_hQzR4f3y


## Connect to Weaviate

You need to pass in your OpenAI key, which will be used to vectorise your data.

In [37]:
import weaviate
from weaviate.classes.init import Auth
# from weaviate.classes.init import AdditionalConfig, Timeout

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),

    headers = {
        "X-JinaAI-Api-Key": JINAAI_API_KEY,
    },
)

client.is_ready()

True

## Create a collection with a vectorizer

* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)
* ~~ [OpenAI integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings) ~~
* https://weaviate.io/developers/weaviate/model-providers/jinaai/embeddings

Examples of other embedding models:
* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)
* [HuggingFace ðŸ¤—](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)
* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)

In [13]:
from weaviate.classes.config import Configure

if client.collections.exists("Jeopardy"):
    client.collections.delete("Jeopardy")

# Create a collection - with OpenAI vectorizer
client.collections.create(
    name="Jeopardy",
    vectorizer_config=[
        Configure.NamedVectors.text2vec_jinaai(
            name="title_text_vector",
            source_properties=['title', 'text'],
            model="jina-embeddings-v3",
        )
    ]
)

<weaviate.collections.collection.sync.Collection at 0x79b589b23e90>

## Import data
### Sample Data

In [14]:
import json

with open("./jeopardy_tiny.json") as file:
    data_10 = json.load(file)

print(json.dumps(data_10[0:2], indent=2))

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  }
]


### Insert Many

> `insert_many` is only used for inserting small batches of data - must complete within the timeout.

[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)

In [16]:
# Insert data

# TODO: get Jeopardy collection
# TODO: insert data_10

print(len(data_10))
jeopardy = client.collections.get("Jeopardy")
jeopardy.data.insert_many(data_10)

10


BatchObjectReturn(_all_responses=[UUID('ea5fa22f-9fdb-445d-b63d-1840f4bf8a0d'), UUID('b4780ddc-0e1c-468f-88c9-c2a49ee0060a'), UUID('03a77530-418b-49af-92a4-bd44f9016be9'), UUID('f29e76b8-4ea1-4f22-90d4-077e279008a1'), UUID('031dd656-327f-4355-a575-cee75f689af3'), UUID('06d40782-85a0-4c76-8e9f-921fac539f1f'), UUID('3b85e4a8-649c-47d3-9438-5ffa3691bebb'), UUID('4eb88341-5ea9-42c8-92e0-84c5dacc5b63'), UUID('3233e458-e2c8-4b84-b4b6-7dec8b261d7c'), UUID('67b64db9-1445-40ad-b15a-6253e2f9f907')], elapsed_seconds=1.2704830169677734, errors={}, uuids={0: UUID('ea5fa22f-9fdb-445d-b63d-1840f4bf8a0d'), 1: UUID('b4780ddc-0e1c-468f-88c9-c2a49ee0060a'), 2: UUID('03a77530-418b-49af-92a4-bd44f9016be9'), 3: UUID('f29e76b8-4ea1-4f22-90d4-077e279008a1'), 4: UUID('031dd656-327f-4355-a575-cee75f689af3'), 5: UUID('06d40782-85a0-4c76-8e9f-921fac539f1f'), 6: UUID('3b85e4a8-649c-47d3-9438-5ffa3691bebb'), 7: UUID('4eb88341-5ea9-42c8-92e0-84c5dacc5b63'), 8: UUID('3233e458-e2c8-4b84-b4b6-7dec8b261d7c'), 9: UUID('6

### Data preview

In [17]:
# TODO: fetch 4 objects
# response = jeopardy

# Show data preview
jeopardy = client.collections.get("Jeopardy")
response = jeopardy.query.fetch_objects(limit=4)

for item in response.objects:
    print(item.uuid, item.properties)

031dd656-327f-4355-a575-cee75f689af3 {'answer': 'the diamondback rattler', 'question': 'Heaviest of all poisonous snakes is this North American rattlesnake', 'category': 'ANIMALS'}
03a77530-418b-49af-92a4-bd44f9016be9 {'answer': 'the nose or snout', 'question': 'The gavial looks very much like a crocodile except for this bodily feature', 'category': 'ANIMALS'}
06d40782-85a0-4c76-8e9f-921fac539f1f {'answer': 'species', 'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification", 'category': 'SCIENCE'}
3233e458-e2c8-4b84-b4b6-7dec8b261d7c {'answer': 'the atmosphere', 'question': 'Changes in the tropospheric layer of this are what gives us weather', 'category': 'SCIENCE'}


In [19]:
# Show data preview - with vectors
response = jeopardy.query.fetch_objects(
    limit=4,
    include_vector=True       # TODO: add include_vectors
)

for item in response.objects:
    print(item.properties)
    print(item.vector, '\n')

{'answer': 'the diamondback rattler', 'question': 'Heaviest of all poisonous snakes is this North American rattlesnake', 'category': 'ANIMALS'}
{'title_text_vector': [0.1481798142194748, -0.14293478429317474, 0.1506766974925995, 0.09442616254091263, 0.03379036858677864, -0.043801456689834595, 0.013204239308834076, 0.09323267638683319, -0.037268731743097305, 0.0012786220759153366, 0.008860794827342033, 0.11489588767290115, -0.13880471885204315, -0.022163765504956245, -0.08210663497447968, 0.06733733415603638, -0.053612325340509415, 0.1067613959312439, -0.13293156027793884, -0.09231401234865189, -0.03749643638730049, 0.07492220401763916, -0.0694730281829834, 0.09556467086076736, -0.02660495974123478, 0.004988859407603741, 0.029112637042999268, 0.0467282272875309, -0.08346499502658844, 0.017474256455898285, 0.1327902227640152, 0.016182629391551018, 0.026007238775491714, -0.07911508530378342, -0.028429526835680008, 0.031421076506376266, -0.023308169096708298, -0.07432545721530914, -0.04972

### Super quick query example

In [20]:
# TODO: add near text query, search for African animals with limit 2
# response = jeopardy.query
response = jeopardy.query.near_text("african animals", limit=2)

for item in response.objects:
    print(item.properties)

{'answer': 'Liver', 'question': 'This organ removes excess glucose from the blood & stores it as glycogen', 'category': 'SCIENCE'}
{'answer': 'Antelope', 'question': 'Weighing around a ton, the eland is the largest species of this animal in Africa', 'category': 'ANIMALS'}


## A bit bigger example - 10k objects

### Load data

In [21]:
import json

with open("./wiki-10k.json") as file:
    data_10k = json.load(file)

print(json.dumps(data_10k[0:2], indent=2))

[
  {
    "text": "At this point in the siege, Lee's army had strengthened the Petersburg line. They dug breastworks out of rifle pits. At night, with pick and shovel, they then turned the breastworks into  deep trenches. Pointed stakes turned outwards were designed to break up any frontal attacks. The area between the two lines became a no man's land. The summer that year was hot and dry. Streams and springs were quickly drying up causing a water shortage on both sides. The siege was quickly becoming a stalemate.",
    "title": "Siege of Petersburg",
    "url": "https://simple.wikipedia.org/wiki/Siege%20of%20Petersburg",
    "wiki_id": "20231101.simple_550339_9"
  },
  {
    "text": "1944  Holocaust: Anne Frank and her family are placed on the last transport train from the Westerbork transit camp to Auschwitz.",
    "title": "September 3",
    "url": "https://simple.wikipedia.org/wiki/September%203",
    "wiki_id": "20231101.simple_8532_17"
  }
]


### Create a collection with Named Vectors and SourceProperties

In [26]:
from weaviate.classes.config import Configure, Property, DataType

def create_wiki_collection():
    if client.collections.exists("Wiki"):
        client.collections.delete("Wiki")

    # Create a collection with JinaAI vectorizer and defined source properties
    client.collections.create(
        name="Wiki",

        # NOTE: using NamedVectors here
        vectorizer_config=[
            Configure.NamedVectors.text2vec_jinaai(
                name="title_text_vector",
                source_properties=['title', 'text'],
                model="jina-embeddings-v3",
            )
        ],

        # Example: how to define property schema (Optional)
        properties=[  
            Property(name="title", data_type=DataType.TEXT),
            Property(name="text", data_type=DataType.TEXT),
        #     Property(name="url", data_type=DataType.TEXT),
        #     Property(name="wiki_id", data_type=DataType.TEXT),
        ],
    )

create_wiki_collection()

### Import data - 10k objects with Batch

Batch speeds up the import process by grouping objects to be added in bigger batch groups.

Batch creates an internal buffer to collect objects to be added.<br>
Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.

Types of batch:
* `dynamic` - let batch calculate the optimal batch_size based on detected latency
* `fixed_size` - provide a fixed batch_size
* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit

### Take 1 â€“ import sample 100

In [27]:
from tqdm import tqdm

sample_100 = data_10k[0:100]

wiki = client.collections.get("Wiki")

# TODO: setup dynamic batch
# loop through the sample_100 data
# add each object to the batch
with wiki.batch. dynamic() as batch: # fixed_size(10) as batch: 
    for item in tqdm(sample_100):
        batch.add_object(item)

print(f"Wiki count: {len(wiki)}")

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:00<00:00, 7772.27it/s]




Wiki count: 100


In [28]:
# check for errors
if(len(wiki.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in wiki.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors


### Take 2 â€“ import sample 100 â€“ with UUID

To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property.

In [29]:
from weaviate.util import generate_uuid5

print(generate_uuid5("This UUID is always the same"))
print(generate_uuid5("This UUID is always the same"))
print(generate_uuid5("This UUID is always the same"))
print("====================================")

print(generate_uuid5("This UUID is different"))
print(generate_uuid5("This UUID is different"))
print("====================================")

obj1 = { "title": "this is an object", "count": 1 }
obj2 = { "title": "this is an object", "count": 2 }
print(generate_uuid5(obj1))
print(generate_uuid5(obj2))


8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
8d3441c0-c1d1-5859-8a5e-efce9e7d3bd8
09f975a6-0e62-565a-982e-e6ce148eac86
09f975a6-0e62-565a-982e-e6ce148eac86
c3c3ad32-fa65-5944-a021-415f8fda02af
4d0b77d3-4862-59bc-bf9f-9fe2b9bf89f0


In [30]:
# recreate the collection to start again
create_wiki_collection()

> Rerun the import script multiple times.

> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase.

In [31]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

sample_100 = data_10k[0:100]

wiki = client.collections.get("Wiki")

with wiki.batch.dynamic() as batch: # fixed_size(batch_size=20, concurrent_requests=2) as batch:
    for item in tqdm(sample_100):
        # TODO: generate an id from item["wiki_id"]
        id = generate_uuid5(item["wiki_id"]) # why not use wiki id directly

        #batch.add_object(
        #    item,
        #    # TODO: provide the new id here 
        #    # uuid=
        # )

        batch.add_object(
            item,
            uuid=id
        )

print(f"Wiki count: {len(wiki)}")

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:00<00:00, 7463.97it/s]




Wiki count: 100


### Take 2 - import the rest of the data - but break if multiple errors

In [32]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

wiki = client.collections.get("Wiki")

with wiki.batch.dynamic() as batch: # fixed_size(batch_size=2000, concurrent_requests=2) as batch:
    for item in tqdm(data_10k):
        id = generate_uuid5(item["wiki_id"])
        batch.add_object(item, uuid=id)

        # Check number of errors while running
        if(batch.number_errors > 10):
            print("Errors during batch import")
            break

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [00:24<00:00, 410.46it/s]


### Check for errors

In [33]:
if(len(wiki.batch.failed_objects)>0):
    print("Import complete with errors")
    for err in wiki.batch.failed_objects:
        print(err)
else:
    print("Import complete with no errors")

Import complete with no errors


## Bonus - iterate through all collection data

The client has a built-in function that allows you to iterate through all collection data.

In [34]:
wiki = client.collections.get("Wiki")

counter = 100

for item in wiki.iterator():
    print(item.properties)

    if (counter == 0): break
    
    counter -= 1

{'text': "On October 31, 2000 Stankonia was released. It entered the Billboard 200 at number two after selling over 530,000 copies in its first week. Outkast's first greatest hits album Big Boi and Dre Present... Outkast was released on December 4, 2001.", 'title': 'Outkast', 'wiki_id': '20231101.simple_431000_4', 'url': 'https://simple.wikipedia.org/wiki/Outkast'}
{'text': '3rd Class Cities - When a city incorporates, it becomes a 3rd class city. To incorporate, a city must generally have at least 300 people living there.', 'title': 'List of locations in Kansas', 'wiki_id': '20231101.simple_300080_2', 'url': 'https://simple.wikipedia.org/wiki/List%20of%20locations%20in%20Kansas'}
{'text': "Seventh Son (1987) is an alternate history and fantasy book written by Orson Scott Card. It is the first book in Card's The Tales of Alvin Maker series about Alvin Miller, the seventh son of a seventh son, who therefore has special powers. It was nominated for both the Hugo Award for Best Novel and 

You can also get `vector embeddings`, by using `include_vector`.

In [35]:
counter = 10

for item in wiki.iterator(include_vector=True):
    print(item.properties)
    print(item.vector)

    if (counter == 0): break
    
    counter -= 1

{'text': "On October 31, 2000 Stankonia was released. It entered the Billboard 200 at number two after selling over 530,000 copies in its first week. Outkast's first greatest hits album Big Boi and Dre Present... Outkast was released on December 4, 2001.", 'title': 'Outkast', 'wiki_id': '20231101.simple_431000_4', 'url': 'https://simple.wikipedia.org/wiki/Outkast'}
{'title_text_vector': [0.19073133170604706, -0.16962647438049316, -0.07584690302610397, 0.03890860453248024, 0.047373414039611816, -0.03285321965813637, -0.05774905905127525, 0.0377737358212471, -0.011109917424619198, 0.0235916655510664, -0.01037643663585186, -0.0037099772598594427, -0.04558759182691574, -0.07412955164909363, -0.10759143531322479, -0.1133028045296669, -0.081611268222332, 0.11026512086391449, -0.011728725396096706, 0.06259267032146454, 0.01896992325782776, 0.07643471658229828, -0.006767057813704014, 0.07199272513389587, -0.030070427805185318, 0.06433407962322235, 0.09133080393075943, -0.0671435073018074, -0.1

## Close the client

In [36]:
client.close()