In [None]:
!pip install weaviate-client

## Load data
Download data from [drive](https://drive.google.com/file/d/1W8nBPZA2j1_6AGnw2BAe6ydXLNXzuDq2/view?usp=share_link)

In [38]:
import os
import weaviate

client = weaviate.Client(
    url="https://cohere-wiki-demo.weaviate.network",
    additional_headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
    }
)
client.is_ready()

True

In [23]:
# delete existing schema, (note, this will delete all your weaviate data)
client.schema.delete_all()

article_schema = {
    "class": "Article",
    "description": "Wiki Article",
    "vectorizer": "text2vec-cohere",
    "moduleConfig": {
        "text2vec-cohere": {
            "model": "multilingual-22-12",
            "truncate": "RIGHT"
        }
    },
    "vectorIndexConfig": {
        "distance": "dot"
    },
    "properties": [
    {
        "name": "text",
        "dataType": [ "text" ],
        "description": "Article body",
        "moduleConfig": {
            "text2vec-cohere": {
                "skip": False,
                "vectorizePropertyName": False
            }
        }
    },
    {
        "name": "title",
        "dataType": [ "string" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "url",
        "dataType": [ "string" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "wiki_id",
        "dataType": [ "int" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "views",
        "dataType": [ "number" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    ]
}

# add the schema
client.schema.create_class(article_schema)

print("The schema has been created")

The schema has been created


In [3]:
import pandas as pd
df = pd.read_parquet('./wiki_simple_100k.parquet')

In [39]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=100,
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7fb0486ef3d0>

In [49]:
data = df[:100_000] # make sure it is not more than 100k objects

counter=0

with client.batch as batch:
    for idx, item in data.iterrows():        
        # print update message every 100 objects        
        if (counter %100 == 0):
            print(f"Import {counter} / {len(data)} ", end="\r")

        properties = {
            "text": item["text"],
            "title": item["title"],
            "url": item["url"],
            "views": item["views"],
            "wiki_id": item["wiki_id"]
        }

        vector = item["emb"]

        batch.add_data_object(properties, "Article", None, vector)
        counter = counter+1
    print(f"Import {counter} / {len(data)}")
        
print("Import complete")

Import 100000 / 100000
Import complete


In [42]:
# Test that all data has loaded – get object count
result = (
    client.query.aggregate("Article")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Article"])

Object count:  [{'meta': {'count': 100000}}]


In [31]:
def semantic_serch(query):
    nearText = {
        "concepts": [query],
#         "distance": -139.0,
    }

    properties = [
        "text", "title", "url", "views",
        "_additional {distance}"
    ]

    response = (
        client.query
        .get("Article", properties)
        .with_near_text(nearText)
        .with_limit(5)
        .do()
    )

    result = response['data']['Get']['Article']

    return result


def print_result(result):
    for item in result:
        print(f"\033[95m{item['title']} ({item['views']}) {item['_additional']['distance']}\033[0m")
        print(f"\033[4m{item['url']}\033[0m")
        print(item['text'])
        print()

In [25]:
# Pass the user query to Weaviate
query_result = semantic_serch("musical instruments")

# Print out the result
print_result(query_result)

[95mClassical music (339.3004150390625) -146.06982[0m
[4mhttps://simple.wikipedia.org/wiki?curid=5022[0m
Classical music can be for instruments or for the voice. The symphony orchestra is the most common group of instruments for the playing of classical music. It has four families of instruments: the string instruments which include the violins, violas, cellos and piano, the woodwind instruments which include flutes, oboes,clarinets and bassoons together with related instruments of different sizes, the brass instruments: trumpet, trombone, tuba and French horn, and percussion instruments which nearly always includes timpani as well as many other possible instruments which are hit or shaken. This is very different from a typical rock band which has a drummer, a guitarist, one or two singers and an electric bass and keyboard. Instruments that play classical music are not normally amplified electronically.

[95mScotland (396.1304016113281) -145.73114[0m
[4mhttps://simple.wikipedia.