In [None]:
!pip install weaviate-client

## Load data
Download data from [drive](https://drive.google.com/file/d/1W8nBPZA2j1_6AGnw2BAe6ydXLNXzuDq2/view?usp=share_link)

In [2]:
import os
import weaviate

client = weaviate.Client(
    url="https://cohere-wiki-demo.weaviate.network",
    additional_headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY")
    }
)
client.is_ready()

True

In [23]:
# delete existing schema, (note, this will delete all your weaviate data)
client.schema.delete_all()

article_schema = {
    "class": "Article",
    "description": "Wiki Article",
    "vectorizer": "text2vec-cohere",
    "moduleConfig": {
        "text2vec-cohere": {
            "model": "multilingual-22-12",
            "truncate": "RIGHT"
        }
    },
    "vectorIndexConfig": {
        "distance": "dot"
    },
    "properties": [
    {
        "name": "text",
        "dataType": [ "text" ],
        "description": "Article body",
        "moduleConfig": {
            "text2vec-cohere": {
                "skip": False,
                "vectorizePropertyName": False
            }
        }
    },
    {
        "name": "title",
        "dataType": [ "string" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "url",
        "dataType": [ "string" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "wiki_id",
        "dataType": [ "int" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "views",
        "dataType": [ "number" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    ]
}

# add the schema
client.schema.create_class(article_schema)

print("The schema has been created")

The schema has been created


In [3]:
import pandas as pd
df = pd.read_parquet('./wiki_simple_100k.parquet')

In [17]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=100,
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7fb088f22a60>

In [None]:
small = df[:20000]

counter=0

with client.batch as batch:
    for idx, row in small.iterrows():
        
        # print update message every 100 objects        
        if (counter %100 == 0):
            print(f"Import {counter} / {len(small)} ")

        properties = {
        "text": row["text"],
        "title": row["title"],
        "url": row["url"],
        "views": row["views"],
        "wiki_id": row["wiki_id"]
        }

        vector = row["emb"]

        batch.add_data_object(properties, "Article", None, vector)
        counter = counter+1
        
print("Import complete")

Import 0 / 20000 
Import 100 / 20000 
Import 200 / 20000 
Import 300 / 20000 
Import 400 / 20000 
Import 500 / 20000 
Import 600 / 20000 
Import 700 / 20000 
Import 800 / 20000 
Import 900 / 20000 
Import 1000 / 20000 
Import 1100 / 20000 
Import 1200 / 20000 
Import 1300 / 20000 
Import 1400 / 20000 
Import 1500 / 20000 
Import 1600 / 20000 
Import 1700 / 20000 
Import 1800 / 20000 
Import 1900 / 20000 
Import 2000 / 20000 
Import 2100 / 20000 
Import 2200 / 20000 
Import 2300 / 20000 
Import 2400 / 20000 
Import 2500 / 20000 
Import 2600 / 20000 
Import 2700 / 20000 
Import 2800 / 20000 
Import 2900 / 20000 
Import 3000 / 20000 
Import 3100 / 20000 
Import 3200 / 20000 
Import 3300 / 20000 
Import 3400 / 20000 
Import 3500 / 20000 
Import 3600 / 20000 
Import 3700 / 20000 
Import 3800 / 20000 
Import 3900 / 20000 
Import 4000 / 20000 
Import 4100 / 20000 
Import 4200 / 20000 
Import 4300 / 20000 
Import 4400 / 20000 
Import 4500 / 20000 
Import 4600 / 20000 
Import 4700 / 20000 
Impo

In [21]:
def semantic_serch(query):
    nearText = {
        "concepts": [query],
#         "distance": -139.0,
    }

    properties = [
        "text", "title", "url", "views",
        "_additional {distance}"
    ]

    response = (
        client.query
        .get("Article", properties)
        .with_near_text(nearText)
        .with_limit(5)
        .do()
    )

    result = response['data']['Get']['Article']

    return result
  
def print_result(result):
    for item in result:
        print(f"\033[95m{item['title']} ({item['views']}) {item['_additional']['distance']}\033[0m")
        print(f"\033[4m{item['url']}\033[0m")
        print(item['text'])
        print()

In [22]:
# Pass the user query to Weaviate
query_result = semantic_serch("musical instruments")

# Print out the result
print_result(query_result)

[95mHarmonium (950.3016357421875) -144.7552[0m
[4mhttps://simple.wikipedia.org/wiki?curid=305100[0m
similar instruments. In Vienna, Anton Haeckl constructed the physharmonica, a keyboard instrument filled with free reeds. John Green invented the seraphine, which produced music when air was blown over metallic reeds. Such instruments are now museum pieces.

[95mHarmonium (950.3016357421875) -142.79364[0m
[4mhttps://simple.wikipedia.org/wiki?curid=305100[0m
A harmonium, also called a "melodeon", "reed organ" or "pump organ", is a keyboard instrument that is a lot like an organ. It makes sound by blowing air through reeds, which are tuned to different pitches to make musical notes.

[95mHarmonium (950.3016357421875) -139.97475[0m
[4mhttps://simple.wikipedia.org/wiki?curid=305100[0m
It was first invented by in 1840 in France, who patented his Harmonium in Paris on August 9, 1840. Harmonium with a Swarmandal (a small, harp-like instrument, similar to Zither and Autoharp) was pro