# Dependencies

In [1]:
#!pip install "weaviate-client==3.*"

# Configuration

In [2]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

In [3]:
import weaviate
import json
import os

client = weaviate.Client(
    url = os.environ["WEAVIATE_URL"],  
    auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"]),
    additional_headers = {
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
    }
)

client.is_ready()

True

# Schema

In [4]:
# resetting the schema. CAUTION: This will delete your collection 
if client.schema.exists("MyCollection"):
    client.schema.delete_class("MyCollection")

schema = {
    "class": "MyCollection",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {},
        "generative-openai": {}, # Only relevant for generative searches
    },
    "properties" : [
        {
            "name" : "content",
            "dataType" : ["text"],
        }
        
    ]
}

client.schema.create_class(schema)

print("Successfully created the schema.")

Successfully created the schema.


# Import the Data

In [5]:
data = [
    "私の名前は鈴木(Suzuki)です。趣味は野球です。", # My name is Suzuki. My hobby is baseball.
    "私の名前は佐藤(Sato)です。趣味はサッカーです。", # My name is Sato. My hobby is soccer.
    "私の名前は田中(Tanaka)です。趣味はテニスです。" # My name is Tanaka. My hobby is tennis.
]

client.batch.configure(batch_size=10)  # Configure batch

# Batch import all objects
# (Yes, batch import is an overkill for 3 objects, but it is recommended for large volumes of data)with client.batch as batch:
with client.batch as batch:
    for item in data:
        properties = {
         "content": item
        }

        # the call that performs data insert
        batch.add_data_object(
            class_name="MyCollection",
            data_object=properties,
        )

print("Data import complete")

Data import complete


Quick check to see if all objects are in.
Let's use [meta count](https://weaviate.io/developers/weaviate/search/aggregate#retrieve-a-meta-property).

In [6]:
# Check number of objects
response = (
    client.query
    .aggregate("MyCollection")
    .with_meta_count()
    .do()
)

print(response)

{'data': {'Aggregate': {'MyCollection': [{'meta': {'count': 3}}]}}}


# Queries

## Semantic search (nearVector)

In [13]:
response = (
    client.query
    .get("MyCollection", ["content"])
    .with_near_text({"concepts": ["バトミントン"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))


{
    "data": {
        "Get": {
            "MyCollection": [
                {
                    "content": "\u79c1\u306e\u540d\u524d\u306f\u9234\u6728(Suzuki)\u3067\u3059\u3002\u8da3\u5473\u306f\u91ce\u7403\u3067\u3059\u3002"
                },
                {
                    "content": "\u79c1\u306e\u540d\u524d\u306f\u7530\u4e2d(Tanaka)\u3067\u3059\u3002\u8da3\u5473\u306f\u30c6\u30cb\u30b9\u3067\u3059\u3002"
                }
            ]
        }
    }
}


In [14]:
response = (
    client.query
    .get("MyCollection", ["content"])
    .with_near_text({"concepts": ["バトミントン"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4, ensure_ascii=False))


{
    "data": {
        "Get": {
            "MyCollection": [
                {
                    "content": "私の名前は鈴木(Suzuki)です。趣味は野球です。"
                },
                {
                    "content": "私の名前は田中(Tanaka)です。趣味はテニスです。"
                }
            ]
        }
    }
}


## Semantic search with filter

In [9]:
response = (
    client.query
    .get("MyCollection", ["content"])
    .with_near_text({"concepts": ["バトミントン"]})
        .with_where({
        "path": ["content"],
        "operator": "Like",
        "valueText": "*鈴木*"
    })
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4, ensure_ascii=False))

{
    "data": {
        "Get": {
            "MyCollection": [
                {
                    "content": "私の名前は鈴木(Suzuki)です。趣味は野球です。"
                }
            ]
        }
    }
}


## Generative search

In [10]:
response = (
    client.query
    .get("MyCollection", ["content"])
    .with_near_text({"concepts": ["バトミントン"]})
    .with_generate(single_prompt="{content}。私の名前の読み方は何ですか？ ")
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=4, ensure_ascii=False))

{
    "data": {
        "Get": {
            "MyCollection": [
                {
                    "_additional": {
                        "generate": {
                            "error": null,
                            "singleResult": "鈴木(Suzuki)の読み方は「すずき」です。"
                        }
                    },
                    "content": "私の名前は鈴木(Suzuki)です。趣味は野球です。"
                }
            ]
        }
    }
}


## Hybrid search search 

In [11]:
response = (
    client.query
    .get("MyCollection", ["content"])
    .with_hybrid("田中", alpha=0.5)
    .do()
)

print(json.dumps(response, indent=4, ensure_ascii=False))

response = (
    client.query
    .get("MyCollection", ["content"])
    .with_hybrid("田中", alpha=1)
    .do()
)

print(json.dumps(response, indent=4, ensure_ascii=False))


response = (
    client.query
    .get("MyCollection", ["content"])
    .with_hybrid("田中", alpha=0)
    .do()
)

print(json.dumps(response, indent=4, ensure_ascii=False))


{
    "data": {
        "Get": {
            "MyCollection": [
                {
                    "content": "私の名前は田中(Tanaka)です。趣味はテニスです。"
                },
                {
                    "content": "私の名前は佐藤(Sato)です。趣味はサッカーです。"
                },
                {
                    "content": "私の名前は鈴木(Suzuki)です。趣味は野球です。"
                }
            ]
        }
    }
}
{
    "data": {
        "Get": {
            "MyCollection": [
                {
                    "content": "私の名前は田中(Tanaka)です。趣味はテニスです。"
                },
                {
                    "content": "私の名前は佐藤(Sato)です。趣味はサッカーです。"
                },
                {
                    "content": "私の名前は鈴木(Suzuki)です。趣味は野球です。"
                }
            ]
        }
    }
}
{
    "data": {
        "Get": {
            "MyCollection": []
        }
    }
}


## BM25 search (keyword-based)

In [12]:
# doesn't work at this time
response = (
  client.query
    .get("MyCollection", ["content"])
    .with_bm25({"query": "田中", "properties": ["content"]})
    .do()
)


print(json.dumps(response, indent=4, ensure_ascii=False))


AttributeError: 'dict' object has no attribute 'replace'