Skip to content

Commit

Permalink
test: Add more sparse test cases (milvus-io#33916)
Browse files Browse the repository at this point in the history
issue: milvus-io#31483

Signed-off-by: elstic <hao.wang@zilliz.com>
  • Loading branch information
elstic committed Jun 19, 2024
1 parent 6d5747c commit 1216a4b
Show file tree
Hide file tree
Showing 8 changed files with 433 additions and 23 deletions.
8 changes: 8 additions & 0 deletions tests/python_client/base/client_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,11 @@ def init_collection_general(self, prefix="test", insert_data=False, nb=ct.defaul
if is_binary:
default_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim,
primary_field=primary_field)
if vector_data_type == ct.sparse_vector:
default_schema = cf.gen_default_sparse_schema(auto_id=auto_id, primary_field=primary_field,
enable_dynamic_field=enable_dynamic_field,
with_json=with_json,
multiple_dim_array=multiple_dim_array)
if is_all_data_type:
default_schema = cf.gen_collection_schema_all_datatype(auto_id=auto_id, dim=dim,
primary_field=primary_field,
Expand Down Expand Up @@ -289,6 +294,9 @@ def init_collection_general(self, prefix="test", insert_data=False, nb=ct.defaul
# This condition will be removed after auto index feature
if is_binary:
collection_w.create_index(ct.default_binary_vec_field_name, ct.default_bin_flat_index)
elif vector_data_type == ct.sparse_vector:
for vector_name in vector_name_list:
collection_w.create_index(vector_name, ct.default_sparse_inverted_index)
else:
if len(multiple_dim_array) == 0 or is_all_data_type == False:
vector_name_list.append(ct.default_float_vec_field_name)
Expand Down
58 changes: 38 additions & 20 deletions tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ def gen_double_field(name=ct.default_double_field_name, is_primary=False, descri

def gen_float_vec_field(name=ct.default_float_vec_field_name, is_primary=False, dim=ct.default_dim,
description=ct.default_desc, vector_data_type="FLOAT_VECTOR", **kwargs):
if vector_data_type == "SPARSE_FLOAT_VECTOR":
dtype = DataType.SPARSE_FLOAT_VECTOR
float_vec_field, _ = ApiFieldSchemaWrapper().init_field_schema(name=name, dtype=dtype,
description=description,
is_primary=is_primary, **kwargs)
return float_vec_field
if vector_data_type == "FLOAT_VECTOR":
dtype = DataType.FLOAT_VECTOR
elif vector_data_type == "FLOAT16_VECTOR":
Expand Down Expand Up @@ -358,9 +364,14 @@ def gen_collection_schema_all_datatype(description=ct.default_desc,
else:
multiple_dim_array.insert(0, dim)
for i in range(len(multiple_dim_array)):
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
if ct.all_float_vector_types[i%3] != ct.sparse_vector:
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
dim=multiple_dim_array[i],
vector_data_type=ct.all_float_vector_types[i%3]))
else:
# The field of a sparse vector cannot be dimensioned
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.sparse_vector}",
vector_data_type=ct.sparse_vector))

schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
primary_field=primary_field, auto_id=auto_id,
Expand All @@ -384,8 +395,17 @@ def gen_default_binary_collection_schema(description=ct.default_desc, primary_fi


def gen_default_sparse_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
auto_id=False, **kwargs):
auto_id=False, with_json=False, multiple_dim_array=[], **kwargs):

fields = [gen_int64_field(), gen_float_field(), gen_string_field(), gen_sparse_vec_field()]
if with_json:
fields.insert(-1, gen_json_field())

if len(multiple_dim_array) != 0:
for i in range(len(multiple_dim_array)):
vec_name = ct.default_sparse_vec_field_name + "_" + str(i)
vec_field = gen_sparse_vec_field(name=vec_name)
fields.append(vec_field)
sparse_schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
primary_field=primary_field,
auto_id=auto_id, **kwargs)
Expand Down Expand Up @@ -418,7 +438,7 @@ def gen_vectors(nb, dim, vector_data_type="FLOAT_VECTOR"):
vectors = gen_fp16_vectors(nb, dim)[1]
elif vector_data_type == "BFLOAT16_VECTOR":
vectors = gen_bf16_vectors(nb, dim)[1]
elif vector_data_type == "SPARSE_VECTOR":
elif vector_data_type == "SPARSE_FLOAT_VECTOR":
vectors = gen_sparse_vectors(nb, dim)

if dim > 1:
Expand Down Expand Up @@ -508,10 +528,10 @@ def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
index = 2
del insert_list[index]
if len(multiple_dim_array) != 0:
if len(multiple_vector_field_name) != len(multiple_dim_array):
log.error("multiple vector feature is enabled, please input the vector field name list "
"not including the default vector field")
assert len(multiple_vector_field_name) == len(multiple_dim_array)
# if len(multiple_vector_field_name) != len(multiple_dim_array):
# log.error("multiple vector feature is enabled, please input the vector field name list "
# "not including the default vector field")
# assert len(multiple_vector_field_name) == len(multiple_dim_array)
for i in range(len(multiple_dim_array)):
new_float_vec_values = gen_vectors(nb, multiple_dim_array[i], vector_data_type=vector_data_type)
insert_list.append(new_float_vec_values)
Expand Down Expand Up @@ -832,7 +852,7 @@ def gen_default_list_sparse_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
string_values = [str(i) for i in range(start, start + nb)]
json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]}
for i in range(start, start + nb)]
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_VECTOR")
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_FLOAT_VECTOR")
if with_json:
data = [int_values, float_values, string_values, json_values, sparse_vec_values]
else:
Expand Down Expand Up @@ -1772,7 +1792,7 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
multiple_vector_field_name=vector_name_list,
vector_data_type=vector_data_type,
auto_id=auto_id, primary_field=primary_field)
elif vector_data_type == "FLOAT16_VECTOR" or "BFLOAT16_VECTOR":
elif vector_data_type in ct.all_float_vector_types:
default_data = gen_general_default_list_data(nb // num, dim=dim, start=start, with_json=with_json,
random_primary_key=random_primary_key,
multiple_dim_array=multiple_dim_array,
Expand Down Expand Up @@ -1972,14 +1992,10 @@ def extract_vector_field_name_list(collection_w):
fields = schema_dict.get('fields')
vector_name_list = []
for field in fields:
if str(field['type']) in ["101", "102", "103"]:
if field['name'] != ct.default_float_vec_field_name:
vector_name_list.append(field['name'])

for field in fields:
if str(field['type']) == 'DataType.FLOAT_VECTOR' \
or str(field['type']) == 'DataType.FLOAT16_VECTOR' \
or str(field['type']) == 'DataType.BFLOAT16_VECTOR':
if field['type'] == DataType.FLOAT_VECTOR \
or field['type'] == DataType.FLOAT16_VECTOR \
or field['type'] == DataType.BFLOAT16_VECTOR \
or field['type'] == DataType.SPARSE_FLOAT_VECTOR:
if field['name'] != ct.default_float_vec_field_name:
vector_name_list.append(field['name'])

Expand Down Expand Up @@ -2120,11 +2136,13 @@ def gen_vectors_based_on_vector_type(num, dim, vector_data_type):
fp16_vectors: the bytes used for insert
return: raw_vectors and fp16_vectors
"""
if vector_data_type == "FLOAT_VECTOR":
if vector_data_type == ct.float_type:
vectors = [[random.random() for _ in range(dim)] for _ in range(num)]
elif vector_data_type == "FLOAT16_VECTOR":
elif vector_data_type == ct.float16_type:
vectors = gen_fp16_vectors(num, dim)[1]
elif vector_data_type == "BFLOAT16_VECTOR":
elif vector_data_type == ct.bfloat16_type:
vectors = gen_bf16_vectors(num, dim)[1]
elif vector_data_type == ct.sparse_vector:
vectors = gen_sparse_vectors(num, dim)

return vectors
3 changes: 2 additions & 1 deletion tests/python_client/common/common_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
float_type = "FLOAT_VECTOR"
float16_type = "FLOAT16_VECTOR"
bfloat16_type = "BFLOAT16_VECTOR"
all_float_vector_types = [float_type, float16_type, bfloat16_type]
sparse_vector = "SPARSE_FLOAT_VECTOR"
all_float_vector_types = [float16_type, bfloat16_type, sparse_vector]
default_sparse_vec_field_name = "sparse_vector"
default_partition_name = "_default"
default_resource_group_name = '__default_resource_group'
Expand Down
19 changes: 19 additions & 0 deletions tests/python_client/testcases/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
max_vector_field_num = ct.max_vector_field_num
SPARSE_FLOAT_VECTOR_data_type = "SPARSE_FLOAT_VECTOR"


class TestCollectionParams(TestcaseBase):
Expand Down Expand Up @@ -1047,6 +1048,24 @@ def test_create_collection_over_maximum_vector_and_all_fields(self):
error = {ct.err_code: 65535, ct.err_msg: "maximum field's number should be limited to 64"}
self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error)

@pytest.mark.tags(CaseLabel.L2)
def test_collection_multi_sparse_vectors(self):
"""
target: Test multiple sparse vectors in a collection
method: create 2 sparse vectors in a collection
expected: successful creation of a collection
"""
# 1. connect
self._connect()
# 2. create collection with multiple vectors
c_name = cf.gen_unique_str(prefix)
fields = [cf.gen_int64_field(is_primary=True), cf.gen_float_field(),
cf.gen_float_vec_field(vector_data_type=ct.sparse_vector_data_type), cf.gen_float_vec_field(name="tmp", vector_data_type=sparse_vector_data_type)]
schema = cf.gen_collection_schema(fields=fields)
self.collection_wrap.init_collection(c_name, schema=schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_name, exp_schema: schema})


class TestCollectionOperation(TestcaseBase):
"""
Expand Down
43 changes: 43 additions & 0 deletions tests/python_client/testcases/test_index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import random
from time import sleep

import numpy as np
import pytest
import copy

Expand Down Expand Up @@ -1442,6 +1444,47 @@ def test_alter_index_invalid(self):
check_items={ct.err_code: 1,
ct.err_msg: f"<'int' object has no attribute 'items'"})

@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", " ", "invalid"])
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_invalid_sparse_metric_type(self, metric_type, index):
"""
target: unsupported metric_type create index
method: unsupported metric_type creates an index
expected: raise exception
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
collection_w.insert(data=data)
param = cf.get_index_params_params(index)
params = {"index_type": index, "metric_type": metric_type, "params": param}
error = {ct.err_code: 65535, ct.err_msg: "only IP is the supported metric type for sparse index"}
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
check_task=CheckTasks.err_res,
check_items=error)

@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("ratio", [-0.5, 1, 3])
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
def test_invalid_sparse_ratio(self, ratio, index):
"""
target: index creation for unsupported ratio parameter
method: indexing of unsupported ratio parameters
expected: raise exception
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
collection_w.insert(data=data)
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
error = {ct.err_code: 1100, ct.err_msg: f"invalid drop_ratio_build: {ratio}, must be in range [0, 1): invalid parameter[expected=valid index params"}
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
check_task=CheckTasks.err_res,
check_items=error)


@pytest.mark.tags(CaseLabel.GPU)
class TestNewIndexAsync(TestcaseBase):
Expand Down
43 changes: 43 additions & 0 deletions tests/python_client/testcases/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,25 @@ def test_insert_with_nan_value(self):
error = {ct.err_code: 65535, ct.err_msg: "value '+Inf' is not a number or infinity"}
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)

@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
@pytest.mark.parametrize("invalid_vector_type ", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
def test_invalid_sparse_vector_data(self, index, invalid_vector_type):
"""
target: insert illegal data type
method: insert illegal data type
expected: raise exception
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
nb = 100
data = cf.gen_default_list_sparse_data(nb=nb)[:-1]
invalid_vec = cf.gen_vectors(nb, dim=128, vector_data_type=invalid_vector_type)
data.append(invalid_vec)
error = {ct.err_code: 1, ct.err_msg: 'input must be a sparse matrix in supported format'}
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)


class TestInsertInvalidBinary(TestcaseBase):
"""
Expand Down Expand Up @@ -1872,6 +1891,30 @@ def test_upsert_dataframe_using_default_value(self):
collection_w.upsert(df)
assert collection_w.num_entities == ct.default_nb

@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
def test_upsert_sparse_data(self, index):
"""
target: multiple upserts and counts(*)
method: multiple upserts and counts(*)
expected: number of data entries normal
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=ct.default_nb)
collection_w.upsert(data=data)
assert collection_w.num_entities == ct.default_nb
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
for i in range(5):
collection_w.upsert(data=data)
collection_w.query(expr=f'{ct.default_int64_field_name} >= 0', output_fields=[ct.default_count_output]
, check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": ct.default_nb}]})


class TestUpsertInvalid(TestcaseBase):
""" Invalid test case of Upsert interface """
Expand Down
31 changes: 31 additions & 0 deletions tests/python_client/testcases/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3691,6 +3691,37 @@ def test_count_expression_comparative(self):
check_task=CheckTasks.check_query_results,
check_items={exp_res: [{count: res}]})

@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_counts_expression_sparse_vectors(self, index):
"""
target: test count with expr
method: count with expr
expected: verify count
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
collection_w.query(expr=default_expr, output_fields=[count],
check_task=CheckTasks.check_query_results,
check_items={exp_res: [{count: ct.default_nb}]})
expr = "int64 > 50 && int64 < 100 && float < 75"
collection_w.query(expr=expr, output_fields=[count],
check_task=CheckTasks.check_query_results,
check_items={exp_res: [{count: 24}]})
batch_size = 100
collection_w.query_iterator(batch_size=batch_size, expr=default_expr,
check_task=CheckTasks.check_query_iterator,
check_items={"count": ct.default_nb,
"batch_size": batch_size})


class TestQueryIterator(TestcaseBase):
"""
Expand Down
Loading

0 comments on commit 1216a4b

Please sign in to comment.