test: Update tests for range search and add test for query with dup i…

…ds (milvus-io#34057) related issue: milvus-io#33883 Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
yellow-shine · Jul 2, 2024 · d6cf547 · d6cf547
1 parent b42c31a
commit d6cf547
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 49 deletions.
diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py
@@ -2275,6 +2275,41 @@ def test_query_dup_ids_dup_term_array(self):
         collection_w.query(term_expr, output_fields=["*"], check_items=CheckTasks.check_query_results,
                            check_task={exp_res: res})
 
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.parametrize("with_growing", [True])
+    def test_query_to_get_latest_entity_with_dup_ids(self, with_growing):
+        """
+        target: test query to get latest entity with duplicate primary keys
+        method: 1.create collection and insert dup primary key = 0
+                2.query with expr=dup_id
+        expected: return the latest entity; verify the result is same as dedup entities
+        """
+        collection_w = self.init_collection_general(prefix, dim=16, is_flush=False, insert_data=False, is_index=False,
+                                                    vector_data_type=ct.float_type, with_json=False)[0]
+        nb = 50
+        rounds = 10
+        for i in range(rounds):
+            df = cf.gen_default_dataframe_data(dim=16, nb=nb, start=i * nb, with_json=False)
+            df[ct.default_int64_field_name] = i
+            collection_w.insert(df)
+            # re-insert the last piece of data in df to refresh the timestamp
+            last_piece = df.iloc[-1:]
+            collection_w.insert(last_piece)
+
+        if not with_growing:
+            collection_w.flush()
+        collection_w.create_index(ct.default_float_vec_field_name, index_params=ct.default_index)
+        collection_w.load()
+        # verify the result returns the latest entity if there are duplicate primary keys
+        expr = f'{ct.default_int64_field_name} == 0'
+        res = collection_w.query(expr=expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name])[0]
+        assert len(res) == 1 and res[0][ct.default_float_field_name] == (nb - 1) * 1.0
+
+        # verify the result is same as dedup entities
+        expr = f'{ct.default_int64_field_name} >= 0'
+        res = collection_w.query(expr=expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name])[0]
+        assert len(res) == rounds
+
     @pytest.mark.tags(CaseLabel.L0)
     def test_query_after_index(self):
         """

diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py
@@ -6962,72 +6962,72 @@ def enable_dynamic_field(self, request):
     """
     @pytest.mark.tags(CaseLabel.L0)
     @pytest.mark.parametrize("vector_data_type", ct.all_dense_vector_types)
-    def test_range_search_default(self, index_type, metric, vector_data_type):
+    @pytest.mark.parametrize("with_growing", [False, True])
+    def test_range_search_default(self, index_type, metric, vector_data_type, with_growing):
         """
         target: verify the range search returns correct results
-        method: 1. create collection, insert 8000 vectors,
+        method: 1. create collection, insert 10k vectors,
                 2. search with topk=1000
                 3. range search from the 30th-330th distance as filter
                 4. verified the range search results is same as the search results in the range
         """
         collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
                                                     vector_data_type=vector_data_type, with_json=False)[0]
-        nb = 2000
-        for i in range(3):
-            data = cf.gen_general_default_list_data(nb=nb, auto_id=True,
-                                                    vector_data_type=vector_data_type, with_json=False)
+        nb = 1000
+        rounds = 10
+        for i in range(rounds):
+            data = cf.gen_general_default_list_data(nb=nb, auto_id=True, vector_data_type=vector_data_type,
+                                                    with_json=False, start=i*nb)
             collection_w.insert(data)
 
         collection_w.flush()
         _index_params = {"index_type": "FLAT", "metric_type": metric, "params": {}}
         collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
         collection_w.load()
 
-        for i in range(2):
-            with_growing = bool(i % 2)
-            if with_growing is True:
-                # add some growing segments
-                for _ in range(2):
-                    data = cf.gen_general_default_list_data(nb=nb, auto_id=True,
-                                                            vector_data_type=vector_data_type, with_json=False)
-                    collection_w.insert(data)
+        if with_growing is True:
+            # add some growing segments
+            for j in range(rounds//2):
+                data = cf.gen_general_default_list_data(nb=nb, auto_id=True, vector_data_type=vector_data_type,
+                                                        with_json=False, start=(rounds+j)*nb)
+                collection_w.insert(data)
 
-            search_params = {"params": {}}
-            nq = 1
-            search_vectors = cf.gen_vectors(nq, ct.default_dim, vector_data_type=vector_data_type)
-            search_res = collection_w.search(search_vectors, default_search_field,
-                                             search_params, limit=1000)[0]
-            assert len(search_res[0].ids) == 1000
-            log.debug(f"search topk=1000 returns {len(search_res[0].ids)}")
-            check_topk = 300
-            check_from = 30
-            ids = search_res[0].ids[check_from:check_from + check_topk]
-            radius = search_res[0].distances[check_from + check_topk]
-            range_filter = search_res[0].distances[check_from]
-
-            # rebuild the collection with test target index
-            collection_w.release()
-            collection_w.indexes[0].drop()
-            _index_params = {"index_type": index_type, "metric_type": metric,
-                             "params": cf.get_index_params_params(index_type)}
-            collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
-            collection_w.load()
+        search_params = {"params": {}}
+        nq = 1
+        search_vectors = cf.gen_vectors(nq, ct.default_dim, vector_data_type=vector_data_type)
+        search_res = collection_w.search(search_vectors, default_search_field,
+                                         search_params, limit=1000)[0]
+        assert len(search_res[0].ids) == 1000
+        log.debug(f"search topk=1000 returns {len(search_res[0].ids)}")
+        check_topk = 300
+        check_from = 30
+        ids = search_res[0].ids[check_from:check_from + check_topk]
+        radius = search_res[0].distances[check_from + check_topk]
+        range_filter = search_res[0].distances[check_from]
+
+        # rebuild the collection with test target index
+        collection_w.release()
+        collection_w.indexes[0].drop()
+        _index_params = {"index_type": index_type, "metric_type": metric,
+                         "params": cf.get_index_params_params(index_type)}
+        collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
+        collection_w.load()
 
-            params = cf.get_search_params_params(index_type)
-            params.update({"radius": radius, "range_filter": range_filter})
-            if index_type == "HNSW":
-                params.update({"ef": check_topk+100})
-            if index_type == "IVF_PQ":
-                params.update({"max_empty_result_buckets": 100})
-            range_search_params = {"params": params}
-            range_res = collection_w.search(search_vectors, default_search_field,
-                                            range_search_params, limit=check_topk)[0]
-            range_ids = range_res[0].ids
-            # assert len(range_ids) == check_topk
-            log.debug(f"range search radius={radius}, range_filter={range_filter}, range results num: {len(range_ids)}")
-            hit_rate = round(len(set(ids).intersection(set(range_ids))) / len(set(ids)), 2)
-            log.debug(f"range search results with growing {bool(i % 2)} hit rate: {hit_rate}")
-            assert hit_rate >= 0.2    # issue #32630 to improve the accuracy
+        params = cf.get_search_params_params(index_type)
+        params.update({"radius": radius, "range_filter": range_filter})
+        if index_type == "HNSW":
+            params.update({"ef": check_topk+100})
+        if index_type == "IVF_PQ":
+            params.update({"max_empty_result_buckets": 100})
+        range_search_params = {"params": params}
+        range_res = collection_w.search(search_vectors, default_search_field,
+                                        range_search_params, limit=check_topk)[0]
+        range_ids = range_res[0].ids
+        # assert len(range_ids) == check_topk
+        log.debug(f"range search radius={radius}, range_filter={range_filter}, range results num: {len(range_ids)}")
+        hit_rate = round(len(set(ids).intersection(set(range_ids))) / len(set(ids)), 2)
+        log.debug(f"{vector_data_type} range search results {index_type} {metric} with_growing {with_growing} hit_rate: {hit_rate}")
+        assert hit_rate >= 0.2    # issue #32630 to improve the accuracy
 
     @pytest.mark.tags(CaseLabel.L2)
     @pytest.mark.parametrize("range_filter", [1000, 1000.0])