Fix handling of many graph nodes at the same location

When nodes have the same location, their distance to each other is 0. And, their distance to all their neighbors is the same. In the case with few maximum neighbors this can cause problems because the nodes end up linking to each other and getting cut off from the rest of the graph. We fix this by introducing a secondary "tie break" distance between nodes at the same location (based on on-disk distance). That's used to create a secondary ordering for equivalent nodes and allows the prune function to then properly cut away equivalent nodes so that your neighbor list doesn't get full of them. Note with SBQ, equivalence classes are actually not /so/ rare especially with low dimension counts.
timescale · Aug 6, 2024 · e698c59 · e698c59
1 parent 4f46878
commit e698c59
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 12 deletions.
diff --git a/pgvectorscale/src/access_method/build.rs b/pgvectorscale/src/access_method/build.rs
@@ -284,6 +284,7 @@ fn finalize_index_build<S: Storage>(
                     if neighbors.len() > state.graph.get_meta_page().get_num_neighbors() as _ {
                         //OPT: get rid of this clone
                         prune_neighbors = state.graph.prune_neighbors(
+                            index_pointer,
                             neighbors.clone(),
                             storage,
                             &mut write_stats.prune_stats,
@@ -835,4 +836,101 @@ pub mod tests {
 
         Ok(())
     }
+
+    #[pg_test]
+    pub unsafe fn test_index_small_accuracy() -> spi::Result<()> {
+        /* Test for the creation of connected graphs when the number of dimensions is small as is the
+        number of neighborss */
+        /* small num_neighbors is especially challenging for making sure no nodes get disconnected */
+        let index_options = "num_neighbors=10, search_list_size=10";
+        let expected_cnt = 300;
+        let dimensions = 2;
+
+        Spi::run(&format!(
+            "CREATE TABLE test_data (
+                id int,
+                embedding vector ({dimensions})
+            );
+
+            select setseed(0.5);
+           -- generate 300 vectors
+            INSERT INTO test_data (id, embedding)
+            SELECT
+                *
+            FROM (
+                SELECT
+                    i % {expected_cnt},
+                    ('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
+                FROM
+                    generate_series(1, {dimensions} * {expected_cnt}) i
+                GROUP BY
+                    i % {expected_cnt}) g;
+
+            CREATE INDEX idx_diskann_bq ON test_data USING diskann (embedding) WITH ({index_options});
+
+
+            SET enable_seqscan = 0;
+            -- perform index scans on the vectors
+            SELECT
+                *
+            FROM
+                test_data
+            ORDER BY
+                embedding <=> (
+                    SELECT
+                        ('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
+            FROM generate_series(1, {dimensions}));"))?;
+
+        let test_vec: Option<Vec<f32>> = Spi::get_one(&format!(
+            "SELECT('{{' || array_to_string(array_agg(1.0), ',', '0') || '}}')::real[] AS embedding
+    FROM generate_series(1, {dimensions})"
+        ))?;
+
+        let cnt: Option<i64> = Spi::get_one_with_args(
+                &format!(
+                    "
+            SET enable_seqscan = 0;
+            SET enable_indexscan = 1;
+            SET diskann.query_search_list_size = 2;
+            WITH cte as (select * from test_data order by embedding <=> $1::vector) SELECT count(*) from cte;
+            ",
+                ),
+                vec![(
+                    pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
+                    test_vec.clone().into_datum(),
+                )],
+            )?;
+
+        if cnt.unwrap() != expected_cnt {
+            /* better debugging */
+            let id: Option<i64> = Spi::get_one_with_args(
+                &format!(
+                    "
+            SET enable_seqscan = 0;
+            SET enable_indexscan = 1;
+            SET diskann.query_search_list_size = 2;
+            WITH cte as (select id from test_data EXCEPT (select id from test_data order by embedding <=> $1::vector)) SELECT id from cte limit 1;
+            ",
+                ),
+                vec![(
+                    pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
+                    test_vec.clone().into_datum(),
+                )],
+            )?;
+
+            assert!(
+                cnt.unwrap() == expected_cnt,
+                "initial count is {} id is {}",
+                cnt.unwrap(),
+                id.unwrap()
+            );
+        }
+
+        assert!(
+            cnt.unwrap() == expected_cnt,
+            "initial count is {}",
+            cnt.unwrap()
+        );
+        Ok(())
+    }
 }
diff --git a/pgvectorscale/src/access_method/graph.rs b/pgvectorscale/src/access_method/graph.rs
@@ -18,11 +18,18 @@ use super::{meta_page::MetaPage, neighbor_with_distance::NeighborWithDistance};
 pub struct ListSearchNeighbor<PD> {
     pub index_pointer: IndexPointer,
     distance: f32,
+    distance_tie_break: usize, /* only used if distance = 0. This ensures a consistent order of results when distance = 0 */
     private_data: PD,
 }
 
 impl<PD> PartialOrd for ListSearchNeighbor<PD> {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        if self.distance == 0.0 && other.distance == 0.0 {
+            /* this logic should be consistent with what's used during pruning */
+            return self
+                .distance_tie_break
+                .partial_cmp(&other.distance_tie_break);
+        }
         self.distance.partial_cmp(&other.distance)
     }
 }
@@ -49,6 +56,7 @@ impl<PD> ListSearchNeighbor<PD> {
             index_pointer,
             private_data,
             distance,
+            distance_tie_break: 0,
         }
     }
 
@@ -62,6 +70,7 @@ pub struct ListSearchResult<QDM, PD> {
     visited: Vec<ListSearchNeighbor<PD>>,
     inserted: HashSet<ItemPointer>,
     pub sdm: Option<QDM>,
+    tie_break_item_pointer: Option<ItemPointer>, /* This records the item pointer of the query. It's used for tie-breaking when the distance = 0 */
     pub stats: GreedySearchStats,
 }
 
@@ -72,20 +81,23 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
             visited: vec![],
             inserted: HashSet::new(),
             sdm: None,
+            tie_break_item_pointer: None,
             stats: GreedySearchStats::new(),
         }
     }
 
     fn new<S: Storage<QueryDistanceMeasure = QDM, LSNPrivateData = PD>>(
         init_ids: Vec<ItemPointer>,
         sdm: S::QueryDistanceMeasure,
+        tie_break_item_pointer: Option<ItemPointer>,
         search_list_size: usize,
         meta_page: &MetaPage,
         gns: &GraphNeighborStore,
         storage: &S,
     ) -> Self {
         let neigbors = meta_page.get_num_neighbors() as usize;
         let mut res = Self {
+            tie_break_item_pointer,
             candidates: BinaryHeap::with_capacity(search_list_size * neigbors),
             visited: Vec::with_capacity(search_list_size * 2),
             //candidate_storage: Vec::with_capacity(search_list_size * neigbors),
@@ -107,8 +119,15 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
     }
 
     /// Internal function
-    pub fn insert_neighbor(&mut self, n: ListSearchNeighbor<PD>) {
+    pub fn insert_neighbor(&mut self, mut n: ListSearchNeighbor<PD>) {
         self.stats.record_candidate();
+        if n.distance == 0.0 {
+            /* record the tie break if distance is 0 */
+            if let Some(tie_break_item_pointer) = self.tie_break_item_pointer {
+                let d = tie_break_item_pointer.ip_distance(n.index_pointer);
+                n.distance_tie_break = d;
+            }
+        }
         self.candidates.push(Reverse(n));
     }
 
@@ -213,7 +232,7 @@ impl<'a> Graph<'a> {
 
         let (pruned, new_neighbors) =
             if candidates.len() > self.neighbor_store.max_neighbors(self.get_meta_page()) {
-                let new_list = self.prune_neighbors(candidates, storage, stats);
+                let new_list = self.prune_neighbors(neighbors_of, candidates, storage, stats);
                 (true, new_list)
             } else {
                 (false, candidates)
@@ -248,6 +267,7 @@ impl<'a> Graph<'a> {
     /// the returned ListSearchResult elements. It shouldn't be used with self.greedy_search_iterate
     fn greedy_search_for_build<S: Storage>(
         &self,
+        index_pointer: IndexPointer,
         query: PgVector,
         meta_page: &MetaPage,
         storage: &S,
@@ -264,6 +284,7 @@ impl<'a> Graph<'a> {
         let mut l = ListSearchResult::new(
             init_ids.unwrap(),
             dm,
+            Some(index_pointer),
             search_list_size,
             meta_page,
             self.get_neighbor_store(),
@@ -293,6 +314,7 @@ impl<'a> Graph<'a> {
         ListSearchResult::new(
             init_ids.unwrap(),
             dm,
+            None,
             search_list_size,
             &self.meta_page,
             self.get_neighbor_store(),
@@ -331,6 +353,7 @@ impl<'a> Graph<'a> {
     /// if we save the factors or the distances and add incrementally. Not sure.
     pub fn prune_neighbors<S: Storage>(
         &self,
+        neighbors_of: ItemPointer,
         mut candidates: Vec<NeighborWithDistance>,
         storage: &S,
         stats: &mut PruneNeighborStats,
@@ -419,17 +442,43 @@ impl<'a> Graph<'a> {
                     debug_assert!(distance_between_candidate_and_existing_neighbor >= 0.0);
 
                     //factor is high if the candidate is closer to an existing neighbor than the point it's being considered for
-                    let factor =
-                        if distance_between_candidate_and_existing_neighbor < 0.0 + f32::EPSILON {
-                            if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
-                                1.0
+                    let factor = if distance_between_candidate_and_existing_neighbor
+                        < 0.0 + f32::EPSILON
+                    {
+                        if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
+                            /* Both distances are 0. This is a special and interesting case because the neighbors of all
+                            other nodes will be the same on both nodes. This, in turn means, that we would have the same
+                            nieghbors on both. But, if the num_neighbors is small, we risk creating a unconnected subgraph all
+                            pointing to each other (all nodes at the same point). For this reason, we create a consistent way
+                            to rank the distance even at the same point (by using the item-pointer distance), and determine
+                            the factor according to this new distance. This means that some 0-distance node will be pruned at alpha=1.0
+
+                            Note: with sbq these equivalence relations are actually not uncommon */
+                            let ip_distance_between_candidate_and_point = candidate_neighbor
+                                .get_index_pointer_to_neighbor()
+                                .ip_distance(neighbors_of);
+
+                            let ip_distance_between_candidate_and_existing_neighbor =
+                                candidate_neighbor
+                                    .get_index_pointer_to_neighbor()
+                                    .ip_distance(existing_neighbor.get_index_pointer_to_neighbor());
+
+                            if ip_distance_between_candidate_and_point
+                                <= ip_distance_between_candidate_and_existing_neighbor
+                            {
+                                0.99
                             } else {
-                                f64::MAX
+                                /* make sure this gets pruned for alpha=1.0 to make room for other nodes at higher distances */
+                                1.01
                             }
                         } else {
-                            distance_between_candidate_and_point as f64
-                                / distance_between_candidate_and_existing_neighbor as f64
-                        };
+                            f64::MAX
+                        }
+                    } else {
+                        distance_between_candidate_and_point as f64
+                            / distance_between_candidate_and_existing_neighbor as f64
+                    };
+
                     max_factors[j] = max_factors[j].max(factor)
                 }
             }
@@ -466,8 +515,13 @@ impl<'a> Graph<'a> {
         let meta_page = self.get_meta_page();
 
         //TODO: make configurable?
-        let v =
-            self.greedy_search_for_build(vec, meta_page, storage, &mut stats.greedy_search_stats);
+        let v = self.greedy_search_for_build(
+            index_pointer,
+            vec,
+            meta_page,
+            storage,
+            &mut stats.greedy_search_stats,
+        );
 
         let (_, neighbor_list) = self.add_neighbors(
             storage,

diff --git a/pgvectorscale/src/util/mod.rs b/pgvectorscale/src/util/mod.rs
@@ -121,6 +121,16 @@ impl ItemPointer {
             len: len as _,
         }
     }
+
+    pub fn ip_distance(self, other: Self) -> usize {
+        /* distance measure based on on-disk distance */
+        /* The two abs() give better results than taking one abs() at the end. Not quite sure why but I think
+         * It creates more links within the equivalence class  */
+        let block_diff = (self.block_number as isize - other.block_number as isize).abs() as usize;
+        let offset_diff = (self.offset as isize - other.offset as isize).abs() as usize;
+        debug_assert!(offset_diff < pgrx::pg_sys::MaxOffsetNumber as _);
+        (block_diff * (pgrx::pg_sys::MaxOffsetNumber as usize) + offset_diff) as usize
+    }
 }
 
 pub type IndexPointer = ItemPointer;