Skip to content

Commit

Permalink
Fix handling of many graph nodes at the same location
Browse files Browse the repository at this point in the history
When nodes have the same location, their distance to each other is 0.
And, their distance to all their neighbors is the same. In the case
with few maximum neighbors this can cause problems because the nodes
end up linking to each other and getting cut off from the rest of
the graph.

We fix this by introducing a secondary "tie break" distance between
nodes at the same location (based on on-disk distance). That's
used to create a secondary ordering for equivalent nodes and allows
the prune function to then properly cut away equivalent nodes so
that your neighbor list doesn't get full of them.

Note with SBQ, equivalence classes are actually not /so/ rare especially
with low dimension counts.
  • Loading branch information
cevian committed Aug 6, 2024
1 parent 4f46878 commit e698c59
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 12 deletions.
98 changes: 98 additions & 0 deletions pgvectorscale/src/access_method/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ fn finalize_index_build<S: Storage>(
if neighbors.len() > state.graph.get_meta_page().get_num_neighbors() as _ {
//OPT: get rid of this clone
prune_neighbors = state.graph.prune_neighbors(
index_pointer,
neighbors.clone(),
storage,
&mut write_stats.prune_stats,
Expand Down Expand Up @@ -835,4 +836,101 @@ pub mod tests {

Ok(())
}

#[pg_test]
pub unsafe fn test_index_small_accuracy() -> spi::Result<()> {
/* Test for the creation of connected graphs when the number of dimensions is small as is the
number of neighborss */
/* small num_neighbors is especially challenging for making sure no nodes get disconnected */
let index_options = "num_neighbors=10, search_list_size=10";
let expected_cnt = 300;
let dimensions = 2;

Spi::run(&format!(
"CREATE TABLE test_data (
id int,
embedding vector ({dimensions})
);
select setseed(0.5);
-- generate 300 vectors
INSERT INTO test_data (id, embedding)
SELECT
*
FROM (
SELECT
i % {expected_cnt},
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
FROM
generate_series(1, {dimensions} * {expected_cnt}) i
GROUP BY
i % {expected_cnt}) g;
CREATE INDEX idx_diskann_bq ON test_data USING diskann (embedding) WITH ({index_options});
SET enable_seqscan = 0;
-- perform index scans on the vectors
SELECT
*
FROM
test_data
ORDER BY
embedding <=> (
SELECT
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
FROM generate_series(1, {dimensions}));"))?;

let test_vec: Option<Vec<f32>> = Spi::get_one(&format!(
"SELECT('{{' || array_to_string(array_agg(1.0), ',', '0') || '}}')::real[] AS embedding
FROM generate_series(1, {dimensions})"
))?;

let cnt: Option<i64> = Spi::get_one_with_args(
&format!(
"
SET enable_seqscan = 0;
SET enable_indexscan = 1;
SET diskann.query_search_list_size = 2;
WITH cte as (select * from test_data order by embedding <=> $1::vector) SELECT count(*) from cte;
",
),
vec![(
pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
test_vec.clone().into_datum(),
)],
)?;

if cnt.unwrap() != expected_cnt {
/* better debugging */
let id: Option<i64> = Spi::get_one_with_args(
&format!(
"
SET enable_seqscan = 0;
SET enable_indexscan = 1;
SET diskann.query_search_list_size = 2;
WITH cte as (select id from test_data EXCEPT (select id from test_data order by embedding <=> $1::vector)) SELECT id from cte limit 1;
",
),
vec![(
pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
test_vec.clone().into_datum(),
)],
)?;

assert!(
cnt.unwrap() == expected_cnt,
"initial count is {} id is {}",
cnt.unwrap(),
id.unwrap()
);
}

assert!(
cnt.unwrap() == expected_cnt,
"initial count is {}",
cnt.unwrap()
);
Ok(())
}
}
78 changes: 66 additions & 12 deletions pgvectorscale/src/access_method/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ use super::{meta_page::MetaPage, neighbor_with_distance::NeighborWithDistance};
pub struct ListSearchNeighbor<PD> {
pub index_pointer: IndexPointer,
distance: f32,
distance_tie_break: usize, /* only used if distance = 0. This ensures a consistent order of results when distance = 0 */
private_data: PD,
}

impl<PD> PartialOrd for ListSearchNeighbor<PD> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
if self.distance == 0.0 && other.distance == 0.0 {
/* this logic should be consistent with what's used during pruning */
return self
.distance_tie_break
.partial_cmp(&other.distance_tie_break);
}
self.distance.partial_cmp(&other.distance)
}
}
Expand All @@ -49,6 +56,7 @@ impl<PD> ListSearchNeighbor<PD> {
index_pointer,
private_data,
distance,
distance_tie_break: 0,
}
}

Expand All @@ -62,6 +70,7 @@ pub struct ListSearchResult<QDM, PD> {
visited: Vec<ListSearchNeighbor<PD>>,
inserted: HashSet<ItemPointer>,
pub sdm: Option<QDM>,
tie_break_item_pointer: Option<ItemPointer>, /* This records the item pointer of the query. It's used for tie-breaking when the distance = 0 */
pub stats: GreedySearchStats,
}

Expand All @@ -72,20 +81,23 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
visited: vec![],
inserted: HashSet::new(),
sdm: None,
tie_break_item_pointer: None,
stats: GreedySearchStats::new(),
}
}

fn new<S: Storage<QueryDistanceMeasure = QDM, LSNPrivateData = PD>>(
init_ids: Vec<ItemPointer>,
sdm: S::QueryDistanceMeasure,
tie_break_item_pointer: Option<ItemPointer>,
search_list_size: usize,
meta_page: &MetaPage,
gns: &GraphNeighborStore,
storage: &S,
) -> Self {
let neigbors = meta_page.get_num_neighbors() as usize;
let mut res = Self {
tie_break_item_pointer,
candidates: BinaryHeap::with_capacity(search_list_size * neigbors),
visited: Vec::with_capacity(search_list_size * 2),
//candidate_storage: Vec::with_capacity(search_list_size * neigbors),
Expand All @@ -107,8 +119,15 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
}

/// Internal function
pub fn insert_neighbor(&mut self, n: ListSearchNeighbor<PD>) {
pub fn insert_neighbor(&mut self, mut n: ListSearchNeighbor<PD>) {
self.stats.record_candidate();
if n.distance == 0.0 {
/* record the tie break if distance is 0 */
if let Some(tie_break_item_pointer) = self.tie_break_item_pointer {
let d = tie_break_item_pointer.ip_distance(n.index_pointer);
n.distance_tie_break = d;
}
}
self.candidates.push(Reverse(n));
}

Expand Down Expand Up @@ -213,7 +232,7 @@ impl<'a> Graph<'a> {

let (pruned, new_neighbors) =
if candidates.len() > self.neighbor_store.max_neighbors(self.get_meta_page()) {
let new_list = self.prune_neighbors(candidates, storage, stats);
let new_list = self.prune_neighbors(neighbors_of, candidates, storage, stats);
(true, new_list)
} else {
(false, candidates)
Expand Down Expand Up @@ -248,6 +267,7 @@ impl<'a> Graph<'a> {
/// the returned ListSearchResult elements. It shouldn't be used with self.greedy_search_iterate
fn greedy_search_for_build<S: Storage>(
&self,
index_pointer: IndexPointer,
query: PgVector,
meta_page: &MetaPage,
storage: &S,
Expand All @@ -264,6 +284,7 @@ impl<'a> Graph<'a> {
let mut l = ListSearchResult::new(
init_ids.unwrap(),
dm,
Some(index_pointer),
search_list_size,
meta_page,
self.get_neighbor_store(),
Expand Down Expand Up @@ -293,6 +314,7 @@ impl<'a> Graph<'a> {
ListSearchResult::new(
init_ids.unwrap(),
dm,
None,
search_list_size,
&self.meta_page,
self.get_neighbor_store(),
Expand Down Expand Up @@ -331,6 +353,7 @@ impl<'a> Graph<'a> {
/// if we save the factors or the distances and add incrementally. Not sure.
pub fn prune_neighbors<S: Storage>(
&self,
neighbors_of: ItemPointer,
mut candidates: Vec<NeighborWithDistance>,
storage: &S,
stats: &mut PruneNeighborStats,
Expand Down Expand Up @@ -419,17 +442,43 @@ impl<'a> Graph<'a> {
debug_assert!(distance_between_candidate_and_existing_neighbor >= 0.0);

//factor is high if the candidate is closer to an existing neighbor than the point it's being considered for
let factor =
if distance_between_candidate_and_existing_neighbor < 0.0 + f32::EPSILON {
if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
1.0
let factor = if distance_between_candidate_and_existing_neighbor
< 0.0 + f32::EPSILON
{
if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
/* Both distances are 0. This is a special and interesting case because the neighbors of all
other nodes will be the same on both nodes. This, in turn means, that we would have the same
nieghbors on both. But, if the num_neighbors is small, we risk creating a unconnected subgraph all
pointing to each other (all nodes at the same point). For this reason, we create a consistent way
to rank the distance even at the same point (by using the item-pointer distance), and determine
the factor according to this new distance. This means that some 0-distance node will be pruned at alpha=1.0
Note: with sbq these equivalence relations are actually not uncommon */
let ip_distance_between_candidate_and_point = candidate_neighbor
.get_index_pointer_to_neighbor()
.ip_distance(neighbors_of);

let ip_distance_between_candidate_and_existing_neighbor =
candidate_neighbor
.get_index_pointer_to_neighbor()
.ip_distance(existing_neighbor.get_index_pointer_to_neighbor());

if ip_distance_between_candidate_and_point
<= ip_distance_between_candidate_and_existing_neighbor
{
0.99
} else {
f64::MAX
/* make sure this gets pruned for alpha=1.0 to make room for other nodes at higher distances */
1.01
}
} else {
distance_between_candidate_and_point as f64
/ distance_between_candidate_and_existing_neighbor as f64
};
f64::MAX
}
} else {
distance_between_candidate_and_point as f64
/ distance_between_candidate_and_existing_neighbor as f64
};

max_factors[j] = max_factors[j].max(factor)
}
}
Expand Down Expand Up @@ -466,8 +515,13 @@ impl<'a> Graph<'a> {
let meta_page = self.get_meta_page();

//TODO: make configurable?
let v =
self.greedy_search_for_build(vec, meta_page, storage, &mut stats.greedy_search_stats);
let v = self.greedy_search_for_build(
index_pointer,
vec,
meta_page,
storage,
&mut stats.greedy_search_stats,
);

let (_, neighbor_list) = self.add_neighbors(
storage,
Expand Down
10 changes: 10 additions & 0 deletions pgvectorscale/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ impl ItemPointer {
len: len as _,
}
}

pub fn ip_distance(self, other: Self) -> usize {
/* distance measure based on on-disk distance */
/* The two abs() give better results than taking one abs() at the end. Not quite sure why but I think
* It creates more links within the equivalence class */
let block_diff = (self.block_number as isize - other.block_number as isize).abs() as usize;
let offset_diff = (self.offset as isize - other.offset as isize).abs() as usize;
debug_assert!(offset_diff < pgrx::pg_sys::MaxOffsetNumber as _);
(block_diff * (pgrx::pg_sys::MaxOffsetNumber as usize) + offset_diff) as usize
}
}

pub type IndexPointer = ItemPointer;
Expand Down

0 comments on commit e698c59

Please sign in to comment.