Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of many graph nodes at the same location #111

Merged
merged 1 commit into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions pgvectorscale/src/access_method/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ fn finalize_index_build<S: Storage>(
if neighbors.len() > state.graph.get_meta_page().get_num_neighbors() as _ {
//OPT: get rid of this clone
prune_neighbors = state.graph.prune_neighbors(
index_pointer,
neighbors.clone(),
storage,
&mut write_stats.prune_stats,
Expand Down Expand Up @@ -835,4 +836,101 @@ pub mod tests {

Ok(())
}

#[pg_test]
pub unsafe fn test_index_small_accuracy() -> spi::Result<()> {
/* Test for the creation of connected graphs when the number of dimensions is small as is the
number of neighborss */
/* small num_neighbors is especially challenging for making sure no nodes get disconnected */
let index_options = "num_neighbors=10, search_list_size=10";
let expected_cnt = 300;
let dimensions = 2;

Spi::run(&format!(
"CREATE TABLE test_data (
id int,
embedding vector ({dimensions})
);

select setseed(0.5);
-- generate 300 vectors
INSERT INTO test_data (id, embedding)
SELECT
*
FROM (
SELECT
i % {expected_cnt},
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
FROM
generate_series(1, {dimensions} * {expected_cnt}) i
GROUP BY
i % {expected_cnt}) g;

CREATE INDEX idx_diskann_bq ON test_data USING diskann (embedding) WITH ({index_options});


SET enable_seqscan = 0;
-- perform index scans on the vectors
SELECT
*
FROM
test_data
ORDER BY
embedding <=> (
SELECT
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
FROM generate_series(1, {dimensions}));"))?;

let test_vec: Option<Vec<f32>> = Spi::get_one(&format!(
"SELECT('{{' || array_to_string(array_agg(1.0), ',', '0') || '}}')::real[] AS embedding
FROM generate_series(1, {dimensions})"
))?;

let cnt: Option<i64> = Spi::get_one_with_args(
&format!(
"
SET enable_seqscan = 0;
SET enable_indexscan = 1;
SET diskann.query_search_list_size = 2;
WITH cte as (select * from test_data order by embedding <=> $1::vector) SELECT count(*) from cte;
",
),
vec![(
pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
test_vec.clone().into_datum(),
)],
)?;

if cnt.unwrap() != expected_cnt {
/* better debugging */
let id: Option<i64> = Spi::get_one_with_args(
&format!(
"
SET enable_seqscan = 0;
SET enable_indexscan = 1;
SET diskann.query_search_list_size = 2;
WITH cte as (select id from test_data EXCEPT (select id from test_data order by embedding <=> $1::vector)) SELECT id from cte limit 1;
",
),
vec![(
pgrx::PgOid::Custom(pgrx::pg_sys::FLOAT4ARRAYOID),
test_vec.clone().into_datum(),
)],
)?;

assert!(
cnt.unwrap() == expected_cnt,
"initial count is {} id is {}",
cnt.unwrap(),
id.unwrap()
);
}

assert!(
cnt.unwrap() == expected_cnt,
"initial count is {}",
cnt.unwrap()
);
Ok(())
}
}
78 changes: 66 additions & 12 deletions pgvectorscale/src/access_method/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ use super::{meta_page::MetaPage, neighbor_with_distance::NeighborWithDistance};
pub struct ListSearchNeighbor<PD> {
pub index_pointer: IndexPointer,
distance: f32,
distance_tie_break: usize, /* only used if distance = 0. This ensures a consistent order of results when distance = 0 */
private_data: PD,
}

impl<PD> PartialOrd for ListSearchNeighbor<PD> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
if self.distance == 0.0 && other.distance == 0.0 {
/* this logic should be consistent with what's used during pruning */
return self
.distance_tie_break
.partial_cmp(&other.distance_tie_break);
}
self.distance.partial_cmp(&other.distance)
}
}
Expand All @@ -49,6 +56,7 @@ impl<PD> ListSearchNeighbor<PD> {
index_pointer,
private_data,
distance,
distance_tie_break: 0,
}
}

Expand All @@ -62,6 +70,7 @@ pub struct ListSearchResult<QDM, PD> {
visited: Vec<ListSearchNeighbor<PD>>,
inserted: HashSet<ItemPointer>,
pub sdm: Option<QDM>,
tie_break_item_pointer: Option<ItemPointer>, /* This records the item pointer of the query. It's used for tie-breaking when the distance = 0 */
pub stats: GreedySearchStats,
}

Expand All @@ -72,20 +81,23 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
visited: vec![],
inserted: HashSet::new(),
sdm: None,
tie_break_item_pointer: None,
stats: GreedySearchStats::new(),
}
}

fn new<S: Storage<QueryDistanceMeasure = QDM, LSNPrivateData = PD>>(
init_ids: Vec<ItemPointer>,
sdm: S::QueryDistanceMeasure,
tie_break_item_pointer: Option<ItemPointer>,
search_list_size: usize,
meta_page: &MetaPage,
gns: &GraphNeighborStore,
storage: &S,
) -> Self {
let neigbors = meta_page.get_num_neighbors() as usize;
let mut res = Self {
tie_break_item_pointer,
candidates: BinaryHeap::with_capacity(search_list_size * neigbors),
visited: Vec::with_capacity(search_list_size * 2),
//candidate_storage: Vec::with_capacity(search_list_size * neigbors),
Expand All @@ -107,8 +119,15 @@ impl<QDM, PD> ListSearchResult<QDM, PD> {
}

/// Internal function
pub fn insert_neighbor(&mut self, n: ListSearchNeighbor<PD>) {
pub fn insert_neighbor(&mut self, mut n: ListSearchNeighbor<PD>) {
self.stats.record_candidate();
if n.distance == 0.0 {
/* record the tie break if distance is 0 */
if let Some(tie_break_item_pointer) = self.tie_break_item_pointer {
let d = tie_break_item_pointer.ip_distance(n.index_pointer);
n.distance_tie_break = d;
}
}
self.candidates.push(Reverse(n));
}

Expand Down Expand Up @@ -213,7 +232,7 @@ impl<'a> Graph<'a> {

let (pruned, new_neighbors) =
if candidates.len() > self.neighbor_store.max_neighbors(self.get_meta_page()) {
let new_list = self.prune_neighbors(candidates, storage, stats);
let new_list = self.prune_neighbors(neighbors_of, candidates, storage, stats);
(true, new_list)
} else {
(false, candidates)
Expand Down Expand Up @@ -248,6 +267,7 @@ impl<'a> Graph<'a> {
/// the returned ListSearchResult elements. It shouldn't be used with self.greedy_search_iterate
fn greedy_search_for_build<S: Storage>(
&self,
index_pointer: IndexPointer,
query: PgVector,
meta_page: &MetaPage,
storage: &S,
Expand All @@ -264,6 +284,7 @@ impl<'a> Graph<'a> {
let mut l = ListSearchResult::new(
init_ids.unwrap(),
dm,
Some(index_pointer),
search_list_size,
meta_page,
self.get_neighbor_store(),
Expand Down Expand Up @@ -293,6 +314,7 @@ impl<'a> Graph<'a> {
ListSearchResult::new(
init_ids.unwrap(),
dm,
None,
search_list_size,
&self.meta_page,
self.get_neighbor_store(),
Expand Down Expand Up @@ -331,6 +353,7 @@ impl<'a> Graph<'a> {
/// if we save the factors or the distances and add incrementally. Not sure.
pub fn prune_neighbors<S: Storage>(
&self,
neighbors_of: ItemPointer,
mut candidates: Vec<NeighborWithDistance>,
storage: &S,
stats: &mut PruneNeighborStats,
Expand Down Expand Up @@ -419,17 +442,43 @@ impl<'a> Graph<'a> {
debug_assert!(distance_between_candidate_and_existing_neighbor >= 0.0);

//factor is high if the candidate is closer to an existing neighbor than the point it's being considered for
let factor =
if distance_between_candidate_and_existing_neighbor < 0.0 + f32::EPSILON {
if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
1.0
let factor = if distance_between_candidate_and_existing_neighbor
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: the prune_neighborgs functions is fairly big. Maybe you can abstract the factor calculation away to its own function. That way you can add docs specific to the factor calculation. Also, you can use guard clauses to remove some of the ifs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to ignore this for now. I think we may refactor this later anyway.

< 0.0 + f32::EPSILON
{
if distance_between_candidate_and_point < 0.0 + f32::EPSILON {
/* Both distances are 0. This is a special and interesting case because the neighbors of all
other nodes will be the same on both nodes. This, in turn means, that we would have the same
nieghbors on both. But, if the num_neighbors is small, we risk creating a unconnected subgraph all
pointing to each other (all nodes at the same point). For this reason, we create a consistent way
to rank the distance even at the same point (by using the item-pointer distance), and determine
the factor according to this new distance. This means that some 0-distance node will be pruned at alpha=1.0

Note: with sbq these equivalence relations are actually not uncommon */
let ip_distance_between_candidate_and_point = candidate_neighbor
.get_index_pointer_to_neighbor()
.ip_distance(neighbors_of);

let ip_distance_between_candidate_and_existing_neighbor =
candidate_neighbor
.get_index_pointer_to_neighbor()
.ip_distance(existing_neighbor.get_index_pointer_to_neighbor());

if ip_distance_between_candidate_and_point
<= ip_distance_between_candidate_and_existing_neighbor
{
0.99
} else {
f64::MAX
/* make sure this gets pruned for alpha=1.0 to make room for other nodes at higher distances */
1.01
}
} else {
distance_between_candidate_and_point as f64
/ distance_between_candidate_and_existing_neighbor as f64
};
f64::MAX
}
} else {
distance_between_candidate_and_point as f64
/ distance_between_candidate_and_existing_neighbor as f64
};

max_factors[j] = max_factors[j].max(factor)
}
}
Expand Down Expand Up @@ -466,8 +515,13 @@ impl<'a> Graph<'a> {
let meta_page = self.get_meta_page();

//TODO: make configurable?
let v =
self.greedy_search_for_build(vec, meta_page, storage, &mut stats.greedy_search_stats);
let v = self.greedy_search_for_build(
index_pointer,
vec,
meta_page,
storage,
&mut stats.greedy_search_stats,
);

let (_, neighbor_list) = self.add_neighbors(
storage,
Expand Down
10 changes: 10 additions & 0 deletions pgvectorscale/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ impl ItemPointer {
len: len as _,
}
}

pub fn ip_distance(self, other: Self) -> usize {
cevian marked this conversation as resolved.
Show resolved Hide resolved
/* distance measure based on on-disk distance */
/* The two abs() give better results than taking one abs() at the end. Not quite sure why but I think
* It creates more links within the equivalence class */
let block_diff = (self.block_number as isize - other.block_number as isize).abs() as usize;
let offset_diff = (self.offset as isize - other.offset as isize).abs() as usize;
debug_assert!(offset_diff < pgrx::pg_sys::MaxOffsetNumber as _);
(block_diff * (pgrx::pg_sys::MaxOffsetNumber as usize) + offset_diff) as usize
}
}

pub type IndexPointer = ItemPointer;
Expand Down