From e3e0d69f0572a925647b6794d4c99c0c8c9a15bc Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Mon, 26 Mar 2018 19:08:22 +0300 Subject: [PATCH] rebalancer: introduce pinned bucket concept into rebalancer algo Pinned bucket is the bucket, that can not be sent out of its replicaset. Taking pinned buckets into account changes rebalancer algorithm, since now on some replicasets the perfect balance can not be reached. Iterative algorithm is used to learn the best balance in a cluster. On each step it calculates perfect bucket count for each replicaset. If this count can not be satisfied due to pinned buckets, the algorithm does best effort to get the perfect balance. This is done via ignoring of replicasets disbalanced via pinning, and their pinned buckets. After that a new balance is calculated. And it can happen, that it can not be satisfied too. It is possible, because ignoring of pinned buckets in overpopulated replicasets leads to decrease of perfect bucket count in other replicasets, and a new values can become less that their pinned bucket count. Part of #71 --- test/unit/rebalancer.result | 333 +++++++++++++++++++++++++++++++++- test/unit/rebalancer.test.lua | 79 ++++++++ vshard/consts.lua | 1 + vshard/replicaset.lua | 102 ++++++++--- vshard/storage/init.lua | 22 ++- 5 files changed, 500 insertions(+), 37 deletions(-) diff --git a/test/unit/rebalancer.result b/test/unit/rebalancer.result index af909eba..ff7fcfc6 100644 --- a/test/unit/rebalancer.result +++ b/test/unit/rebalancer.result @@ -369,7 +369,8 @@ _bucket:replace{3, consts.BUCKET.SENT} ... get_state() --- -- 2 +- bucket_active_count: 2 + bucket_pinned_count: 0 ... _bucket:replace{1, consts.BUCKET.RECEIVING} --- @@ -643,6 +644,336 @@ replicasets ethalon_bucket_count: 34 bucket_count: 25 ... +-- +-- gh-71: allow to pin buckets. A pinned bucket can not be sent +-- out of its replicaset even to satisfy perfect balance. +-- +-- For this case the rebalancer does best effort balance. The +-- perfect balance here is unrechable, since on each replicaset +-- too many buckets are pinned. +test_run:cmd("setopt delimiter ';'") +--- +- true +... +replicasets = { + uuid1 = {bucket_count = 33, pinned_count = 26, weight = 1}, + uuid2 = {bucket_count = 33, pinned_count = 24, weight = 1}, + uuid3 = {bucket_count = 34, pinned_count = 30, weight = 1}, + uuid4 = {bucket_count = 0, weight = 1}, +}; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +calc_ethalon(replicasets, 100) +--- +... +replicasets +--- +- uuid4: + weight: 1 + ethalon_bucket_count: 20 + bucket_count: 0 + uuid1: + bucket_count: 33 + ethalon_bucket_count: 26 + ignore_disbalance: true + weight: 1 + pinned_count: 26 + uuid3: + bucket_count: 34 + ethalon_bucket_count: 30 + ignore_disbalance: true + weight: 1 + pinned_count: 30 + uuid2: + bucket_count: 33 + ethalon_bucket_count: 24 + ignore_disbalance: true + weight: 1 + pinned_count: 24 +... +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +--- +- 100 +... +replicasets +--- +- uuid4: + needed: 20 + weight: 1 + ethalon_bucket_count: 20 + bucket_count: 0 + uuid1: + bucket_count: 33 + ethalon_bucket_count: 26 + ignore_disbalance: true + needed: -7 + weight: 1 + pinned_count: 26 + uuid3: + bucket_count: 34 + ethalon_bucket_count: 30 + ignore_disbalance: true + needed: -4 + weight: 1 + pinned_count: 30 + uuid2: + bucket_count: 33 + ethalon_bucket_count: 24 + ignore_disbalance: true + needed: -9 + weight: 1 + pinned_count: 24 +... +-- +-- Here the disbalance is ok for the replicaset with uuid1 only - +-- other replicasets have pinned buckets too, but not enough to +-- break the balance: buckets are moved ok to uuid4. +-- +test_run:cmd("setopt delimiter ';'") +--- +- true +... +replicasets = { + uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1}, + uuid2 = {bucket_count = 33, pinned_count = 10, weight = 1}, + uuid3 = {bucket_count = 34, pinned_count = 15, weight = 1}, + uuid4 = {bucket_count = 0, weight = 1}, +}; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +calc_ethalon(replicasets, 100) +--- +... +replicasets +--- +- uuid4: + weight: 1 + ethalon_bucket_count: 23 + bucket_count: 0 + uuid1: + bucket_count: 33 + ethalon_bucket_count: 30 + ignore_disbalance: true + weight: 1 + pinned_count: 30 + uuid3: + ethalon_bucket_count: 23 + bucket_count: 34 + pinned_count: 15 + weight: 1 + uuid2: + ethalon_bucket_count: 24 + bucket_count: 33 + pinned_count: 10 + weight: 1 +... +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +--- +- 100 +... +replicasets +--- +- uuid4: + needed: 23 + weight: 1 + ethalon_bucket_count: 23 + bucket_count: 0 + uuid1: + bucket_count: 33 + ethalon_bucket_count: 30 + ignore_disbalance: true + needed: -3 + weight: 1 + pinned_count: 30 + uuid3: + bucket_count: 34 + ethalon_bucket_count: 23 + needed: -11 + weight: 1 + pinned_count: 15 + uuid2: + bucket_count: 33 + ethalon_bucket_count: 24 + needed: -9 + weight: 1 + pinned_count: 10 +... +-- +-- Non-locked replicaset with any pinned bucket count can receive +-- more buckets, if the rebalancer decides it is the best balance. +-- +test_run:cmd("setopt delimiter ';'") +--- +- true +... +replicasets = { + uuid1 = {bucket_count = 30, pinned_count = 25, weight = 0}, + uuid2 = {bucket_count = 25, pinned_count = 25, weight = 1}, + uuid3 = {bucket_count = 25, pinned_count = 25, weight = 1}, + uuid4 = {bucket_count = 20, weight = 0}, +}; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +calc_ethalon(replicasets, 100) +--- +... +replicasets +--- +- uuid4: + weight: 0 + ethalon_bucket_count: 0 + bucket_count: 20 + uuid1: + bucket_count: 30 + ethalon_bucket_count: 25 + ignore_disbalance: true + weight: 0 + pinned_count: 25 + uuid3: + ethalon_bucket_count: 37 + bucket_count: 25 + pinned_count: 25 + weight: 1 + uuid2: + ethalon_bucket_count: 38 + bucket_count: 25 + pinned_count: 25 + weight: 1 +... +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +--- +- inf +... +replicasets +--- +- uuid4: + needed: -20 + weight: 0 + ethalon_bucket_count: 0 + bucket_count: 20 + uuid1: + bucket_count: 30 + ethalon_bucket_count: 25 + ignore_disbalance: true + needed: -5 + weight: 0 + pinned_count: 25 + uuid3: + bucket_count: 25 + ethalon_bucket_count: 37 + needed: 12 + weight: 1 + pinned_count: 25 + uuid2: + bucket_count: 25 + ethalon_bucket_count: 38 + needed: 13 + weight: 1 + pinned_count: 25 +... +-- +-- Check that the rebalancer can calculate a complex case, when a +-- perfect balance is learned in several steps of the algorithm. +-- Here on the first step it is calculated, that each replicaset +-- must contain 25 buckets. But UUID1 can not satisfy it, so it +-- is ignored. On the next step there are 100 - 30 pinned buckets +-- from UUID1 = 70 buckets, and 3 replicasets. A new perfect +-- balance is 23-23-24. But it can not be satisfied too - UUID2 +-- has 25 pinned buckets, so it is ignored. On the third step +-- there are 70 - 25 = 45 buckets and 2 replicasets. A new perfect +-- balance is 22-23. But it is unreachable too, because UUID3 has +-- 24 pinned buckets. So only UUID4 is not ignored, and it +-- receives all non-pinned buckets: 45 - 24 = 21. +-- +test_run:cmd("setopt delimiter ';'") +--- +- true +... +replicasets = { + uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1}, + uuid2 = {bucket_count = 33, pinned_count = 25, weight = 1}, + uuid3 = {bucket_count = 34, pinned_count = 24, weight = 1}, + uuid4 = {bucket_count = 0, weight = 1}, +}; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +calc_ethalon(replicasets, 100) +--- +... +replicasets +--- +- uuid4: + weight: 1 + ethalon_bucket_count: 21 + bucket_count: 0 + uuid1: + bucket_count: 33 + ethalon_bucket_count: 30 + ignore_disbalance: true + weight: 1 + pinned_count: 30 + uuid3: + bucket_count: 34 + ethalon_bucket_count: 24 + ignore_disbalance: true + weight: 1 + pinned_count: 24 + uuid2: + bucket_count: 33 + ethalon_bucket_count: 25 + ignore_disbalance: true + weight: 1 + pinned_count: 25 +... +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +--- +- 100 +... +replicasets +--- +- uuid4: + needed: 21 + weight: 1 + ethalon_bucket_count: 21 + bucket_count: 0 + uuid1: + bucket_count: 33 + ethalon_bucket_count: 30 + ignore_disbalance: true + needed: -3 + weight: 1 + pinned_count: 30 + uuid3: + bucket_count: 34 + ethalon_bucket_count: 24 + ignore_disbalance: true + needed: -10 + weight: 1 + pinned_count: 24 + uuid2: + bucket_count: 33 + ethalon_bucket_count: 25 + ignore_disbalance: true + needed: -8 + weight: 1 + pinned_count: 25 +... _bucket:drop() --- ... diff --git a/test/unit/rebalancer.test.lua b/test/unit/rebalancer.test.lua index d22116bf..71411bb1 100644 --- a/test/unit/rebalancer.test.lua +++ b/test/unit/rebalancer.test.lua @@ -162,4 +162,83 @@ test_run:cmd("setopt delimiter ''"); calc_ethalon(replicasets, 100) replicasets +-- +-- gh-71: allow to pin buckets. A pinned bucket can not be sent +-- out of its replicaset even to satisfy perfect balance. +-- +-- For this case the rebalancer does best effort balance. The +-- perfect balance here is unrechable, since on each replicaset +-- too many buckets are pinned. +test_run:cmd("setopt delimiter ';'") +replicasets = { + uuid1 = {bucket_count = 33, pinned_count = 26, weight = 1}, + uuid2 = {bucket_count = 33, pinned_count = 24, weight = 1}, + uuid3 = {bucket_count = 34, pinned_count = 30, weight = 1}, + uuid4 = {bucket_count = 0, weight = 1}, +}; +test_run:cmd("setopt delimiter ''"); +calc_ethalon(replicasets, 100) +replicasets +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +replicasets +-- +-- Here the disbalance is ok for the replicaset with uuid1 only - +-- other replicasets have pinned buckets too, but not enough to +-- break the balance: buckets are moved ok to uuid4. +-- +test_run:cmd("setopt delimiter ';'") +replicasets = { + uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1}, + uuid2 = {bucket_count = 33, pinned_count = 10, weight = 1}, + uuid3 = {bucket_count = 34, pinned_count = 15, weight = 1}, + uuid4 = {bucket_count = 0, weight = 1}, +}; +test_run:cmd("setopt delimiter ''"); +calc_ethalon(replicasets, 100) +replicasets +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +replicasets +-- +-- Non-locked replicaset with any pinned bucket count can receive +-- more buckets, if the rebalancer decides it is the best balance. +-- +test_run:cmd("setopt delimiter ';'") +replicasets = { + uuid1 = {bucket_count = 30, pinned_count = 25, weight = 0}, + uuid2 = {bucket_count = 25, pinned_count = 25, weight = 1}, + uuid3 = {bucket_count = 25, pinned_count = 25, weight = 1}, + uuid4 = {bucket_count = 20, weight = 0}, +}; +test_run:cmd("setopt delimiter ''"); +calc_ethalon(replicasets, 100) +replicasets +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +replicasets +-- +-- Check that the rebalancer can calculate a complex case, when a +-- perfect balance is learned in several steps of the algorithm. +-- Here on the first step it is calculated, that each replicaset +-- must contain 25 buckets. But UUID1 can not satisfy it, so it +-- is ignored. On the next step there are 100 - 30 pinned buckets +-- from UUID1 = 70 buckets, and 3 replicasets. A new perfect +-- balance is 23-23-24. But it can not be satisfied too - UUID2 +-- has 25 pinned buckets, so it is ignored. On the third step +-- there are 70 - 25 = 45 buckets and 2 replicasets. A new perfect +-- balance is 22-23. But it is unreachable too, because UUID3 has +-- 24 pinned buckets. So only UUID4 is not ignored, and it +-- receives all non-pinned buckets: 45 - 24 = 21. +-- +test_run:cmd("setopt delimiter ';'") +replicasets = { + uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1}, + uuid2 = {bucket_count = 33, pinned_count = 25, weight = 1}, + uuid3 = {bucket_count = 34, pinned_count = 24, weight = 1}, + uuid4 = {bucket_count = 0, weight = 1}, +}; +test_run:cmd("setopt delimiter ''"); +calc_ethalon(replicasets, 100) +replicasets +calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING) +replicasets + _bucket:drop() diff --git a/vshard/consts.lua b/vshard/consts.lua index 1a0e4ab7..db0daf46 100644 --- a/vshard/consts.lua +++ b/vshard/consts.lua @@ -2,6 +2,7 @@ return { -- Bucket FSM BUCKET = { ACTIVE = 'active', + PINNED = 'pinned', SENDING = 'sending', SENT = 'sent', RECEIVING = 'receiving', diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua index 724b1b39..ecb077b1 100644 --- a/vshard/replicaset.lua +++ b/vshard/replicaset.lua @@ -396,42 +396,90 @@ local replica_mt = { -- -- Calculate for each replicaset its ethalon bucket count. +-- Iterative algorithm is used to learn the best balance in a +-- cluster. On each step it calculates perfect bucket count for +-- each replicaset. If this count can not be satisfied due to +-- pinned buckets, the algorithm does best effort to get the +-- perfect balance. This is done via ignoring of replicasets +-- disbalanced via pinning, and their pinned buckets. After that a +-- new balance is calculated. And it can happen, that it can not +-- be satisfied too. It is possible, because ignoring of pinned +-- buckets in overpopulated replicasets leads to decrease of +-- perfect bucket count in other replicasets, and a new values can +-- become less that their pinned bucket count. +-- +-- On each step the algorithm either is finished, or ignores at +-- least one new overpopulated replicaset, so it has complexity +-- O(N), where N - replicaset count. -- local function cluster_calculate_ethalon_balance(replicasets, bucket_count) + local is_balance_found = false local weight_sum = 0 + local step_count = 0 + local replicaset_count = 0 for _, replicaset in pairs(replicasets) do weight_sum = weight_sum + replicaset.weight + replicaset_count = replicaset_count + 1 end - assert(weight_sum > 0) - local bucket_per_weight = bucket_count / weight_sum - local buckets_calculated = 0 - for _, replicaset in pairs(replicasets) do - replicaset.ethalon_bucket_count = - math.ceil(replicaset.weight * bucket_per_weight) - buckets_calculated = - buckets_calculated + replicaset.ethalon_bucket_count - end - if buckets_calculated == bucket_count then - return - end - -- A situation is possible, when bucket_per_weight is not - -- integer. Lets spread this disbalance over cluster to - -- make for any replicaset pair - -- |replicaset_1 - replicaset_2| <= 1 - this difference is - -- admissible. - local buckets_rest = buckets_calculated - bucket_count - for _, replicaset in pairs(replicasets) do - local ceil = math.ceil(replicaset.weight * bucket_per_weight) - local floor = math.floor(replicaset.weight * bucket_per_weight) - if replicaset.ethalon_bucket_count > 0 and ceil ~= floor then - replicaset.ethalon_bucket_count = replicaset.ethalon_bucket_count - 1 - buckets_rest = buckets_rest - 1 - if buckets_rest == 0 then - return + while not is_balance_found do + step_count = step_count + 1 + assert(weight_sum > 0) + local bucket_per_weight = bucket_count / weight_sum + local buckets_calculated = 0 + for _, replicaset in pairs(replicasets) do + if not replicaset.ignore_disbalance then + replicaset.ethalon_bucket_count = + math.ceil(replicaset.weight * bucket_per_weight) + buckets_calculated = + buckets_calculated + replicaset.ethalon_bucket_count end end + local buckets_rest = buckets_calculated - bucket_count + is_balance_found = true + for _, replicaset in pairs(replicasets) do + if not replicaset.ignore_disbalance then + -- A situation is possible, when bucket_per_weight + -- is not integer. Lets spread this disbalance + -- over the cluster. + if buckets_rest > 0 then + local n = replicaset.weight * bucket_per_weight + local ceil = math.ceil(n) + local floor = math.floor(n) + if replicaset.ethalon_bucket_count > 0 and ceil ~= floor then + replicaset.ethalon_bucket_count = + replicaset.ethalon_bucket_count - 1 + buckets_rest = buckets_rest - 1 + end + end + -- + -- Search for incorrigible disbalance due to + -- pinned buckets. + -- + local pinned = replicaset.pinned_count + if pinned and replicaset.ethalon_bucket_count < pinned then + -- This replicaset can not send out enough + -- buckets to reach a balance. So do the best + -- effort balance by sending from the + -- replicaset though non-pinned buckets. This + -- replicaset and its pinned buckets does not + -- participate in the next steps of balance + -- calculation. + is_balance_found = false + bucket_count = bucket_count - replicaset.pinned_count + replicaset.ethalon_bucket_count = replicaset.pinned_count + replicaset.ignore_disbalance = true + weight_sum = weight_sum - replicaset.weight + end + end + end + assert(buckets_rest == 0) + if step_count > replicaset_count then + -- This can happed only because of a bug in this + -- algorithm. But it occupies 100% of transaction + -- thread, so check step count explicitly. + return error('PANIC: the rebalancer is broken') + end end - assert(buckets_rest == 0) end -- diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index fe8a40fb..99fbf7b3 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -1213,19 +1213,20 @@ local function rebalancer_download_states() local total_bucket_locked_count = 0 local total_bucket_active_count = 0 for uuid, replicaset in pairs(M.replicasets) do - local bucket_active_count = + local state = replicaset:callrw('vshard.storage.rebalancer_request_state', {}) - if bucket_active_count == nil then + if state == nil then return end + local bucket_count = state.bucket_active_count + + state.bucket_pinned_count if replicaset.lock then - total_bucket_locked_count = - total_bucket_locked_count + bucket_active_count + total_bucket_locked_count = total_bucket_locked_count + bucket_count else - total_bucket_active_count = - total_bucket_active_count + bucket_active_count - replicasets[uuid] = {bucket_count = bucket_active_count, - weight = replicaset.weight} + total_bucket_active_count = total_bucket_active_count + bucket_count + replicasets[uuid] = {bucket_count = bucket_count, + weight = replicaset.weight, + pinned_count = state.bucket_pinned_count} end end local sum = total_bucket_active_count + total_bucket_locked_count @@ -1325,7 +1326,10 @@ local function rebalancer_request_state() return end local bucket_count = _bucket:count() - return status_index:count({consts.BUCKET.ACTIVE}) + return { + bucket_active_count = status_index:count({consts.BUCKET.ACTIVE}), + bucket_pinned_count = status_index:count({consts.BUCKET.PINNED}), + } end --