Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 35 additions & 36 deletions ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,8 @@ def add_options(p):
p.add_argument('--enable-hard-switch-piles', action='store_true', help='Enable hard switch pile with setting PRIMARY')
p.add_argument('--enable-disconnect-piles', action='store_true', help='Enable disconnect pile')
p.add_argument('--fixed-pile-for-disconnect', type=int, help='Pile to disconnect')


def fetch_start_time_map(base_config):
start_time_map = {}
for node_id in {pdisk.NodeId for pdisk in base_config.PDisk}:
r = common.fetch_json_info('sysinfo', [node_id])
if len(r) != 1:
return None
k, v = r.popitem()
assert k == node_id
if 'StartTime' not in v:
return None
start_time_map[node_id] = int(v['StartTime'])
return start_time_map
p.add_argument('--weight-restarts', type=float, default=1.0, help='weight for restart action')
p.add_argument('--weight-kill-tablets', type=float, default=1.0, help='weight for kill tablets action')


def make_pdisk_key_config(pdisk_keys, node_id):
Expand Down Expand Up @@ -97,8 +85,6 @@ def do(args):
base_config = common.fetch_base_config()
vslot_map = common.build_vslot_map(base_config)
node_fqdn_map = common.build_node_fqdn_map(base_config)
if args.enable_pdisk_encryption_keys_changes or not args.disable_restarts:
start_time_map = fetch_start_time_map(base_config)
except Exception:
if config_retries is None:
config_retries = 3
Expand All @@ -108,7 +94,21 @@ def do(args):
config_retries -= 1
continue

tablets = common.fetch_json_info('tabletinfo') if args.enable_kill_tablets or args.enable_kill_blob_depot else {}
if args.enable_kill_tablets or args.enable_kill_blob_depot:
tablets = {
int(tablet['TabletId']) : tablet
for tablet in common.fetch('viewer/json/tabletinfo', dict(enums=1)).get('TabletStateInfo', [])
}
else:
tablets = {}
sysinfo = {
int(node['NodeId']): node
for node in common.fetch('viewer/json/sysinfo', dict(fields_required=-1, enums=1), cache=False).get('SystemStateInfo', [])
}
start_time_map = {
int(node['NodeId']): int(node['StartTime'])
for node in sysinfo.values()
}

config_retries = None

Expand Down Expand Up @@ -187,11 +187,10 @@ def match(x):
return True

def do_restart(node_id):
host = node_fqdn_map[node_id]
node = sysinfo[node_id]
if args.enable_pdisk_encryption_keys_changes:
update_pdisk_key_config(node_fqdn_map, pdisk_keys, node_id)
subprocess.call(['ssh', host, 'sudo', 'killall', '-%s' % args.kill_signal, 'kikimr'])
subprocess.call(['ssh', host, 'sudo', 'killall', '-%s' % args.kill_signal, 'ydbd'])
subprocess.call(['ssh', node['Host'], 'sudo', 'kill', '-%s' % args.kill_signal, node['PID']])
if args.enable_pdisk_encryption_keys_changes:
remove_old_pdisk_keys(pdisk_keys, pdisk_key_versions, node_id)

Expand Down Expand Up @@ -316,9 +315,9 @@ def do_connect_pile(pile_id, pile_id_to_hosts):
possible_actions = []

if args.enable_kill_tablets:
possible_actions.append(('kill tablet', (do_kill_tablet,)))
possible_actions.append((args.weight_kill_tablets, 'kill tablet', (do_kill_tablet,)))
if args.enable_kill_blob_depot:
possible_actions.append(('kill blob depot', (do_kill_blob_depot,)))
possible_actions.append((1.0, 'kill blob depot', (do_kill_blob_depot,)))

evicts = []
wipes = []
Expand Down Expand Up @@ -361,19 +360,19 @@ def pick(v):
action[0](*action[1:])

if evicts:
possible_actions.append(('evict', (pick, evicts)))
possible_actions.append((1.0, 'evict', (pick, evicts)))
if wipes:
possible_actions.append(('wipe', (pick, wipes)))
possible_actions.append((1.0, 'wipe', (pick, wipes)))
if readonlies:
possible_actions.append(('readonly', (pick, readonlies)))
possible_actions.append((1.0, 'readonly', (pick, readonlies)))
if unreadonlies:
possible_actions.append(('un-readonly', (pick, unreadonlies)))
possible_actions.append((1.0, 'un-readonly', (pick, unreadonlies)))
if pdisk_restarts:
possible_actions.append(('restart-pdisk', (pick, pdisk_restarts)))
possible_actions.append((1.0, 'restart-pdisk', (pick, pdisk_restarts)))
if make_pdisks_readonly:
possible_actions.append(('make-pdisks-readonly', (pick, make_pdisks_readonly)))
possible_actions.append((1.0, 'make-pdisks-readonly', (pick, make_pdisks_readonly)))
if make_pdisks_not_readonly:
possible_actions.append(('make-pdisks-not-readonly', (pick, make_pdisks_not_readonly)))
possible_actions.append((1.0, 'make-pdisks-not-readonly', (pick, make_pdisks_not_readonly)))

restarts = []

Expand All @@ -385,12 +384,12 @@ def pick(v):
nodes_to_restart = nodes_to_restart[:node_count//2]
for node_id in nodes_to_restart:
if args.enable_pdisk_encryption_keys_changes:
possible_actions.append(('add new pdisk key to node with id: %d' % node_id, (do_add_pdisk_key, node_id)))
possible_actions.append((1.0, 'add new pdisk key to node with id: %d' % node_id, (do_add_pdisk_key, node_id)))
if not args.disable_restarts:
restarts.append(('restart node with id: %d' % node_id, (do_restart, node_id)))

if restarts:
possible_actions.append(('restart', (pick, restarts)))
possible_actions.append((args.weight_restarts, 'restart', (pick, restarts)))

has_pile_operations = args.enable_soft_switch_piles or args.enable_hard_switch_piles or args.enable_disconnect_piles
if has_pile_operations:
Expand Down Expand Up @@ -418,14 +417,14 @@ def pick(v):
can_hard_switch = (len(synchronized_piles) + len(promoted_piles) > 0)

if args.enable_soft_switch_piles and can_soft_switch:
possible_actions.append(('soft-switch-pile', (do_soft_switch_pile, random.choice(synchronized_piles))))
possible_actions.append((1.0, 'soft-switch-pile', (do_soft_switch_pile, random.choice(synchronized_piles))))
if args.enable_hard_switch_piles and can_hard_switch:
possible_actions.append(('hard-switch-pile', (do_hard_switch_pile, random.choice(promoted_piles + synchronized_piles), [primary_pile] + promoted_piles + synchronized_piles)))
possible_actions.append((1.0, 'hard-switch-pile', (do_hard_switch_pile, random.choice(promoted_piles + synchronized_piles), [primary_pile] + promoted_piles + synchronized_piles)))
if len(disconnected_piles) > 0:
possible_actions.append(('connect-pile', (do_connect_pile, random.choice(disconnected_piles), pile_id_to_endpoints)))
possible_actions.append((1.0, 'connect-pile', (do_connect_pile, random.choice(disconnected_piles), pile_id_to_endpoints)))
if args.enable_disconnect_piles and len(synchronized_piles) > 0:
pile_to_disconnect = args.fixed_pile_for_disconnect if args.fixed_pile_for_disconnect is not None else random.choice([primary_pile] + synchronized_piles)
possible_actions.append(('disconnect-pile', (do_disconnect_pile, pile_to_disconnect, pile_id_to_endpoints)))
possible_actions.append((1.0, 'disconnect-pile', (do_disconnect_pile, pile_to_disconnect, pile_id_to_endpoints)))

if not possible_actions:
common.print_if_not_quiet(args, 'Waiting for the next round...', file=sys.stdout)
Expand All @@ -434,7 +433,7 @@ def pick(v):

################################################################################################################

action_name, action = random.choice(possible_actions)
(_, action_name, action), = random.choices(possible_actions, weights=[w for w, _, _ in possible_actions])
print('%s %s' % (action_name, datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S')))

try:
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/node_whiteboard.proto
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ message TSystemStateInfo {
optional uint64 NetworkWriteThroughput = 42;
optional uint32 RealNumberOfCpus = 43; // number of cpus without cgroups limitations
repeated TSystemThreadInfo Threads = 44;
optional uint64 PID = 45;
}

message TEvSystemStateRequest {
Expand Down
2 changes: 2 additions & 0 deletions ydb/core/tablet/node_whiteboard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <ydb/core/util/tuples.h>

#include <util/string/split.h>
#include <util/system/getpid.h>
#include <contrib/libs/protobuf/src/google/protobuf/util/message_differencer.h>

using namespace NActors;
Expand Down Expand Up @@ -65,6 +66,7 @@ class TNodeWhiteboardService : public TActorBootstrapped<TNodeWhiteboardService>
}

SystemStateInfo.SetStartTime(ctx.Now().MilliSeconds());
SystemStateInfo.SetPID(GetPID());
ctx.Send(ctx.SelfID, new TEvPrivate::TEvUpdateRuntimeStats());

auto utils = NKikimr::GetServiceCounters(NKikimr::AppData()->Counters, "utils");
Expand Down
Loading