diff --git a/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py b/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py index c453988726d1..67c5bf6c69b0 100644 --- a/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py +++ b/ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py @@ -28,20 +28,8 @@ def add_options(p): p.add_argument('--enable-hard-switch-piles', action='store_true', help='Enable hard switch pile with setting PRIMARY') p.add_argument('--enable-disconnect-piles', action='store_true', help='Enable disconnect pile') p.add_argument('--fixed-pile-for-disconnect', type=int, help='Pile to disconnect') - - -def fetch_start_time_map(base_config): - start_time_map = {} - for node_id in {pdisk.NodeId for pdisk in base_config.PDisk}: - r = common.fetch_json_info('sysinfo', [node_id]) - if len(r) != 1: - return None - k, v = r.popitem() - assert k == node_id - if 'StartTime' not in v: - return None - start_time_map[node_id] = int(v['StartTime']) - return start_time_map + p.add_argument('--weight-restarts', type=float, default=1.0, help='weight for restart action') + p.add_argument('--weight-kill-tablets', type=float, default=1.0, help='weight for kill tablets action') def make_pdisk_key_config(pdisk_keys, node_id): @@ -97,8 +85,6 @@ def do(args): base_config = common.fetch_base_config() vslot_map = common.build_vslot_map(base_config) node_fqdn_map = common.build_node_fqdn_map(base_config) - if args.enable_pdisk_encryption_keys_changes or not args.disable_restarts: - start_time_map = fetch_start_time_map(base_config) except Exception: if config_retries is None: config_retries = 3 @@ -108,7 +94,21 @@ def do(args): config_retries -= 1 continue - tablets = common.fetch_json_info('tabletinfo') if args.enable_kill_tablets or args.enable_kill_blob_depot else {} + if args.enable_kill_tablets or args.enable_kill_blob_depot: + tablets = { + int(tablet['TabletId']) : tablet + for tablet in common.fetch('viewer/json/tabletinfo', dict(enums=1)).get('TabletStateInfo', []) + } + else: + tablets = {} + sysinfo = { + int(node['NodeId']): node + for node in common.fetch('viewer/json/sysinfo', dict(fields_required=-1, enums=1), cache=False).get('SystemStateInfo', []) + } + start_time_map = { + int(node['NodeId']): int(node['StartTime']) + for node in sysinfo.values() + } config_retries = None @@ -187,11 +187,10 @@ def match(x): return True def do_restart(node_id): - host = node_fqdn_map[node_id] + node = sysinfo[node_id] if args.enable_pdisk_encryption_keys_changes: update_pdisk_key_config(node_fqdn_map, pdisk_keys, node_id) - subprocess.call(['ssh', host, 'sudo', 'killall', '-%s' % args.kill_signal, 'kikimr']) - subprocess.call(['ssh', host, 'sudo', 'killall', '-%s' % args.kill_signal, 'ydbd']) + subprocess.call(['ssh', node['Host'], 'sudo', 'kill', '-%s' % args.kill_signal, node['PID']]) if args.enable_pdisk_encryption_keys_changes: remove_old_pdisk_keys(pdisk_keys, pdisk_key_versions, node_id) @@ -316,9 +315,9 @@ def do_connect_pile(pile_id, pile_id_to_hosts): possible_actions = [] if args.enable_kill_tablets: - possible_actions.append(('kill tablet', (do_kill_tablet,))) + possible_actions.append((args.weight_kill_tablets, 'kill tablet', (do_kill_tablet,))) if args.enable_kill_blob_depot: - possible_actions.append(('kill blob depot', (do_kill_blob_depot,))) + possible_actions.append((1.0, 'kill blob depot', (do_kill_blob_depot,))) evicts = [] wipes = [] @@ -361,19 +360,19 @@ def pick(v): action[0](*action[1:]) if evicts: - possible_actions.append(('evict', (pick, evicts))) + possible_actions.append((1.0, 'evict', (pick, evicts))) if wipes: - possible_actions.append(('wipe', (pick, wipes))) + possible_actions.append((1.0, 'wipe', (pick, wipes))) if readonlies: - possible_actions.append(('readonly', (pick, readonlies))) + possible_actions.append((1.0, 'readonly', (pick, readonlies))) if unreadonlies: - possible_actions.append(('un-readonly', (pick, unreadonlies))) + possible_actions.append((1.0, 'un-readonly', (pick, unreadonlies))) if pdisk_restarts: - possible_actions.append(('restart-pdisk', (pick, pdisk_restarts))) + possible_actions.append((1.0, 'restart-pdisk', (pick, pdisk_restarts))) if make_pdisks_readonly: - possible_actions.append(('make-pdisks-readonly', (pick, make_pdisks_readonly))) + possible_actions.append((1.0, 'make-pdisks-readonly', (pick, make_pdisks_readonly))) if make_pdisks_not_readonly: - possible_actions.append(('make-pdisks-not-readonly', (pick, make_pdisks_not_readonly))) + possible_actions.append((1.0, 'make-pdisks-not-readonly', (pick, make_pdisks_not_readonly))) restarts = [] @@ -385,12 +384,12 @@ def pick(v): nodes_to_restart = nodes_to_restart[:node_count//2] for node_id in nodes_to_restart: if args.enable_pdisk_encryption_keys_changes: - possible_actions.append(('add new pdisk key to node with id: %d' % node_id, (do_add_pdisk_key, node_id))) + possible_actions.append((1.0, 'add new pdisk key to node with id: %d' % node_id, (do_add_pdisk_key, node_id))) if not args.disable_restarts: restarts.append(('restart node with id: %d' % node_id, (do_restart, node_id))) if restarts: - possible_actions.append(('restart', (pick, restarts))) + possible_actions.append((args.weight_restarts, 'restart', (pick, restarts))) has_pile_operations = args.enable_soft_switch_piles or args.enable_hard_switch_piles or args.enable_disconnect_piles if has_pile_operations: @@ -418,14 +417,14 @@ def pick(v): can_hard_switch = (len(synchronized_piles) + len(promoted_piles) > 0) if args.enable_soft_switch_piles and can_soft_switch: - possible_actions.append(('soft-switch-pile', (do_soft_switch_pile, random.choice(synchronized_piles)))) + possible_actions.append((1.0, 'soft-switch-pile', (do_soft_switch_pile, random.choice(synchronized_piles)))) if args.enable_hard_switch_piles and can_hard_switch: - possible_actions.append(('hard-switch-pile', (do_hard_switch_pile, random.choice(promoted_piles + synchronized_piles), [primary_pile] + promoted_piles + synchronized_piles))) + possible_actions.append((1.0, 'hard-switch-pile', (do_hard_switch_pile, random.choice(promoted_piles + synchronized_piles), [primary_pile] + promoted_piles + synchronized_piles))) if len(disconnected_piles) > 0: - possible_actions.append(('connect-pile', (do_connect_pile, random.choice(disconnected_piles), pile_id_to_endpoints))) + possible_actions.append((1.0, 'connect-pile', (do_connect_pile, random.choice(disconnected_piles), pile_id_to_endpoints))) if args.enable_disconnect_piles and len(synchronized_piles) > 0: pile_to_disconnect = args.fixed_pile_for_disconnect if args.fixed_pile_for_disconnect is not None else random.choice([primary_pile] + synchronized_piles) - possible_actions.append(('disconnect-pile', (do_disconnect_pile, pile_to_disconnect, pile_id_to_endpoints))) + possible_actions.append((1.0, 'disconnect-pile', (do_disconnect_pile, pile_to_disconnect, pile_id_to_endpoints))) if not possible_actions: common.print_if_not_quiet(args, 'Waiting for the next round...', file=sys.stdout) @@ -434,7 +433,7 @@ def pick(v): ################################################################################################################ - action_name, action = random.choice(possible_actions) + (_, action_name, action), = random.choices(possible_actions, weights=[w for w, _, _ in possible_actions]) print('%s %s' % (action_name, datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S'))) try: diff --git a/ydb/core/protos/node_whiteboard.proto b/ydb/core/protos/node_whiteboard.proto index bf1686978236..ca8c9dadb6bc 100644 --- a/ydb/core/protos/node_whiteboard.proto +++ b/ydb/core/protos/node_whiteboard.proto @@ -367,6 +367,7 @@ message TSystemStateInfo { optional uint64 NetworkWriteThroughput = 42; optional uint32 RealNumberOfCpus = 43; // number of cpus without cgroups limitations repeated TSystemThreadInfo Threads = 44; + optional uint64 PID = 45; } message TEvSystemStateRequest { diff --git a/ydb/core/tablet/node_whiteboard.cpp b/ydb/core/tablet/node_whiteboard.cpp index 16328eaa0581..65e88302d519 100644 --- a/ydb/core/tablet/node_whiteboard.cpp +++ b/ydb/core/tablet/node_whiteboard.cpp @@ -16,6 +16,7 @@ #include #include +#include #include using namespace NActors; @@ -65,6 +66,7 @@ class TNodeWhiteboardService : public TActorBootstrapped } SystemStateInfo.SetStartTime(ctx.Now().MilliSeconds()); + SystemStateInfo.SetPID(GetPID()); ctx.Send(ctx.SelfID, new TEvPrivate::TEvUpdateRuntimeStats()); auto utils = NKikimr::GetServiceCounters(NKikimr::AppData()->Counters, "utils");