diff --git a/ocaml/idl/datamodel_host.ml b/ocaml/idl/datamodel_host.ml index 3d208bef45f..ef129ef2351 100644 --- a/ocaml/idl/datamodel_host.ml +++ b/ocaml/idl/datamodel_host.ml @@ -411,6 +411,17 @@ let host_query_ha = call ~flags:[`Session] ~allowed_roles:_R_POOL_OP () + (* Host.prepare_for_poweroff *) + + let prepare_for_poweroff = call + ~name:"prepare_for_poweroff" + ~in_product_since:rel_kolkata + ~doc:"Performs the necessary actions before host shutdown or reboot." + ~params:[Ref _host, "host", "The Host that is about to reboot or shutdown"] + ~allowed_roles:_R_LOCAL_ROOT_ONLY + ~hide_from_docs:true + () + (* Host.power_on *) let power_on = call @@ -1267,6 +1278,7 @@ let host_query_ha = call ~flags:[`Session] enable; shutdown; reboot; + prepare_for_poweroff; dmesg; dmesg_clear; get_log; diff --git a/ocaml/xapi-client/jbuild b/ocaml/xapi-client/jbuild index 0596579a7f8..a806cd0d149 100644 --- a/ocaml/xapi-client/jbuild +++ b/ocaml/xapi-client/jbuild @@ -20,6 +20,8 @@ let () = Printf.ksprintf Jbuild_plugin.V1.send {| ((name xapi_client) (public_name xapi-client) (libraries ( + mtime + mtime.clock.os xapi-types xapi-stdext-date )) diff --git a/ocaml/xapi-client/tasks.ml b/ocaml/xapi-client/tasks.ml index bf3336c0a09..53b8058f2b9 100644 --- a/ocaml/xapi-client/tasks.ml +++ b/ocaml/xapi-client/tasks.ml @@ -14,35 +14,69 @@ open Client +module D = Debug.Make(struct let name = "tasks" end) + module TaskSet = Set.Make(struct type t = API.ref_task let compare = compare end) (* Return once none of the tasks have a `pending status. *) -let wait_for_all ~rpc ~session_id ~tasks = +let wait_for_all_inner ~rpc ~session_id ?all_timeout ~tasks = let classes = List.map (fun task -> Printf.sprintf "task/%s" (Ref.string_of task)) tasks in + let timeout_span = match all_timeout with + | Some t -> Some (t *. Mtime.s_to_ns |> Int64.of_float |> Mtime.Span.of_uint64_ns) + | None -> None in + let timer = Mtime_clock.counter () in let timeout = 5.0 in let rec wait ~token ~task_set = - if TaskSet.is_empty task_set then () - else begin - let open Event_types in - let event_from_rpc = Client.Event.from ~rpc ~session_id ~classes ~token ~timeout in - let event_from = Event_types.event_from_of_rpc event_from_rpc in - let records = List.map Event_helper.record_of_event event_from.events in - (* If any records indicate that a task is no longer pending, remove that task from the set. *) - let pending_task_set = List.fold_left (fun task_set' record -> - match record with - | Event_helper.Task (t, Some t_rec) -> - if (TaskSet.mem t task_set') && (t_rec.API.task_status <> `pending) then - TaskSet.remove t task_set' - else - task_set' - | _ -> task_set') task_set records in - wait ~token:(event_from.Event_types.token) ~task_set:pending_task_set - end + if TaskSet.is_empty task_set then true + else match timeout_span with + | Some span when Mtime.Span.compare (Mtime_clock.count timer) span > 0 -> + let tasks = TaskSet.elements task_set in + let tasks_str = tasks |> List.map Ref.really_pretty_and_small |> String.concat "," in + D.info "Waiting for tasks timed out on %s" tasks_str; + false + | _ -> + let open Event_types in + let event_from_rpc = Client.Event.from ~rpc ~session_id ~classes ~token ~timeout in + let event_from = Event_types.event_from_of_rpc event_from_rpc in + let records = List.map Event_helper.record_of_event event_from.events in + (* If any records indicate that a task is no longer pending, remove that task from the set. *) + let pending_task_set = List.fold_left (fun task_set' record -> + match record with + | Event_helper.Task (t, Some t_rec) -> + if (TaskSet.mem t task_set') && (t_rec.API.task_status <> `pending) then + TaskSet.remove t task_set' + else + task_set' + | _ -> task_set') task_set records in + wait ~token:(event_from.Event_types.token) ~task_set:pending_task_set in let token = "" in let task_set = List.fold_left (fun task_set' task -> TaskSet.add task task_set') TaskSet.empty tasks in wait ~token ~task_set +let wait_for_all ~rpc ~session_id ~tasks = + wait_for_all_inner ~rpc ~session_id ?all_timeout:None ~tasks |> ignore + +let with_tasks_destroy ~rpc ~session_id ~timeout ~tasks = + let wait_or_cancel () = + D.info "Waiting for %d tasks, timeout: %.3fs" (List.length tasks) timeout; + if not (wait_for_all_inner ~rpc ~session_id ~all_timeout:timeout ~tasks) then begin + D.info "Canceling tasks"; + List.iter (fun task -> + if Client.Task.get_status ~rpc ~session_id ~self:task = `pending then + Client.Task.cancel ~rpc ~session_id ~task) tasks; + (* cancel is not immediate, give it a reasonable chance to take effect *) + wait_for_all_inner ~rpc ~session_id ~all_timeout:60. ~tasks; + false + end else true + in + + let destroy_all () = + List.iter (fun task -> + (* db gc thread in xapi may delete task from tasks table *) + D.log_and_ignore_exn (fun () -> Client.Task.destroy ~rpc ~session_id ~self:task)) tasks + in + Xapi_stdext_pervasives.Pervasiveext.finally wait_or_cancel destroy_all diff --git a/ocaml/xapi-client/tasks.mli b/ocaml/xapi-client/tasks.mli index c878b58719b..5b99aebd833 100644 --- a/ocaml/xapi-client/tasks.mli +++ b/ocaml/xapi-client/tasks.mli @@ -15,3 +15,8 @@ (** [wait_for_all ~rpc ~session_id ~tasks] returns when all of [tasks] are in some non-pending state. *) val wait_for_all : rpc:(Rpc.call -> Rpc.response) -> session_id:API.ref_session -> tasks:API.ref_task list -> unit + +(** [with_tasks_destroy ~rpc ~session_id ~timeout ~tasks] is like [wait_for_all] except after [timeout] has elapsed + it will cancel pending tasks and return false. + Finally it will destroy all tasks *) +val with_tasks_destroy: rpc:(Rpc.call -> Rpc.response) -> session_id:API.ref_session -> timeout:float -> tasks:API.ref_task list -> bool diff --git a/ocaml/xapi/cli_frontend.ml b/ocaml/xapi/cli_frontend.ml index 8ea48bdd6fc..99bda955b81 100644 --- a/ocaml/xapi/cli_frontend.ml +++ b/ocaml/xapi/cli_frontend.ml @@ -574,6 +574,15 @@ let rec cmdtable_data : (string*cmd_spec) list = flags=[Host_selectors]; }; + "host-prepare-for-poweroff", + { + reqd=[]; + optn=[]; + help="Perform the necessary actions before host shutdown or reboot."; + implementation=No_fd Cli_operations.host_prepare_for_poweroff; + flags=[Hidden]; + }; + "host-dmesg", { reqd=[]; diff --git a/ocaml/xapi/cli_operations.ml b/ocaml/xapi/cli_operations.ml index 3d5e9ce3c5b..94446b9c8f4 100644 --- a/ocaml/xapi/cli_operations.ml +++ b/ocaml/xapi/cli_operations.ml @@ -4131,6 +4131,11 @@ let host_reboot printer rpc session_id params = let host_power_on printer rpc session_id params = ignore(do_host_op rpc session_id (fun _ host -> Client.Host.power_on rpc session_id (host.getref ())) params []) +let host_prepare_for_poweroff _printer rpc session_id params = + let uuid = List.assoc "uuid" params in + let host = Client.Host.get_by_uuid ~rpc ~session_id ~uuid in + Client.Host.prepare_for_poweroff ~rpc ~session_id ~host + let host_dmesg printer rpc session_id params = let op _ host = let dmesg = Client.Host.dmesg rpc session_id (host.getref ()) in diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index ef9201a9aff..24eb8bcc8b8 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -2279,6 +2279,12 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct do_op_on ~local_fn ~__context ~host (fun session_id rpc -> Client.Host.reboot rpc session_id host) ) + (* This is only be called by systemd during shutdown when xapi-domains.service is stopped *) + let prepare_for_poweroff ~__context ~host = + info "Host.prepare_for_poweroff: host = '%s'" (host_uuid ~__context host); + let local_fn = Local.Host.prepare_for_poweroff ~host in + do_op_on ~local_fn ~__context ~host (fun session_id rpc -> Client.Host.prepare_for_poweroff rpc session_id host) + let power_on ~__context ~host = info "Host.power_on: host = '%s'" (host_uuid ~__context host); with_host_operation ~__context ~self:host ~doc:"Host.power_on" ~op:`power_on diff --git a/ocaml/xapi/vm_evacuation.ml b/ocaml/xapi/vm_evacuation.ml new file mode 100644 index 00000000000..6cd62498f95 --- /dev/null +++ b/ocaml/xapi/vm_evacuation.ml @@ -0,0 +1,113 @@ +module D=Debug.Make(struct let name="xapi" end) +open D + +let estimate_evacuate_timeout ~__context ~host = + let mref = Db.Host.get_metrics ~__context ~self:host in + let metrics = Db.Host_metrics.get_record ~__context ~self:mref in + let memory_used = Int64.sub metrics.API.host_metrics_memory_total metrics.API.host_metrics_memory_free in + (* Conservative estimation based on 1000Mbps link, and the memory usage of + Dom0 (which is not going to be transferred) is an intentional surplus *) + let t = ((Int64.to_float memory_used) *. 8. /. (1000. *. 1024. *. 1024.)) in + max 240. t + +(* Returns a tuple of lists: The first containing the control domains, and the second containing the regular VMs *) +let get_resident_vms ~__context ~self = + let my_resident_vms = Db.Host.get_resident_VMs ~__context ~self in + List.partition (fun vm -> Db.VM.get_is_control_domain ~__context ~self:vm) my_resident_vms + +let ensure_no_vms ~__context ~rpc ~session_id ~evacuate_timeout = + let open Client in + + let is_running vm = + Db.VM.get_power_state ~__context ~self:vm = `Running + in + + let host = Helpers.get_localhost ~__context in + let self_managed_poweroff vm = + let result = Db.VM.get_other_config ~__context ~self:vm + |> List.mem_assoc "auto_poweroff" in + if result then + debug "Skip running VM %s: has self-managed poweroff" (Db.VM.get_name_label ~__context ~self:vm); + result + in + let get_running_domains () = + get_resident_vms ~__context ~self:host |> snd + |> List.filter (fun vm -> is_running vm && not (self_managed_poweroff vm)) + in + + let cancel_vm_tasks self = + Db.VM.get_current_operations ~__context ~self + |> List.rev_map fst + |> List.rev_map Ref.of_string + |> List.iter (fun (task:[`task] Ref.t) -> + let name = Db.VM.get_name_label ~__context ~self in + debug "Canceling operation on VM %s" name; + log_and_ignore_exn (fun () -> Client.Task.cancel ~rpc ~session_id ~task)) + in + + let evacuate () = + TaskHelper.exn_if_cancelling ~__context; (* First check if _we_ have been cancelled *) + info "Requesting evacuation of host"; + let timeout = if evacuate_timeout > 0. then evacuate_timeout + else estimate_evacuate_timeout ~__context ~host in + let tasks = [ Client.Async.Host.evacuate ~rpc ~session_id ~host ] in + if not (Tasks.with_tasks_destroy ~rpc ~session_id ~timeout ~tasks) then begin + get_running_domains () + |> List.iter cancel_vm_tasks + end + in + + let clean_shutdown vms = + TaskHelper.exn_if_cancelling ~__context; (* First check if _we_ have been cancelled *) + let tasks = + vms + |> List.filter (fun vm -> + List.mem `clean_shutdown (Client.VM.get_allowed_operations ~rpc ~session_id ~self:vm)) + |> List.map (fun vm -> + let name_label = Client.VM.get_name_label ~rpc ~session_id ~self:vm in + debug "Requesting clean shutdown of VM: %s" name_label; + Client.Async.VM.clean_shutdown ~rpc ~session_id ~vm) in + Tasks.with_tasks_destroy ~rpc ~session_id ~timeout:60. ~tasks |> ignore + in + + let hard_shutdown vms = + TaskHelper.exn_if_cancelling ~__context; (* First check if _we_ have been cancelled *) + let tasks = + vms + |> List.map (fun vm -> + let name_label = Client.VM.get_name_label ~rpc ~session_id ~self:vm in + debug "Requesting hard shutdown of VM: %s" name_label; + Client.Async.VM.hard_shutdown ~rpc ~session_id ~vm) in + (* no timeout: we need the VMs to be off *) + Tasks.wait_for_all ~rpc ~session_id ~tasks; + vms + |> List.filter is_running + |> List.iter (fun vm -> + let name_label = Client.VM.get_name_label ~rpc ~session_id ~self:vm in + info "Failure performing hard shutdown of VM: %s" name_label) + in + + let shutdown vms = + log_and_ignore_exn (fun () -> clean_shutdown vms); + (* We can unplug the PBD if a VM is suspended or halted, but not if + * it is running or paused, i.e. "live" *) + vms + |> List.filter (fun self -> Xapi_vm_lifecycle.is_live ~__context ~self) + |> hard_shutdown + in + + log_and_ignore_exn (fun () -> + Client.Host.get_vms_which_prevent_evacuation ~rpc ~session_id ~self:host + |> Xapi_stdext_std.Listext.List.filter_map (fun (vm, _) -> + if self_managed_poweroff vm then None + else Some vm) + |> shutdown; + + evacuate ()); + + log_and_ignore_exn (fun () -> get_running_domains () |> shutdown) + +let ensure_no_vms ~__context ~evacuate_timeout = + Helpers.call_api_functions ~__context (fun rpc session_id -> + ensure_no_vms ~__context ~rpc ~session_id ~evacuate_timeout) + diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml index c12d2f788e4..8755b782dda 100644 --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -1511,6 +1511,21 @@ let enable __context heartbeat_srs configuration = notice the invalid state and disable its HA *) raise exn +let assert_have_statefile_access ~__context ~host = + let pool = Helpers.get_pool ~__context in + if Db.Pool.get_ha_enabled ~__context ~self:pool then begin + let liveset = query_liveset () in + let me = + Hashtbl.find + liveset.Xha_interface.LiveSetInformation.hosts + liveset.Xha_interface.LiveSetInformation.local_host_id + in + if (not me.Xha_interface.LiveSetInformation.Host.state_file_access) || + me.Xha_interface.LiveSetInformation.Host.state_file_corrupted + then raise (Api_errors.Server_error(Api_errors.ha_lost_statefile, [])) + end + +let before_clean_shutdown_or_reboot_precheck = assert_have_statefile_access let before_clean_shutdown_or_reboot ~__context ~host = let pool = Helpers.get_pool ~__context in @@ -1527,13 +1542,7 @@ let before_clean_shutdown_or_reboot ~__context ~host = then we lose it and ha_set_excluded fails, manually fence ourselves. *) (* Safe early abort if we don't have statefile access *) - let liveset = query_liveset () in - let me = Hashtbl.find liveset.Xha_interface.LiveSetInformation.hosts - liveset.Xha_interface.LiveSetInformation.local_host_id in - if false - || not(me.Xha_interface.LiveSetInformation.Host.state_file_access) - || me.Xha_interface.LiveSetInformation.Host.state_file_corrupted - then raise (Api_errors.Server_error(Api_errors.ha_lost_statefile, [])); + assert_have_statefile_access ~__context ~host; (* From this point we will fence ourselves if any unexpected error occurs *) begin try @@ -1567,5 +1576,19 @@ let before_clean_shutdown_or_reboot ~__context ~host = info "Still waiting to reboot after %.2f seconds" (Unix.gettimeofday () -. start) done end; - List.iter Static_vdis.detach_only (Static_vdis.list()) + + (* We must do this before attempting to detach the VDI holding the redo log, + otherwise we would either get an error later or hang. + + Note that Xha_metadata_vdi is a VDI with reason = ha_metadata_vdi_reason and type=`redo_log: + type=`metadata is for DR *) + debug "About to close active redo logs"; + Redo_log.with_active_redo_logs (Redo_log.shutdown); + + (* We cannot call ha_release_resources because we want to keep HA armed on reboot *) + debug "About to detach static VDIs"; + + List.iter (Static_vdis.detach_only) (Static_vdis.list ()); + + debug "Detached static VDIs" end diff --git a/ocaml/xapi/xapi_ha.mli b/ocaml/xapi/xapi_ha.mli index 32d0a579c42..d54a4e584cc 100644 --- a/ocaml/xapi/xapi_ha.mli +++ b/ocaml/xapi/xapi_ha.mli @@ -114,3 +114,5 @@ val before_clean_shutdown_or_reboot : __context:Context.t -> host:'a -> unit (** Called before shutting down or rebooting a host (called by the host.shutdown, host.reboot API functions). *) +val before_clean_shutdown_or_reboot_precheck : __context:Context.t -> host:'a -> unit +(** Only runs the checks performed by [before_clean_shutdown_or_reboot]. *) diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index 2ae1d3ca42b..597eee3d15f 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -497,12 +497,29 @@ let enable ~__context ~host = then Helpers.call_api_functions ~__context (fun rpc session_id -> Client.Client.Pool.ha_schedule_plan_recomputation rpc session_id) end -let shutdown_and_reboot_common ~__context ~host label description operation cmd = - if Db.Host.get_enabled ~__context ~self:host - then raise (Api_errors.Server_error (Api_errors.host_not_disabled, [])); +let prepare_for_poweroff_precheck ~__context ~host = + Xapi_host_helpers.assert_host_disabled ~__context ~host + +let prepare_for_poweroff ~__context ~host = + (** Do not run assert_host_disabled here, continue even if the host is + enabled: the host is already shutting down when this function gets called *) + + let i_am_master = Pool_role.is_master () in + if i_am_master then + (* We are the master and we are about to shutdown HA and redo log: + prevent slaves from sending (DB) requests. + If we are the slave we cannot shutdown the request thread yet + because we might need it when unplugging the PBDs + *) + Remote_requests.stop_request_thread(); + + Vm_evacuation.ensure_no_vms ~__context ~evacuate_timeout:0.; Xapi_ha.before_clean_shutdown_or_reboot ~__context ~host; - Remote_requests.stop_request_thread(); + Xapi_pbd.unplug_all_pbds ~__context; + + if not i_am_master then + Remote_requests.stop_request_thread(); (* Push the Host RRD to the master. Note there are no VMs running here so we don't have to worry about them. *) if not(Pool_role.is_master ()) @@ -513,7 +530,22 @@ let shutdown_and_reboot_common ~__context ~host label description operation cmd (* This prevents anyone actually re-enabling us until after reboot *) Localdb.put Constants.host_disabled_until_reboot "true"; (* This helps us distinguish between an HA fence and a reboot *) - Localdb.put Constants.host_restarted_cleanly "true"; + Localdb.put Constants.host_restarted_cleanly "true" + +let shutdown_and_reboot_common ~__context ~host label description operation cmd = + (* The actual shutdown actions are done asynchronously, in a call to + prepare_for_poweroff, so the API user will not be notified of any errors + that happen during that operation. + Therefore here we make an additional call to the prechecks of every + operation that gets called from prepare_for_poweroff, either directly or + indirectly, to fail early and ensure that a suitable error is returned to + the XenAPI user. *) + let shutdown_precheck () = + prepare_for_poweroff_precheck ~__context ~host; + Xapi_ha.before_clean_shutdown_or_reboot_precheck ~__context ~host + in + shutdown_precheck (); + (* This tells the master that the shutdown is still ongoing: it can be used to continue masking other operations even after this call return. diff --git a/ocaml/xapi/xapi_host.mli b/ocaml/xapi/xapi_host.mli index 309ebca18cf..8b2db986cc0 100644 --- a/ocaml/xapi/xapi_host.mli +++ b/ocaml/xapi/xapi_host.mli @@ -54,6 +54,7 @@ val restart_agent : __context:'a -> host:'b -> unit val shutdown_agent : __context:'a -> unit val disable : __context:Context.t -> host:[ `host ] Ref.t -> unit val enable : __context:Context.t -> host:[ `host ] Ref.t -> unit +val prepare_for_poweroff : __context:Context.t -> host:[ `host ] Ref.t -> unit val shutdown : __context:Context.t -> host:[ `host ] Ref.t -> unit val reboot : __context:Context.t -> host:[ `host ] Ref.t -> unit val power_on : __context:Context.t -> host:[ `host ] Ref.t -> unit diff --git a/ocaml/xapi/xapi_host_helpers.ml b/ocaml/xapi/xapi_host_helpers.ml index 362cf1c7161..2f16e48bcc8 100644 --- a/ocaml/xapi/xapi_host_helpers.ml +++ b/ocaml/xapi/xapi_host_helpers.ml @@ -185,6 +185,10 @@ let mark_host_as_dead ~__context ~host ~reason = Xapi_hooks.host_post_declare_dead ~__context ~host ~reason ) +let assert_host_disabled ~__context ~host = + if Db.Host.get_enabled ~__context ~self:host + then raise (Api_errors.Server_error (Api_errors.host_not_disabled, [])) + (* Toggled by an explicit Host.disable call to prevent a master restart making us bounce back *) let user_requested_host_disable = ref false diff --git a/ocaml/xapi/xapi_host_helpers.mli b/ocaml/xapi/xapi_host_helpers.mli index 98aa6fa1d07..02f259cb1ee 100644 --- a/ocaml/xapi/xapi_host_helpers.mli +++ b/ocaml/xapi/xapi_host_helpers.mli @@ -30,6 +30,13 @@ val assert_operation_valid : {- Shutdown and Reboot are only allowed if the host is disabled} }*) +val assert_host_disabled : + __context:Context.t -> + host:API.ref_host -> + unit +(** [assert_host_disabled ~__context ~host] raises an API error + host_not_disabled if the host is not disabled. *) + val update_allowed_operations : __context:Context.t -> self:API.ref_host -> diff --git a/ocaml/xapi/xapi_pbd.ml b/ocaml/xapi/xapi_pbd.ml index e056c1384bf..a75dc942a1c 100644 --- a/ocaml/xapi/xapi_pbd.ml +++ b/ocaml/xapi/xapi_pbd.ml @@ -153,7 +153,7 @@ let unplug ~__context ~self = if Db.Pool.get_ha_enabled ~__context ~self:pool then begin let statefiles = Db.Pool.get_ha_statefiles ~__context ~self:pool in let statefile_srs = List.map (fun self -> Db.VDI.get_SR ~__context ~self:(Ref.of_string self)) statefiles in - if List.mem sr statefile_srs + if List.mem sr statefile_srs && not (Xha_scripts.can_unplug_statefile_pbd ()) then raise (Api_errors.Server_error(Api_errors.ha_is_enabled, [])) end; @@ -204,3 +204,23 @@ let set_device_config ~__context ~self ~value = (* Only allowed from the SM plugin *) assert_no_srmaster_key value; Db.PBD.set_device_config ~__context ~self ~value + +let get_locally_attached ~__context = + let host = Helpers.get_localhost ~__context in + Db.PBD.get_refs_where ~__context + ~expr:(Db_filter_types.( + And( + Eq (Field "host", Literal (Ref.string_of host)), + Eq (Field "currently_attached", Literal "true")))) + +let unplug_all_pbds ~__context = + info "Unplugging all SRs plugged on local host"; + (* best effort unplug of all PBDs *) + get_locally_attached ~__context + |> List.iter (fun pbd -> + log_and_ignore_exn (fun () -> + TaskHelper.exn_if_cancelling ~__context; + let uuid = Db.PBD.get_uuid ~__context ~self:pbd in + debug "Unplugging PBD %s" uuid; + unplug ~__context ~self:pbd)); + debug "Finished unplug_all_pbds" diff --git a/ocaml/xapi/xha_scripts.ml b/ocaml/xapi/xha_scripts.ml index 9216a62bf78..c240cd44693 100644 --- a/ocaml/xapi/xha_scripts.ml +++ b/ocaml/xapi/xha_scripts.ml @@ -58,3 +58,20 @@ let call_script ?log_successful_output script args = warn "%s %s returned %s (%s)" script' (String.concat " " args) (Xha_errno.to_string code) (Xha_errno.to_description_string code); raise (Xha_error code) + +(** Internal API call that determines whether it is safe to unplug the PBD + holding the statefile during shutdown. *) +let can_unplug_statefile_pbd () = + (* During shutdown we execute a soft emergency HA disable, which means that HA will still look to be armed in the localdb, + so we cannot use that to determine if it is safe to unplug. + However during shutdown we stop the daemon, so querying the liveset should fail with daemon not running *) + match call_script ~log_successful_output:false ha_query_liveset [] with + | exception Xha_error Xha_errno.Mtc_exit_daemon_is_not_present -> + info "HA daemon not running: safe to unplug statefile PBD"; + true + | exception e -> + info "Caught exception querying liveset; assuming it is not safe to unplug: %s" (ExnHelper.string_of_exn e); + false + | _ -> + info "HA daemon still running or in unknown state: assuming it is not safe to unplug"; + false diff --git a/scripts/xapi-domains.service b/scripts/xapi-domains.service index d89811b5bf7..e9ec14f8321 100644 --- a/scripts/xapi-domains.service +++ b/scripts/xapi-domains.service @@ -8,7 +8,7 @@ Type=oneshot RemainAfterExit=yes EnvironmentFile=@INVENTORY@ ExecStart=@BINDIR@/xapi-autostart-vms -ExecStop=/bin/sh -c "/opt/xensource/libexec/shutdown $INSTALLATION_UUID || /opt/xensource/libexec/shutdown --force $INSTALLATION_UUID" +ExecStop=/bin/sh -c "/opt/xensource/bin/xe host-prepare-for-poweroff uuid=$INSTALLATION_UUID" ExecStop=/opt/xensource/bin/xe host-emergency-ha-disable force=true soft=true # Generous 24hr timeout that corresponding to the max evacuation time of a host diff --git a/xapi-client.opam b/xapi-client.opam index 670989afd2e..b012ee5c96a 100644 --- a/xapi-client.opam +++ b/xapi-client.opam @@ -9,6 +9,7 @@ build: [[ "jbuilder" "build" "-p" name ]] depends: [ "jbuilder" {build & >= "1.0+beta11"} + "mtime" "xapi-datamodel" "xapi-types" "xapi-stdext-date"