Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ocaml/idl/datamodel_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,17 @@ let host_query_ha = call ~flags:[`Session]
~allowed_roles:_R_POOL_OP
()

(* Host.prepare_for_poweroff *)

let prepare_for_poweroff = call
~name:"prepare_for_poweroff"
~in_product_since:rel_kolkata
~doc:"Performs the necessary actions before host shutdown or reboot."
~params:[Ref _host, "host", "The Host that is about to reboot or shutdown"]
~allowed_roles:_R_LOCAL_ROOT_ONLY
~hide_from_docs:true
()

(* Host.power_on *)

let power_on = call
Expand Down Expand Up @@ -1267,6 +1278,7 @@ let host_query_ha = call ~flags:[`Session]
enable;
shutdown;
reboot;
prepare_for_poweroff;
dmesg;
dmesg_clear;
get_log;
Expand Down
2 changes: 2 additions & 0 deletions ocaml/xapi-client/jbuild
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ let () = Printf.ksprintf Jbuild_plugin.V1.send {|
((name xapi_client)
(public_name xapi-client)
(libraries (
mtime
mtime.clock.os
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to update the opam file as well?

xapi-types
xapi-stdext-date
))
Expand Down
70 changes: 52 additions & 18 deletions ocaml/xapi-client/tasks.ml
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,69 @@

open Client

module D = Debug.Make(struct let name = "tasks" end)

module TaskSet = Set.Make(struct type t = API.ref_task let compare = compare end)

(* Return once none of the tasks have a `pending status. *)
let wait_for_all ~rpc ~session_id ~tasks =
let wait_for_all_inner ~rpc ~session_id ?all_timeout ~tasks =
let classes = List.map
(fun task -> Printf.sprintf "task/%s" (Ref.string_of task))
tasks
in
let timeout_span = match all_timeout with
| Some t -> Some (t *. Mtime.s_to_ns |> Int64.of_float |> Mtime.Span.of_uint64_ns)
| None -> None in
let timer = Mtime_clock.counter () in
let timeout = 5.0 in
let rec wait ~token ~task_set =
if TaskSet.is_empty task_set then ()
else begin
let open Event_types in
let event_from_rpc = Client.Event.from ~rpc ~session_id ~classes ~token ~timeout in
let event_from = Event_types.event_from_of_rpc event_from_rpc in
let records = List.map Event_helper.record_of_event event_from.events in
(* If any records indicate that a task is no longer pending, remove that task from the set. *)
let pending_task_set = List.fold_left (fun task_set' record ->
match record with
| Event_helper.Task (t, Some t_rec) ->
if (TaskSet.mem t task_set') && (t_rec.API.task_status <> `pending) then
TaskSet.remove t task_set'
else
task_set'
| _ -> task_set') task_set records in
wait ~token:(event_from.Event_types.token) ~task_set:pending_task_set
end
if TaskSet.is_empty task_set then true
else match timeout_span with
| Some span when Mtime.Span.compare (Mtime_clock.count timer) span > 0 ->
let tasks = TaskSet.elements task_set in
let tasks_str = tasks |> List.map Ref.really_pretty_and_small |> String.concat "," in
D.info "Waiting for tasks timed out on %s" tasks_str;
false
| _ ->
let open Event_types in
let event_from_rpc = Client.Event.from ~rpc ~session_id ~classes ~token ~timeout in
let event_from = Event_types.event_from_of_rpc event_from_rpc in
let records = List.map Event_helper.record_of_event event_from.events in
(* If any records indicate that a task is no longer pending, remove that task from the set. *)
let pending_task_set = List.fold_left (fun task_set' record ->
match record with
| Event_helper.Task (t, Some t_rec) ->
if (TaskSet.mem t task_set') && (t_rec.API.task_status <> `pending) then
TaskSet.remove t task_set'
else
task_set'
| _ -> task_set') task_set records in
wait ~token:(event_from.Event_types.token) ~task_set:pending_task_set
in
let token = "" in
let task_set = List.fold_left (fun task_set' task -> TaskSet.add task task_set') TaskSet.empty tasks in
wait ~token ~task_set

let wait_for_all ~rpc ~session_id ~tasks =
wait_for_all_inner ~rpc ~session_id ?all_timeout:None ~tasks |> ignore

let with_tasks_destroy ~rpc ~session_id ~timeout ~tasks =
let wait_or_cancel () =
D.info "Waiting for %d tasks, timeout: %.3fs" (List.length tasks) timeout;
if not (wait_for_all_inner ~rpc ~session_id ~all_timeout:timeout ~tasks) then begin
D.info "Canceling tasks";
List.iter (fun task ->
if Client.Task.get_status ~rpc ~session_id ~self:task = `pending then
Client.Task.cancel ~rpc ~session_id ~task) tasks;
(* cancel is not immediate, give it a reasonable chance to take effect *)
wait_for_all_inner ~rpc ~session_id ~all_timeout:60. ~tasks;
false
end else true
in

let destroy_all () =
List.iter (fun task ->
(* db gc thread in xapi may delete task from tasks table *)
D.log_and_ignore_exn (fun () -> Client.Task.destroy ~rpc ~session_id ~self:task)) tasks
in
Xapi_stdext_pervasives.Pervasiveext.finally wait_or_cancel destroy_all
5 changes: 5 additions & 0 deletions ocaml/xapi-client/tasks.mli
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,8 @@
(** [wait_for_all ~rpc ~session_id ~tasks] returns when all of [tasks]
are in some non-pending state. *)
val wait_for_all : rpc:(Rpc.call -> Rpc.response) -> session_id:API.ref_session -> tasks:API.ref_task list -> unit

(** [with_tasks_destroy ~rpc ~session_id ~timeout ~tasks] is like [wait_for_all] except after [timeout] has elapsed
it will cancel pending tasks and return false.
Finally it will destroy all tasks *)
val with_tasks_destroy: rpc:(Rpc.call -> Rpc.response) -> session_id:API.ref_session -> timeout:float -> tasks:API.ref_task list -> bool
9 changes: 9 additions & 0 deletions ocaml/xapi/cli_frontend.ml
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,15 @@ let rec cmdtable_data : (string*cmd_spec) list =
flags=[Host_selectors];
};

"host-prepare-for-poweroff",
{
reqd=[];
optn=[];
help="Perform the necessary actions before host shutdown or reboot.";
implementation=No_fd Cli_operations.host_prepare_for_poweroff;
flags=[Hidden];
};

"host-dmesg",
{
reqd=[];
Expand Down
5 changes: 5 additions & 0 deletions ocaml/xapi/cli_operations.ml
Original file line number Diff line number Diff line change
Expand Up @@ -4131,6 +4131,11 @@ let host_reboot printer rpc session_id params =
let host_power_on printer rpc session_id params =
ignore(do_host_op rpc session_id (fun _ host -> Client.Host.power_on rpc session_id (host.getref ())) params [])

let host_prepare_for_poweroff _printer rpc session_id params =
let uuid = List.assoc "uuid" params in
let host = Client.Host.get_by_uuid ~rpc ~session_id ~uuid in
Client.Host.prepare_for_poweroff ~rpc ~session_id ~host

let host_dmesg printer rpc session_id params =
let op _ host =
let dmesg = Client.Host.dmesg rpc session_id (host.getref ()) in
Expand Down
6 changes: 6 additions & 0 deletions ocaml/xapi/message_forwarding.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2279,6 +2279,12 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct
do_op_on ~local_fn ~__context ~host (fun session_id rpc -> Client.Host.reboot rpc session_id host)
)

(* This is only be called by systemd during shutdown when xapi-domains.service is stopped *)
let prepare_for_poweroff ~__context ~host =
info "Host.prepare_for_poweroff: host = '%s'" (host_uuid ~__context host);
let local_fn = Local.Host.prepare_for_poweroff ~host in
do_op_on ~local_fn ~__context ~host (fun session_id rpc -> Client.Host.prepare_for_poweroff rpc session_id host)

let power_on ~__context ~host =
info "Host.power_on: host = '%s'" (host_uuid ~__context host);
with_host_operation ~__context ~self:host ~doc:"Host.power_on" ~op:`power_on
Expand Down
113 changes: 113 additions & 0 deletions ocaml/xapi/vm_evacuation.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
module D=Debug.Make(struct let name="xapi" end)
open D

let estimate_evacuate_timeout ~__context ~host =
let mref = Db.Host.get_metrics ~__context ~self:host in
let metrics = Db.Host_metrics.get_record ~__context ~self:mref in
let memory_used = Int64.sub metrics.API.host_metrics_memory_total metrics.API.host_metrics_memory_free in
(* Conservative estimation based on 1000Mbps link, and the memory usage of
Dom0 (which is not going to be transferred) is an intentional surplus *)
let t = ((Int64.to_float memory_used) *. 8. /. (1000. *. 1024. *. 1024.)) in
max 240. t

(* Returns a tuple of lists: The first containing the control domains, and the second containing the regular VMs *)
let get_resident_vms ~__context ~self =
let my_resident_vms = Db.Host.get_resident_VMs ~__context ~self in
List.partition (fun vm -> Db.VM.get_is_control_domain ~__context ~self:vm) my_resident_vms

let ensure_no_vms ~__context ~rpc ~session_id ~evacuate_timeout =
let open Client in

let is_running vm =
Db.VM.get_power_state ~__context ~self:vm = `Running
in

let host = Helpers.get_localhost ~__context in
let self_managed_poweroff vm =
let result = Db.VM.get_other_config ~__context ~self:vm
|> List.mem_assoc "auto_poweroff" in
if result then
debug "Skip running VM %s: has self-managed poweroff" (Db.VM.get_name_label ~__context ~self:vm);
result
in
let get_running_domains () =
get_resident_vms ~__context ~self:host |> snd
|> List.filter (fun vm -> is_running vm && not (self_managed_poweroff vm))
in

let cancel_vm_tasks self =
Db.VM.get_current_operations ~__context ~self
|> List.rev_map fst
|> List.rev_map Ref.of_string
|> List.iter (fun (task:[`task] Ref.t) ->
let name = Db.VM.get_name_label ~__context ~self in
debug "Canceling operation on VM %s" name;
log_and_ignore_exn (fun () -> Client.Task.cancel ~rpc ~session_id ~task))
in

let evacuate () =
TaskHelper.exn_if_cancelling ~__context; (* First check if _we_ have been cancelled *)
info "Requesting evacuation of host";
let timeout = if evacuate_timeout > 0. then evacuate_timeout
else estimate_evacuate_timeout ~__context ~host in
let tasks = [ Client.Async.Host.evacuate ~rpc ~session_id ~host ] in
if not (Tasks.with_tasks_destroy ~rpc ~session_id ~timeout ~tasks) then begin
get_running_domains ()
|> List.iter cancel_vm_tasks
end
in

let clean_shutdown vms =
TaskHelper.exn_if_cancelling ~__context; (* First check if _we_ have been cancelled *)
let tasks =
vms
|> List.filter (fun vm ->
List.mem `clean_shutdown (Client.VM.get_allowed_operations ~rpc ~session_id ~self:vm))
|> List.map (fun vm ->
let name_label = Client.VM.get_name_label ~rpc ~session_id ~self:vm in
debug "Requesting clean shutdown of VM: %s" name_label;
Client.Async.VM.clean_shutdown ~rpc ~session_id ~vm) in
Tasks.with_tasks_destroy ~rpc ~session_id ~timeout:60. ~tasks |> ignore
in

let hard_shutdown vms =
TaskHelper.exn_if_cancelling ~__context; (* First check if _we_ have been cancelled *)
let tasks =
vms
|> List.map (fun vm ->
let name_label = Client.VM.get_name_label ~rpc ~session_id ~self:vm in
debug "Requesting hard shutdown of VM: %s" name_label;
Client.Async.VM.hard_shutdown ~rpc ~session_id ~vm) in
(* no timeout: we need the VMs to be off *)
Tasks.wait_for_all ~rpc ~session_id ~tasks;
vms
|> List.filter is_running
|> List.iter (fun vm ->
let name_label = Client.VM.get_name_label ~rpc ~session_id ~self:vm in
info "Failure performing hard shutdown of VM: %s" name_label)
in

let shutdown vms =
log_and_ignore_exn (fun () -> clean_shutdown vms);
(* We can unplug the PBD if a VM is suspended or halted, but not if
* it is running or paused, i.e. "live" *)
vms
|> List.filter (fun self -> Xapi_vm_lifecycle.is_live ~__context ~self)
|> hard_shutdown
in

log_and_ignore_exn (fun () ->
Client.Host.get_vms_which_prevent_evacuation ~rpc ~session_id ~self:host
|> Xapi_stdext_std.Listext.List.filter_map (fun (vm, _) ->
if self_managed_poweroff vm then None
else Some vm)
|> shutdown;

evacuate ());

log_and_ignore_exn (fun () -> get_running_domains () |> shutdown)

let ensure_no_vms ~__context ~evacuate_timeout =
Helpers.call_api_functions ~__context (fun rpc session_id ->
ensure_no_vms ~__context ~rpc ~session_id ~evacuate_timeout)

39 changes: 31 additions & 8 deletions ocaml/xapi/xapi_ha.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1511,6 +1511,21 @@ let enable __context heartbeat_srs configuration =
notice the invalid state and disable its HA *)
raise exn

let assert_have_statefile_access ~__context ~host =
let pool = Helpers.get_pool ~__context in
if Db.Pool.get_ha_enabled ~__context ~self:pool then begin
let liveset = query_liveset () in
let me =
Hashtbl.find
liveset.Xha_interface.LiveSetInformation.hosts
liveset.Xha_interface.LiveSetInformation.local_host_id
in
if (not me.Xha_interface.LiveSetInformation.Host.state_file_access) ||
me.Xha_interface.LiveSetInformation.Host.state_file_corrupted
then raise (Api_errors.Server_error(Api_errors.ha_lost_statefile, []))
end

let before_clean_shutdown_or_reboot_precheck = assert_have_statefile_access

let before_clean_shutdown_or_reboot ~__context ~host =
let pool = Helpers.get_pool ~__context in
Expand All @@ -1527,13 +1542,7 @@ let before_clean_shutdown_or_reboot ~__context ~host =
then we lose it and ha_set_excluded fails, manually fence ourselves. *)

(* Safe early abort if we don't have statefile access *)
let liveset = query_liveset () in
let me = Hashtbl.find liveset.Xha_interface.LiveSetInformation.hosts
liveset.Xha_interface.LiveSetInformation.local_host_id in
if false
|| not(me.Xha_interface.LiveSetInformation.Host.state_file_access)
|| me.Xha_interface.LiveSetInformation.Host.state_file_corrupted
then raise (Api_errors.Server_error(Api_errors.ha_lost_statefile, []));
assert_have_statefile_access ~__context ~host;

(* From this point we will fence ourselves if any unexpected error occurs *)
begin try
Expand Down Expand Up @@ -1567,5 +1576,19 @@ let before_clean_shutdown_or_reboot ~__context ~host =
info "Still waiting to reboot after %.2f seconds" (Unix.gettimeofday () -. start)
done
end;
List.iter Static_vdis.detach_only (Static_vdis.list())

(* We must do this before attempting to detach the VDI holding the redo log,
otherwise we would either get an error later or hang.

Note that Xha_metadata_vdi is a VDI with reason = ha_metadata_vdi_reason and type=`redo_log:
type=`metadata is for DR *)
debug "About to close active redo logs";
Redo_log.with_active_redo_logs (Redo_log.shutdown);

(* We cannot call ha_release_resources because we want to keep HA armed on reboot *)
debug "About to detach static VDIs";

List.iter (Static_vdis.detach_only) (Static_vdis.list ());

debug "Detached static VDIs"
end
2 changes: 2 additions & 0 deletions ocaml/xapi/xapi_ha.mli
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,5 @@ val before_clean_shutdown_or_reboot : __context:Context.t -> host:'a -> unit
(** Called before shutting down or rebooting a host
(called by the host.shutdown, host.reboot API functions). *)

val before_clean_shutdown_or_reboot_precheck : __context:Context.t -> host:'a -> unit
(** Only runs the checks performed by [before_clean_shutdown_or_reboot]. *)
Loading