From d823e045d9ebba2372172031b2fbc414bb645433 Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Thu, 22 May 2025 16:58:03 +0100 Subject: [PATCH 001/111] Improve the xapi_observer debug logs by adding more context Signed-off-by: Steven Woods --- ocaml/xapi/xapi_observer.ml | 66 ++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/ocaml/xapi/xapi_observer.ml b/ocaml/xapi/xapi_observer.ml index 404c4496f29..0dd53865307 100644 --- a/ocaml/xapi/xapi_observer.ml +++ b/ocaml/xapi/xapi_observer.ml @@ -61,56 +61,56 @@ end module Observer : ObserverInterface = struct let create ~__context ~uuid ~name_label ~attributes ~endpoints ~enabled = - debug "Observer.create %s" uuid ; + debug "xapi Observer.create %s" uuid ; Tracing.TracerProvider.create ~uuid ~name_label ~attributes ~endpoints ~enabled let destroy ~__context ~uuid = - debug "Observer.destroy %s" uuid ; + debug "xapi Observer.destroy %s" uuid ; Tracing.TracerProvider.destroy ~uuid let set_enabled ~__context ~uuid ~enabled = - debug "Observer.set_enabled %s" uuid ; + debug "xapi Observer.set_enabled %s" uuid ; Tracing.TracerProvider.set ~uuid ~enabled () let set_attributes ~__context ~uuid ~attributes = - debug "Observer.set_attributes %s" uuid ; + debug "xapi Observer.set_attributes %s" uuid ; Tracing.TracerProvider.set ~uuid ~attributes () let set_endpoints ~__context ~uuid ~endpoints = - debug "Observer.set_endpoints %s" uuid ; + debug "xapi Observer.set_endpoints %s" uuid ; Tracing.TracerProvider.set ~uuid ~endpoints () let init ~__context = - debug "Observer.init" ; + debug "xapi Observer.init" ; ignore @@ Tracing_export.main () let set_trace_log_dir ~__context ~dir = - debug "Observer.set_trace_log_dir" ; + debug "xapi Observer.set_trace_log_dir" ; Tracing_export.Destination.File.set_trace_log_dir dir let set_export_interval ~__context ~interval = - debug "Observer.set_export_interval" ; + debug "xapi Observer.set_export_interval" ; Tracing_export.set_export_interval interval let set_max_spans ~__context ~spans = - debug "Observer.set_max_spans" ; + debug "xapi Observer.set_max_spans" ; Tracing.Spans.set_max_spans spans let set_max_traces ~__context ~traces = - debug "Observer.set_max_traces" ; + debug "xapi Observer.set_max_traces" ; Tracing.Spans.set_max_traces traces let set_max_file_size ~__context ~file_size = - debug "Observer.set_max_file_size" ; + debug "xapi Observer.set_max_file_size" ; Tracing_export.Destination.File.set_max_file_size file_size let set_host_id ~__context ~host_id = - debug "Observer.set_host_id" ; + debug "xapi Observer.set_host_id" ; Tracing_export.set_host_id host_id let set_compress_tracing_files ~__context ~enabled = - debug "Observer.set_compress_tracing_files" ; + debug "xapi Observer.set_compress_tracing_files" ; Tracing_export.Destination.File.set_compress_tracing_files enabled end @@ -142,79 +142,79 @@ module Xapi_cluster = struct module Observer = struct let create ~__context ~uuid ~name_label ~attributes ~endpoints ~enabled = - debug "Observer.create %s" uuid ; + debug "xapi_cluster Observer.create %s" uuid ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.create dbg uuid name_label attributes endpoints enabled let destroy ~__context ~uuid = - debug "Observer.destroy %s" uuid ; + debug "xapi_cluster Observer.destroy %s" uuid ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.destroy dbg uuid let set_enabled ~__context ~uuid ~enabled = - debug "Observer.set_enabled %s" uuid ; + debug "xapi_cluster Observer.set_enabled %s" uuid ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_enabled dbg uuid enabled let set_attributes ~__context ~uuid ~attributes = - debug "Observer.set_attributes %s" uuid ; + debug "xapi_cluster Observer.set_attributes %s" uuid ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_attributes dbg uuid attributes let set_endpoints ~__context ~uuid ~endpoints = - debug "Observer.set_endpoints %s" uuid ; + debug "xapi_cluster Observer.set_endpoints %s" uuid ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_endpoints dbg uuid endpoints let init ~__context = - debug "Observer.init" ; + debug "xapi_cluster Observer.init" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.init dbg let set_trace_log_dir ~__context ~dir = - debug "Observer.set_trace_log_dir" ; + debug "xapi_cluster Observer.set_trace_log_dir" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_trace_log_dir dbg dir let set_export_interval ~__context ~interval = - debug "Observer.set_export_interval" ; + debug "xapi_cluster Observer.set_export_interval" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_export_interval dbg interval let set_max_spans ~__context ~spans = - debug "Observer.set_max_spans" ; + debug "xapi_cluster Observer.set_max_spans" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_max_spans dbg spans let set_max_traces ~__context ~traces = - debug "Observer.set_max_traces" ; + debug "xapi_cluster Observer.set_max_traces" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_max_traces dbg traces let set_max_file_size ~__context ~file_size = - debug "Observer.set_max_file_size" ; + debug "xapi_cluster Observer.set_max_file_size" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_max_file_size dbg file_size let set_host_id ~__context ~host_id = - debug "Observer.set_host_id" ; + debug "xapi_cluster Observer.set_host_id" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_host_id dbg host_id let set_compress_tracing_files ~__context ~enabled = - debug "Observer.set_compress_tracing_files" ; + debug "xapi_cluster Observer.set_compress_tracing_files" ; let module S = (val local_client ~__context : XAPI_CLUSTER) in let dbg = Context.string_of_task __context in S.Observer.set_compress_tracing_files dbg enabled @@ -331,28 +331,40 @@ module Dom0ObserverConfig (ObserverComponent : OBSERVER_COMPONENT) : let create ~__context ~uuid ~name_label:_ ~attributes:_ ~endpoints:_ ~enabled:_ = + debug "%s config Observer.create" (to_string ObserverComponent.component) ; let observer = Db.Observer.get_by_uuid ~__context ~uuid in update_config ~__context ~observer ~uuid - let destroy ~__context ~uuid = remove_config ~uuid + let destroy ~__context ~uuid = + debug "%s config Observer.destroy" (to_string ObserverComponent.component) ; + remove_config ~uuid let set_enabled ~__context ~uuid ~enabled:_ = + debug "%s config Observer.set_enabled" + (to_string ObserverComponent.component) ; let observer = Db.Observer.get_by_uuid ~__context ~uuid in update_config ~__context ~observer ~uuid let set_attributes ~__context ~uuid ~attributes:_ = + debug "%s config Observer.set_attributes" + (to_string ObserverComponent.component) ; let observer = Db.Observer.get_by_uuid ~__context ~uuid in update_config ~__context ~observer ~uuid let set_endpoints ~__context ~uuid ~endpoints:_ = + debug "%s config Observer.set_endpoints" + (to_string ObserverComponent.component) ; let observer = Db.Observer.get_by_uuid ~__context ~uuid in update_config ~__context ~observer ~uuid let init ~__context = + debug "%s config Observer.init" (to_string ObserverComponent.component) ; let observer_all = Db.Observer.get_all ~__context in update_all_configs ~__context ~observer_all let set_trace_log_dir ~__context ~dir:_ = + debug "%s config Observer.set_trace_log_dir" + (to_string ObserverComponent.component) ; let observer_all = Db.Observer.get_all ~__context in update_all_configs ~__context ~observer_all From 13042fd0580fb314ae785aba1b762a8f7a241e6b Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Tue, 27 May 2025 10:44:33 +0100 Subject: [PATCH 002/111] Reduce code duplication by using a common Observer Interface Besides the errors, Xapi_cluster and Xenopsd use the exact same Observer RPC definitions. Add a new Observer error (as the unique errors for cluster/xenops are not applicable to the Observer functions anyway) and use common code to remove this duplication. Signed-off-by: Steven Woods --- ocaml/xapi-idl/cluster/cluster_interface.ml | 77 +--------- ocaml/xapi-idl/lib/dune | 2 +- ocaml/xapi-idl/lib/observer_helpers.ml | 150 ++++++++++++++++++++ ocaml/xapi-idl/xen/xenops_interface.ml | 77 +--------- 4 files changed, 153 insertions(+), 153 deletions(-) create mode 100644 ocaml/xapi-idl/lib/observer_helpers.ml diff --git a/ocaml/xapi-idl/cluster/cluster_interface.ml b/ocaml/xapi-idl/cluster/cluster_interface.ml index a39fc0a2ae9..d537cf0f99e 100644 --- a/ocaml/xapi-idl/cluster/cluster_interface.ml +++ b/ocaml/xapi-idl/cluster/cluster_interface.ml @@ -384,80 +384,5 @@ module LocalAPI (R : RPC) = struct (debug_info_p @-> timeout_p @-> returning result_p err) end - module Observer = struct - open TypeCombinators - - let endpoints_p = Param.mk ~name:"endpoints" (list Types.string) - - let bool_p = Param.mk ~name:"bool" Types.bool - - let uuid_p = Param.mk ~name:"uuid" Types.string - - let name_label_p = Param.mk ~name:"name_label" Types.string - - let dict_p = Param.mk ~name:"dict" dict - - let string_p = Param.mk ~name:"string" Types.string - - let int_p = Param.mk ~name:"int" Types.int - - let float_p = Param.mk ~name:"float" Types.float - - let create = - declare "Observer.create" [] - (debug_info_p - @-> uuid_p - @-> name_label_p - @-> dict_p - @-> endpoints_p - @-> bool_p - @-> returning unit_p err - ) - - let destroy = - declare "Observer.destroy" [] - (debug_info_p @-> uuid_p @-> returning unit_p err) - - let set_enabled = - declare "Observer.set_enabled" [] - (debug_info_p @-> uuid_p @-> bool_p @-> returning unit_p err) - - let set_attributes = - declare "Observer.set_attributes" [] - (debug_info_p @-> uuid_p @-> dict_p @-> returning unit_p err) - - let set_endpoints = - declare "Observer.set_endpoints" [] - (debug_info_p @-> uuid_p @-> endpoints_p @-> returning unit_p err) - - let init = declare "Observer.init" [] (debug_info_p @-> returning unit_p err) - - let set_trace_log_dir = - declare "Observer.set_trace_log_dir" [] - (debug_info_p @-> string_p @-> returning unit_p err) - - let set_export_interval = - declare "Observer.set_export_interval" [] - (debug_info_p @-> float_p @-> returning unit_p err) - - let set_host_id = - declare "Observer.set_host_id" [] - (debug_info_p @-> string_p @-> returning unit_p err) - - let set_max_traces = - declare "Observer.set_max_traces" [] - (debug_info_p @-> int_p @-> returning unit_p err) - - let set_max_spans = - declare "Observer.set_max_spans" [] - (debug_info_p @-> int_p @-> returning unit_p err) - - let set_max_file_size = - declare "Observer.set_max_file_size" [] - (debug_info_p @-> int_p @-> returning unit_p err) - - let set_compress_tracing_files = - declare "Observer.set_compress_tracing_files" [] - (debug_info_p @-> bool_p @-> returning unit_p err) - end + module Observer = Observer_helpers.ObserverAPI (R) end diff --git a/ocaml/xapi-idl/lib/dune b/ocaml/xapi-idl/lib/dune index 8f0d7ca27de..f0f1f4ce588 100644 --- a/ocaml/xapi-idl/lib/dune +++ b/ocaml/xapi-idl/lib/dune @@ -40,7 +40,7 @@ (wrapped false) (preprocess (per_module - ((pps ppx_deriving_rpc) Xcp_channel Xcp_channel_protocol TypeCombinators) + ((pps ppx_deriving_rpc) Xcp_channel Xcp_channel_protocol TypeCombinators Observer_helpers) ((pps ppx_sexp_conv ppx_deriving_rpc) Xcp_pci)))) (library diff --git a/ocaml/xapi-idl/lib/observer_helpers.ml b/ocaml/xapi-idl/lib/observer_helpers.ml new file mode 100644 index 00000000000..959a666b1c4 --- /dev/null +++ b/ocaml/xapi-idl/lib/observer_helpers.ml @@ -0,0 +1,150 @@ +(* + * Copyright (c) Cloud Software Group + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +open Rpc +open Idl + +module D = Debug.Make (struct let name = "observer_interface" end) + +open D + +module Errors = struct + type error = + | Internal_error of string + | Unimplemented of string + | Unknown_error + [@@default Unknown_error] [@@deriving rpcty] +end + +exception Observer_error of Errors.error + +let err = + let open Error in + { + def= Errors.error + ; raiser= + (fun e -> + let exn = Observer_error e in + error "%s (%s)" (Printexc.to_string exn) __LOC__ ; + raise exn + ) + ; matcher= + (function + | Observer_error e as exn -> + error "%s (%s)" (Printexc.to_string exn) __LOC__ ; + Some e + | exn -> + error "%s (%s)" (Printexc.to_string exn) __LOC__ ; + Some (Internal_error (Printexc.to_string exn)) + ) + } + +(** An uninterpreted string associated with the operation. *) +type debug_info = string [@@deriving rpcty] + +module ObserverAPI (R : RPC) = struct + open R + open TypeCombinators + + let description = + let open Interface in + { + name= "Observer" + ; namespace= None + ; description= + [ + "This interface is used to create, update and destroy Observers to \ + control the use of tracing in different xapi components" + ] + ; version= (1, 0, 0) + } + + let implementation = implement description + + let dbg_p = Param.mk ~name:"dbg" Types.string + + let unit_p = Param.mk ~name:"unit" Types.unit + + let endpoints_p = Param.mk ~name:"endpoints" (list Types.string) + + let bool_p = Param.mk ~name:"bool" Types.bool + + let uuid_p = Param.mk ~name:"uuid" Types.string + + let name_label_p = Param.mk ~name:"name_label" Types.string + + let dict_p = Param.mk ~name:"dict" dict + + let string_p = Param.mk ~name:"string" Types.string + + let int_p = Param.mk ~name:"int" Types.int + + let float_p = Param.mk ~name:"float" Types.float + + let create = + declare "Observer.create" [] + (dbg_p + @-> uuid_p + @-> name_label_p + @-> dict_p + @-> endpoints_p + @-> bool_p + @-> returning unit_p err + ) + + let destroy = + declare "Observer.destroy" [] (dbg_p @-> uuid_p @-> returning unit_p err) + + let set_enabled = + declare "Observer.set_enabled" [] + (dbg_p @-> uuid_p @-> bool_p @-> returning unit_p err) + + let set_attributes = + declare "Observer.set_attributes" [] + (dbg_p @-> uuid_p @-> dict_p @-> returning unit_p err) + + let set_endpoints = + declare "Observer.set_endpoints" [] + (dbg_p @-> uuid_p @-> endpoints_p @-> returning unit_p err) + + let init = declare "Observer.init" [] (dbg_p @-> returning unit_p err) + + let set_trace_log_dir = + declare "Observer.set_trace_log_dir" [] + (dbg_p @-> string_p @-> returning unit_p err) + + let set_export_interval = + declare "Observer.set_export_interval" [] + (dbg_p @-> float_p @-> returning unit_p err) + + let set_max_spans = + declare "Observer.set_max_spans" [] + (dbg_p @-> int_p @-> returning unit_p err) + + let set_max_traces = + declare "Observer.set_max_traces" [] + (dbg_p @-> int_p @-> returning unit_p err) + + let set_max_file_size = + declare "Observer.set_max_file_size" [] + (dbg_p @-> int_p @-> returning unit_p err) + + let set_host_id = + declare "Observer.set_host_id" [] + (dbg_p @-> string_p @-> returning unit_p err) + + let set_compress_tracing_files = + declare "Observer.set_compress_tracing_files" [] + (dbg_p @-> bool_p @-> returning unit_p err) +end diff --git a/ocaml/xapi-idl/xen/xenops_interface.ml b/ocaml/xapi-idl/xen/xenops_interface.ml index 4c9da479a78..85ac0665450 100644 --- a/ocaml/xapi-idl/xen/xenops_interface.ml +++ b/ocaml/xapi-idl/xen/xenops_interface.ml @@ -1152,80 +1152,5 @@ module XenopsAPI (R : RPC) = struct (debug_info_p @-> unit_p @-> returning unit_p err) end - module Observer = struct - open TypeCombinators - - let endpoints_p = Param.mk ~name:"endpoints" (list Types.string) - - let bool_p = Param.mk ~name:"bool" Types.bool - - let uuid_p = Param.mk ~name:"uuid" Types.string - - let name_label_p = Param.mk ~name:"name_label" Types.string - - let dict_p = Param.mk ~name:"dict" dict - - let string_p = Param.mk ~name:"string" Types.string - - let int_p = Param.mk ~name:"int" Types.int - - let float_p = Param.mk ~name:"float" Types.float - - let create = - declare "Observer.create" [] - (debug_info_p - @-> uuid_p - @-> name_label_p - @-> dict_p - @-> endpoints_p - @-> bool_p - @-> returning unit_p err - ) - - let destroy = - declare "Observer.destroy" [] - (debug_info_p @-> uuid_p @-> returning unit_p err) - - let set_enabled = - declare "Observer.set_enabled" [] - (debug_info_p @-> uuid_p @-> bool_p @-> returning unit_p err) - - let set_attributes = - declare "Observer.set_attributes" [] - (debug_info_p @-> uuid_p @-> dict_p @-> returning unit_p err) - - let set_endpoints = - declare "Observer.set_endpoints" [] - (debug_info_p @-> uuid_p @-> endpoints_p @-> returning unit_p err) - - let init = declare "Observer.init" [] (debug_info_p @-> returning unit_p err) - - let set_trace_log_dir = - declare "Observer.set_trace_log_dir" [] - (debug_info_p @-> string_p @-> returning unit_p err) - - let set_export_interval = - declare "Observer.set_export_interval" [] - (debug_info_p @-> float_p @-> returning unit_p err) - - let set_host_id = - declare "Observer.set_host_id" [] - (debug_info_p @-> string_p @-> returning unit_p err) - - let set_max_traces = - declare "Observer.set_max_traces" [] - (debug_info_p @-> int_p @-> returning unit_p err) - - let set_max_spans = - declare "Observer.set_max_spans" [] - (debug_info_p @-> int_p @-> returning unit_p err) - - let set_max_file_size = - declare "Observer.set_max_file_size" [] - (debug_info_p @-> int_p @-> returning unit_p err) - - let set_compress_tracing_files = - declare "Observer.set_compress_tracing_files" [] - (debug_info_p @-> bool_p @-> returning unit_p err) - end + module Observer = Observer_helpers.ObserverAPI (R) end From 4ab193a0615fcc1727cf188d921bf6a6be046ba3 Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Tue, 27 May 2025 11:20:03 +0100 Subject: [PATCH 003/111] CA-409431: Use an Observer forwarder for xapi-storage-script Currently, xapi-storage-script uses the presence/absence of a smapi observer config file to determine whether it should create traces. This only happens on startup which means smapiv3 traces will often not be created when they should be. This commit updates the Smapi Observer forwarder to use an RPC client to send messages to xapi-storage-script, updating it on any relevant changes to the Observer. Signed-off-by: Steven Woods --- ocaml/xapi-idl/lib/dune | 2 +- ocaml/xapi-idl/lib/observer_helpers.ml | 114 ++++++++++++++++++++++++ ocaml/xapi-idl/lib/observer_skeleton.ml | 48 ++++++++++ ocaml/xapi-storage-script/dune | 1 + ocaml/xapi-storage-script/main.ml | 38 ++++---- ocaml/xapi/xapi_observer.ml | 26 +++++- quality-gate.sh | 2 +- 7 files changed, 213 insertions(+), 18 deletions(-) create mode 100644 ocaml/xapi-idl/lib/observer_skeleton.ml diff --git a/ocaml/xapi-idl/lib/dune b/ocaml/xapi-idl/lib/dune index f0f1f4ce588..4f29504a97a 100644 --- a/ocaml/xapi-idl/lib/dune +++ b/ocaml/xapi-idl/lib/dune @@ -40,7 +40,7 @@ (wrapped false) (preprocess (per_module - ((pps ppx_deriving_rpc) Xcp_channel Xcp_channel_protocol TypeCombinators Observer_helpers) + ((pps ppx_deriving_rpc) Xcp_channel Xcp_channel_protocol TypeCombinators Observer_helpers Observer_skeleton) ((pps ppx_sexp_conv ppx_deriving_rpc) Xcp_pci)))) (library diff --git a/ocaml/xapi-idl/lib/observer_helpers.ml b/ocaml/xapi-idl/lib/observer_helpers.ml index 959a666b1c4..125ba101722 100644 --- a/ocaml/xapi-idl/lib/observer_helpers.ml +++ b/ocaml/xapi-idl/lib/observer_helpers.ml @@ -19,6 +19,16 @@ module D = Debug.Make (struct let name = "observer_interface" end) open D +let service_name = "observer" + +let queue_name = Xcp_service.common_prefix ^ service_name + +let default_sockets_dir = "/var/lib/xcp" + +let default_path = Filename.concat default_sockets_dir service_name + +let uri () = "file:" ^ default_path + module Errors = struct type error = | Internal_error of string @@ -148,3 +158,107 @@ module ObserverAPI (R : RPC) = struct declare "Observer.set_compress_tracing_files" [] (dbg_p @-> bool_p @-> returning unit_p err) end + +module type Server_impl = sig + type context = unit + + val create : + context + -> dbg:debug_info + -> uuid:string + -> name_label:string + -> attributes:(string * string) list + -> endpoints:string list + -> enabled:bool + -> unit + + val destroy : context -> dbg:debug_info -> uuid:string -> unit + + val set_enabled : + context -> dbg:debug_info -> uuid:string -> enabled:bool -> unit + + val set_attributes : + context + -> dbg:debug_info + -> uuid:string + -> attributes:(string * string) list + -> unit + + val set_endpoints : + context -> dbg:debug_info -> uuid:string -> endpoints:string list -> unit + + val init : context -> dbg:debug_info -> unit + + val set_trace_log_dir : context -> dbg:debug_info -> dir:string -> unit + + val set_export_interval : context -> dbg:debug_info -> interval:float -> unit + + val set_max_spans : context -> dbg:debug_info -> spans:int -> unit + + val set_max_traces : context -> dbg:debug_info -> traces:int -> unit + + val set_max_file_size : context -> dbg:debug_info -> file_size:int -> unit + + val set_host_id : context -> dbg:debug_info -> host_id:string -> unit + + val set_compress_tracing_files : + context -> dbg:debug_info -> enabled:bool -> unit +end + +module Server (Impl : Server_impl) () = struct + module S = ObserverAPI (Idl.Exn.GenServer ()) + + let _ = + S.create (fun dbg uuid name_label attributes endpoints enabled -> + Impl.create () ~dbg ~uuid ~name_label ~attributes ~endpoints ~enabled + ) ; + S.destroy (fun dbg uuid -> Impl.destroy () ~dbg ~uuid) ; + S.set_enabled (fun dbg uuid enabled -> + Impl.set_enabled () ~dbg ~uuid ~enabled + ) ; + S.set_attributes (fun dbg uuid attributes -> + Impl.set_attributes () ~dbg ~uuid ~attributes + ) ; + S.set_endpoints (fun dbg uuid endpoints -> + Impl.set_endpoints () ~dbg ~uuid ~endpoints + ) ; + S.init (fun dbg -> Impl.init () ~dbg) ; + S.set_trace_log_dir (fun dbg dir -> Impl.set_trace_log_dir () ~dbg ~dir) ; + S.set_export_interval (fun dbg interval -> + Impl.set_export_interval () ~dbg ~interval + ) ; + S.set_max_spans (fun dbg spans -> Impl.set_max_spans () ~dbg ~spans) ; + S.set_max_traces (fun dbg traces -> Impl.set_max_traces () ~dbg ~traces) ; + S.set_max_file_size (fun dbg file_size -> + Impl.set_max_file_size () ~dbg ~file_size + ) ; + S.set_host_id (fun dbg host_id -> Impl.set_host_id () ~dbg ~host_id) ; + S.set_compress_tracing_files (fun dbg enabled -> + Impl.set_compress_tracing_files () ~dbg ~enabled + ) + + (* Bind all *) + let process call = Idl.Exn.server S.implementation call +end + +let rec retry_econnrefused f = + try f () with + | Unix.Unix_error (Unix.ECONNREFUSED, "connect", _) -> + (* debug "Caught ECONNREFUSED; retrying in 5s"; *) + Thread.delay 5. ; retry_econnrefused f + | e -> + (* error "Caught %s: does the observer service need restarting?" + (Printexc.to_string e); *) + raise e + +module Client = ObserverAPI (Idl.Exn.GenClient (struct + open Xcp_client + + let rpc call = + retry_econnrefused (fun () -> + if !use_switch then + json_switch_rpc queue_name call + else + xml_http_rpc ~srcstr:(get_user_agent ()) ~dststr:queue_name uri call + ) +end)) diff --git a/ocaml/xapi-idl/lib/observer_skeleton.ml b/ocaml/xapi-idl/lib/observer_skeleton.ml new file mode 100644 index 00000000000..8cf5e2f5221 --- /dev/null +++ b/ocaml/xapi-idl/lib/observer_skeleton.ml @@ -0,0 +1,48 @@ +(* + * Copyright (c) Cloud Software Group + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) +[@@@ocaml.warning "-27"] + +let u x = raise Observer_helpers.(Observer_error (Errors.Unimplemented x)) + +module Observer = struct + type context = unit + + let create ctx ~dbg ~uuid ~name_label ~attributes ~endpoints ~enabled = + u "Observer.create" + + let destroy ctx ~dbg ~uuid = u "Observer.destroy" + + let set_enabled ctx ~dbg ~uuid ~enabled = u "Observer.set_enabled" + + let set_attributes ctx ~dbg ~uuid ~attributes = u "Observer.set_attributes" + + let set_endpoints ctx ~dbg ~uuid ~endpoints = u "Observer.set_endpoints" + + let init ctx ~dbg = u "Observer.init" + + let set_trace_log_dir ctx ~dbg ~dir = u "Observer.set_trace_log_dir" + + let set_export_interval ctx ~dbg ~interval = u "Observer.set_export_interval" + + let set_max_spans ctx ~dbg ~spans = u "Observer.set_max_spans" + + let set_max_traces ctx ~dbg ~traces = u "Observer.set_max_traces" + + let set_max_file_size ctx ~dbg ~file_size = u "Observer.set_max_file_size" + + let set_host_id ctx ~dbg ~host_id = u "Observer.set_host_id" + + let set_compress_tracing_files ctx ~dbg ~enabled = + u "Observer.set_compress_tracing_files" +end diff --git a/ocaml/xapi-storage-script/dune b/ocaml/xapi-storage-script/dune index e1391aed2ca..f917e426ca5 100644 --- a/ocaml/xapi-storage-script/dune +++ b/ocaml/xapi-storage-script/dune @@ -41,6 +41,7 @@ sexplib sexplib0 uri + threads.posix xapi-backtrace xapi-consts xapi-consts.xapi_version diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index 1b15a17f46e..e04a93203b3 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -411,19 +411,6 @@ let observer_config_dir = in dir // component // "enabled" -(** Determine if SM API observation is enabled from the - filesystem. Ordinarily, determining if a component is enabled - would consist of querying the 'components' field of an observer - from the xapi database. *) -let observer_is_component_enabled () = - let is_enabled () = - let is_config_file path = Filename.check_suffix path ".observer.conf" in - let* files = Sys.readdir observer_config_dir in - Lwt.return (List.exists is_config_file files) - in - let* result = Deferred.try_with is_enabled in - Lwt.return (Option.value (Result.to_option result) ~default:false) - (** Call the script named after the RPC method in the [script_dir] directory. The arguments (not the whole JSON-RPC call) are passed as JSON to its stdin, and stdout is returned. In case of a non-zero exit code, @@ -2247,6 +2234,19 @@ let register_exn_pretty_printers () = assert false ) +module XapiStorageScript : Observer_helpers.Server_impl = struct + include Observer_skeleton.Observer + + let create _context ~dbg:_ ~uuid:_ ~name_label:_ ~attributes:_ ~endpoints:_ + ~enabled = + config.use_observer <- enabled + + let destroy _context ~dbg:_ ~uuid:_ = config.use_observer <- false + + let set_enabled _context ~dbg:_ ~uuid:_ ~enabled = + config.use_observer <- enabled +end + let () = register_exn_pretty_printers () ; let root_dir = ref "/var/lib/xapi/storage-scripts" in @@ -2293,9 +2293,17 @@ let () = Logs.set_reporter (lwt_reporter ()) ; Logs.set_level ~all:true (Some Logs.Info) ; + + let module S = Observer_helpers.Server (XapiStorageScript) () in + let s = + Xcp_service.make ~path:Observer_helpers.default_path + ~queue_name:Observer_helpers.queue_name ~rpc_fn:S.process () + in + let (_ : Thread.t) = + Thread.create (fun () -> Xcp_service.serve_forever s) () + in + let main = - let* observer_enabled = observer_is_component_enabled () in - config.use_observer <- observer_enabled ; if !self_test_only then self_test ~root_dir:!root_dir else diff --git a/ocaml/xapi/xapi_observer.ml b/ocaml/xapi/xapi_observer.ml index 0dd53865307..62d3ea4359c 100644 --- a/ocaml/xapi/xapi_observer.ml +++ b/ocaml/xapi/xapi_observer.ml @@ -383,6 +383,30 @@ end module SMObserverConfig = Dom0ObserverConfig (struct let component = SMApi end) +module SMObserver = struct + include SMObserverConfig + open Observer_helpers + + let create ~__context ~uuid ~name_label ~attributes ~endpoints ~enabled = + debug "SMObserver Observer.create %s" uuid ; + SMObserverConfig.create ~__context ~uuid ~name_label ~attributes ~endpoints + ~enabled ; + let dbg = Context.string_of_task __context in + Client.create dbg uuid name_label attributes endpoints enabled + + let destroy ~__context ~uuid = + debug "SMObserver Observer.destroy %s" uuid ; + SMObserverConfig.destroy ~__context ~uuid ; + let dbg = Context.string_of_task __context in + Client.destroy dbg uuid + + let set_enabled ~__context ~uuid ~enabled = + debug "SMObserver Observer.set_enabled %s" uuid ; + SMObserverConfig.set_enabled ~__context ~uuid ~enabled ; + let dbg = Context.string_of_task __context in + Client.set_enabled dbg uuid enabled +end + let get_forwarder c = let module Forwarder = ( val match c with @@ -393,7 +417,7 @@ let get_forwarder c = | Xapi_clusterd -> (module Xapi_cluster.Observer) | SMApi -> - (module SMObserverConfig) + (module SMObserver) : ObserverInterface ) in diff --git a/quality-gate.sh b/quality-gate.sh index f6540cb2a1f..6455846d21b 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=467 + N=469 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" From 0b9f900cbc06a1402c95f59cfc6eca97a9f4e3b7 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 9 Jun 2025 10:26:45 +0100 Subject: [PATCH 004/111] xapi: Move cpu_info keys to xapi-consts from xapi_globs to be used across modules Signed-off-by: Andrii Sultanov --- ocaml/tests/test_xapi_xenops.ml | 4 ++-- ocaml/tests/test_xenopsd_metadata.ml | 4 ++-- ocaml/xapi-consts/constants.ml | 14 ++++++++++++++ ocaml/xapi/cpuid_helpers.ml | 23 +++++++++++------------ ocaml/xapi/create_misc.ml | 12 ++++++------ ocaml/xapi/xapi_globs.ml | 14 -------------- ocaml/xapi/xapi_xenops.ml | 8 ++++---- 7 files changed, 39 insertions(+), 40 deletions(-) diff --git a/ocaml/tests/test_xapi_xenops.ml b/ocaml/tests/test_xapi_xenops.ml index 551c7d0d90f..e1f1bf048e2 100644 --- a/ocaml/tests/test_xapi_xenops.ml +++ b/ocaml/tests/test_xapi_xenops.ml @@ -62,8 +62,8 @@ let test_xapi_restart_inner () = in let flags = [ - (Xapi_globs.cpu_info_vendor_key, "AuthenticAMD") - ; (Xapi_globs.cpu_info_features_key, "deadbeef-deadbeef") + (Constants.cpu_info_vendor_key, "AuthenticAMD") + ; (Constants.cpu_info_features_key, "deadbeef-deadbeef") ] in let add_flags vm = diff --git a/ocaml/tests/test_xenopsd_metadata.ml b/ocaml/tests/test_xenopsd_metadata.ml index c052de228fa..14362e73b68 100644 --- a/ocaml/tests/test_xenopsd_metadata.ml +++ b/ocaml/tests/test_xenopsd_metadata.ml @@ -38,8 +38,8 @@ let load_vm_config __context conf = in let flags = [ - (Xapi_globs.cpu_info_vendor_key, "AuthenticAMD") - ; (Xapi_globs.cpu_info_features_key, "deadbeef-deadbeef") + (Constants.cpu_info_vendor_key, "AuthenticAMD") + ; (Constants.cpu_info_features_key, "deadbeef-deadbeef") ] in Db.VM.set_last_boot_CPU_flags ~__context ~self ~value:flags ; diff --git a/ocaml/xapi-consts/constants.ml b/ocaml/xapi-consts/constants.ml index 185f9669a7c..07481abc54d 100644 --- a/ocaml/xapi-consts/constants.ml +++ b/ocaml/xapi-consts/constants.ml @@ -177,6 +177,20 @@ let hvm_boot_params_order = "order" let hvm_default_boot_order = "cd" +(** Keys for different CPUID policies in {Host,Pool}.cpu_info *) + +let cpu_info_vendor_key = "vendor" + +let cpu_info_features_key = "features" + +let cpu_info_features_pv_key = "features_pv" + +let cpu_info_features_hvm_key = "features_hvm" + +let cpu_info_features_pv_host_key = "features_pv_host" + +let cpu_info_features_hvm_host_key = "features_hvm_host" + (* Key we put in VM.other_config when we upgrade a VM from Zurich/Geneva to Rio *) let vm_upgrade_time = "upgraded at" diff --git a/ocaml/xapi/cpuid_helpers.ml b/ocaml/xapi/cpuid_helpers.ml index 1bf6731efad..d001b7ab73f 100644 --- a/ocaml/xapi/cpuid_helpers.ml +++ b/ocaml/xapi/cpuid_helpers.ml @@ -12,8 +12,6 @@ * GNU Lesser General Public License for more details. *) -open Xapi_globs - module D = Debug.Make (struct let name = "cpuid_helpers" end) open D @@ -24,20 +22,19 @@ let features_t t = (Xenops_interface.CPU_policy.of_string t) Xenops_interface.CPU_policy.to_string -let features = - Map_check.(field Xapi_globs.cpu_info_features_key (features_t `vm)) +let features = Map_check.(field Constants.cpu_info_features_key (features_t `vm)) let features_pv = - Map_check.(field Xapi_globs.cpu_info_features_pv_key (features_t `host)) + Map_check.(field Constants.cpu_info_features_pv_key (features_t `host)) let features_hvm = - Map_check.(field Xapi_globs.cpu_info_features_hvm_key (features_t `host)) + Map_check.(field Constants.cpu_info_features_hvm_key (features_t `host)) let features_pv_host = - Map_check.(field Xapi_globs.cpu_info_features_pv_host_key (features_t `host)) + Map_check.(field Constants.cpu_info_features_pv_host_key (features_t `host)) let features_hvm_host = - Map_check.(field Xapi_globs.cpu_info_features_hvm_host_key (features_t `host)) + Map_check.(field Constants.cpu_info_features_hvm_host_key (features_t `host)) let cpu_count = Map_check.(field "cpu_count" int) @@ -55,7 +52,7 @@ let get_flags_for_vm ~__context domain_type cpu_info = | `pv -> features_pv in - let vendor = List.assoc cpu_info_vendor_key cpu_info in + let vendor = List.assoc Constants.cpu_info_vendor_key cpu_info in let migration = Map_check.getf features_field cpu_info in (vendor, migration) @@ -124,16 +121,18 @@ let assert_vm_is_compatible ~__context ~vm ~host = get_host_compatibility_info ~__context ~domain_type ~host () in let vm_cpu_info = vm_rec.API.vM_last_boot_CPU_flags in - if List.mem_assoc cpu_info_vendor_key vm_cpu_info then ( + if List.mem_assoc Constants.cpu_info_vendor_key vm_cpu_info then ( (* Check the VM was last booted on a CPU with the same vendor as this host's CPU. *) - let vm_cpu_vendor = List.assoc cpu_info_vendor_key vm_cpu_info in + let vm_cpu_vendor = + List.assoc Constants.cpu_info_vendor_key vm_cpu_info + in debug "VM last booted on CPU of vendor %s; host CPUs are of vendor %s" vm_cpu_vendor host_cpu_vendor ; if vm_cpu_vendor <> host_cpu_vendor then fail "VM last booted on a host which had a CPU from a different vendor." ) ; - if List.mem_assoc cpu_info_features_key vm_cpu_info then ( + if List.mem_assoc Constants.cpu_info_features_key vm_cpu_info then ( (* Check the VM was last booted on a CPU whose features are a subset of the features of this host's CPU. *) let vm_cpu_features = Map_check.getf features vm_cpu_info in debug diff --git a/ocaml/xapi/create_misc.ml b/ocaml/xapi/create_misc.ml index cd0a97b4115..cd3412156cd 100644 --- a/ocaml/xapi/create_misc.ml +++ b/ocaml/xapi/create_misc.ml @@ -579,16 +579,16 @@ let create_host_cpu ~__context host_info = ; ("model", cpu_info.model) ; ("stepping", cpu_info.stepping) ; ("flags", cpu_info.flags) - ; ( Xapi_globs.cpu_info_features_pv_key + ; ( Constants.cpu_info_features_pv_key , CPU_policy.to_string cpu_info.features_pv ) - ; ( Xapi_globs.cpu_info_features_hvm_key + ; ( Constants.cpu_info_features_hvm_key , CPU_policy.to_string cpu_info.features_hvm ) - ; ( Xapi_globs.cpu_info_features_hvm_host_key + ; ( Constants.cpu_info_features_hvm_host_key , CPU_policy.to_string cpu_info.features_hvm_host ) - ; ( Xapi_globs.cpu_info_features_pv_host_key + ; ( Constants.cpu_info_features_pv_host_key , CPU_policy.to_string cpu_info.features_pv_host ) ] @@ -698,8 +698,8 @@ let create_pool_cpuinfo ~__context = ("vendor", "") ; ("socket_count", "0") ; ("cpu_count", "0") - ; (Xapi_globs.cpu_info_features_pv_host_key, "") - ; (Xapi_globs.cpu_info_features_hvm_host_key, "") + ; (Constants.cpu_info_features_pv_host_key, "") + ; (Constants.cpu_info_features_hvm_host_key, "") ] in let pool_cpuinfo = List.fold_left merge zero all_host_cpus in diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 22908a496b1..e3957deea71 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -631,20 +631,6 @@ let auth_type_PAM = "PAM" let event_hook_auth_on_xapi_initialize_succeeded = ref false -(** {2 CPUID feature masking} *) - -let cpu_info_vendor_key = "vendor" - -let cpu_info_features_key = "features" - -let cpu_info_features_pv_key = "features_pv" - -let cpu_info_features_hvm_key = "features_hvm" - -let cpu_info_features_pv_host_key = "features_pv_host" - -let cpu_info_features_hvm_host_key = "features_hvm_host" - (** Metrics *) let metrics_root = "/dev/shm/metrics" diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 2f0add74368..e9f5174f90b 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -1213,7 +1213,7 @@ module MD = struct if not (List.mem_assoc Vm_platform.featureset platformdata) then let featureset = match - List.assoc_opt Xapi_globs.cpu_info_features_key + List.assoc_opt Constants.cpu_info_features_key vm.API.vM_last_boot_CPU_flags with | _ when vm.API.vM_power_state <> `Suspended -> @@ -2418,12 +2418,12 @@ let update_vm ~__context id = state.Vm.featureset ; let vendor = Db.Host.get_cpu_info ~__context ~self:localhost - |> List.assoc Xapi_globs.cpu_info_vendor_key + |> List.assoc Constants.cpu_info_vendor_key in let value = [ - (Xapi_globs.cpu_info_vendor_key, vendor) - ; (Xapi_globs.cpu_info_features_key, state.Vm.featureset) + (Constants.cpu_info_vendor_key, vendor) + ; (Constants.cpu_info_features_key, state.Vm.featureset) ] in Db.VM.set_last_boot_CPU_flags ~__context ~self ~value From c23f1a3d702dad90d15062cccac7f5b8f0aac6a3 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 9 Jun 2025 10:27:30 +0100 Subject: [PATCH 005/111] xapi-cli-server: Fix host-get-cpu-features and add pool-get-cpu-features Host.cpu_info list no longer contains a value associated with a "features" key, but the CLI implementation was hardcoded to expect it. Instead use the cpu_info_features keys from xapi-consts. Add the pool version of the command. Additionally document their output format. Signed-off-by: Andrii Sultanov --- ocaml/xapi-cli-server/cli_frontend.ml | 18 +++++++++++-- ocaml/xapi-cli-server/cli_operations.ml | 35 +++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/ocaml/xapi-cli-server/cli_frontend.ml b/ocaml/xapi-cli-server/cli_frontend.ml index 57861e95001..34d107ef87a 100644 --- a/ocaml/xapi-cli-server/cli_frontend.ml +++ b/ocaml/xapi-cli-server/cli_frontend.ml @@ -535,6 +535,18 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [] } ) + ; ( "pool-get-cpu-features" + , { + reqd= [] + ; optn= [] + ; help= + {|Prints a hexadecimal representation of the pool's physical-CPU + features for PV and HVM VMs. These are combinations of all the + hosts' policies and are used when starting new VMs in a pool.|} + ; implementation= No_fd Cli_operations.pool_get_cpu_features + ; flags= [] + } + ) ; ( "host-is-in-emergency-mode" , { reqd= [] @@ -1018,8 +1030,10 @@ let rec cmdtable_data : (string * cmd_spec) list = reqd= [] ; optn= ["uuid"] ; help= - "Prints a hexadecimal representation of the host's physical-CPU \ - features." + {|Prints a hexadecimal representation of the host's physical-CPU + features for PV and HVM VMs. features_{hvm,pv} are "maximum" + featuresets the host will accept during migrations, and + features_{hvm,pv}_host will be used to start new VMs.|} ; implementation= No_fd Cli_operations.host_get_cpu_features ; flags= [] } diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index 25e4c84ce79..c2e67cf3764 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -6799,6 +6799,28 @@ let pool_get_guest_secureboot_readiness printer rpc session_id params = (Record_util.pool_guest_secureboot_readiness_to_string result) ) +let cpu_info_features_of feature_keys cpu_info = + let ( let* ) = Option.bind in + List.filter_map + (fun key -> + let* features = List.assoc_opt key cpu_info in + Some (key, features) + ) + feature_keys + +let pool_get_cpu_features printer rpc session_id params = + let pool = get_pool_with_default rpc session_id params "uuid" in + let cpu_info = Client.Pool.get_cpu_info ~rpc ~session_id ~self:pool in + + let feature_keys = + [ + Constants.cpu_info_features_pv_host_key + ; Constants.cpu_info_features_hvm_host_key + ] + in + let features = cpu_info_features_of feature_keys cpu_info in + printer (Cli_printer.PTable [features]) + let pool_sync_bundle fd _printer rpc session_id params = let filename_opt = List.assoc_opt "filename" params in match filename_opt with @@ -6968,8 +6990,17 @@ let host_get_cpu_features printer rpc session_id params = get_host_from_session rpc session_id in let cpu_info = Client.Host.get_cpu_info ~rpc ~session_id ~self:host in - let features = List.assoc "features" cpu_info in - printer (Cli_printer.PMsg features) + + let feature_keys = + [ + Constants.cpu_info_features_pv_key + ; Constants.cpu_info_features_hvm_key + ; Constants.cpu_info_features_pv_host_key + ; Constants.cpu_info_features_hvm_host_key + ] + in + let features = cpu_info_features_of feature_keys cpu_info in + printer (Cli_printer.PTable [features]) let host_enable_display printer rpc session_id params = let host = From 11ab8a681bf9aa0a959e297f3c4b07c2849a72b6 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 10 Jun 2025 16:47:53 +0100 Subject: [PATCH 006/111] unixext: Add a raise_with_preserved_backtrace function Signed-off-by: Andrii Sultanov --- ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.ml | 5 +++++ ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.mli | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.ml index 32a9f5119ab..893a7e4d9bc 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.ml +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.ml @@ -17,6 +17,11 @@ exception Unix_error of int let _exit = Unix._exit +let raise_with_preserved_backtrace exn f = + let bt = Printexc.get_raw_backtrace () in + f () ; + Printexc.raise_with_backtrace exn bt + (** remove a file, but doesn't raise an exception if the file is already removed *) let unlink_safe file = try Unix.unlink file with (* Unix.Unix_error (Unix.ENOENT, _ , _)*) _ -> () diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.mli index 047935b475c..3db652bd2a3 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.mli +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-unix/unixext.mli @@ -15,6 +15,10 @@ val _exit : int -> unit +val raise_with_preserved_backtrace : exn -> (unit -> unit) -> 'b +(** A wrapper that preserves the backtrace (otherwise erased by calling + formatting functions, for example) *) + val unlink_safe : string -> unit val mkdir_safe : string -> Unix.file_perm -> unit From 929eefd11f6354056e4e7cd2c37d46675444e62d Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 10 Jun 2025 11:36:36 +0100 Subject: [PATCH 007/111] xapi_vgpu_type: Don't pollute the logs with non-critical errors on every toolstack restart on hosts without Nvidia GPUs, xapi complains about a non-existent directory: xapi: [error||0 |dbsync (update_env) R:733fc2551767|xapi_vgpu_type] Failed to create NVidia compat config_file: Sys_error("/usr/share/nvidia/vgx: No such file or directory") Handle the directory's absence without propagating the error. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_vgpu_type.ml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/xapi_vgpu_type.ml b/ocaml/xapi/xapi_vgpu_type.ml index f7d5e1eb408..aae64cef195 100644 --- a/ocaml/xapi/xapi_vgpu_type.ml +++ b/ocaml/xapi/xapi_vgpu_type.ml @@ -1033,7 +1033,9 @@ module Nvidia_compat = struct read_configs ac tl ) in - let conf_files = Array.to_list (Sys.readdir conf_dir) in + let conf_files = + try Array.to_list (Sys.readdir conf_dir) with Sys_error _ -> [] + in debug "Reading NVIDIA vGPU config files %s/{%s}" conf_dir (String.concat ", " conf_files) ; read_configs [] From a4b992c586c0623170336936e555b028cca7234b Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 10 Jun 2025 14:26:22 +0100 Subject: [PATCH 008/111] networkd: Add ENOENT to the list of expected errors in Sysfs.read_one_line Otherwise this quite frequently logs something like: ``` xcp-networkd: [error||22 |dbsync (update_env)|network_utils] Error in read one line of file: /sys/class/net/eth0/device/sriov_totalvfs, exception Unix.Unix_error(Unix.ENOENT, "open", "/sys/class/net/eth0/device/sriov_totalvfs") Backtrace ... ``` Signed-off-by: Andrii Sultanov --- ocaml/networkd/lib/network_utils.ml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocaml/networkd/lib/network_utils.ml b/ocaml/networkd/lib/network_utils.ml index 1c8c8cd1a27..846c517c82e 100644 --- a/ocaml/networkd/lib/network_utils.ml +++ b/ocaml/networkd/lib/network_utils.ml @@ -162,7 +162,8 @@ module Sysfs = struct with | End_of_file -> "" - | Unix.Unix_error (Unix.EINVAL, _, _) -> + | Unix.Unix_error (Unix.EINVAL, _, _) | Unix.Unix_error (Unix.ENOENT, _, _) + -> (* The device is not yet up *) raise (Network_error (Read_error file)) | exn -> From 3f0e977e94449d9dd1aa4939ae5ab22549c99c6b Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 10 Jun 2025 15:55:36 +0100 Subject: [PATCH 009/111] xenguestHelper: Don't dump errors on End_of_file non_debug_receive will dump an error after reading the last bits of the header, which is expected and handled by the caller appropriately: ``` xenopsd-xc: [error||67 |Async.VM.resume R:beac7be348f1|xenguesthelper] Memory F 6019464 KiB S 0 KiB T 8183 MiB <--- dumping error xenopsd-xc: [debug||67 |Async.VM.resume R:beac7be348f1|mig64] Finished emu-manager result processing <---- End_of_file expected and handled ``` Don't pollute the logs and instead just log the same info with 'debug' when the error is End_of_file. Signed-off-by: Andrii Sultanov --- ocaml/xenopsd/xc/xenguestHelper.ml | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/ocaml/xenopsd/xc/xenguestHelper.ml b/ocaml/xenopsd/xc/xenguestHelper.ml index b76fec51c25..06a28d92f33 100644 --- a/ocaml/xenopsd/xc/xenguestHelper.ml +++ b/ocaml/xenopsd/xc/xenguestHelper.ml @@ -200,13 +200,14 @@ let rec non_debug_receive ?(debug_callback = fun s -> debug "%s" s) cnx = (* Dump memory statistics on failure *) let non_debug_receive ?debug_callback cnx = - let debug_memory () = + let debug_memory log_type = Xenctrl.with_intf (fun xc -> let open Memory in let open Int64 in let open Xenctrl in let p = Xenctrl.physinfo xc in - error "Memory F %Ld KiB S %Ld KiB T %Ld MiB" + (match log_type with Syslog.Debug -> debug | _ -> error) + "Memory F %Ld KiB S %Ld KiB T %Ld MiB" (p.free_pages |> of_nativeint |> kib_of_pages) (p.scrub_pages |> of_nativeint |> kib_of_pages) (p.total_pages |> of_nativeint |> mib_of_pages_free) @@ -215,10 +216,18 @@ let non_debug_receive ?debug_callback cnx = try match non_debug_receive ?debug_callback cnx with | Error y as x -> - error "Received: %s" y ; debug_memory () ; x + error "Received: %s" y ; debug_memory Syslog.Err ; x | x -> x - with e -> debug_memory () ; raise e + with + | End_of_file as e -> + Unixext.raise_with_preserved_backtrace e (fun () -> + debug_memory Syslog.Debug + ) + | e -> + Unixext.raise_with_preserved_backtrace e (fun () -> + debug_memory Syslog.Err + ) (** For the simple case where we just want the successful result, return it. If we get an error message (or suspend) then throw an exception. *) From 3b7278f85df7fdd8a566ae65fce9c01c294fe876 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 14:59:36 +0100 Subject: [PATCH 010/111] xe-cli completion: Remove "" from completion suggestions While one could potentially filter for this "value", I don't think it's that useful and adds noise to the completions, like here: ``` $ xe vm-list resident-on= 64c11cad-2c52-4dea-aea6-5fae0e720699 \ 7f566729-0ee7-47c4-853d-2c5f3a195ad4 ``` Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index 8120df874f3..5a1c535ad7f 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -755,6 +755,10 @@ __add_completion() local description_cmd="$2" local max_cmd_length="$3" + if [ "$word" = "" ]; then + return 0 + fi + COMPLETION_SUGGESTIONS=$((COMPLETION_SUGGESTIONS+1)) __xe_debug "\t$word" From 4e5012f4cb74947910abbcac016c21df380c367f Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 15:01:22 +0100 Subject: [PATCH 011/111] xe-cli completion: Fix debug logging log needs to be moved one line below the first assignment into the "description" variable, otherwise it's always going to be an empty string Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index 5a1c535ad7f..b65c81d4b58 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -772,8 +772,8 @@ __add_completion() COMPREPLY+=( $(printf '%s%q' "$description" "$word") ) else if [[ $SHOW_DESCRIPTION == 1 ]]; then - __xe_debug "\t showing command description - '$description'" description=" - $(eval $description_cmd$word)" + __xe_debug "\t showing command description - '$description'" fi # Right-pad the command with spaces before the help string COMPREPLY+=( $(printf "%-${max_cmd_length}q %s" "$word" "$description") ) From 0ba2168675f2788250c502e97873657b78f16209 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 15:02:18 +0100 Subject: [PATCH 012/111] xe-cli completion: Eliminate duplicate suggestions early on We used to rely on Bash's completion removing duplicate entries from the suggestions, but processing them in the first place is unnecessary (and will slow down completion since there's usually an 'xe' command run for each entry in the wordlist). Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index b65c81d4b58..2f0c09776e2 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -784,7 +784,8 @@ __preprocess_suggestions() { wordlist=$( echo "$1" | \ sed -re 's/(^|[^\])((\\\\)*),,*/\1\2\n/g' -e 's/\\,/,/g' -e 's/\\\\/\\/g' | \ - sed -e 's/ *$//') + sed -e 's/ *$//' | \ + sort -u ) local IFS=$'\n' for word in $wordlist; do if [[ "$word" =~ ^$prefix.* ]]; then From 32a772a28e675e93dae11d4c959c01d46b51e7d9 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 15:05:02 +0100 Subject: [PATCH 013/111] xe-cli completion: Improve completion for 'resident-on=' and 'affinity=' Provide a helpful description for some parameters of 'xe vm-list', compare before: ``` $ xe vm-list resident-on= 64c11cad-2c52-4dea-aea6-5fae0e720699 7f566729-0ee7-47c4-853d-2c5f3a195ad4 ``` with after: ``` $ xe vm-list resident-on= 64c11cad-2c52-4dea-aea6-5fae0e720699 - hpmc30 7f566729-0ee7-47c4-853d-2c5f3a195ad4 - hpmc29 ``` Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index 2f0c09776e2..f7be3e3fbce 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -588,7 +588,21 @@ _xe() __xe_debug "fst is '$fst', snd is '$snd'" if [[ "$snd" == "list" || "$fst" == "vm" ]]; then IFS=$'\n,' - set_completions_for_names "${fst}-list" "$param" "$value" + + # Try to provide a helpful "description" to the suggestions + case "$param" in + resident-on | affinity) + SHOW_DESCRIPTION=1 + class="host" + ;; + *) + ;; + esac + + local name_label_cmd="$xe ${class}-list params=name-label 2>/dev/null --minimal uuid=" + __xe_debug "description class is '$class'" + + set_completions_for_names "${fst}-list" "$param" "$value" "$name_label_cmd" return 0 fi fi From 2a8e55b6c2c36a82ba8a2975a0da06c68aa2be84 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 15:07:45 +0100 Subject: [PATCH 014/111] xe-cli completion: Handle suggestions for 'suspend-{VDI,SR}-uuid' No completion was provided before, and it's handled properly now: ``` $ xe vm-list suspend-SR-uuid= 08906228-cbf6-dad4-720d-e581df11510a - SR1 37b734f0-e594-0e48-2114-cd063241dd36 - SR2 ``` Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index f7be3e3fbce..4126708f91b 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -566,11 +566,18 @@ _xe() else all="--all" fi - if [[ "$fst" == "into-vdi" || "$fst" == "base-vdi" || "$fst" == "vdi-from" || "$fst" == "vdi-to" ]]; then + + case "$fst" in + into-vdi | base-vdi | vdi-from | vdi-to | suspend-VDI) class=vdi - else + ;; + suspend-SR) + class=sr + ;; + *) class="$fst" - fi + ;; + esac # Show corresponding name labels for each UUID SHOW_DESCRIPTION=1 From ff8f112473832e8e4c74202b226e507b977d669b Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 08:57:53 +0100 Subject: [PATCH 015/111] xapi/helpers: Note that get_localhost can fail while the database is starting up Otherwise, errors like this can be a little bit confusing in the logs: ``` [dispatch:session.login_with_password |backtrace] Raised Db_exn.Read_missing_uuid("host", "", "236acc01-0f95-4af1-8b35-f5a2fb51c354") ``` Signed-off-by: Andrii Sultanov --- ocaml/xapi/helpers.ml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml index 75199a62fa9..318ddfecf8d 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -171,8 +171,13 @@ let get_localhost ~__context = match localhost_ref = Ref.null with | false -> localhost_ref - | true -> - get_localhost_uncached ~__context + | true -> ( + try get_localhost_uncached ~__context + with Db_exn.Read_missing_uuid (_, _, _) as e -> + Unixext.raise_with_preserved_backtrace e (fun () -> + warn "The database has not fully come up yet, so localhost is missing" + ) + ) (* Determine the gateway and DNS PIFs: * If one of the PIFs with IP has other_config:defaultroute=true, then From e544d857d872960efe45b25a4b225791873be7d7 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 11 Jun 2025 08:59:23 +0100 Subject: [PATCH 016/111] xapi_host: missing UEFI certificates warrant a warning, not an error It is a XenServer-specific expectation that these certificates should always be present on the host (they are not provided on XCP-ng by default, for example, due to licensing restrictions). The error log is not followed by any exception, and the missing UEFI certificates do not interrupt any operation, they just mean the host is set up differently (which can be verified by the clients with appropriate API calls like pool-get-guest-secureboot-readiness): ``` xapi: [error||Sync UEFI certificates on host with XAPI db |xapi_host] check_valid_uefi_certs: missing KEK.auth in /var/lib/varstored xapi: [error||Sync UEFI certificates on host with XAPI db |xapi_host] check_valid_uefi_certs: missing db.auth in /var/lib/varstored ``` These warrant a warning instead of an error log. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_host.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index 405733baa78..92297c2251f 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -2769,7 +2769,7 @@ let write_uefi_certificates_to_disk ~__context ~host = ["KEK.auth"; "db.auth"] |> List.iter (fun cert -> let log_of found = - (if found then info else error) + (if found then info else warn) "check_valid_uefi_certs: %s %s in %s" (if found then "found" else "missing") cert path From 6fe7c9e31d9866d85fe3b1368f80551195d13059 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Wed, 11 Jun 2025 06:49:45 +0000 Subject: [PATCH 017/111] CA-412164: XSI-1901: uid-info does not support `:` in gecos from https://en.wikipedia.org/wiki/Passwd#Password_file uid_info as following format username:password:uid:gid:gecos:homedir:shell Regarding gecos, it is recommended as follows Typically, this is a set of comma-separated values including the user's full name and contact details. However, this information comes form AD and user may mis-configure it with `:`, which is used as seperator. In such case, the parse would failed. Enhance the parse function to support `:` in gecos, other fields does not likely contain it. Signed-off-by: Lin Liu --- ocaml/tests/test_extauth_plugin_ADwinbind.ml | 21 +++++++++++++++ ocaml/xapi/extauth_plugin_ADwinbind.ml | 27 +++++++++++++++++--- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/ocaml/tests/test_extauth_plugin_ADwinbind.ml b/ocaml/tests/test_extauth_plugin_ADwinbind.ml index 5fe5bfc91cd..3c6d44daaa9 100644 --- a/ocaml/tests/test_extauth_plugin_ADwinbind.ml +++ b/ocaml/tests/test_extauth_plugin_ADwinbind.ml @@ -219,6 +219,27 @@ let test_parse_wbinfo_uid_info = ; gecos= {|ladmin|} } ) + (* XSI-1901: output of customer environment, has `:` in the gecos, + other fields does not likely contain it *) + ; ( {|HVS\udaadmin:*:3000000:3000000:ADMIN: Dalsem, Ulric:/home/HVS/udaadmin:/bin/bash|} + , Ok + { + user_name= {|HVS\udaadmin|} + ; uid= 3000000 + ; gid= 3000000 + ; gecos= {|ADMIN: Dalsem, Ulric|} + } + ) + (* Multiple `:` in gecos *) + ; ( {|HVS\udaadmin:*:3000000:3000000:ADMIN: Dalsem, Ulric, POOL OP: udaadmin:/home/HVS/udaadmin:/bin/bash|} + , Ok + { + user_name= {|HVS\udaadmin|} + ; uid= 3000000 + ; gid= 3000000 + ; gecos= {|ADMIN: Dalsem, Ulric, POOL OP: udaadmin|} + } + ) ; ( {|CONNAPP\locked:*:3000004:3000174::/home/CONNAPP/locked:/bin/bash|} , Ok {user_name= {|CONNAPP\locked|}; uid= 3000004; gid= 3000174; gecos= ""} diff --git a/ocaml/xapi/extauth_plugin_ADwinbind.ml b/ocaml/xapi/extauth_plugin_ADwinbind.ml index f23f1f5447e..9213b2dea09 100644 --- a/ocaml/xapi/extauth_plugin_ADwinbind.ml +++ b/ocaml/xapi/extauth_plugin_ADwinbind.ml @@ -686,11 +686,30 @@ module Wbinfo = struct let parse_uid_info stdout = (* looks like one line from /etc/passwd: https://en.wikipedia.org/wiki/Passwd#Password_file *) match String.split_on_char ':' stdout with - | [user_name; _passwd; uid; gid; gecos; _homedir; _shell] -> ( - try Ok {user_name; uid= int_of_string uid; gid= int_of_string gid; gecos} - with _ -> Error () - ) + | user_name :: _passwd :: uid :: gid :: rest -> ( + (* We expect at least homedir and shell at the end *) + let rest = List.rev rest in + match rest with + | _shell :: _homedir :: tail -> ( + (* Rev it back to original order *) + let tail = List.rev tail in + let gecos = String.concat ":" tail in + try + Ok + { + user_name + ; uid= int_of_string uid + ; gid= int_of_string gid + ; gecos + } + with _ -> Error () + ) + | _ -> + debug "%s uid_info format error: %s" __FUNCTION__ stdout ; + Error () + ) | _ -> + debug "%s uid_info format error: %s" __FUNCTION__ stdout ; Error () let uid_info_of_uid (uid : int) = From 3d755835e90afa7c26df63a8d19a7d61bd40226c Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Wed, 4 Jun 2025 14:09:27 +0100 Subject: [PATCH 018/111] CP-47063: Instrument xenops vm non-atomic functions. Instruments: - `VM.add`, - `VM.stat`, - `VM.exists`, - `VM.list`. Signed-off-by: Gabriel Buica --- ocaml/xapi-idl/lib/debug_info.ml | 2 +- ocaml/xapi-idl/lib/debug_info.mli | 2 +- ocaml/xenopsd/lib/xenops_server.ml | 16 ++++++++++++---- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ocaml/xapi-idl/lib/debug_info.ml b/ocaml/xapi-idl/lib/debug_info.ml index 5483d6bc451..edf3c4979a8 100644 --- a/ocaml/xapi-idl/lib/debug_info.ml +++ b/ocaml/xapi-idl/lib/debug_info.ml @@ -76,7 +76,7 @@ let to_log_string t = t.log (* Sets the logging context based on `dbg`. Also adds a new tracing span, linked to the parent span from `dbg`, if available. *) -let with_dbg ?(with_thread = false) ~module_name ~name ~dbg f = +let with_dbg ?(with_thread = false) ?(module_name = "") ~name ~dbg f = let di = of_string dbg in let f_with_trace () = let name = diff --git a/ocaml/xapi-idl/lib/debug_info.mli b/ocaml/xapi-idl/lib/debug_info.mli index fa2f6ff5d6a..9db63471035 100644 --- a/ocaml/xapi-idl/lib/debug_info.mli +++ b/ocaml/xapi-idl/lib/debug_info.mli @@ -24,7 +24,7 @@ val to_log_string : t -> string val with_dbg : ?with_thread:bool - -> module_name:string + -> ?module_name:string -> name:string -> dbg:string -> (t -> 'a) diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index 9b109c1c980..8fe027630fe 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -3682,7 +3682,9 @@ end module VM = struct module DB = VM_DB - let add _ dbg x = Debug.with_thread_associated dbg (fun () -> DB.add' x) () + let add _ dbg x = + Debug_info.with_dbg ~with_thread:true ~name:__FUNCTION__ ~dbg @@ fun _ -> + DB.add' x let rename _ dbg id1 id2 when' = queue_operation dbg id1 (Atomic (VM_rename (id1, id2, when'))) @@ -3719,11 +3721,17 @@ module VM = struct in (vm_t, state) - let stat _ dbg id = Debug.with_thread_associated dbg (fun () -> stat' id) () + let stat _ dbg id = + Debug_info.with_dbg ~with_thread:true ~name:__FUNCTION__ ~dbg @@ fun _ -> + stat' id - let exists _ _dbg id = match DB.read id with Some _ -> true | None -> false + let exists _ dbg id = + Debug_info.with_dbg ~name:__FUNCTION__ ~dbg @@ fun _ -> + match DB.read id with Some _ -> true | None -> false - let list _ dbg () = Debug.with_thread_associated dbg (fun () -> DB.list ()) () + let list _ dbg () = + Debug_info.with_dbg ~with_thread:true ~name:__FUNCTION__ ~dbg @@ fun _ -> + DB.list () let create _ dbg id = let no_sharept = false in From 5138375bff23315e0b5a62cce17d82771e6648a8 Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Wed, 4 Jun 2025 15:00:56 +0100 Subject: [PATCH 019/111] CP-47063: Instrument `switch_rpc` and message-switch client Instruments `switch_rpc` according to OpenTelemetry standard on instrumenting rpc calls. - `server.address` is the name of the message queue. Intruments sending the message on a queue according to OpenTelemetry standard on instrumenting messaging. - `destination` is the name of the message queue. `Tracing.with_tracing` now accepts an optional argument to set the Span Kind. Signed-off-by: Gabriel Buica --- ocaml/libs/tracing/tracing.ml | 8 +++-- ocaml/libs/tracing/tracing.mli | 39 +++++++++++----------- ocaml/libs/tracing/tracing_export.ml | 8 ++--- ocaml/message-switch/core/dune | 1 + ocaml/message-switch/core/make.ml | 2 +- ocaml/message-switch/core/s.ml | 3 +- ocaml/message-switch/unix/dune | 1 + ocaml/message-switch/unix/protocol_unix.ml | 22 +++++++++--- ocaml/xapi-idl/lib/xcp_client.ml | 29 ++++++++++++++-- 9 files changed, 80 insertions(+), 33 deletions(-) diff --git a/ocaml/libs/tracing/tracing.ml b/ocaml/libs/tracing/tracing.ml index c1cdc33692e..d320fd6061b 100644 --- a/ocaml/libs/tracing/tracing.ml +++ b/ocaml/libs/tracing/tracing.ml @@ -792,10 +792,14 @@ end let enable_span_garbage_collector ?(timeout = 86400.) () = Spans.GC.initialise_thread ~timeout -let with_tracing ?(attributes = []) ?(parent = None) ?trace_context ~name f = +let with_tracing ?(attributes = []) ?(parent = None) ?span_kind ?trace_context + ~name f = let tracer = Tracer.get_tracer ~name in if tracer.enabled then ( - match Tracer.start ~tracer ?trace_context ~attributes ~name ~parent () with + match + Tracer.start ?span_kind ~tracer ?trace_context ~attributes ~name ~parent + () + with | Ok span -> ( try let result = f span in diff --git a/ocaml/libs/tracing/tracing.mli b/ocaml/libs/tracing/tracing.mli index 262acb52f27..8323346a443 100644 --- a/ocaml/libs/tracing/tracing.mli +++ b/ocaml/libs/tracing/tracing.mli @@ -190,12 +190,12 @@ module Tracer : sig -> (Span.t option, exn) result val update_span_with_parent : Span.t -> Span.t option -> Span.t option - (**[update_span_with_parent s p] returns [Some span] where [span] is an + (**[update_span_with_parent s p] returns [Some span] where [span] is an updated verison of the span [s]. - If [p] is [Some parent], [span] is a child of [parent], otherwise it is the + If [p] is [Some parent], [span] is a child of [parent], otherwise it is the original [s]. - - If the span [s] is finished or is no longer considered an on-going span, + + If the span [s] is finished or is no longer considered an on-going span, returns [None]. *) @@ -209,7 +209,7 @@ module Tracer : sig val finished_span_hashtbl_is_empty : unit -> bool end -(** [TracerProvider] module provides ways to intereact with the tracer providers. +(** [TracerProvider] module provides ways to intereact with the tracer providers. *) module TracerProvider : sig (** Type that represents a tracer provider.*) @@ -222,7 +222,7 @@ module TracerProvider : sig -> name_label:string -> uuid:string -> unit - (** [create ~enabled ~attributes ~endpoints ~name_label ~uuid] initializes a + (** [create ~enabled ~attributes ~endpoints ~name_label ~uuid] initializes a tracer provider based on the following parameters: [enabled], [attributes], [endpoints], [name_label], and [uuid]. *) @@ -234,17 +234,17 @@ module TracerProvider : sig -> unit -> unit (** [set ?enabled ?attributes ?endpoints ~uuid ()] updates the tracer provider - identified by the given [uuid] with the new configuration paremeters: - [enabled], [attributes], and [endpoints]. - + identified by the given [uuid] with the new configuration paremeters: + [enabled], [attributes], and [endpoints]. + If any of the configuration parameters are missing, the old ones are kept. - + Raises [Failure] if there are no tracer provider with the given [uuid]. *) val destroy : uuid:string -> unit - (** [destroy ~uuid] destroys the tracer provider with the given [uuid]. + (** [destroy ~uuid] destroys the tracer provider with the given [uuid]. If there are no tracer provider with the given [uuid], it does nothing. *) @@ -269,6 +269,7 @@ val enable_span_garbage_collector : ?timeout:float -> unit -> unit val with_tracing : ?attributes:(string * string) list -> ?parent:Span.t option + -> ?span_kind:SpanKind.t -> ?trace_context:TraceContext.t -> name:string -> (Span.t option -> 'a) @@ -288,24 +289,24 @@ val get_observe : unit -> bool val validate_attribute : string * string -> bool -(** [EnvHelpers] module is a helper module for the tracing library to easily - transition back and forth between a string list of environment variables to - a traceparent. +(** [EnvHelpers] module is a helper module for the tracing library to easily + transition back and forth between a string list of environment variables to + a traceparent. *) module EnvHelpers : sig val traceparent_key : string (** [traceparent_key] is a constant the represents the key of the traceparent - environment variable. + environment variable. *) val of_traceparent : string option -> string list (** [of_traceparent traceparent_opt] returns a singleton list consisting of a - envirentment variable with the key [traceparent_key] and value [v] if + envirentment variable with the key [traceparent_key] and value [v] if [traceparent_opt] is [Some v]. Otherwise, returns an empty list. *) val to_traceparent : string list -> string option - (** [to_traceparent env_var_lst] returns [Some v] where v is the value of the - environmental variable coresponding to the key [traceparent_key] from a + (** [to_traceparent env_var_lst] returns [Some v] where v is the value of the + environmental variable coresponding to the key [traceparent_key] from a string list of environmental variables [env_var_lst]. If there is no such evironmental variable in the list, it returns [None]. *) @@ -314,7 +315,7 @@ module EnvHelpers : sig (** [of_span span] returns a singleton list consisting of a envirentment variable with the key [traceparent_key] and value [v], where [v] is traceparent representation of span [s] (if [span] is [Some s]). - + If [span] is [None], it returns an empty list. *) end diff --git a/ocaml/libs/tracing/tracing_export.ml b/ocaml/libs/tracing/tracing_export.ml index 5844d389e1c..c4cabb3c576 100644 --- a/ocaml/libs/tracing/tracing_export.ml +++ b/ocaml/libs/tracing/tracing_export.ml @@ -278,8 +278,8 @@ module Destination = struct ] in let@ _ = - with_tracing ~trace_context:TraceContext.empty ~parent ~attributes - ~name + with_tracing ~span_kind:Server ~trace_context:TraceContext.empty + ~parent ~attributes ~name in all_spans |> Content.Json.ZipkinV2.content_of @@ -293,8 +293,8 @@ module Destination = struct let ((_span_list, span_count) as span_info) = Spans.since () in let attributes = [("export.traces.count", string_of_int span_count)] in let@ parent = - with_tracing ~trace_context:TraceContext.empty ~parent:None ~attributes - ~name:"Tracing.flush_spans" + with_tracing ~span_kind:Server ~trace_context:TraceContext.empty + ~parent:None ~attributes ~name:"Tracing.flush_spans" in TracerProvider.get_tracer_providers () |> List.filter TracerProvider.get_enabled diff --git a/ocaml/message-switch/core/dune b/ocaml/message-switch/core/dune index d61746efe44..c9b5b3e2cff 100644 --- a/ocaml/message-switch/core/dune +++ b/ocaml/message-switch/core/dune @@ -9,6 +9,7 @@ sexplib sexplib0 threads.posix + tracing uri xapi-log xapi-stdext-threads diff --git a/ocaml/message-switch/core/make.ml b/ocaml/message-switch/core/make.ml index 43b7e301a9b..df1d003f5f5 100644 --- a/ocaml/message-switch/core/make.ml +++ b/ocaml/message-switch/core/make.ml @@ -229,7 +229,7 @@ functor in return (Ok t) - let rpc ~t ~queue ?timeout ~body:x () = + let rpc ?_span_parent ~t ~queue ?timeout ~body:x () = let ivar = M.Ivar.create () in let timer = Option.map diff --git a/ocaml/message-switch/core/s.ml b/ocaml/message-switch/core/s.ml index 423304d1b24..fefe4d7a1f6 100644 --- a/ocaml/message-switch/core/s.ml +++ b/ocaml/message-switch/core/s.ml @@ -144,7 +144,8 @@ module type CLIENT = sig (** [disconnect] closes the connection *) val rpc : - t:t + ?_span_parent:Tracing.Span.t + -> t:t -> queue:string -> ?timeout:int -> body:string diff --git a/ocaml/message-switch/unix/dune b/ocaml/message-switch/unix/dune index 92bddfd66fb..1858aa271b3 100644 --- a/ocaml/message-switch/unix/dune +++ b/ocaml/message-switch/unix/dune @@ -11,6 +11,7 @@ rpclib.core rpclib.json threads.posix + tracing xapi-stdext-threads xapi-stdext-unix ) diff --git a/ocaml/message-switch/unix/protocol_unix.ml b/ocaml/message-switch/unix/protocol_unix.ml index f7aa0802c0f..29b95f7ef12 100644 --- a/ocaml/message-switch/unix/protocol_unix.ml +++ b/ocaml/message-switch/unix/protocol_unix.ml @@ -347,7 +347,7 @@ module Client = struct Ok c' ) - let rpc ~t:c ~queue:dest_queue_name ?timeout ~body:x () = + let rpc ?_span_parent ~t:c ~queue:dest_queue_name ?timeout ~body:x () = let t = Ivar.create () in let timer = Option.map @@ -364,9 +364,23 @@ module Client = struct do_rpc c.requests_conn (In.CreatePersistent dest_queue_name) >>|= fun (_ : string) -> let msg = - In.Send - ( dest_queue_name - , {Message.payload= x; kind= Message.Request c.reply_queue_name} + Tracing.with_tracing + ~attributes: + [ + ("messaging.operation.name", "send") + ; ("messaging.system", "message-switch") + ; ("messaging.destination.name", dest_queue_name) + ] + ~span_kind:Producer ~parent:_span_parent + ~name:("send" ^ " " ^ dest_queue_name) + (fun _ -> + In.Send + ( dest_queue_name + , { + Message.payload= x + ; kind= Message.Request c.reply_queue_name + } + ) ) in do_rpc c.requests_conn msg >>|= fun (id : string) -> diff --git a/ocaml/xapi-idl/lib/xcp_client.ml b/ocaml/xapi-idl/lib/xcp_client.ml index 3ea0006b59c..435a63e3126 100644 --- a/ocaml/xapi-idl/lib/xcp_client.ml +++ b/ocaml/xapi-idl/lib/xcp_client.ml @@ -38,10 +38,35 @@ let switch_rpc ?timeout queue_name string_of_call response_of_string = get_ok (Message_switch_unix.Protocol_unix.Client.connect ~switch:!switch_path ()) in - fun call -> + fun (call : Rpc.call) -> + let _span_parent = + call.params + |> List.find_map (function Rpc.Dict kv_list -> Some kv_list | _ -> None) + |> Fun.flip Option.bind + (List.find_map (function + | "debug_info", Rpc.String debug_info -> + let di = debug_info |> Debug_info.of_string in + di.tracing + | _ -> + None + ) + ) + in + let rpc_service = "message_switch" in + Tracing.with_tracing + ~attributes: + [ + ("rpc.system", "ocaml-rpc") + ; ("rpc.service", rpc_service) + ; ("server.address", queue_name) + ; ("rpc.method", call.name) + ] + ~parent:_span_parent + ~name:(rpc_service ^ "/" ^ call.name) + @@ fun _span_parent -> response_of_string (get_ok - (Message_switch_unix.Protocol_unix.Client.rpc ~t ?timeout + (Message_switch_unix.Protocol_unix.Client.rpc ?_span_parent ~t ?timeout ~queue:queue_name ~body:(string_of_call call) () ) ) From 7825873c5e3b4f39032ba2011471cbe34c878f53 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 12 Jun 2025 10:42:12 +0100 Subject: [PATCH 020/111] xe-cli completion: Fix regex for checking previously provided parameters The regex removed parameters with a particular suffix instead of checking for the whole name. For example, after providing a uuid parameter to xe vif-move, network-uuid would no longer be suggested: ``` $ xe vif-move network-uuid= uuid= $ xe vif-move uuid=0af7619c-0798-c5be-5a0e-20813a48c7df ``` This is fixed now: ``` $ xe vif-move uuid=0af7619c-0798-c5be-5a0e-20813a48c7df $ xe vif-move uuid=0af7619c-0798-c5be-5a0e-20813a48c7df network-uuid= ``` Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index 8120df874f3..589ada92e0f 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -638,7 +638,7 @@ _xe() local previous_params="${OLDSTYLE_WORDS[@]:2:$params_len}" previous_params=$( echo "$previous_params" | cut -d= -f1 | \ sed -r '/^\s*$/d' | cut -d: -f1 | \ - sed -re 's/^/-e "\\s*/g' -e 's/$/[=:]"/g' | paste -sd " ") + sed -re 's/^/-e "^\\s*/g' -e 's/$/[=:]"/g' | paste -sd " ") set_completions "$SUBCOMMAND_PARAMS" "$param" "" "$previous_params" From 1f83af948684a2a4c2f2605ac7ae8200d150873c Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 12 Jun 2025 11:34:13 +0100 Subject: [PATCH 021/111] xapi-cli-server: Add some of the missing parameters to cli_frontend There were several optional boolean parameters that were checked and used in cli_operations but were not included in cli_frontend (therefore would not be shown in suggestions or 'xe help command'). Add these to cli_frontend. is_unique is a ... unique case because it does not follow the style of the CLI parameters (which use dashes, not underscores, to separate words), so add 'is-unique' to cli_frontend but handle both in cli_operations. Signed-off-by: Andrii Sultanov --- ocaml/xapi-cli-server/cli_frontend.ml | 12 +++++++++--- ocaml/xapi-cli-server/cli_operations.ml | 4 +++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ocaml/xapi-cli-server/cli_frontend.ml b/ocaml/xapi-cli-server/cli_frontend.ml index 2f6d2350345..cdb749df943 100644 --- a/ocaml/xapi-cli-server/cli_frontend.ml +++ b/ocaml/xapi-cli-server/cli_frontend.ml @@ -101,6 +101,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; "sr-uuid" ; "network-uuid" ; "pool-uuid" + ; "public" ] ; help= "Create a binary blob to be associated with an API object" ; implementation= No_fd Cli_operations.blob_create @@ -816,7 +817,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; ( "host-emergency-ha-disable" , { reqd= [] - ; optn= ["force"] + ; optn= ["force"; "soft"] ; help= "Disable HA on the local host. Only to be used to recover a pool \ with a broken HA setup." @@ -1776,6 +1777,8 @@ let rec cmdtable_data : (string * cmd_spec) list = ; "host-password" ; "type" ; "remote-config" + ; "dry-run" + ; "metadata" ; "url" ; "vdi:" ] @@ -1789,7 +1792,8 @@ let rec cmdtable_data : (string * cmd_spec) list = VDIs will be imported into the Pool's default SR unless an override \ is provided. If the force option is given then any disk data \ checksum failures will be ignored. If the parameter 'url' is \ - specified, xapi will attempt to import from that URL." + specified, xapi will attempt to import from that URL. Only metadata \ + will be imported if 'metadata' is true" ; implementation= With_fd Cli_operations.vm_import ; flags= [Standard] } @@ -1803,6 +1807,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; "compress" ; "metadata" ; "excluded-device-types" + ; "include-snapshots" ] ; help= "Export a VM to ." ; implementation= With_fd Cli_operations.vm_export @@ -2393,6 +2398,7 @@ let rec cmdtable_data : (string * cmd_spec) list = "name-description" ; "sharable" ; "read-only" + ; "managed" ; "other-config:" ; "xenstore-data:" ; "sm-config:" @@ -3831,7 +3837,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; ( "vtpm-create" , { reqd= ["vm-uuid"] - ; optn= [] + ; optn= ["is-unique"] ; help= "Create a VTPM associated with a VM." ; implementation= No_fd Cli_operations.VTPM.create ; flags= [] diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index fb75f559099..b098632f1c0 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -8118,7 +8118,9 @@ module VTPM = struct let create printer rpc session_id params = let vm_uuid = List.assoc "vm-uuid" params in let vM = Client.VM.get_by_uuid ~rpc ~session_id ~uuid:vm_uuid in - let is_unique = get_bool_param params "is_unique" in + let is_unique = + get_bool_param params "is_unique" || get_bool_param params "is-unique" + in let ref = Client.VTPM.create ~rpc ~session_id ~vM ~is_unique in let uuid = Client.VTPM.get_uuid ~rpc ~session_id ~self:ref in printer (Cli_printer.PList [uuid]) From 065b2a6a752a326ea7978fb66cc721a2fe93bc53 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 12 Jun 2025 11:37:22 +0100 Subject: [PATCH 022/111] xapi-cli-server: Remove old commented-out code This code hasn't been used for 10+ years. Signed-off-by: Andrii Sultanov --- ocaml/xapi-cli-server/cli_frontend.ml | 86 ++----------------------- ocaml/xapi-cli-server/cli_operations.ml | 55 ---------------- 2 files changed, 5 insertions(+), 136 deletions(-) diff --git a/ocaml/xapi-cli-server/cli_frontend.ml b/ocaml/xapi-cli-server/cli_frontend.ml index cdb749df943..b6be8a2023f 100644 --- a/ocaml/xapi-cli-server/cli_frontend.ml +++ b/ocaml/xapi-cli-server/cli_frontend.ml @@ -128,14 +128,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [] } ) - ; (* "host-introduce", - { - reqd=["name"; "address"; "remote-port"; "remote-username"; "remote-password"]; - optn=["description"]; - help="Introduce a remote host"; - implementation=No_fd Cli_operations.host_introduce - };*) - ( "pool-enable-binary-storage" + ; ( "pool-enable-binary-storage" , { reqd= [] ; optn= [] @@ -2768,17 +2761,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [Standard] } ) - ; (* - "diagnostic-event-deltas", - { - reqd=["class"]; - optn=[]; - help="Print the changes that are happening to all objects of class specified."; - implementation=With_fd Cli_operations.diagnostic_event_deltas; - flags=[]; - }; -*) - ( "diagnostic-license-status" + ; ( "diagnostic-license-status" , { reqd= [] ; optn= [] @@ -2998,35 +2981,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [] } ) - ; (* - "alert-create", - { - reqd=["message"]; - optn=["alert-level"]; - help="Create a new alert."; - implementation=No_fd Cli_operations.alert_create; - flags=[]; - }; - "alert-destroy", - { - reqd=["uuid"]; - optn=[]; - help="Destroy an Alert."; - implementation=No_fd Cli_operations.alert_destroy; - flags=[]; - }; -*) - (* - "host-fence", - { - reqd=["host-uuid"]; - optn=[]; - help="Fence a host"; - implementation=No_fd_local_session Cli_operations.host_fence; - flags=[]; - }; -*) - ( "pool-vlan-create" + ; ( "pool-vlan-create" , { reqd= ["pif-uuid"; "vlan"; "network-uuid"] ; optn= [] @@ -3187,28 +3142,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [Hidden; Neverforward] } ) - ; (* - "host-ha-query", - { - reqd=[]; - optn=[]; - help="Query the HA configuration of the local host."; - implementation=No_fd_local_session Cli_operations.host_ha_query; - flags=[Neverforward]; - }; - -*) - (* - "subject-list", - { - reqd=[]; - optn=[]; - help="Returns a list of subject names that can access the pool"; - implementation=No_fd Cli_operations.subject_list; - flags=[] - }; -*) - ( "subject-add" + ; ( "subject-add" , { reqd= ["subject-name"] ; optn= [] @@ -3254,17 +3188,7 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [] } ) - ; (* RBAC 2.0 only - "role-create", - { - reqd=["id";"name"]; - optn=[]; - help="Add a role to the pool"; - implementation=No_fd Cli_operations.role_create; - flags=[] - }; - *) - ( "session-subject-identifier-list" + ; ( "session-subject-identifier-list" , { reqd= [] ; optn= [] diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index b098632f1c0..d1f2a87a76e 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -7237,59 +7237,11 @@ let host_send_debug_keys _printer rpc session_id params = let keys = List.assoc "keys" params in Client.Host.send_debug_keys ~rpc ~session_id ~host ~keys -(* - let host_introduce printer rpc session_id params = - let name = List.assoc "name" params in - let descr = if List.mem_assoc "description" params then List.assoc "description" params else "" in - let address = List.assoc "address" params in - let port = List.assoc "remote-port" params in - let remote_username = List.assoc "remote-username" params in - let remote_password = List.assoc "remote-password" params in - ignore(Client.Credential.create_with_password ~rpc ~session_id name descr address (Int64.of_string port) remote_username remote_password) - *) - let task_cancel _printer rpc session_id params = let uuid = List.assoc "uuid" params in let task = Client.Task.get_by_uuid ~rpc ~session_id ~uuid in Client.Task.cancel ~rpc ~session_id ~task -(* - let alert_create printer rpc session_id params = - let string_to_alert_level s = - match s with - | "info" -> `Info - | "warning" | "warn" -> `Warn - | "error" -> `Error - | _ -> `Info - in - let message = List.assoc "message" params in - let level = if List.mem_assoc "level" params then List.assoc "level" params else "info" in - let level = string_to_alert_level level in - let alert = Client.Alert.create ~rpc ~session_id message [] level in - let uuid = Client.Alert.get_uuid ~rpc ~session_id alert in - printer (Cli_printer.PList [uuid]) - - let alert_destroy printer rpc session_id params = - let uuid = List.assoc "uuid" params in - let alert = Client.Alert.get_by_uuid ~rpc ~session_id uuid in - Client.Alert.destroy ~rpc ~session_id alert - *) - -(* - let subject_list printer rpc session_id params = -(* we get all subjects from the pool *) - let subjects = Client.Subject.get_all_records ~rpc ~session_id in - let table_of_subject (subject,record) = - [ "subject-uuid", record.API.subject_uuid; - "subject-identifier", record.API.subject_subject_identifier; -(* "subject-name", Client.Subject.get_subject_name ~rpc ~session_id subject;*) - ] @ - record.API.subject_other_config - in - let all = List.map table_of_subject subjects in - printer (Cli_printer.PTable all) - *) - let subject_add printer rpc session_id params = let subject_name = List.assoc "subject-name" params in (* let's try to resolve the subject_name to a subject_id using the external directory *) @@ -7380,13 +7332,6 @@ let audit_log_get fd _printer rpc session_id params = download_file_with_task fd rpc session_id filename Constants.audit_log_uri query label label -(* RBAC 2.0 only - let role_create printer rpc session_id params = - (*let id = List.assoc "id" params in*) - let name = List.assoc "name" params in - ignore (Client.Role.create ~rpc ~session_id ~name ~description:"" ~permissions:[] ~is_basic:false ~is_complete:false) -*) - let session_subject_identifier_list printer rpc session_id _params = let subject_identifiers = Client.Session.get_all_subject_identifiers ~rpc ~session_id From 80c6afec41295fb2d16f3baab97d87c50bd1b023 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 12 Jun 2025 11:38:14 +0100 Subject: [PATCH 023/111] xe-cli completion: Add more boolean parameters Signed-off-by: Andrii Sultanov --- ocaml/xe-cli/bash-completion | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ocaml/xe-cli/bash-completion b/ocaml/xe-cli/bash-completion index 589ada92e0f..7ed05f0f1c8 100644 --- a/ocaml/xe-cli/bash-completion +++ b/ocaml/xe-cli/bash-completion @@ -542,12 +542,16 @@ _xe() hvm | nomigrate | nested-virt | PV-drivers-up-to-date | \ PV-drivers-detected | live | cooperative | enforce-homogeneity | \ host-metrics-live | sharable | read-only | storage-lock | missing | \ - metadata-latest | empty | clustered | pool-auto-join | joined) + metadata-latest | empty | clustered | pool-auto-join | joined | \ + dry-run | metadata | paused | approximate | copy | progress | public | \ + include-snapshots | preserve-power-state | soft | update | is-unique) # Until autocompletion can be generated from the # datamodel, this is just naive hardcoding. These cases were # obtained by looking for boolean fields: # 'xapi-cli-server/records.ml | grep bool_of_string' and # 'grep string_of_bool' + # and + # 'xapi-cli-server/cli_frontend.ml | grep get_bool_param' __xe_debug "triggering autocompletion for boolean params" IFS=$'\n,' set_completions 'true,false' "$value" From 69087b16ba65919aa63d062bfd6bf9cacf11b959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 11 Jun 2025 15:36:01 +0100 Subject: [PATCH 024/111] CA-412313: Tracing_export: flush on XAPI exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintenance mode is entered by running Host.evacuate, followed by promoting a new pool coordinator and shutting down XAPI. We only export spans every 30s, so we may miss exporting the span for Host.evacuate. Ensure that we at least trigger the export when XAPI is about to shutdown. Do not wait for the export to finish, because this could take a long time (e.g. when exporting to a remote Jaeger instance). After this change I now see Host.evacuate properly in the exported trace. Signed-off-by: Edwin Török --- ocaml/libs/tracing/tracing_export.ml | 13 +++++++++++-- ocaml/libs/tracing/tracing_export.mli | 6 +++--- ocaml/tests/bench/bench_tracing.ml | 4 ++-- ocaml/xapi/xapi_fuse.ml | 2 ++ 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ocaml/libs/tracing/tracing_export.ml b/ocaml/libs/tracing/tracing_export.ml index 5844d389e1c..6b4371350cc 100644 --- a/ocaml/libs/tracing/tracing_export.ml +++ b/ocaml/libs/tracing/tracing_export.ml @@ -306,6 +306,8 @@ module Destination = struct (* Note this signal will flush the spans and terminate the exporter thread *) let signal () = Delay.signal delay + let wait_exit = Delay.make () + let create_exporter () = enable_span_garbage_collector () ; Thread.create @@ -319,7 +321,8 @@ module Destination = struct signaled := true ) ; flush_spans () - done + done ; + Delay.signal wait_exit ) () @@ -339,6 +342,12 @@ module Destination = struct ) end -let flush_and_exit = Destination.signal +let flush_and_exit ~max_wait () = + D.debug "flush_and_exit: signaling thread to export now" ; + Destination.signal () ; + if Delay.wait Destination.wait_exit max_wait then + D.info "flush_and_exit: timeout on span export" + else + D.debug "flush_and_exit: span export finished" let main = Destination.main diff --git a/ocaml/libs/tracing/tracing_export.mli b/ocaml/libs/tracing/tracing_export.mli index 3f8ca750026..f322bd2404c 100644 --- a/ocaml/libs/tracing/tracing_export.mli +++ b/ocaml/libs/tracing/tracing_export.mli @@ -85,9 +85,9 @@ module Destination : sig end end -val flush_and_exit : unit -> unit -(** [flush_and_exit ()] sends a signal to flush the finish spans and terminate - the exporter thread. +val flush_and_exit : max_wait:float -> unit -> unit +(** [flush_and_exit ~max_wait ()] sends a signal to flush the finish spans and terminate + the exporter thread. It waits at most [max_wait] seconds. *) val main : unit -> Thread.t diff --git a/ocaml/tests/bench/bench_tracing.ml b/ocaml/tests/bench/bench_tracing.ml index ff8d872ee64..8db30cfc220 100644 --- a/ocaml/tests/bench/bench_tracing.ml +++ b/ocaml/tests/bench/bench_tracing.ml @@ -25,7 +25,7 @@ let export_thread = (* need to ensure this isn't running outside the benchmarked section, or bechamel might fail with 'Failed to stabilize GC' *) - let after _ = Tracing_export.flush_and_exit () in + let after _ = Tracing_export.flush_and_exit ~max_wait:0. () in Bechamel_simple_cli.thread_workload ~before:Tracing_export.main ~after ~run:ignore @@ -52,7 +52,7 @@ let allocate () = let free t = Tracing.TracerProvider.destroy ~uuid ; - Tracing_export.flush_and_exit () ; + Tracing_export.flush_and_exit ~max_wait:0. () ; Thread.join t let test_tracing_on ?(overflow = false) ~name f = diff --git a/ocaml/xapi/xapi_fuse.ml b/ocaml/xapi/xapi_fuse.ml index 48d0737a613..8c2b5b56d3d 100644 --- a/ocaml/xapi/xapi_fuse.ml +++ b/ocaml/xapi/xapi_fuse.ml @@ -52,6 +52,8 @@ let light_fuse_and_run ?(fuse_length = !Constants.fuse_time) () = in let new_fuse_length = max 5. (fuse_length -. delay_so_far) in debug "light_fuse_and_run: current RRDs have been saved" ; + ignore + (Thread.create Tracing_export.(flush_and_exit ~max_wait:new_fuse_length) ()) ; ignore (Thread.create (fun () -> From a54505e4247778162970fc3c1b2d31fd73907c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 11 Jun 2025 16:02:35 +0100 Subject: [PATCH 025/111] CA-412313: xs-trace: introduce a pp command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit json_reformat cannot handle newline delimited json, it is easier if we have a command to reformat it ourselves. This can be useful when debugging why a trace is missing elements. Traces are stored as newline-delimited JSON in /var/log/dt/zipkinv2/json, however json_reformat cannot process them directly, and the lines can be very long and difficult to read otherwise. Signed-off-by: Edwin Török --- ocaml/xs-trace/dune | 29 ++++++++++++----------------- ocaml/xs-trace/xs_trace.ml | 32 +++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/ocaml/xs-trace/dune b/ocaml/xs-trace/dune index e34fc7e5575..4a19b8c888a 100644 --- a/ocaml/xs-trace/dune +++ b/ocaml/xs-trace/dune @@ -1,23 +1,18 @@ (executable - (modes exe) - (name xs_trace) - (public_name xs-trace) - (package xapi-tools) - (libraries - uri - tracing - cmdliner - tracing_export - xapi-stdext-unix - zstd - ) -) + (modes exe) + (name xs_trace) + (public_name xs-trace) + (package xapi-tools) + (libraries uri tracing cmdliner tracing_export yojson xapi-stdext-unix zstd)) (rule - (targets xs-trace.1) - (deps (:exe xs_trace.exe)) - (action (with-stdout-to %{targets} (run %{exe} --help=groff))) -) + (targets xs-trace.1) + (deps + (:exe xs_trace.exe)) + (action + (with-stdout-to + %{targets} + (run %{exe} --help=groff)))) ; not expected by the specfile ;(install diff --git a/ocaml/xs-trace/xs_trace.ml b/ocaml/xs-trace/xs_trace.ml index 6360649fb20..a5f0c8becef 100644 --- a/ocaml/xs-trace/xs_trace.ml +++ b/ocaml/xs-trace/xs_trace.ml @@ -25,10 +25,7 @@ module Exporter = struct | _ -> () - (** Export traces from file system to a remote endpoint. *) - let export erase src dst = - let dst = Uri.of_string dst in - let submit_json = submit_json dst in + let iter_src src f = let rec export_file = function | path when Sys.is_directory path -> (* Recursively export trace files. *) @@ -38,7 +35,7 @@ module Exporter = struct (* Decompress compressed trace file and submit each line iteratively *) let args = [|"zstdcat"; path|] in let ic = Unix.open_process_args_in args.(0) args in - Unixext.lines_iter submit_json ic ; + Unixext.lines_iter f ic ; match Unix.close_process_in ic with | Unix.WEXITED 0 -> () @@ -47,15 +44,27 @@ module Exporter = struct ) | path when Filename.check_suffix path ".ndjson" -> (* Submit traces line by line. *) - Unixext.readfile_line submit_json path + Unixext.readfile_line f path | path -> (* Assume any other extension is a valid JSON file. *) let json = Unixext.string_of_file path in - submit_json json + f json in - export_file src ; + export_file src + + (** Export traces from file system to a remote endpoint. *) + let export erase src dst = + let dst = Uri.of_string dst in + let submit_json = submit_json dst in + iter_src src submit_json ; if erase then Unixext.rm_rec ~rm_top:true src + + let pretty_print src = + iter_src src @@ fun line -> + line + |> Yojson.Safe.from_string + |> Yojson.Safe.pretty_to_channel ~std:true stdout end module Cli = struct @@ -83,6 +92,11 @@ module Cli = struct let doc = "copy a trace to an endpoint and erase it afterwards" in Cmd.(v (info "mv" ~doc) term) + let pp_cmd = + let term = Term.(const Exporter.pretty_print $ src) in + let doc = "Pretty print NDJSON traces" in + Cmd.(v (info "pp" ~doc) term) + let xs_trace_cmd = let man = [ @@ -94,7 +108,7 @@ module Cli = struct let doc = "utility for working with local trace files" in Cmd.info "xs-trace" ~doc ~version:"0.1" ~man in - Cmd.group desc [cp_cmd; mv_cmd] + Cmd.group desc [cp_cmd; mv_cmd; pp_cmd] let main () = Cmd.eval xs_trace_cmd end From c5a914565b094cd5032a5e9bfcd8fd5948c98c7b Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 12 Jun 2025 16:12:41 +0100 Subject: [PATCH 026/111] xenopsd: Allow to override the default NUMA placement Use the numa-compat argument to be able to override the default numa placement using xenopsd.conf option. This allows to change the default placement when building a package This patch reverts some of the changes in e6f94be82198532d165018677a303a554c62c7ba Signed-off-by: Pau Ruiz Safont --- ocaml/xenopsd/lib/xenopsd.ml | 9 ++++----- ocaml/xenopsd/xc/xenops_server_xen.ml | 2 ++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ocaml/xenopsd/lib/xenopsd.ml b/ocaml/xenopsd/lib/xenopsd.ml index 9c5e83e04ce..ccacea0ed8b 100644 --- a/ocaml/xenopsd/lib/xenopsd.ml +++ b/ocaml/xenopsd/lib/xenopsd.ml @@ -59,6 +59,8 @@ let feature_flags_path = ref "/etc/xenserver/features.d" let pvinpvh_xen_cmdline = ref "pv-shim console=xen" +let numa_placement_compat = ref true + (* O(N^2) operations, until we get a xenstore cache, so use a small number here *) let vm_guest_agent_xenstore_quota = ref 128 @@ -240,11 +242,8 @@ let options = , "Command line for the inner-xen for PV-in-PVH guests" ) ; ( "numa-placement" - , Arg.Bool (fun _ -> ()) - , (fun () -> - string_of_bool - (!Xenops_server.default_numa_affinity_policy = Best_effort) - ) + , Arg.Bool (fun x -> numa_placement_compat := x) + , (fun () -> string_of_bool !numa_placement_compat) , "NUMA-aware placement of VMs (deprecated, use XAPI setting)" ) ; ( "pci-quarantine" diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index a1a37085659..cdc54d32873 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -5259,6 +5259,8 @@ let init () = {Xs_protocol.ACL.owner= 0; other= Xs_protocol.ACL.READ; acl= []} ) ; Device.Backend.init () ; + Xenops_server.default_numa_affinity_policy := + if !Xenopsd.numa_placement_compat then Best_effort else Any ; info "Default NUMA affinity policy is '%s'" Xenops_server.(string_of_numa_affinity_policy !default_numa_affinity_policy) ; Xenops_server.numa_placement := !Xenops_server.default_numa_affinity_policy ; From b8f3ab4aa77a73bd6dfb76a58681e4813279382d Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Fri, 13 Jun 2025 10:44:27 +0100 Subject: [PATCH 027/111] Fix `message-switch` opam metadata Signed-off-by: Gabriel Buica --- dune-project | 4 +++- opam/message-switch-core.opam | 1 + opam/message-switch-unix.opam | 1 + opam/message-switch.opam | 1 + opam/message-switch.opam.template | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dune-project b/dune-project index 56de01f0fd3..6ed0602a185 100644 --- a/dune-project +++ b/dune-project @@ -651,6 +651,7 @@ (= :version)) (xapi-stdext-threads (= :version)) + (xapi-tracing (= :version)) (odoc :with-doc))) (package @@ -669,7 +670,8 @@ ppx_deriving_rpc rpclib (xapi-stdext-threads - (= :version)))) + (= :version)) + (xapi-tracing (= :version)))) (package (name message-switch)) diff --git a/opam/message-switch-core.opam b/opam/message-switch-core.opam index a6b183bdd7f..dc4ca95da07 100644 --- a/opam/message-switch-core.opam +++ b/opam/message-switch-core.opam @@ -20,6 +20,7 @@ depends: [ "uri" "xapi-log" {= version} "xapi-stdext-threads" {= version} + "xapi-tracing" {= version} "odoc" {with-doc} ] build: [ diff --git a/opam/message-switch-unix.opam b/opam/message-switch-unix.opam index c9379979e2d..975d81ac831 100644 --- a/opam/message-switch-unix.opam +++ b/opam/message-switch-unix.opam @@ -16,6 +16,7 @@ depends: [ "ppx_deriving_rpc" "rpclib" "xapi-stdext-threads" {= version} + "xapi-tracing" {= version} "odoc" {with-doc} ] build: [ diff --git a/opam/message-switch.opam b/opam/message-switch.opam index f0dcf7ff224..41613cb034f 100644 --- a/opam/message-switch.opam +++ b/opam/message-switch.opam @@ -30,6 +30,7 @@ depends: [ "sexplib" "shared-block-ring" {>= "2.3.0"} "xapi-stdext-unix" + "xapi-tracing" ] synopsis: "A simple store-and-forward message switch" description: """ diff --git a/opam/message-switch.opam.template b/opam/message-switch.opam.template index a33fe27cb3e..0e8ec76c2e6 100644 --- a/opam/message-switch.opam.template +++ b/opam/message-switch.opam.template @@ -28,6 +28,7 @@ depends: [ "sexplib" "shared-block-ring" {>= "2.3.0"} "xapi-stdext-unix" + "xapi-tracing" ] synopsis: "A simple store-and-forward message switch" description: """ From a3a7ca4d025230ae57abcc4feb1391d67a97862a Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Fri, 13 Jun 2025 11:13:02 +0100 Subject: [PATCH 028/111] opam: generate xapi-log with dune Adds the missing dependency to stdext-threads for testing Signed-off-by: Pau Ruiz Safont --- dune-project | 13 ++++++++++- opam/xapi-log.opam | 45 ++++++++++++++++++++----------------- opam/xapi-log.opam.template | 29 ------------------------ 3 files changed, 37 insertions(+), 50 deletions(-) delete mode 100644 opam/xapi-log.opam.template diff --git a/dune-project b/dune-project index 56de01f0fd3..6030fb9ee92 100644 --- a/dune-project +++ b/dune-project @@ -221,7 +221,18 @@ (name xapi-nbd)) (package - (name xapi-log)) + (name xapi-log) + (synopsis "A Logs library required by xapi") + (description + "This package is provided for backwards compatibility only. No new package should use it.") + (depends + astring + fmt + logs + mtime + xapi-backtrace + (xapi-stdext-pervasives (= :version)) + (xapi-stdext-threads (and :with-test (= :version))))) (package (name xapi-idl)) diff --git a/opam/xapi-log.opam b/opam/xapi-log.opam index d83f9bec7c6..b811d1f7f9e 100644 --- a/opam/xapi-log.opam +++ b/opam/xapi-log.opam @@ -1,31 +1,36 @@ # This file is generated by dune, edit dune-project instead -license: "LGPL-2.1-only WITH OCaml-LGPL-linking-exception" opam-version: "2.0" -maintainer: "xen-api@lists.xen.org" -authors: "xen-api@lists.xen.org" +synopsis: "A Logs library required by xapi" +description: + "This package is provided for backwards compatibility only. No new package should use it." +maintainer: ["Xapi project maintainers"] +authors: ["xen-api@lists.xen.org"] +license: "LGPL-2.1-only WITH OCaml-LGPL-linking-exception" homepage: "https://xapi-project.github.io/" -bug-reports: "https://github.com/xapi-project/xen-api.git" -dev-repo: "git+https://github.com/xapi-project/xen-api.git" -build: [ - ["dune" "build" "-p" name "-j" jobs] - ["dune" "runtest" "-p" name "-j" jobs] {with-test} -] -available: [ os = "linux" ] +bug-reports: "https://github.com/xapi-project/xen-api/issues" depends: [ - "ocaml" "dune" {>= "3.15"} "astring" "fmt" "logs" "mtime" "xapi-backtrace" - "xapi-stdext-pervasives" + "xapi-stdext-pervasives" {= version} + "xapi-stdext-threads" {with-test & = version} + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] ] -synopsis: "Library required by xapi" -description: """ -These libraries are provided for backwards compatibility only. -No new code should use these libraries.""" -url { - src: - "https://github.com/xapi-project/xen-api/archive/master.tar.gz" -} +dev-repo: "git+https://github.com/xapi-project/xen-api.git" diff --git a/opam/xapi-log.opam.template b/opam/xapi-log.opam.template deleted file mode 100644 index 00b5cce6fd5..00000000000 --- a/opam/xapi-log.opam.template +++ /dev/null @@ -1,29 +0,0 @@ -opam-version: "2.0" -maintainer: "xen-api@lists.xen.org" -authors: "xen-api@lists.xen.org" -homepage: "https://xapi-project.github.io/" -bug-reports: "https://github.com/xapi-project/xen-api.git" -dev-repo: "git+https://github.com/xapi-project/xen-api.git" -build: [ - ["dune" "build" "-p" name "-j" jobs] - ["dune" "runtest" "-p" name "-j" jobs] {with-test} -] -available: [ os = "linux" ] -depends: [ - "ocaml" - "dune" {>= "3.15"} - "astring" - "fmt" - "logs" - "mtime" - "xapi-backtrace" - "xapi-stdext-pervasives" -] -synopsis: "Library required by xapi" -description: """ -These libraries are provided for backwards compatibility only. -No new code should use these libraries.""" -url { - src: - "https://github.com/xapi-project/xen-api/archive/master.tar.gz" -} From c35e8e2e7cc74c666b182353086e0c76786c11c2 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Fri, 13 Jun 2025 11:22:38 +0100 Subject: [PATCH 029/111] xapi-log: remove circular dependency on tests unfortunately threadext uses this log package, define a with_lock in xapi-log tests to avoid using the former. Signed-off-by: Pau Ruiz Safont --- dune-project | 3 +-- ocaml/libs/log/test/dune | 2 +- ocaml/libs/log/test/log_test.ml | 6 +++++- ocaml/libs/log/test/log_test.t | 2 +- opam/xapi-log.opam | 1 - 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/dune-project b/dune-project index 6030fb9ee92..d54087f6c6b 100644 --- a/dune-project +++ b/dune-project @@ -231,8 +231,7 @@ logs mtime xapi-backtrace - (xapi-stdext-pervasives (= :version)) - (xapi-stdext-threads (and :with-test (= :version))))) + (xapi-stdext-pervasives (= :version)))) (package (name xapi-idl)) diff --git a/ocaml/libs/log/test/dune b/ocaml/libs/log/test/dune index 299a6155eac..75fbbad7557 100644 --- a/ocaml/libs/log/test/dune +++ b/ocaml/libs/log/test/dune @@ -1,6 +1,6 @@ (executable (name log_test) - (libraries log xapi-stdext-threads threads.posix xapi-backtrace)) + (libraries log threads.posix xapi-backtrace)) (cram (package xapi-log) diff --git a/ocaml/libs/log/test/log_test.ml b/ocaml/libs/log/test/log_test.ml index 53d5cf0ddeb..b493b18d426 100644 --- a/ocaml/libs/log/test/log_test.ml +++ b/ocaml/libs/log/test/log_test.ml @@ -6,12 +6,16 @@ let a = [||] let buggy () = a.(1) <- 0 +let with_lock mutex f = + let finally () = Mutex.unlock mutex in + Mutex.lock mutex ; Fun.protect ~finally f + let () = Printexc.record_backtrace true ; Debug.log_to_stdout () ; () |> Debug.with_thread_associated "main" @@ fun () -> - try Xapi_stdext_threads.Threadext.Mutex.execute m buggy + try with_lock m buggy with e -> D.log_backtrace e ; D.warn "Got exception: %s" (Printexc.to_string e) diff --git a/ocaml/libs/log/test/log_test.t b/ocaml/libs/log/test/log_test.t index b51ea26fca0..20d41233f8a 100644 --- a/ocaml/libs/log/test/log_test.t +++ b/ocaml/libs/log/test/log_test.t @@ -3,7 +3,7 @@ [|error||0 |main|backtrace] 1/4 log_test.exe Raised at file ocaml/libs/log/test/log_test.ml, line 7 [|error||0 |main|backtrace] 2/4 log_test.exe Called from file fun.ml, line 33 [|error||0 |main|backtrace] 3/4 log_test.exe Called from file fun.ml, line 38 - [|error||0 |main|backtrace] 4/4 log_test.exe Called from file ocaml/libs/log/test/log_test.ml, line 14 + [|error||0 |main|backtrace] 4/4 log_test.exe Called from file ocaml/libs/log/test/log_test.ml, line 18 [|error||0 |main|backtrace] [| warn||0 |main|log_test.ml] Got exception: Invalid_argument("index out of bounds") diff --git a/opam/xapi-log.opam b/opam/xapi-log.opam index b811d1f7f9e..12840be135b 100644 --- a/opam/xapi-log.opam +++ b/opam/xapi-log.opam @@ -16,7 +16,6 @@ depends: [ "mtime" "xapi-backtrace" "xapi-stdext-pervasives" {= version} - "xapi-stdext-threads" {with-test & = version} "odoc" {with-doc} ] build: [ From 87f82e7d0a645ccb4390861a08632bfdc58bf30f Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Fri, 13 Jun 2025 11:30:59 +0100 Subject: [PATCH 030/111] datamodel_lifecycle: automatic update Signed-off-by: Gabriel Buica --- ocaml/idl/datamodel_lifecycle.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index fc9acec7bd1..bbab96b8f0f 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -246,7 +246,7 @@ let prototyped_of_message = function | "VM", "restart_device_models" -> Some "23.30.0" | "VM", "call_host_plugin" -> - Some "25.21.0-next" + Some "25.22.0" | "VM", "set_groups" -> Some "24.19.1" | "pool", "set_console_idle_timeout" -> From d32c7cea3903a9223fd90bbd0fdd140b1143e4c8 Mon Sep 17 00:00:00 2001 From: Changlei Li Date: Thu, 12 Jun 2025 11:02:53 +0800 Subject: [PATCH 031/111] CA-412146 Filter out VF when scan SR-IOV (Single Root I/O Virtualization) is a technology that allows a single physical PCI Express (PCIe) device, such as a network adapter, to be shared efficiently among multiple virtual machines (VMs) or containers. It achieves this by creating Virtual Functions (VFs) that act as lightweight PCIe functions, each assigned to a VM, while the Physical Function (PF) remains responsible for managing the device. Add check in Sysfs.is_physical - check if there is "physfn" in the device dir to filter out VF, then XAPI will not create PIF object for VF during scan. Signed-off-by: Changlei Li --- ocaml/networkd/lib/network_utils.ml | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/ocaml/networkd/lib/network_utils.ml b/ocaml/networkd/lib/network_utils.ml index 846c517c82e..2c3cdab9fb8 100644 --- a/ocaml/networkd/lib/network_utils.ml +++ b/ocaml/networkd/lib/network_utils.ml @@ -181,18 +181,29 @@ module Sysfs = struct close_out outchan ; raise (Network_error (Write_error file)) - let is_physical name = + exception Unable_to_read_driver_link + + let is_vif name = + let devpath = getpath name "device" in try - let devpath = getpath name "device" in let driver_link = Unix.readlink (devpath ^ "/driver") in (* filter out symlinks under device/driver which look like /../../../devices/xen-backend/vif- *) - not - (List.mem "xen-backend" - (Astring.String.cuts ~empty:false ~sep:"/" driver_link) - ) + List.mem "xen-backend" + (Astring.String.cuts ~empty:false ~sep:"/" driver_link) + with _ -> raise Unable_to_read_driver_link + + let is_vf name = + let devpath = getpath name "device" in + try + ignore @@ Unix.readlink (devpath ^ "/physfn") ; + true with _ -> false + let is_physical name = + try not (is_vif name || is_vf name) + with Unable_to_read_driver_link -> false + (* device types are defined in linux/if_arp.h *) let is_ether_device name = match int_of_string (read_one_line (getpath name "type")) with @@ -1547,7 +1558,7 @@ module Ovs = struct let vif_arg = let existing_vifs = List.filter - (fun iface -> not (Sysfs.is_physical iface)) + (fun iface -> try Sysfs.is_vif iface with _ -> false) (bridge_to_interfaces name) in let ifaces_with_type = From 61374b31d0acb6d349dc73d9c1167d6ae82818fe Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Mon, 16 Jun 2025 03:29:06 +0100 Subject: [PATCH 032/111] Update datamodel_host Signed-off-by: Bengang Yuan --- ocaml/idl/datamodel_host.ml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocaml/idl/datamodel_host.ml b/ocaml/idl/datamodel_host.ml index f0bce099389..2be03f90993 100644 --- a/ocaml/idl/datamodel_host.ml +++ b/ocaml/idl/datamodel_host.ml @@ -1304,7 +1304,7 @@ let create_params = param_type= Bool ; param_name= "ssh_enabled" ; param_doc= "True if SSH access is enabled for the host" - ; param_release= numbered_release "25.20.0-next" + ; param_release= numbered_release "25.21.0" ; param_default= Some (VBool Constants.default_ssh_enabled) } ; { @@ -1314,7 +1314,7 @@ let create_params = "The timeout in seconds after which SSH access will be automatically \ disabled (0 means never), this setting will be applied every time the \ SSH is enabled by XAPI" - ; param_release= numbered_release "25.20.0-next" + ; param_release= numbered_release "25.21.0" ; param_default= Some (VInt Constants.default_ssh_enabled_timeout) } ; { @@ -1323,7 +1323,7 @@ let create_params = ; param_doc= "The time in UTC after which the SSH access will be automatically \ disabled" - ; param_release= numbered_release "25.20.0-next" + ; param_release= numbered_release "25.21.0" ; param_default= Some (VDateTime Date.epoch) } ; { @@ -1332,7 +1332,7 @@ let create_params = ; param_doc= "The timeout in seconds after which idle console will be automatically \ terminated (0 means never)" - ; param_release= numbered_release "25.20.0-next" + ; param_release= numbered_release "25.21.0" ; param_default= Some (VInt Constants.default_console_idle_timeout) } ] @@ -1348,7 +1348,7 @@ let create = kept for host joined a pool" ) ; ( Changed - , "25.20.0-next" + , "25.21.0" , "Added --ssh_enabled --ssh_enabled_timeout --ssh_expiry \ --console_idle_timeout options to allow them to be configured for \ new host" From 62b473381f57f12f8c197d64d344d481c4a662b8 Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Mon, 16 Jun 2025 03:34:56 +0100 Subject: [PATCH 033/111] Update XE_SR_ERRORCODES from SM Signed-off-by: Bengang Yuan --- ocaml/sdk-gen/csharp/XE_SR_ERRORCODES.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ocaml/sdk-gen/csharp/XE_SR_ERRORCODES.xml b/ocaml/sdk-gen/csharp/XE_SR_ERRORCODES.xml index 47fefd83086..792fe17fcd7 100644 --- a/ocaml/sdk-gen/csharp/XE_SR_ERRORCODES.xml +++ b/ocaml/sdk-gen/csharp/XE_SR_ERRORCODES.xml @@ -513,6 +513,12 @@ 117 + + PVMultiIDs + PVs found with multiple SCSI IDs + 119 + + APISession From a4602b5a58d063c130aef3fbf8b2e76e4a53f3e5 Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Wed, 18 Jun 2025 15:42:30 +0100 Subject: [PATCH 034/111] CP-308253: `Task.destroy` spans should no longer be orphaned Simplifies the logic of `exec_with_context` by letting the caller decide when the task is destroyed from the database. Adds helper function in `context.ml` to destroy and trace the destroy op correctly. Signed-off-by: Gabriel Buica --- ocaml/xapi/context.ml | 8 ++++ ocaml/xapi/context.mli | 2 + ocaml/xapi/server_helpers.ml | 73 ++++++++++++++++++------------------ 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/ocaml/xapi/context.ml b/ocaml/xapi/context.ml index b71ed4ca234..e57c3c71eca 100644 --- a/ocaml/xapi/context.ml +++ b/ocaml/xapi/context.ml @@ -504,6 +504,14 @@ let get_client_ip context = let get_user_agent context = match context.origin with Internal -> None | Http (rq, _) -> rq.user_agent +let finally_destroy_context ~__context f = + let tracing = __context.tracing in + Xapi_stdext_pervasives.Pervasiveext.finally f (fun () -> + __context.tracing <- tracing ; + destroy __context ; + __context.tracing <- None + ) + let with_tracing ?originator ~__context name f = let open Tracing in let parent = __context.tracing in diff --git a/ocaml/xapi/context.mli b/ocaml/xapi/context.mli index 34e51afd2ee..61d307e6476 100644 --- a/ocaml/xapi/context.mli +++ b/ocaml/xapi/context.mli @@ -146,6 +146,8 @@ val complete_tracing : ?error:exn * Printexc.raw_backtrace -> t -> unit val tracing_of : t -> Tracing.Span.t option +val finally_destroy_context : __context:t -> (unit -> 'a) -> 'a + val with_tracing : ?originator:string -> __context:t -> string -> (t -> 'a) -> 'a diff --git a/ocaml/xapi/server_helpers.ml b/ocaml/xapi/server_helpers.ml index 04aae674472..425fef29036 100644 --- a/ocaml/xapi/server_helpers.ml +++ b/ocaml/xapi/server_helpers.ml @@ -53,9 +53,10 @@ let parameter_count_mismatch_failure func expected received = API.response_of_failure Api_errors.message_parameter_count_mismatch [func; expected; received] -(** WARNING: the context is destroyed when execution is finished if the task is not forwarded, in database and not called asynchronous. *) -let exec_with_context ~__context ~need_complete ?marshaller ?f_forward - ?(called_async = false) ?quiet f = +(** WARNING: DOES NOT DESTROY the context when execution is finished. The + caller must destroy it *) +let exec_with_context ~__context ~need_complete ?marshaller ?f_forward ?quiet f + = (* Execute fn f in specified __context, marshalling result with "marshaller" *) let exec () = (* NB: @@ -95,23 +96,15 @@ let exec_with_context ~__context ~need_complete ?marshaller ?f_forward if need_complete then TaskHelper.failed ~__context e ; raise e in - Locking_helpers.Thread_state.with_named_thread - (TaskHelper.get_name ~__context) (Context.get_task_id __context) (fun () -> - let client = Context.get_client __context in - Debug.with_thread_associated ?client ?quiet - (Context.string_of_task __context) - (fun () -> - (* CP-982: promote tracking debug line to info status *) - if called_async then - info "spawning a new thread to handle the current task%s" - (Context.trackid ~with_brackets:true ~prefix:" " __context) ; - Xapi_stdext_pervasives.Pervasiveext.finally exec (fun () -> - if not called_async then Context.destroy __context - (* else debug "nothing more to process for this thread" *) - ) - ) - () - ) + let@ () = + Locking_helpers.Thread_state.with_named_thread + (TaskHelper.get_name ~__context) + (Context.get_task_id __context) + in + let client = Context.get_client __context in + Debug.with_thread_associated ?client ?quiet + (Context.string_of_task __context) + exec () let dispatch_exn_wrapper f = try f () @@ -168,18 +161,22 @@ let do_dispatch ?session_id ?forward_op ?self:_ supports_async called_fn_name let sync () = let need_complete = not (Context.forwarded_task __context) in - exec_with_context ~__context ~need_complete ~called_async - ?f_forward:forward_op ~marshaller op_fn + let@ () = Context.finally_destroy_context ~__context in + exec_with_context ~__context ~need_complete ?f_forward:forward_op + ~marshaller op_fn |> marshaller |> Rpc.success in + let async ~need_complete = (* Fork thread in which to execute async call *) + info "spawning a new thread to handle the current task%s" + (Context.trackid ~with_brackets:true ~prefix:" " __context) ; ignore (Thread.create (fun () -> - exec_with_context ~__context ~need_complete ~called_async - ?f_forward:forward_op ~marshaller op_fn + exec_with_context ~__context ~need_complete ?f_forward:forward_op + ~marshaller op_fn ) () ) ; @@ -200,26 +197,30 @@ let do_dispatch ?session_id ?forward_op ?self:_ supports_async called_fn_name (* in the following functions, it is our responsibility to complete any tasks we create *) let exec_with_new_task ?http_other_config ?quiet ?subtask_of ?session_id ?task_in_database ?task_description ?origin task_name f = - exec_with_context ?quiet - ~__context: - (Context.make ?http_other_config ?quiet ?subtask_of ?session_id - ?task_in_database ?task_description ?origin task_name - ) ~need_complete:true (fun ~__context -> f __context + let __context = + Context.make ?http_other_config ?quiet ?subtask_of ?session_id + ?task_in_database ?task_description ?origin task_name + in + let@ () = Context.finally_destroy_context ~__context in + exec_with_context ?quiet ~__context ~need_complete:true (fun ~__context -> + f __context ) let exec_with_forwarded_task ?http_other_config ?session_id ?origin task_id f = - exec_with_context - ~__context: - (Context.from_forwarded_task ?http_other_config ?session_id ?origin - task_id - ) ~need_complete:true (fun ~__context -> f __context + let __context = + Context.from_forwarded_task ?http_other_config ?session_id ?origin task_id + in + let@ () = Context.finally_destroy_context ~__context in + exec_with_context ~__context ~need_complete:true (fun ~__context -> + f __context ) let exec_with_subtask ~__context ?task_in_database task_name f = - let subcontext = + let __context = Context.make_subcontext ~__context ?task_in_database task_name in - exec_with_context ~__context:subcontext ~need_complete:true f + let@ () = Context.finally_destroy_context ~__context in + exec_with_context ~__context ~need_complete:true f let forward_extension ~__context rbac call = rbac __context (fun () -> Xapi_extensions.call_extension call) From 63eef6fdf06d010b12fc57b75d7aa34d6f8f1709 Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Wed, 18 Jun 2025 17:08:22 +0100 Subject: [PATCH 035/111] CP-308392: Create specialized functions Signed-off-by: Gabriel Buica --- ocaml/xapi/context.ml | 24 +++++++++++++++++++++-- ocaml/xapi/context.mli | 38 +++++++++++++++++++++++++++++++++++- ocaml/xapi/server_helpers.ml | 19 ++++++++---------- 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/ocaml/xapi/context.ml b/ocaml/xapi/context.ml index e57c3c71eca..f03ce60e2a0 100644 --- a/ocaml/xapi/context.ml +++ b/ocaml/xapi/context.ml @@ -506,11 +506,31 @@ let get_user_agent context = let finally_destroy_context ~__context f = let tracing = __context.tracing in - Xapi_stdext_pervasives.Pervasiveext.finally f (fun () -> + Xapi_stdext_pervasives.Pervasiveext.finally + (fun () -> f __context) + (fun () -> __context.tracing <- tracing ; destroy __context ; __context.tracing <- None - ) + ) + +let with_context ?http_other_config ?quiet ?subtask_of ?session_id ?database + ?task_in_database ?task_description ?origin task_name f = + let __context = + make ?http_other_config ?quiet ?subtask_of ?session_id ?database + ?task_in_database ?task_description ?origin task_name + in + finally_destroy_context ~__context f + +let with_subcontext ~__context ?task_in_database task_name f = + let __context = make_subcontext ~__context ?task_in_database task_name in + finally_destroy_context ~__context f + +let with_forwarded_task ?http_other_config ?session_id ?origin task_id f = + let __context = + from_forwarded_task ?http_other_config ?session_id ?origin task_id + in + finally_destroy_context ~__context f let with_tracing ?originator ~__context name f = let open Tracing in diff --git a/ocaml/xapi/context.mli b/ocaml/xapi/context.mli index 61d307e6476..281f67ca4b2 100644 --- a/ocaml/xapi/context.mli +++ b/ocaml/xapi/context.mli @@ -146,7 +146,43 @@ val complete_tracing : ?error:exn * Printexc.raw_backtrace -> t -> unit val tracing_of : t -> Tracing.Span.t option -val finally_destroy_context : __context:t -> (unit -> 'a) -> 'a +val finally_destroy_context : __context:t -> (t -> 'a) -> 'a +(** [finally_destroy_context ~context f] executes [f ~__context] and then + ensure [__context] is destroyed.*) + +val with_context : + ?http_other_config:(string * string) list + -> ?quiet:bool + -> ?subtask_of:API.ref_task + -> ?session_id:API.ref_session + -> ?database:Xapi_database.Db_ref.t + -> ?task_in_database:bool + -> ?task_description:string + -> ?origin:origin + -> string + -> (t -> 'a) + -> 'a +(** [with_context ?http_other_config ?quiet ?subtask_of ?session_id ?database + ?task_in_database ?task_description ?origin name f] creates a + context [__context], executes [f ~__context] and then ensure [__context] is + destroyed.*) + +val with_subcontext : + __context:t -> ?task_in_database:bool -> string -> (t -> 'a) -> 'a +(** [with_subcontext ~__context ?task_in_database name] creates a subcontext + [__context], executes [f ~__context] and then ensure `__context` is + destroyed.*) + +val with_forwarded_task : + ?http_other_config:(string * string) list + -> ?session_id:API.ref_session + -> ?origin:origin + -> API.ref_task + -> (t -> 'a) + -> 'a +(** [with_forwarded_task ?http_other_config ?session_id ?origin task f] + creates a context form frowarded task [task], executes [f ~__context] and + then ensure [__context] is destroyed.*) val with_tracing : ?originator:string -> __context:t -> string -> (t -> 'a) -> 'a diff --git a/ocaml/xapi/server_helpers.ml b/ocaml/xapi/server_helpers.ml index 425fef29036..48789c455aa 100644 --- a/ocaml/xapi/server_helpers.ml +++ b/ocaml/xapi/server_helpers.ml @@ -161,7 +161,7 @@ let do_dispatch ?session_id ?forward_op ?self:_ supports_async called_fn_name let sync () = let need_complete = not (Context.forwarded_task __context) in - let@ () = Context.finally_destroy_context ~__context in + let@ __context = Context.finally_destroy_context ~__context in exec_with_context ~__context ~need_complete ?f_forward:forward_op ~marshaller op_fn |> marshaller @@ -197,29 +197,26 @@ let do_dispatch ?session_id ?forward_op ?self:_ supports_async called_fn_name (* in the following functions, it is our responsibility to complete any tasks we create *) let exec_with_new_task ?http_other_config ?quiet ?subtask_of ?session_id ?task_in_database ?task_description ?origin task_name f = - let __context = - Context.make ?http_other_config ?quiet ?subtask_of ?session_id + let@ __context = + Context.with_context ?http_other_config ?quiet ?subtask_of ?session_id ?task_in_database ?task_description ?origin task_name in - let@ () = Context.finally_destroy_context ~__context in - exec_with_context ?quiet ~__context ~need_complete:true (fun ~__context -> + exec_with_context ~__context ~need_complete:true (fun ~__context -> f __context ) let exec_with_forwarded_task ?http_other_config ?session_id ?origin task_id f = - let __context = - Context.from_forwarded_task ?http_other_config ?session_id ?origin task_id + let@ __context = + Context.with_forwarded_task ?http_other_config ?session_id ?origin task_id in - let@ () = Context.finally_destroy_context ~__context in exec_with_context ~__context ~need_complete:true (fun ~__context -> f __context ) let exec_with_subtask ~__context ?task_in_database task_name f = - let __context = - Context.make_subcontext ~__context ?task_in_database task_name + let@ __context = + Context.with_subcontext ~__context ?task_in_database task_name in - let@ () = Context.finally_destroy_context ~__context in exec_with_context ~__context ~need_complete:true f let forward_extension ~__context rbac call = From 3b76902a4cf02b662f262a14f3753f561fe6c82d Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 19 Jun 2025 08:19:18 +0100 Subject: [PATCH 036/111] xapi-idl: Clean up xenops-related interfaces Dynamic.t is not used currently, just drop it. Signed-off-by: Andrii Sultanov --- ocaml/xapi-idl/xen/xenops_interface.ml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ocaml/xapi-idl/xen/xenops_interface.ml b/ocaml/xapi-idl/xen/xenops_interface.ml index 39299a41f93..9b3f2941910 100644 --- a/ocaml/xapi-idl/xen/xenops_interface.ml +++ b/ocaml/xapi-idl/xen/xenops_interface.ml @@ -444,16 +444,6 @@ module Dynamic = struct type barrier = int * id list [@@deriving rpcty] - type t = - | Vm_t of Vm.id * (Vm.t * Vm.state) option - | Vbd_t of Vbd.id * (Vbd.t * Vbd.state) option - | Vif_t of Vif.id * (Vif.t * Vif.state) option - | Pci_t of Pci.id * (Pci.t * Pci.state) option - | Vgpu_t of Vgpu.id * (Vgpu.t * Vgpu.state) option - | Vusb_t of Vusb.id * (Vusb.t * Vusb.state) option - | Task_t of Task.id * Task.t option - [@@deriving rpcty] - let rpc_of_id = Rpcmarshal.marshal id.Rpc.Types.ty end From c568697aaa9e3ea2774a2a915b87f197eaaa0b60 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 18 Jun 2025 15:50:07 +0100 Subject: [PATCH 037/111] xapi_xenops: Remove unnecessary Helpers.get_localhost call There's already a 'localhost' in scope. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_xenops.ml | 1 - 1 file changed, 1 deletion(-) diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index e9f5174f90b..54e824188ab 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -2124,7 +2124,6 @@ let update_vm ~__context id = debug "xenopsd event: Updating VM %s consoles" id ; Option.iter (fun (_, state) -> - let localhost = Helpers.get_localhost ~__context in let address = Db.Host.get_address ~__context ~self:localhost in From fa2810548faac8638cd3da5b1f59d377826ad753 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 18 Jun 2025 15:24:44 +0100 Subject: [PATCH 038/111] xapi_xenops: Split update_vm internals into a separate function No functional change, this just removes several indentation levels from the 500+ lines of the function, making it easier to refactor in the future. This also clears up the logic of the function, now that two arms of if-else and try-with clauses are not 500+ lines apart, and avoids splitting some expressions and strings over several lines given that they reach the line character limit more often inside of several levels of indentation. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_xenops.ml | 1110 ++++++++++++++++++------------------- 1 file changed, 542 insertions(+), 568 deletions(-) diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 54e824188ab..dbefffb1571 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -1852,591 +1852,565 @@ module Events_from_xenopsd = struct ) end -let update_vm ~__context id = - try - if Events_from_xenopsd.are_suppressed id then - debug "xenopsd event: ignoring event for VM (VM %s migrating away)" id - else - let self = Db.VM.get_by_uuid ~__context ~uuid:id in - let localhost = Helpers.get_localhost ~__context in - if Db.VM.get_resident_on ~__context ~self = localhost then - let previous = Xenops_cache.find_vm id in - let dbg = Context.string_of_task_and_tracing __context in - let module Client = - (val make_client (queue_of_vm ~__context ~self) : XENOPS) - in - let info = try Some (Client.VM.stat dbg id) with _ -> None in - if Option.map snd info <> previous then ( - debug "xenopsd event: processing event for VM %s" id ; - if info = None then - debug "xenopsd event: VM state missing: assuming VM has shut down" ; - let should_update_allowed_operations = ref false in - let different f = - let a = Option.map (fun x -> f (snd x)) info in - let b = Option.map f previous in - a <> b - in - (* Helpers to create and update guest metrics when needed *) - let lookup state key = List.assoc_opt key state.Vm.guest_agent in - let list state dir = - let dir = - if dir.[0] = '/' then - String.sub dir 1 (String.length dir - 1) - else - dir - in - let results = - List.filter_map - (fun (path, _) -> - if String.starts_with ~prefix:dir path then - let rest = - String.sub path (String.length dir) - (String.length path - String.length dir) - in - match - List.filter (fun x -> x <> "") (String.split '/' rest) - with - | x :: _ -> - Some x - | _ -> - None - else - None - ) - state.Vm.guest_agent - |> Listext.setify +let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = + debug "xenopsd event: processing event for VM %s" id ; + if info = None then + debug "xenopsd event: VM state missing: assuming VM has shut down" ; + let should_update_allowed_operations = ref false in + let different f = + let a = Option.map (fun x -> f (snd x)) info in + let b = Option.map f previous in + a <> b + in + (* Helpers to create and update guest metrics when needed *) + let lookup state key = List.assoc_opt key state.Vm.guest_agent in + let list state dir = + let dir = + if dir.[0] = '/' then + String.sub dir 1 (String.length dir - 1) + else + dir + in + let results = + List.filter_map + (fun (path, _) -> + if String.starts_with ~prefix:dir path then + let rest = + String.sub path (String.length dir) + (String.length path - String.length dir) in - results - in - let create_guest_metrics_if_needed () = - let gm = Db.VM.get_guest_metrics ~__context ~self in - if gm = Ref.null then - Option.iter - (fun (_, state) -> - List.iter - (fun domid -> - try - let new_gm_ref = - Xapi_guest_agent.create_and_set_guest_metrics - (lookup state) (list state) ~__context ~domid - ~uuid:id - ~pV_drivers_detected:state.pv_drivers_detected - in - debug - "xenopsd event: created guest metrics %s for VM %s" - (Ref.string_of new_gm_ref) id - with e -> - error "Caught %s: while creating VM %s guest metrics" - (Printexc.to_string e) id - ) - state.domids - ) - info - in - let check_guest_agent () = - Option.iter - (fun (_, state) -> - Option.iter - (fun oldstate -> - let old_ga = oldstate.Vm.guest_agent in - let new_ga = state.Vm.guest_agent in - (* Remove memory keys *) - let ignored_keys = - ["data/meminfo_free"; "data/updated"; "data/update_cnt"] - in - let remove_ignored ga = - List.fold_left - (fun acc k -> List.filter (fun x -> fst x <> k) acc) - ga ignored_keys - in - let old_ga = remove_ignored old_ga in - let new_ga = remove_ignored new_ga in - if new_ga <> old_ga then ( - debug - "Will update VM.allowed_operations because guest_agent \ - has changed." ; - should_update_allowed_operations := true - ) else - debug - "Supressing VM.allowed_operations update because \ - guest_agent data is largely the same" - ) - previous ; - List.iter - (fun domid -> - try - debug "xenopsd event: Updating VM %s domid %d guest_agent" - id domid ; - Xapi_guest_agent.all (lookup state) (list state) - ~__context ~domid ~uuid:id - ~pV_drivers_detected:state.pv_drivers_detected - with e -> - error "Caught %s: while updating VM %s guest_agent" - (Printexc.to_string e) id - ) - state.domids - ) - info - in - (* Notes on error handling: if something fails we log and continue, to - maximise the amount of state which is correctly synced. If something - does fail then we may end up permanently out-of-sync until either a - process restart or an event is generated. We may wish to periodically - inject artificial events IF there has been an event sync failure? *) - let power_state = - xenapi_of_xenops_power_state - (Option.map (fun x -> (snd x).Vm.power_state) info) - in - let power_state_before_update = - Db.VM.get_power_state ~__context ~self - in - (* We preserve the current_domain_type of suspended VMs like we preserve - the currently_attached fields for VBDs/VIFs etc - it's important to know - whether suspended VMs are going to resume into PV or PVinPVH for example. - We do this before updating the power_state to maintain the invariant that - any VM that's not `Halted cannot have an unspecified current_domain_type *) - if different (fun x -> x.domain_type) && power_state <> `Suspended - then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - let update domain_type = - debug - "xenopsd event: Updating VM %s current_domain_type <- %s" id - (Record_util.domain_type_to_string domain_type) ; - Db.VM_metrics.set_current_domain_type ~__context ~self:metrics - ~value:domain_type - in - match state.Vm.domain_type with - | Domain_HVM -> - update `hvm - | Domain_PV -> - update `pv - | Domain_PVinPVH -> - update `pv_in_pvh - | Domain_PVH -> - update `pvh - | Domain_undefined -> - if power_state <> `Halted then - debug - "xenopsd returned an undefined domain type for \ - non-halted VM %s;assuming this is transient, so not \ - updating current_domain_type" - id - else - update `unspecified - ) - info ; - ( if different (fun x -> x.power_state) then - try - debug - "Will update VM.allowed_operations because power_state has \ - changed." ; - should_update_allowed_operations := true ; - (* Update ha_always_run before the power_state (if needed), to avoid racing - with the HA monitor thread. *) - let pool = Helpers.get_pool ~__context in - if - power_state = `Halted - && not - (Db.Pool.get_ha_reboot_vm_on_internal_shutdown ~__context - ~self:pool - ) - then ( - Db.VM.set_ha_always_run ~__context ~self ~value:false ; - debug "Setting ha_always_run on vm=%s as false after shutdown" - (Ref.string_of self) - ) ; - debug "xenopsd event: Updating VM %s power_state <- %s" id - (Record_util.vm_power_state_to_string power_state) ; - - (* NOTE: Pull xenopsd metadata as soon as possible so that - nothing comes inbetween the power state change and the - Xenopsd_metadata.pull and overwrites it. *) - ( if power_state = `Suspended then - let md = Xenopsd_metadata.pull ~__context id in - match md.Metadata.domains with - | None -> - error "Suspended VM has no domain-specific metadata" - | Some x -> - Db.VM.set_last_booted_record ~__context ~self ~value:x ; - debug "VM %s last_booted_record set to %s" - (Ref.string_of self) x - ) ; - - (* This will mark VBDs, VIFs as detached and clear resident_on - if the VM has permanently shutdown. current-operations - should not be reset as there maybe a checkpoint is ongoing*) - Xapi_vm_lifecycle.force_state_reset_keep_current_operations - ~__context ~self ~value:power_state ; - if power_state = `Running then - create_guest_metrics_if_needed () ; - if power_state = `Suspended || power_state = `Halted then ( - Xapi_network.detach_for_vm ~__context ~host:localhost ~vm:self ; - Storage_access.reset ~__context ~vm:self - ) ; - if power_state = `Halted then ( - Xenopsd_metadata.delete ~__context id ; - !trigger_xenapi_reregister () - ) - with e -> - error "Caught %s: while updating VM %s power_state" - (Printexc.to_string e) id - ) ; - ( if different (fun x -> x.domids) then - try - debug - "Will update VM.allowed_operations because domid has changed." ; - should_update_allowed_operations := true ; - debug "xenopsd event: Updating VM %s domid" id ; - Option.iter - (fun (_, state) -> - match state.Vm.domids with - | value :: _ -> - Db.VM.set_domid ~__context ~self - ~value:(Int64.of_int value) - | [] -> - () - (* happens when the VM is shutdown *) - ) - info ; - (* If this is a storage domain, attempt to plug the PBD *) - Option.iter - (fun pbd -> - let (_ : Thread.t) = - Thread.create - (fun () -> - (* Don't block the database update thread *) - Xapi_pbd.plug ~__context ~self:pbd - ) - () - in - () - ) - (System_domains.pbd_of_vm ~__context ~vm:self) - with e -> - error "Caught %s: while updating VM %s domids" - (Printexc.to_string e) id - ) ; - (* consoles *) - ( if different (fun x -> x.consoles) then - try - debug "xenopsd event: Updating VM %s consoles" id ; - Option.iter - (fun (_, state) -> - let address = - Db.Host.get_address ~__context ~self:localhost - in - let uri = - Uri.( - make ~scheme:"https" ~host:address - ~path:Constants.console_uri () - |> to_string - ) - in - let get_uri_from_location loc = - try - let n = String.index loc '?' in - String.sub loc 0 n - with Not_found -> loc - in - let current_protocols = - List.map - (fun self -> - ( ( Db.Console.get_protocol ~__context ~self - |> to_xenops_console_protocol - , Db.Console.get_location ~__context ~self - |> get_uri_from_location - ) - , self - ) - ) - (Db.VM.get_consoles ~__context ~self) - in - let new_protocols = - List.map - (fun c -> ((c.Vm.protocol, uri), c)) - state.Vm.consoles - in - (* Destroy consoles that have gone away *) - List.iter - (fun protocol -> - let self = List.assoc protocol current_protocols in - Db.Console.destroy ~__context ~self - ) - (Listext.set_difference - (List.map fst current_protocols) - (List.map fst new_protocols) - ) ; - (* Create consoles that have appeared *) - List.iter - (fun (protocol, _) -> - let ref = Ref.make () in - let uuid = Uuidx.to_string (Uuidx.make ()) in - let location = Printf.sprintf "%s?uuid=%s" uri uuid in - let port = - try - Int64.of_int - (List.find - (fun c -> c.Vm.protocol = protocol) - state.Vm.consoles - ) - .port - with Not_found -> -1L - in - Db.Console.create ~__context ~ref ~uuid - ~protocol:(to_xenapi_console_protocol protocol) - ~location ~vM:self ~other_config:[] ~port - ) - (Listext.set_difference - (List.map fst new_protocols) - (List.map fst current_protocols) - ) - ) - info - with e -> - error "Caught %s: while updating VM %s consoles" - (Printexc.to_string e) id - ) ; - ( if different (fun x -> x.memory_target) then + match List.filter (fun x -> x <> "") (String.split '/' rest) with + | x :: _ -> + Some x + | _ -> + None + else + None + ) + state.Vm.guest_agent + |> Listext.setify + in + results + in + let create_guest_metrics_if_needed () = + let gm = Db.VM.get_guest_metrics ~__context ~self in + if gm = Ref.null then + Option.iter + (fun (_, state) -> + List.iter + (fun domid -> try - Option.iter - (fun (_, state) -> - debug "xenopsd event: Updating VM %s memory_target <- %Ld" - id state.Vm.memory_target ; - Db.VM.set_memory_target ~__context ~self - ~value:state.memory_target - ) - info + let new_gm_ref = + Xapi_guest_agent.create_and_set_guest_metrics (lookup state) + (list state) ~__context ~domid ~uuid:id + ~pV_drivers_detected:state.pv_drivers_detected + in + debug "xenopsd event: created guest metrics %s for VM %s" + (Ref.string_of new_gm_ref) id with e -> - error "Caught %s: while updating VM %s consoles" + error "Caught %s: while creating VM %s guest metrics" (Printexc.to_string e) id - ) ; - ( if different (fun x -> x.rtc_timeoffset) then + ) + state.domids + ) + info + in + let check_guest_agent () = + Option.iter + (fun (_, state) -> + Option.iter + (fun oldstate -> + let old_ga = oldstate.Vm.guest_agent in + let new_ga = state.Vm.guest_agent in + (* Remove memory keys *) + let ignored_keys = + ["data/meminfo_free"; "data/updated"; "data/update_cnt"] + in + let remove_ignored ga = + List.fold_left + (fun acc k -> List.filter (fun x -> fst x <> k) acc) + ga ignored_keys + in + let old_ga = remove_ignored old_ga in + let new_ga = remove_ignored new_ga in + if new_ga <> old_ga then ( + debug + "Will update VM.allowed_operations because guest_agent has \ + changed." ; + should_update_allowed_operations := true + ) else + debug + "Supressing VM.allowed_operations update because guest_agent \ + data is largely the same" + ) + previous ; + List.iter + (fun domid -> + try + debug "xenopsd event: Updating VM %s domid %d guest_agent" id + domid ; + Xapi_guest_agent.all (lookup state) (list state) ~__context ~domid + ~uuid:id ~pV_drivers_detected:state.pv_drivers_detected + with e -> + error "Caught %s: while updating VM %s guest_agent" + (Printexc.to_string e) id + ) + state.domids + ) + info + in + (* Notes on error handling: if something fails we log and continue, to + maximise the amount of state which is correctly synced. If something + does fail then we may end up permanently out-of-sync until either a + process restart or an event is generated. We may wish to periodically + inject artificial events IF there has been an event sync failure? *) + let power_state = + xenapi_of_xenops_power_state + (Option.map (fun x -> (snd x).Vm.power_state) info) + in + let power_state_before_update = Db.VM.get_power_state ~__context ~self in + (* We preserve the current_domain_type of suspended VMs like we preserve + the currently_attached fields for VBDs/VIFs etc - it's important to know + whether suspended VMs are going to resume into PV or PVinPVH for example. + We do this before updating the power_state to maintain the invariant that + any VM that's not `Halted cannot have an unspecified current_domain_type *) + if different (fun x -> x.domain_type) && power_state <> `Suspended then + Option.iter + (fun (_, state) -> + let metrics = Db.VM.get_metrics ~__context ~self in + let update domain_type = + debug "xenopsd event: Updating VM %s current_domain_type <- %s" id + (Record_util.domain_type_to_string domain_type) ; + Db.VM_metrics.set_current_domain_type ~__context ~self:metrics + ~value:domain_type + in + match state.Vm.domain_type with + | Domain_HVM -> + update `hvm + | Domain_PV -> + update `pv + | Domain_PVinPVH -> + update `pv_in_pvh + | Domain_PVH -> + update `pvh + | Domain_undefined -> + if power_state <> `Halted then + debug + "xenopsd returned an undefined domain type for non-halted VM \ + %s;assuming this is transient, so not updating \ + current_domain_type" + id + else + update `unspecified + ) + info ; + ( if different (fun x -> x.power_state) then + try + debug + "Will update VM.allowed_operations because power_state has changed." ; + should_update_allowed_operations := true ; + (* Update ha_always_run before the power_state (if needed), to avoid racing + with the HA monitor thread. *) + let pool = Helpers.get_pool ~__context in + if + power_state = `Halted + && not + (Db.Pool.get_ha_reboot_vm_on_internal_shutdown ~__context + ~self:pool + ) + then ( + Db.VM.set_ha_always_run ~__context ~self ~value:false ; + debug "Setting ha_always_run on vm=%s as false after shutdown" + (Ref.string_of self) + ) ; + debug "xenopsd event: Updating VM %s power_state <- %s" id + (Record_util.vm_power_state_to_string power_state) ; + + (* NOTE: Pull xenopsd metadata as soon as possible so that + nothing comes inbetween the power state change and the + Xenopsd_metadata.pull and overwrites it. *) + ( if power_state = `Suspended then + let md = Xenopsd_metadata.pull ~__context id in + match md.Metadata.domains with + | None -> + error "Suspended VM has no domain-specific metadata" + | Some x -> + Db.VM.set_last_booted_record ~__context ~self ~value:x ; + debug "VM %s last_booted_record set to %s" (Ref.string_of self) + x + ) ; + + (* This will mark VBDs, VIFs as detached and clear resident_on + if the VM has permanently shutdown. current-operations + should not be reset as there maybe a checkpoint is ongoing*) + Xapi_vm_lifecycle.force_state_reset_keep_current_operations ~__context + ~self ~value:power_state ; + if power_state = `Running then + create_guest_metrics_if_needed () ; + if power_state = `Suspended || power_state = `Halted then ( + Xapi_network.detach_for_vm ~__context ~host:localhost ~vm:self ; + Storage_access.reset ~__context ~vm:self + ) ; + if power_state = `Halted then ( + Xenopsd_metadata.delete ~__context id ; + !trigger_xenapi_reregister () + ) + with e -> + error "Caught %s: while updating VM %s power_state" + (Printexc.to_string e) id + ) ; + ( if different (fun x -> x.domids) then + try + debug "Will update VM.allowed_operations because domid has changed." ; + should_update_allowed_operations := true ; + debug "xenopsd event: Updating VM %s domid" id ; + Option.iter + (fun (_, state) -> + match state.Vm.domids with + | value :: _ -> + Db.VM.set_domid ~__context ~self ~value:(Int64.of_int value) + | [] -> + () + (* happens when the VM is shutdown *) + ) + info ; + (* If this is a storage domain, attempt to plug the PBD *) + Option.iter + (fun pbd -> + let (_ : Thread.t) = + Thread.create + (fun () -> + (* Don't block the database update thread *) + Xapi_pbd.plug ~__context ~self:pbd + ) + () + in + () + ) + (System_domains.pbd_of_vm ~__context ~vm:self) + with e -> + error "Caught %s: while updating VM %s domids" (Printexc.to_string e) id + ) ; + (* consoles *) + ( if different (fun x -> x.consoles) then + try + debug "xenopsd event: Updating VM %s consoles" id ; + Option.iter + (fun (_, state) -> + let address = Db.Host.get_address ~__context ~self:localhost in + let uri = + Uri.( + make ~scheme:"https" ~host:address ~path:Constants.console_uri + () + |> to_string + ) + in + let get_uri_from_location loc = try - Option.iter - (fun (_, state) -> - if state.Vm.rtc_timeoffset <> "" then ( - debug - "xenopsd event: Updating VM %s platform:timeoffset <- \ - %s" - id state.rtc_timeoffset ; - ( try - Db.VM.remove_from_platform ~__context ~self - ~key:Vm_platform.timeoffset - with _ -> () - ) ; - Db.VM.add_to_platform ~__context ~self - ~key:Vm_platform.timeoffset ~value:state.rtc_timeoffset + let n = String.index loc '?' in + String.sub loc 0 n + with Not_found -> loc + in + let current_protocols = + List.map + (fun self -> + ( ( Db.Console.get_protocol ~__context ~self + |> to_xenops_console_protocol + , Db.Console.get_location ~__context ~self + |> get_uri_from_location ) + , self ) - info - with e -> - error "Caught %s: while updating VM %s rtc/timeoffset" - (Printexc.to_string e) id - ) ; - if different (fun x -> x.hvm) then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s hvm <- %s" id - (string_of_bool state.Vm.hvm) ; - Db.VM_metrics.set_hvm ~__context ~self:metrics - ~value:state.Vm.hvm - ) - info ; - if different (fun x -> x.nomigrate) then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s nomigrate <- %s" id - (string_of_bool state.Vm.nomigrate) ; - Db.VM_metrics.set_nomigrate ~__context ~self:metrics - ~value:state.Vm.nomigrate + ) + (Db.VM.get_consoles ~__context ~self) + in + let new_protocols = + List.map (fun c -> ((c.Vm.protocol, uri), c)) state.Vm.consoles + in + (* Destroy consoles that have gone away *) + List.iter + (fun protocol -> + let self = List.assoc protocol current_protocols in + Db.Console.destroy ~__context ~self ) - info ; - if different (fun x -> x.nested_virt) then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s nested_virt <- %s" id - (string_of_bool state.Vm.nested_virt) ; - Db.VM_metrics.set_nested_virt ~__context ~self:metrics - ~value:state.Vm.nested_virt + (Listext.set_difference + (List.map fst current_protocols) + (List.map fst new_protocols) + ) ; + (* Create consoles that have appeared *) + List.iter + (fun (protocol, _) -> + let ref = Ref.make () in + let uuid = Uuidx.to_string (Uuidx.make ()) in + let location = Printf.sprintf "%s?uuid=%s" uri uuid in + let port = + try + Int64.of_int + (List.find + (fun c -> c.Vm.protocol = protocol) + state.Vm.consoles + ) + .port + with Not_found -> -1L + in + Db.Console.create ~__context ~ref ~uuid + ~protocol:(to_xenapi_console_protocol protocol) + ~location ~vM:self ~other_config:[] ~port ) - info ; - let update_pv_drivers_detected () = - Option.iter - (fun (_, state) -> - try - let gm = Db.VM.get_guest_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s PV drivers detected %b" - id state.Vm.pv_drivers_detected ; - Db.VM_guest_metrics.set_PV_drivers_detected ~__context - ~self:gm ~value:state.Vm.pv_drivers_detected ; - Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context - ~self:gm ~value:state.Vm.pv_drivers_detected - with e -> - debug "Caught %s: while updating VM %s PV drivers" - (Printexc.to_string e) id + (Listext.set_difference + (List.map fst new_protocols) + (List.map fst current_protocols) ) - info - in - (* Chack last_start_time before updating anything in the guest metrics *) - ( if different (fun x -> x.last_start_time) then - try - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - (* Clamp time to full seconds, stored timestamps do not - have decimals *) - let start_time = - Float.floor state.Vm.last_start_time |> Date.of_unix_time - in - let expected_time = - Db.VM_metrics.get_start_time ~__context ~self:metrics - in - if Date.is_later ~than:expected_time start_time then ( - debug - "xenopsd event: Updating VM %s last_start_time <- %s" id - Date.(to_rfc3339 (of_unix_time state.Vm.last_start_time)) ; - Db.VM_metrics.set_start_time ~__context ~self:metrics - ~value:start_time ; - if - (* VM start and VM reboot *) - power_state = `Running - && power_state_before_update <> `Suspended - then ( - Xapi_vm_lifecycle.remove_pending_guidance ~__context - ~self ~value:`restart_device_model ; - Xapi_vm_lifecycle.remove_pending_guidance ~__context - ~self ~value:`restart_vm - ) - ) ; - create_guest_metrics_if_needed () ; - let gm = Db.VM.get_guest_metrics ~__context ~self in - let update_time = - Db.VM_guest_metrics.get_last_updated ~__context ~self:gm - in - if update_time < start_time then ( - debug - "VM %s guest metrics update time (%s) < VM start time \ - (%s): deleting" - id - (Date.to_rfc3339 update_time) - (Date.to_rfc3339 start_time) ; - Xapi_vm_helpers.delete_guest_metrics ~__context ~self ; - check_guest_agent () - ) - ) - info - with e -> - error "Caught %s: while updating VM %s last_start_time" - (Printexc.to_string e) id - ) ; - Option.iter - (fun (_, state) -> - List.iter - (fun domid -> - (* Guest metrics could have been destroyed during the last_start_time check - by recreating them, we avoid CA-223387 *) - create_guest_metrics_if_needed () ; - if different (fun x -> x.Vm.uncooperative_balloon_driver) then - debug - "xenopsd event: VM %s domid %d \ - uncooperative_balloon_driver = %b" - id domid state.Vm.uncooperative_balloon_driver ; - if different (fun x -> x.Vm.guest_agent) then - check_guest_agent () ; - if different (fun x -> x.Vm.pv_drivers_detected) then - update_pv_drivers_detected () ; - ( if different (fun x -> x.Vm.xsdata_state) then - try - debug "xenopsd event: Updating VM %s domid %d xsdata" id - domid ; - Db.VM.set_xenstore_data ~__context ~self - ~value:state.Vm.xsdata_state - with e -> - error "Caught %s: while updating VM %s xsdata" - (Printexc.to_string e) id - ) ; - if different (fun x -> x.Vm.memory_target) then - try - debug - "xenopsd event: Updating VM %s domid %d memory target" - id domid ; - Rrdd.update_vm_memory_target domid state.Vm.memory_target - with e -> - error "Caught %s: while updating VM %s memory_target" - (Printexc.to_string e) id - ) - state.Vm.domids + ) + info + with e -> + error "Caught %s: while updating VM %s consoles" (Printexc.to_string e) + id + ) ; + ( if different (fun x -> x.memory_target) then + try + Option.iter + (fun (_, state) -> + debug "xenopsd event: Updating VM %s memory_target <- %Ld" id + state.Vm.memory_target ; + Db.VM.set_memory_target ~__context ~self ~value:state.memory_target + ) + info + with e -> + error "Caught %s: while updating VM %s consoles" (Printexc.to_string e) + id + ) ; + ( if different (fun x -> x.rtc_timeoffset) then + try + Option.iter + (fun (_, state) -> + if state.Vm.rtc_timeoffset <> "" then ( + debug "xenopsd event: Updating VM %s platform:timeoffset <- %s" id + state.rtc_timeoffset ; + ( try + Db.VM.remove_from_platform ~__context ~self + ~key:Vm_platform.timeoffset + with _ -> () + ) ; + Db.VM.add_to_platform ~__context ~self ~key:Vm_platform.timeoffset + ~value:state.rtc_timeoffset ) - info ; - if different (fun x -> x.Vm.vcpu_target) then - Option.iter - (fun (_, state) -> - try - debug "xenopsd event: Updating VM %s vcpu_target <- %d" id - state.Vm.vcpu_target ; - let metrics = Db.VM.get_metrics ~__context ~self in - Db.VM_metrics.set_VCPUs_number ~__context ~self:metrics - ~value:(Int64.of_int state.Vm.vcpu_target) - with e -> - error "Caught %s: while updating VM %s VCPUs_number" - (Printexc.to_string e) id + ) + info + with e -> + error "Caught %s: while updating VM %s rtc/timeoffset" + (Printexc.to_string e) id + ) ; + if different (fun x -> x.hvm) then + Option.iter + (fun (_, state) -> + let metrics = Db.VM.get_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s hvm <- %s" id + (string_of_bool state.Vm.hvm) ; + Db.VM_metrics.set_hvm ~__context ~self:metrics ~value:state.Vm.hvm + ) + info ; + if different (fun x -> x.nomigrate) then + Option.iter + (fun (_, state) -> + let metrics = Db.VM.get_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s nomigrate <- %s" id + (string_of_bool state.Vm.nomigrate) ; + Db.VM_metrics.set_nomigrate ~__context ~self:metrics + ~value:state.Vm.nomigrate + ) + info ; + if different (fun x -> x.nested_virt) then + Option.iter + (fun (_, state) -> + let metrics = Db.VM.get_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s nested_virt <- %s" id + (string_of_bool state.Vm.nested_virt) ; + Db.VM_metrics.set_nested_virt ~__context ~self:metrics + ~value:state.Vm.nested_virt + ) + info ; + let update_pv_drivers_detected () = + Option.iter + (fun (_, state) -> + try + let gm = Db.VM.get_guest_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s PV drivers detected %b" id + state.Vm.pv_drivers_detected ; + Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm + ~value:state.Vm.pv_drivers_detected ; + Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm + ~value:state.Vm.pv_drivers_detected + with e -> + debug "Caught %s: while updating VM %s PV drivers" + (Printexc.to_string e) id + ) + info + in + (* Chack last_start_time before updating anything in the guest metrics *) + ( if different (fun x -> x.last_start_time) then + try + Option.iter + (fun (_, state) -> + let metrics = Db.VM.get_metrics ~__context ~self in + (* Clamp time to full seconds, stored timestamps do not + have decimals *) + let start_time = + Float.floor state.Vm.last_start_time |> Date.of_unix_time + in + let expected_time = + Db.VM_metrics.get_start_time ~__context ~self:metrics + in + if Date.is_later ~than:expected_time start_time then ( + debug "xenopsd event: Updating VM %s last_start_time <- %s" id + Date.(to_rfc3339 (of_unix_time state.Vm.last_start_time)) ; + Db.VM_metrics.set_start_time ~__context ~self:metrics + ~value:start_time ; + if + (* VM start and VM reboot *) + power_state = `Running + && power_state_before_update <> `Suspended + then ( + Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self + ~value:`restart_device_model ; + Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self + ~value:`restart_vm ) - info ; - ( if different (fun x -> x.shadow_multiplier_target) then + ) ; + create_guest_metrics_if_needed () ; + let gm = Db.VM.get_guest_metrics ~__context ~self in + let update_time = + Db.VM_guest_metrics.get_last_updated ~__context ~self:gm + in + if update_time < start_time then ( + debug + "VM %s guest metrics update time (%s) < VM start time (%s): \ + deleting" + id + (Date.to_rfc3339 update_time) + (Date.to_rfc3339 start_time) ; + Xapi_vm_helpers.delete_guest_metrics ~__context ~self ; + check_guest_agent () + ) + ) + info + with e -> + error "Caught %s: while updating VM %s last_start_time" + (Printexc.to_string e) id + ) ; + Option.iter + (fun (_, state) -> + List.iter + (fun domid -> + (* Guest metrics could have been destroyed during the last_start_time check + by recreating them, we avoid CA-223387 *) + create_guest_metrics_if_needed () ; + if different (fun x -> x.Vm.uncooperative_balloon_driver) then + debug + "xenopsd event: VM %s domid %d uncooperative_balloon_driver = %b" + id domid state.Vm.uncooperative_balloon_driver ; + if different (fun x -> x.Vm.guest_agent) then + check_guest_agent () ; + if different (fun x -> x.Vm.pv_drivers_detected) then + update_pv_drivers_detected () ; + ( if different (fun x -> x.Vm.xsdata_state) then try - Option.iter - (fun (_, state) -> - debug - "xenopsd event: Updating VM %s shadow_multiplier <- %.2f" - id state.Vm.shadow_multiplier_target ; - if - state.Vm.power_state <> Halted - && state.Vm.shadow_multiplier_target >= 0.0 - then - Db.VM.set_HVM_shadow_multiplier ~__context ~self - ~value:state.Vm.shadow_multiplier_target - ) - info + debug "xenopsd event: Updating VM %s domid %d xsdata" id domid ; + Db.VM.set_xenstore_data ~__context ~self + ~value:state.Vm.xsdata_state with e -> - error "Caught %s: while updating VM %s HVM_shadow_multiplier" + error "Caught %s: while updating VM %s xsdata" (Printexc.to_string e) id ) ; - (* Preserve last_boot_CPU_flags when suspending (see current_domain_type) *) - if different (fun x -> x.Vm.featureset) && power_state <> `Suspended - then - Option.iter - (fun (_, state) -> - try - debug - "xenopsd event: Updating VM %s last_boot_CPU_flags <- %s" id - state.Vm.featureset ; - let vendor = - Db.Host.get_cpu_info ~__context ~self:localhost - |> List.assoc Constants.cpu_info_vendor_key - in - let value = - [ - (Constants.cpu_info_vendor_key, vendor) - ; (Constants.cpu_info_features_key, state.Vm.featureset) - ] - in - Db.VM.set_last_boot_CPU_flags ~__context ~self ~value - with e -> - error "Caught %s: while updating VM %s last_boot_CPU_flags" - (Printexc.to_string e) id - ) - info ; - Xenops_cache.update_vm id (Option.map snd info) ; - if !should_update_allowed_operations then - Helpers.call_api_functions ~__context (fun rpc session_id -> - XenAPI.VM.update_allowed_operations ~rpc ~session_id ~self - ) + if different (fun x -> x.Vm.memory_target) then + try + debug "xenopsd event: Updating VM %s domid %d memory target" id + domid ; + Rrdd.update_vm_memory_target domid state.Vm.memory_target + with e -> + error "Caught %s: while updating VM %s memory_target" + (Printexc.to_string e) id ) + state.Vm.domids + ) + info ; + if different (fun x -> x.Vm.vcpu_target) then + Option.iter + (fun (_, state) -> + try + debug "xenopsd event: Updating VM %s vcpu_target <- %d" id + state.Vm.vcpu_target ; + let metrics = Db.VM.get_metrics ~__context ~self in + Db.VM_metrics.set_VCPUs_number ~__context ~self:metrics + ~value:(Int64.of_int state.Vm.vcpu_target) + with e -> + error "Caught %s: while updating VM %s VCPUs_number" + (Printexc.to_string e) id + ) + info ; + ( if different (fun x -> x.shadow_multiplier_target) then + try + Option.iter + (fun (_, state) -> + debug "xenopsd event: Updating VM %s shadow_multiplier <- %.2f" id + state.Vm.shadow_multiplier_target ; + if + state.Vm.power_state <> Halted + && state.Vm.shadow_multiplier_target >= 0.0 + then + Db.VM.set_HVM_shadow_multiplier ~__context ~self + ~value:state.Vm.shadow_multiplier_target + ) + info + with e -> + error "Caught %s: while updating VM %s HVM_shadow_multiplier" + (Printexc.to_string e) id + ) ; + (* Preserve last_boot_CPU_flags when suspending (see current_domain_type) *) + if different (fun x -> x.Vm.featureset) && power_state <> `Suspended then + Option.iter + (fun (_, state) -> + try + debug "xenopsd event: Updating VM %s last_boot_CPU_flags <- %s" id + state.Vm.featureset ; + let vendor = + Db.Host.get_cpu_info ~__context ~self:localhost + |> List.assoc Constants.cpu_info_vendor_key + in + let value = + [ + (Constants.cpu_info_vendor_key, vendor) + ; (Constants.cpu_info_features_key, state.Vm.featureset) + ] + in + Db.VM.set_last_boot_CPU_flags ~__context ~self ~value + with e -> + error "Caught %s: while updating VM %s last_boot_CPU_flags" + (Printexc.to_string e) id + ) + info ; + Xenops_cache.update_vm id (Option.map snd info) ; + if !should_update_allowed_operations then + Helpers.call_api_functions ~__context (fun rpc session_id -> + XenAPI.VM.update_allowed_operations ~rpc ~session_id ~self + ) + +let update_vm ~__context id = + try + if Events_from_xenopsd.are_suppressed id then + debug "xenopsd event: ignoring event for VM (VM %s migrating away)" id + else + let self = Db.VM.get_by_uuid ~__context ~uuid:id in + let localhost = Helpers.get_localhost ~__context in + if Db.VM.get_resident_on ~__context ~self = localhost then + let previous = Xenops_cache.find_vm id in + let dbg = Context.string_of_task_and_tracing __context in + let module Client = + (val make_client (queue_of_vm ~__context ~self) : XENOPS) + in + let info = try Some (Client.VM.stat dbg id) with _ -> None in + if Option.map snd info <> previous then + update_vm_internal ~__context ~id ~self ~previous ~info ~localhost with e -> error "xenopsd event: Caught %s while updating VM: has this VM been removed \ From 58e9def69a7ba8d7db045ccfff52001c79313c8d Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Fri, 20 Jun 2025 03:29:43 +0000 Subject: [PATCH 039/111] CA-408552: 1/3 Improve bootstrom performance by save db ops events_from_xenopsd thread is critical as it sync up VM status in case of bootstorm, this thread is flood as lots of events comes from xenopsd waiting for process. During processing of the events, VM/VDI/VBD update_allowed_operations will be called to refresh the allowed operations. However, for each ops (start/suspend,etc) for the same object(VM), the object info is always the same no matter what the ops is. Thus, it is not necessary to query the object information over and over again. Disclosure is used to resovle the issue. Query once and the disclosure will just remember the query result. The performance test for starting 500 VM on 4 hosts improve around 10% performance for both XS8 and XS9 This commit just introduce disclosure and caller call the disclosure instead of the original function Signed-off-by: Lin Liu --- ocaml/xapi/xapi_vdi.ml | 685 ++++++++++++++++---------------- ocaml/xapi/xapi_vm_lifecycle.ml | 7 +- 2 files changed, 347 insertions(+), 345 deletions(-) diff --git a/ocaml/xapi/xapi_vdi.ml b/ocaml/xapi/xapi_vdi.ml index 15dff1df4d8..9691f3831cd 100644 --- a/ocaml/xapi/xapi_vdi.ml +++ b/ocaml/xapi/xapi_vdi.ml @@ -63,7 +63,7 @@ let check_sm_feature_error (op : API.vdi_operations) sm_features sr = specified, it should contain at least all the VBD records from the database that are linked to this VDI. *) let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) - ?vbd_records ha_enabled record _ref' op = + ?vbd_records ha_enabled record _ref' = let ( let* ) = Result.bind in let _ref = Ref.string_of _ref' in let current_ops = record.Db_actions.vDI_current_operations in @@ -83,360 +83,370 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) 5. HA prevents you from deleting statefiles or metadata volumes 6. During rolling pool upgrade, only operations known by older releases are allowed *) - let* () = - if - Helpers.rolling_upgrade_in_progress ~__context - && not - (Xapi_globs.Vdi_operations_set.mem op - Xapi_globs.rpu_allowed_vdi_operations - ) - then - Error (Api_errors.not_supported_during_upgrade, []) - else - Ok () - in - let* () = - (* Don't fail with other_operation_in_progress if VDI mirroring is in - progress and destroy is called as part of VDI mirroring *) - let is_vdi_mirroring_in_progress = - op = `destroy && List.exists (fun (_, op) -> op = `mirror) current_ops + fun op -> + let* () = + if + Helpers.rolling_upgrade_in_progress ~__context + && not + (Xapi_globs.Vdi_operations_set.mem op + Xapi_globs.rpu_allowed_vdi_operations + ) + then + Error (Api_errors.not_supported_during_upgrade, []) + else + Ok () in - if - List.exists (fun (_, op) -> op <> `copy) current_ops - && not is_vdi_mirroring_in_progress - then - Error (Api_errors.other_operation_in_progress, ["VDI"; _ref]) - else - Ok () - in - (* check to see whether it's a local cd drive *) - let sr = record.Db_actions.vDI_SR in - let sr_type = Db.SR.get_type ~__context ~self:sr in - let is_tools_sr = Db.SR.get_is_tools_sr ~__context ~self:sr in - (* Check to see if any PBDs are attached *) - let open Xapi_database.Db_filter_types in - let pbds_attached = - match pbd_records with - | [] -> - Db.PBD.get_records_where ~__context - ~expr: - (And - ( Eq (Field "SR", Literal (Ref.string_of sr)) - , Eq (Field "currently_attached", Literal "true") - ) + let* () = + (* Don't fail with other_operation_in_progress if VDI mirroring is in + progress and destroy is called as part of VDI mirroring *) + let is_vdi_mirroring_in_progress = + op = `destroy && List.exists (fun (_, op) -> op = `mirror) current_ops + in + if + List.exists (fun (_, op) -> op <> `copy) current_ops + && not is_vdi_mirroring_in_progress + then + Error (Api_errors.other_operation_in_progress, ["VDI"; _ref]) + else + Ok () + in + (* check to see whether it's a local cd drive *) + let sr = record.Db_actions.vDI_SR in + let sr_type = Db.SR.get_type ~__context ~self:sr in + let is_tools_sr = Db.SR.get_is_tools_sr ~__context ~self:sr in + (* Check to see if any PBDs are attached *) + let open Xapi_database.Db_filter_types in + let pbds_attached = + match pbd_records with + | [] -> + Db.PBD.get_records_where ~__context + ~expr: + (And + ( Eq (Field "SR", Literal (Ref.string_of sr)) + , Eq (Field "currently_attached", Literal "true") + ) + ) + | _ -> + List.filter + (fun (_, pbd_record) -> + pbd_record.API.pBD_SR = sr + && pbd_record.API.pBD_currently_attached ) - | _ -> - List.filter - (fun (_, pbd_record) -> - pbd_record.API.pBD_SR = sr && pbd_record.API.pBD_currently_attached - ) - pbd_records - in - let* () = - if pbds_attached = [] && op = `resize then - Error (Api_errors.sr_no_pbds, [Ref.string_of sr]) - else - Ok () - in - - (* check to see whether VBDs exist which are using this VDI *) - - (* Only a 'live' operation can be performed if there are active (even RO) devices *) - let my_active_vbd_records = - match vbd_records with - | None -> - List.map snd - (Db.VBD.get_internal_records_where ~__context - ~expr: - (And - ( Eq (Field "VDI", Literal _ref) - , Or - ( Eq (Field "currently_attached", Literal "true") - , Eq (Field "reserved", Literal "true") - ) - ) - ) - ) - | Some records -> - List.filter - (fun vbd_record -> - vbd_record.Db_actions.vBD_VDI = _ref' - && (vbd_record.Db_actions.vBD_currently_attached - || vbd_record.Db_actions.vBD_reserved - ) - ) - records - in - let my_active_rw_vbd_records = - List.filter (fun vbd -> vbd.Db_actions.vBD_mode = `RW) my_active_vbd_records - in - (* VBD operations (plug/unplug) (which should be transient) cause us to serialise *) - let my_has_current_operation_vbd_records = - match vbd_records with - | None -> - List.map snd - (Db.VBD.get_internal_records_where ~__context - ~expr: - (And - ( Eq (Field "VDI", Literal _ref) - , Not (Eq (Field "current_operations", Literal "()")) - ) - ) - ) - | Some records -> - List.filter - (fun vbd_record -> - vbd_record.Db_actions.vBD_VDI = _ref' - && vbd_record.Db_actions.vBD_current_operations <> [] - ) - records - in - (* If the VBD is currently_attached then some operations can still be - performed ie: VDI.clone (if the VM is suspended we have to have the - 'allow_clone_suspended_vm' flag); VDI.snapshot; VDI.resize_online; - 'blocked' (CP-831); VDI.data_destroy: it is not allowed on VDIs linked - to a VM, but the implementation first waits for the VDI's VBDs to be - unplugged and destroyed, and the checks are performed there. - *) - let operation_can_be_performed_live = - match op with - | `snapshot - | `resize_online - | `blocked - | `clone - | `mirror - | `enable_cbt - | `disable_cbt - | `data_destroy -> - true - | _ -> - false - in - let operation_can_be_performed_with_ro_attach = - operation_can_be_performed_live - || match op with `copy -> true | _ -> false - in - (* NB RO vs RW sharing checks are done in xapi_vbd.ml *) - let blocked_by_attach = - let blocked_by_attach = - if operation_can_be_performed_live then - false - else if operation_can_be_performed_with_ro_attach then - my_active_rw_vbd_records <> [] + pbd_records + in + let* () = + if pbds_attached = [] && op = `resize then + Error (Api_errors.sr_no_pbds, [Ref.string_of sr]) else - my_active_vbd_records <> [] + Ok () in - let allow_attached_vbds = - (* We use Valid_ref_list.list to ignore exceptions due to invalid - references that could propagate to the message forwarding layer, which - calls this function to check for errors - these exceptions would - prevent the actual XenAPI function from being run. Checks called from - the message forwarding layer should not fail with an exception. *) - let true_for_all_active_vbds f = - Valid_ref_list.for_all f my_active_vbd_records - in + + (* check to see whether VBDs exist which are using this VDI *) + + (* Only a 'live' operation can be performed if there are active (even RO) devices *) + let my_active_vbd_records = + match vbd_records with + | None -> + List.map snd + (Db.VBD.get_internal_records_where ~__context + ~expr: + (And + ( Eq (Field "VDI", Literal _ref) + , Or + ( Eq (Field "currently_attached", Literal "true") + , Eq (Field "reserved", Literal "true") + ) + ) + ) + ) + | Some records -> + List.filter + (fun vbd_record -> + vbd_record.Db_actions.vBD_VDI = _ref' + && (vbd_record.Db_actions.vBD_currently_attached + || vbd_record.Db_actions.vBD_reserved + ) + ) + records + in + let my_active_rw_vbd_records = + List.filter + (fun vbd -> vbd.Db_actions.vBD_mode = `RW) + my_active_vbd_records + in + (* VBD operations (plug/unplug) (which should be transient) cause us to serialise *) + let my_has_current_operation_vbd_records = + match vbd_records with + | None -> + List.map snd + (Db.VBD.get_internal_records_where ~__context + ~expr: + (And + ( Eq (Field "VDI", Literal _ref) + , Not (Eq (Field "current_operations", Literal "()")) + ) + ) + ) + | Some records -> + List.filter + (fun vbd_record -> + vbd_record.Db_actions.vBD_VDI = _ref' + && vbd_record.Db_actions.vBD_current_operations <> [] + ) + records + in + (* If the VBD is currently_attached then some operations can still be + performed ie: VDI.clone (if the VM is suspended we have to have the + 'allow_clone_suspended_vm' flag); VDI.snapshot; VDI.resize_online; + 'blocked' (CP-831); VDI.data_destroy: it is not allowed on VDIs linked + to a VM, but the implementation first waits for the VDI's VBDs to be + unplugged and destroyed, and the checks are performed there. + *) + let operation_can_be_performed_live = match op with - | `list_changed_blocks -> - let vbd_connected_to_vm_snapshot vbd = - let vm = vbd.Db_actions.vBD_VM in - Db.is_valid_ref __context vm - && Db.VM.get_is_a_snapshot ~__context ~self:vm - in - (* We allow list_changed_blocks on VDIs attached to snapshot VMs, - because VM.checkpoint may set the currently_attached fields of the - snapshot's VBDs to true, and this would block list_changed_blocks. *) - true_for_all_active_vbds vbd_connected_to_vm_snapshot + | `snapshot + | `resize_online + | `blocked + | `clone + | `mirror + | `enable_cbt + | `disable_cbt + | `data_destroy -> + true | _ -> false in - blocked_by_attach && not allow_attached_vbds - in - let* () = - if blocked_by_attach then - Error - (Api_errors.vdi_in_use, [_ref; Record_util.vdi_operations_to_string op]) - else if - (* data_destroy first waits for all the VBDs to disappear in its - implementation, so it is harmless to allow it when any of the VDI's - VBDs have operations in progress. This ensures that we avoid the retry - mechanism of message forwarding and only use the event loop. *) - my_has_current_operation_vbd_records <> [] && op <> `data_destroy - then - Error (Api_errors.other_operation_in_progress, ["VDI"; _ref]) - else - Ok () - in - let sm_features = - Xapi_sr_operations.features_of_sr_internal ~__context ~_type:sr_type - in - let* () = check_sm_feature_error op sm_features sr in - let allowed_for_cbt_metadata_vdi = - match op with - | `clone - | `copy - | `disable_cbt - | `enable_cbt - | `mirror - | `resize - | `resize_online - | `snapshot - | `set_on_boot -> - false - | `blocked - | `data_destroy - | `destroy - | `list_changed_blocks - | `force_unlock - | `forget - | `generate_config - | `update -> - true - in - let* () = - if - (not allowed_for_cbt_metadata_vdi) - && record.Db_actions.vDI_type = `cbt_metadata - then - Error - ( Api_errors.vdi_incompatible_type - , [_ref; Record_util.vdi_type_to_string `cbt_metadata] - ) - else - Ok () - in - let allowed_when_cbt_enabled = - match op with - | `mirror | `set_on_boot -> - false - | `blocked - | `clone - | `copy - | `data_destroy - | `destroy - | `disable_cbt - | `enable_cbt - | `list_changed_blocks - | `force_unlock - | `forget - | `generate_config - | `resize - | `resize_online - | `snapshot - | `update -> - true - in - let* () = - if (not allowed_when_cbt_enabled) && record.Db_actions.vDI_cbt_enabled then - Error (Api_errors.vdi_cbt_enabled, [_ref]) - else - Ok () - in - let check_destroy () = - if sr_type = "udev" then - Error (Api_errors.vdi_is_a_physical_device, [_ref]) - else if is_tools_sr then - Error (Api_errors.sr_operation_not_supported, [Ref.string_of sr]) - else if List.mem record.Db_actions.vDI_type [`rrd] then - Error (Api_errors.vdi_has_rrds, [_ref]) - else if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - then - Error (Api_errors.ha_is_enabled, []) - else if - List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - && Xapi_pool_helpers.ha_enable_in_progress ~__context - then - Error (Api_errors.ha_enable_in_progress, []) - else if - List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - && Xapi_pool_helpers.ha_disable_in_progress ~__context - then - Error (Api_errors.ha_disable_in_progress, []) - else - Ok () - in - match op with - | `forget -> - if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + let operation_can_be_performed_with_ro_attach = + operation_can_be_performed_live + || match op with `copy -> true | _ -> false + in + (* NB RO vs RW sharing checks are done in xapi_vbd.ml *) + let blocked_by_attach = + let blocked_by_attach = + if operation_can_be_performed_live then + false + else if operation_can_be_performed_with_ro_attach then + my_active_rw_vbd_records <> [] + else + my_active_vbd_records <> [] + in + let allow_attached_vbds = + (* We use Valid_ref_list.list to ignore exceptions due to invalid + references that could propagate to the message forwarding layer, which + calls this function to check for errors - these exceptions would + prevent the actual XenAPI function from being run. Checks called from + the message forwarding layer should not fail with an exception. *) + let true_for_all_active_vbds f = + Valid_ref_list.for_all f my_active_vbd_records + in + match op with + | `list_changed_blocks -> + let vbd_connected_to_vm_snapshot vbd = + let vm = vbd.Db_actions.vBD_VM in + Db.is_valid_ref __context vm + && Db.VM.get_is_a_snapshot ~__context ~self:vm + in + (* We allow list_changed_blocks on VDIs attached to snapshot VMs, + because VM.checkpoint may set the currently_attached fields of the + snapshot's VBDs to true, and this would block list_changed_blocks. *) + true_for_all_active_vbds vbd_connected_to_vm_snapshot + | _ -> + false + in + blocked_by_attach && not allow_attached_vbds + in + let* () = + if blocked_by_attach then + Error + ( Api_errors.vdi_in_use + , [_ref; Record_util.vdi_operations_to_string op] + ) + else if + (* data_destroy first waits for all the VBDs to disappear in its + implementation, so it is harmless to allow it when any of the VDI's + VBDs have operations in progress. This ensures that we avoid the retry + mechanism of message forwarding and only use the event loop. *) + my_has_current_operation_vbd_records <> [] && op <> `data_destroy then - Error (Api_errors.ha_is_enabled, []) - else if List.mem record.Db_actions.vDI_type [`rrd] then - Error (Api_errors.vdi_has_rrds, [_ref]) + Error (Api_errors.other_operation_in_progress, ["VDI"; _ref]) else Ok () - | `destroy -> - check_destroy () - | `data_destroy -> - if not record.Db_actions.vDI_is_a_snapshot then + in + let sm_features = + Xapi_sr_operations.features_of_sr_internal ~__context ~_type:sr_type + in + let* () = check_sm_feature_error op sm_features sr in + let allowed_for_cbt_metadata_vdi = + match op with + | `clone + | `copy + | `disable_cbt + | `enable_cbt + | `mirror + | `resize + | `resize_online + | `snapshot + | `set_on_boot -> + false + | `blocked + | `data_destroy + | `destroy + | `list_changed_blocks + | `force_unlock + | `forget + | `generate_config + | `update -> + true + in + let* () = + if + (not allowed_for_cbt_metadata_vdi) + && record.Db_actions.vDI_type = `cbt_metadata + then Error - (Api_errors.operation_not_allowed, ["VDI is not a snapshot: " ^ _ref]) - else if not record.Db_actions.vDI_cbt_enabled then - Error (Api_errors.vdi_no_cbt_metadata, [_ref]) + ( Api_errors.vdi_incompatible_type + , [_ref; Record_util.vdi_type_to_string `cbt_metadata] + ) else - check_destroy () - | `resize -> - if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + Ok () + in + let allowed_when_cbt_enabled = + match op with + | `mirror | `set_on_boot -> + false + | `blocked + | `clone + | `copy + | `data_destroy + | `destroy + | `disable_cbt + | `enable_cbt + | `list_changed_blocks + | `force_unlock + | `forget + | `generate_config + | `resize + | `resize_online + | `snapshot + | `update -> + true + in + let* () = + if (not allowed_when_cbt_enabled) && record.Db_actions.vDI_cbt_enabled then - Error (Api_errors.ha_is_enabled, []) + Error (Api_errors.vdi_cbt_enabled, [_ref]) else Ok () - | `resize_online -> - if + in + let check_destroy () = + if sr_type = "udev" then + Error (Api_errors.vdi_is_a_physical_device, [_ref]) + else if is_tools_sr then + Error (Api_errors.sr_operation_not_supported, [Ref.string_of sr]) + else if List.mem record.Db_actions.vDI_type [`rrd] then + Error (Api_errors.vdi_has_rrds, [_ref]) + else if ha_enabled && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] then Error (Api_errors.ha_is_enabled, []) + else if + List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + && Xapi_pool_helpers.ha_enable_in_progress ~__context + then + Error (Api_errors.ha_enable_in_progress, []) + else if + List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + && Xapi_pool_helpers.ha_disable_in_progress ~__context + then + Error (Api_errors.ha_disable_in_progress, []) else Ok () - | `snapshot when record.Db_actions.vDI_sharable -> - Error (Api_errors.vdi_is_sharable, [_ref]) - | (`snapshot | `copy) when reset_on_boot -> - Error (Api_errors.vdi_on_boot_mode_incompatible_with_operation, []) - | `snapshot -> - if List.exists (fun (_, op) -> op = `copy) current_ops then - Error - ( Api_errors.operation_not_allowed - , ["Snapshot operation not allowed during copy."] - ) - else - Ok () - | `copy -> - if List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] then - Error - ( Api_errors.operation_not_allowed - , [ - "VDI containing HA statefile or redo log cannot be copied (check \ - the VDI's allowed operations)." - ] - ) - else - Ok () - | `enable_cbt | `disable_cbt -> - if record.Db_actions.vDI_is_a_snapshot then - Error (Api_errors.operation_not_allowed, ["VDI is a snapshot: " ^ _ref]) - else if not (List.mem record.Db_actions.vDI_type [`user; `system]) then - Error - ( Api_errors.vdi_incompatible_type - , [_ref; Record_util.vdi_type_to_string record.Db_actions.vDI_type] - ) - else if reset_on_boot then + in + match op with + | `forget -> + if + ha_enabled + && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + then + Error (Api_errors.ha_is_enabled, []) + else if List.mem record.Db_actions.vDI_type [`rrd] then + Error (Api_errors.vdi_has_rrds, [_ref]) + else + Ok () + | `destroy -> + check_destroy () + | `data_destroy -> + if not record.Db_actions.vDI_is_a_snapshot then + Error + ( Api_errors.operation_not_allowed + , ["VDI is not a snapshot: " ^ _ref] + ) + else if not record.Db_actions.vDI_cbt_enabled then + Error (Api_errors.vdi_no_cbt_metadata, [_ref]) + else + check_destroy () + | `resize -> + if + ha_enabled + && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + then + Error (Api_errors.ha_is_enabled, []) + else + Ok () + | `resize_online -> + if + ha_enabled + && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + then + Error (Api_errors.ha_is_enabled, []) + else + Ok () + | `snapshot when record.Db_actions.vDI_sharable -> + Error (Api_errors.vdi_is_sharable, [_ref]) + | (`snapshot | `copy) when reset_on_boot -> Error (Api_errors.vdi_on_boot_mode_incompatible_with_operation, []) - else + | `snapshot -> + if List.exists (fun (_, op) -> op = `copy) current_ops then + Error + ( Api_errors.operation_not_allowed + , ["Snapshot operation not allowed during copy."] + ) + else + Ok () + | `copy -> + if List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] then + Error + ( Api_errors.operation_not_allowed + , [ + "VDI containing HA statefile or redo log cannot be copied \ + (check the VDI's allowed operations)." + ] + ) + else + Ok () + | `enable_cbt | `disable_cbt -> + if record.Db_actions.vDI_is_a_snapshot then + Error + (Api_errors.operation_not_allowed, ["VDI is a snapshot: " ^ _ref]) + else if not (List.mem record.Db_actions.vDI_type [`user; `system]) then + Error + ( Api_errors.vdi_incompatible_type + , [_ref; Record_util.vdi_type_to_string record.Db_actions.vDI_type] + ) + else if reset_on_boot then + Error (Api_errors.vdi_on_boot_mode_incompatible_with_operation, []) + else + Ok () + | `mirror + | `clone + | `generate_config + | `force_unlock + | `set_on_boot + | `list_changed_blocks + | `blocked + | `update -> Ok () - | `mirror - | `clone - | `generate_config - | `force_unlock - | `set_on_boot - | `list_changed_blocks - | `blocked - | `update -> - Ok () let assert_operation_valid ~__context ~self ~(op : API.vdi_operations) = let pool = Helpers.get_pool ~__context in @@ -486,16 +496,11 @@ let update_allowed_operations_internal ~__context ~self ~sr_records ~pbd_records v in let allowed = - let check x = - match - check_operation_error ~__context ~sr_records ~pbd_records ?vbd_records - ha_enabled all self x - with - | Ok () -> - true - | _ -> - false + let check' = + check_operation_error ~__context ~sr_records ~pbd_records ?vbd_records + ha_enabled all self in + let check x = match check' x with Ok () -> true | _ -> false in all_ops |> Xapi_globs.Vdi_operations_set.filter check in let allowed = diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index fc281c70de0..6e3a3955fca 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -777,12 +777,9 @@ let allowable_ops = List.filter (fun op -> not (List.mem op ignored_ops)) API.vm_operations__all let update_allowed_operations ~__context ~self = + let check' = check_operation_error ~__context ~ref:self in let check accu op = - match check_operation_error ~__context ~ref:self ~op ~strict:true with - | None -> - op :: accu - | Some _err -> - accu + match check' ~op ~strict:true with None -> op :: accu | Some _err -> accu in let allowed = List.fold_left check [] allowable_ops in (* FIXME: need to be able to deal with rolling-upgrade for orlando as well *) From e8440b3a01720f09baf04eec5b8482c8e3e93ff8 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Fri, 20 Jun 2025 05:47:09 +0000 Subject: [PATCH 040/111] CA-408552: 2/3 Improve bootstrom performance by save db ops Define db ops into variables and keep them inside returned function for code review Signed-off-by: Lin Liu --- ocaml/xapi/xapi_vdi.ml | 40 ++++++++++++++++----------------- ocaml/xapi/xapi_vm_lifecycle.ml | 32 +++++++++++++++++--------- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/ocaml/xapi/xapi_vdi.ml b/ocaml/xapi/xapi_vdi.ml index 9691f3831cd..1090ae01f1d 100644 --- a/ocaml/xapi/xapi_vdi.ml +++ b/ocaml/xapi/xapi_vdi.ml @@ -85,8 +85,11 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) *) fun op -> let* () = - if + let rolling_upgrade_in_progress = Helpers.rolling_upgrade_in_progress ~__context + in + if + rolling_upgrade_in_progress && not (Xapi_globs.Vdi_operations_set.mem op Xapi_globs.rpu_allowed_vdi_operations @@ -338,25 +341,31 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) else Ok () in + let vdi_is_ha_state_or_redolog = + List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + in let check_destroy () = + let ha_enable_in_progress = + Xapi_pool_helpers.ha_enable_in_progress ~__context + in + let ha_disable_in_progress = + Xapi_pool_helpers.ha_disable_in_progress ~__context + in if sr_type = "udev" then Error (Api_errors.vdi_is_a_physical_device, [_ref]) else if is_tools_sr then Error (Api_errors.sr_operation_not_supported, [Ref.string_of sr]) else if List.mem record.Db_actions.vDI_type [`rrd] then Error (Api_errors.vdi_has_rrds, [_ref]) - else if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - then + else if ha_enabled && vdi_is_ha_state_or_redolog then Error (Api_errors.ha_is_enabled, []) else if - List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + vdi_is_ha_state_or_redolog && Xapi_pool_helpers.ha_enable_in_progress ~__context then Error (Api_errors.ha_enable_in_progress, []) else if - List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + vdi_is_ha_state_or_redolog && Xapi_pool_helpers.ha_disable_in_progress ~__context then Error (Api_errors.ha_disable_in_progress, []) @@ -365,10 +374,7 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) in match op with | `forget -> - if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - then + if ha_enabled && vdi_is_ha_state_or_redolog then Error (Api_errors.ha_is_enabled, []) else if List.mem record.Db_actions.vDI_type [`rrd] then Error (Api_errors.vdi_has_rrds, [_ref]) @@ -387,18 +393,12 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) else check_destroy () | `resize -> - if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - then + if ha_enabled && vdi_is_ha_state_or_redolog then Error (Api_errors.ha_is_enabled, []) else Ok () | `resize_online -> - if - ha_enabled - && List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - then + if ha_enabled && vdi_is_ha_state_or_redolog then Error (Api_errors.ha_is_enabled, []) else Ok () @@ -415,7 +415,7 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) else Ok () | `copy -> - if List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] then + if vdi_is_ha_state_or_redolog then Error ( Api_errors.operation_not_allowed , [ diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index 6e3a3955fca..7e95933d8d2 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -528,10 +528,10 @@ let check_operation_error ~__context ~ref = in let current_error = let metrics = Db.VM.get_metrics ~__context ~self:ref in + let is_nested_virt = nested_virt ~__context ref metrics in check current_error (fun () -> match op with - | `changing_dynamic_range - when nested_virt ~__context ref metrics && strict -> + | `changing_dynamic_range when is_nested_virt && strict -> Some (Api_errors.vm_is_using_nested_virt, [ref_str]) | _ -> None @@ -542,13 +542,11 @@ let check_operation_error ~__context ~ref = (* make use of the Helpers.ballooning_enabled_for_vm function. *) let current_error = check current_error (fun () -> - let vm_ref () = + let is_domain_zero = Db.VM.get_by_uuid ~__context ~uuid:vmr.Db_actions.vM_uuid + |> Helpers.is_domain_zero ~__context in - if - (op = `changing_VCPUs || op = `destroy) - && Helpers.is_domain_zero ~__context (vm_ref ()) - then + if (op = `changing_VCPUs || op = `destroy) && is_domain_zero then Some ( Api_errors.operation_not_allowed , ["This operation is not allowed on dom0"] @@ -668,8 +666,11 @@ let check_operation_error ~__context ~ref = in (* Check for errors caused by VM being in an appliance. *) let current_error = + let is_appliance_valid = + Db.is_valid_ref __context vmr.Db_actions.vM_appliance + in check current_error (fun () -> - if Db.is_valid_ref __context vmr.Db_actions.vM_appliance then + if is_appliance_valid then check_appliance ~vmr ~op ~ref_str else None @@ -677,8 +678,11 @@ let check_operation_error ~__context ~ref = in (* Check for errors caused by VM being assigned to a protection policy. *) let current_error = + let is_protection_policy_valid = + Db.is_valid_ref __context vmr.Db_actions.vM_protection_policy + in check current_error (fun () -> - if Db.is_valid_ref __context vmr.Db_actions.vM_protection_policy then + if is_protection_policy_valid then check_protection_policy ~vmr ~op ~ref_str else None @@ -686,8 +690,11 @@ let check_operation_error ~__context ~ref = in (* Check for errors caused by VM being assigned to a snapshot schedule. *) let current_error = + let is_snapshort_schedule_valid = + Db.is_valid_ref __context vmr.Db_actions.vM_snapshot_schedule + in check current_error (fun () -> - if Db.is_valid_ref __context vmr.Db_actions.vM_snapshot_schedule then + if is_snapshort_schedule_valid then check_snapshot_schedule ~vmr ~ref_str op else None @@ -709,9 +716,12 @@ let check_operation_error ~__context ~ref = ) in let current_error = + let rolling_upgrade_in_progress = + Helpers.rolling_upgrade_in_progress ~__context + in check current_error (fun () -> if - Helpers.rolling_upgrade_in_progress ~__context + rolling_upgrade_in_progress && not (List.mem op Xapi_globs.rpu_allowed_vm_operations) then Some (Api_errors.not_supported_during_upgrade, []) From e1a57fa5ef4b97adab32bcf1a24d9af61bccc3c4 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Fri, 20 Jun 2025 06:35:20 +0000 Subject: [PATCH 041/111] CA-408552: 3/3 Improve bootstrom performance by save db ops Move ops unrelated db operation outside of returned function Signed-off-by: Lin Liu --- ocaml/xapi/xapi_vdi.ml | 176 +++++++++++++++----------------- ocaml/xapi/xapi_vm_lifecycle.ml | 76 +++++++------- 2 files changed, 123 insertions(+), 129 deletions(-) diff --git a/ocaml/xapi/xapi_vdi.ml b/ocaml/xapi/xapi_vdi.ml index 1090ae01f1d..0f9904d72fb 100644 --- a/ocaml/xapi/xapi_vdi.ml +++ b/ocaml/xapi/xapi_vdi.ml @@ -68,6 +68,84 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) let _ref = Ref.string_of _ref' in let current_ops = record.Db_actions.vDI_current_operations in let reset_on_boot = record.Db_actions.vDI_on_boot = `reset in + let rolling_upgrade_in_progress = + Helpers.rolling_upgrade_in_progress ~__context + in + (* check to see whether it's a local cd drive *) + let sr = record.Db_actions.vDI_SR in + let sr_type = Db.SR.get_type ~__context ~self:sr in + let is_tools_sr = Db.SR.get_is_tools_sr ~__context ~self:sr in + (* Check to see if any PBDs are attached *) + let open Xapi_database.Db_filter_types in + let pbds_attached = + match pbd_records with + | [] -> + Db.PBD.get_records_where ~__context + ~expr: + (And + ( Eq (Field "SR", Literal (Ref.string_of sr)) + , Eq (Field "currently_attached", Literal "true") + ) + ) + | _ -> + List.filter + (fun (_, pbd_record) -> + pbd_record.API.pBD_SR = sr && pbd_record.API.pBD_currently_attached + ) + pbd_records + in + + (* Only a 'live' operation can be performed if there are active (even RO) devices *) + let my_active_vbd_records = + match vbd_records with + | None -> + List.map snd + (Db.VBD.get_internal_records_where ~__context + ~expr: + (And + ( Eq (Field "VDI", Literal _ref) + , Or + ( Eq (Field "currently_attached", Literal "true") + , Eq (Field "reserved", Literal "true") + ) + ) + ) + ) + | Some records -> + List.filter + (fun vbd_record -> + vbd_record.Db_actions.vBD_VDI = _ref' + && (vbd_record.Db_actions.vBD_currently_attached + || vbd_record.Db_actions.vBD_reserved + ) + ) + records + in + let my_active_rw_vbd_records = + List.filter (fun vbd -> vbd.Db_actions.vBD_mode = `RW) my_active_vbd_records + in + (* VBD operations (plug/unplug) (which should be transient) cause us to serialise *) + let my_has_current_operation_vbd_records = + match vbd_records with + | None -> + List.map snd + (Db.VBD.get_internal_records_where ~__context + ~expr: + (And + ( Eq (Field "VDI", Literal _ref) + , Not (Eq (Field "current_operations", Literal "()")) + ) + ) + ) + | Some records -> + List.filter + (fun vbd_record -> + vbd_record.Db_actions.vBD_VDI = _ref' + && vbd_record.Db_actions.vBD_current_operations <> [] + ) + records + in + (* Policy: 1. any current_operation besides copy implies exclusivity; fail everything else; except vdi mirroring is in current operations and destroy is performed @@ -83,11 +161,15 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) 5. HA prevents you from deleting statefiles or metadata volumes 6. During rolling pool upgrade, only operations known by older releases are allowed *) + let sm_features = + Xapi_sr_operations.features_of_sr_internal ~__context ~_type:sr_type + in + let vdi_is_ha_state_or_redolog = + List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] + in + fun op -> let* () = - let rolling_upgrade_in_progress = - Helpers.rolling_upgrade_in_progress ~__context - in if rolling_upgrade_in_progress && not @@ -113,30 +195,6 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) else Ok () in - (* check to see whether it's a local cd drive *) - let sr = record.Db_actions.vDI_SR in - let sr_type = Db.SR.get_type ~__context ~self:sr in - let is_tools_sr = Db.SR.get_is_tools_sr ~__context ~self:sr in - (* Check to see if any PBDs are attached *) - let open Xapi_database.Db_filter_types in - let pbds_attached = - match pbd_records with - | [] -> - Db.PBD.get_records_where ~__context - ~expr: - (And - ( Eq (Field "SR", Literal (Ref.string_of sr)) - , Eq (Field "currently_attached", Literal "true") - ) - ) - | _ -> - List.filter - (fun (_, pbd_record) -> - pbd_record.API.pBD_SR = sr - && pbd_record.API.pBD_currently_attached - ) - pbd_records - in let* () = if pbds_attached = [] && op = `resize then Error (Api_errors.sr_no_pbds, [Ref.string_of sr]) @@ -146,58 +204,6 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) (* check to see whether VBDs exist which are using this VDI *) - (* Only a 'live' operation can be performed if there are active (even RO) devices *) - let my_active_vbd_records = - match vbd_records with - | None -> - List.map snd - (Db.VBD.get_internal_records_where ~__context - ~expr: - (And - ( Eq (Field "VDI", Literal _ref) - , Or - ( Eq (Field "currently_attached", Literal "true") - , Eq (Field "reserved", Literal "true") - ) - ) - ) - ) - | Some records -> - List.filter - (fun vbd_record -> - vbd_record.Db_actions.vBD_VDI = _ref' - && (vbd_record.Db_actions.vBD_currently_attached - || vbd_record.Db_actions.vBD_reserved - ) - ) - records - in - let my_active_rw_vbd_records = - List.filter - (fun vbd -> vbd.Db_actions.vBD_mode = `RW) - my_active_vbd_records - in - (* VBD operations (plug/unplug) (which should be transient) cause us to serialise *) - let my_has_current_operation_vbd_records = - match vbd_records with - | None -> - List.map snd - (Db.VBD.get_internal_records_where ~__context - ~expr: - (And - ( Eq (Field "VDI", Literal _ref) - , Not (Eq (Field "current_operations", Literal "()")) - ) - ) - ) - | Some records -> - List.filter - (fun vbd_record -> - vbd_record.Db_actions.vBD_VDI = _ref' - && vbd_record.Db_actions.vBD_current_operations <> [] - ) - records - in (* If the VBD is currently_attached then some operations can still be performed ie: VDI.clone (if the VM is suspended we have to have the 'allow_clone_suspended_vm' flag); VDI.snapshot; VDI.resize_online; @@ -275,9 +281,6 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) else Ok () in - let sm_features = - Xapi_sr_operations.features_of_sr_internal ~__context ~_type:sr_type - in let* () = check_sm_feature_error op sm_features sr in let allowed_for_cbt_metadata_vdi = match op with @@ -341,16 +344,7 @@ let check_operation_error ~__context ?sr_records:_ ?(pbd_records = []) else Ok () in - let vdi_is_ha_state_or_redolog = - List.mem record.Db_actions.vDI_type [`ha_statefile; `redo_log] - in let check_destroy () = - let ha_enable_in_progress = - Xapi_pool_helpers.ha_enable_in_progress ~__context - in - let ha_disable_in_progress = - Xapi_pool_helpers.ha_disable_in_progress ~__context - in if sr_type = "udev" then Error (Api_errors.vdi_is_a_physical_device, [_ref]) else if is_tools_sr then diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index 7e95933d8d2..5ec4ca6d792 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -393,8 +393,7 @@ let nested_virt ~__context vm metrics = let key = "nested-virt" in Vm_platform.is_true ~key ~platformdata ~default:false -let is_mobile ~__context vm strict = - let metrics = Db.VM.get_metrics ~__context ~self:vm in +let is_mobile ~__context vm strict metrics = (not @@ nomigrate ~__context vm metrics) && (not @@ nested_virt ~__context vm metrics) || not strict @@ -447,6 +446,42 @@ let check_operation_error ~__context ~ref = vmr.Db_actions.vM_VBDs |> List.filter (Db.is_valid_ref __context) in + let current_ops = vmr.Db_actions.vM_current_operations in + let metrics = Db.VM.get_metrics ~__context ~self:ref in + let is_nested_virt = nested_virt ~__context ref metrics in + let is_domain_zero = + Db.VM.get_by_uuid ~__context ~uuid:vmr.Db_actions.vM_uuid + |> Helpers.is_domain_zero ~__context + in + let vdis_reset_and_caching = + List.filter_map + (fun vdi -> + try + let sm_config = Db.VDI.get_sm_config ~__context ~self:vdi in + Some + ( List.assoc_opt "on_boot" sm_config = Some "reset" + , bool_of_assoc "caching" sm_config + ) + with _ -> None + ) + vdis + in + let sriov_pcis = nvidia_sriov_pcis ~__context vmr.Db_actions.vM_VGPUs in + let is_not_sriov pci = not @@ List.mem pci sriov_pcis in + let pcis = vmr.Db_actions.vM_attached_PCIs in + let is_appliance_valid = + Db.is_valid_ref __context vmr.Db_actions.vM_appliance + in + let is_protection_policy_valid = + Db.is_valid_ref __context vmr.Db_actions.vM_protection_policy + in + let rolling_upgrade_in_progress = + Helpers.rolling_upgrade_in_progress ~__context + in + let is_snapshort_schedule_valid = + Db.is_valid_ref __context vmr.Db_actions.vM_snapshot_schedule + in + fun ~op ~strict -> let current_error = None in let check c f = match c with Some e -> Some e | None -> f () in @@ -470,7 +505,6 @@ let check_operation_error ~__context ~ref = (* if other operations are in progress, check that the new operation is allowed concurrently with them. *) let current_error = check current_error (fun () -> - let current_ops = vmr.Db_actions.vM_current_operations in if List.length current_ops <> 0 && not (is_allowed_concurrently ~op ~current_ops) @@ -520,15 +554,13 @@ let check_operation_error ~__context ~ref = check current_error (fun () -> match op with | (`suspend | `checkpoint | `pool_migrate | `migrate_send) - when not (is_mobile ~__context ref strict) -> + when not (is_mobile ~__context ref strict metrics) -> Some (Api_errors.vm_is_immobile, [ref_str]) | _ -> None ) in let current_error = - let metrics = Db.VM.get_metrics ~__context ~self:ref in - let is_nested_virt = nested_virt ~__context ref metrics in check current_error (fun () -> match op with | `changing_dynamic_range when is_nested_virt && strict -> @@ -542,10 +574,6 @@ let check_operation_error ~__context ~ref = (* make use of the Helpers.ballooning_enabled_for_vm function. *) let current_error = check current_error (fun () -> - let is_domain_zero = - Db.VM.get_by_uuid ~__context ~uuid:vmr.Db_actions.vM_uuid - |> Helpers.is_domain_zero ~__context - in if (op = `changing_VCPUs || op = `destroy) && is_domain_zero then Some ( Api_errors.operation_not_allowed @@ -592,19 +620,6 @@ let check_operation_error ~__context ~ref = (* Check for an error due to VDI caching/reset behaviour *) let current_error = check current_error (fun () -> - let vdis_reset_and_caching = - List.filter_map - (fun vdi -> - try - let sm_config = Db.VDI.get_sm_config ~__context ~self:vdi in - Some - ( List.assoc_opt "on_boot" sm_config = Some "reset" - , bool_of_assoc "caching" sm_config - ) - with _ -> None - ) - vdis - in if op = `checkpoint || op = `snapshot @@ -633,9 +648,6 @@ let check_operation_error ~__context ~ref = (* If a PCI device is passed-through, check if the operation is allowed *) let current_error = check current_error @@ fun () -> - let sriov_pcis = nvidia_sriov_pcis ~__context vmr.Db_actions.vM_VGPUs in - let is_not_sriov pci = not @@ List.mem pci sriov_pcis in - let pcis = vmr.Db_actions.vM_attached_PCIs in match op with | (`suspend | `checkpoint | `pool_migrate | `migrate_send) when List.exists is_not_sriov pcis -> @@ -666,9 +678,6 @@ let check_operation_error ~__context ~ref = in (* Check for errors caused by VM being in an appliance. *) let current_error = - let is_appliance_valid = - Db.is_valid_ref __context vmr.Db_actions.vM_appliance - in check current_error (fun () -> if is_appliance_valid then check_appliance ~vmr ~op ~ref_str @@ -678,9 +687,6 @@ let check_operation_error ~__context ~ref = in (* Check for errors caused by VM being assigned to a protection policy. *) let current_error = - let is_protection_policy_valid = - Db.is_valid_ref __context vmr.Db_actions.vM_protection_policy - in check current_error (fun () -> if is_protection_policy_valid then check_protection_policy ~vmr ~op ~ref_str @@ -690,9 +696,6 @@ let check_operation_error ~__context ~ref = in (* Check for errors caused by VM being assigned to a snapshot schedule. *) let current_error = - let is_snapshort_schedule_valid = - Db.is_valid_ref __context vmr.Db_actions.vM_snapshot_schedule - in check current_error (fun () -> if is_snapshort_schedule_valid then check_snapshot_schedule ~vmr ~ref_str op @@ -716,9 +719,6 @@ let check_operation_error ~__context ~ref = ) in let current_error = - let rolling_upgrade_in_progress = - Helpers.rolling_upgrade_in_progress ~__context - in check current_error (fun () -> if rolling_upgrade_in_progress From d236751e762d53ee09dcf9110845d178341a3e87 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 19 Jun 2025 13:19:13 +0100 Subject: [PATCH 042/111] xenops_server_plugin: Refer to the type alias instead of its definition This allows changing the type definition in Updates without modifying the types here as well in the future. Signed-off-by: Andrii Sultanov --- ocaml/xenopsd/lib/xenops_server_plugin.ml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ocaml/xenopsd/lib/xenops_server_plugin.ml b/ocaml/xenopsd/lib/xenops_server_plugin.ml index 1a52749a9f3..4c8c73773f8 100644 --- a/ocaml/xenopsd/lib/xenops_server_plugin.ml +++ b/ocaml/xenopsd/lib/xenops_server_plugin.ml @@ -288,10 +288,7 @@ module type S = sig end module UPDATES : sig - val get : - Updates.id option - -> int option - -> Dynamic.barrier list * Dynamic.id list * Updates.id + val get : Updates.id option -> int option -> Updates.get_result end module DEBUG : sig From 7a3788a9e9d2153cba1a14d9a857e811af4e3263 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 19 Jun 2025 13:28:00 +0100 Subject: [PATCH 043/111] xapi-idl/updates: Make filterfn in inject_barrier only look at keys The only usage of it ignores values, so drop them altogether. This allows changing the type of the values in the future without modifying inject_barrier and its users in any way. Signed-off-by: Andrii Sultanov --- ocaml/xapi-idl/lib/updates.ml | 1 + ocaml/xapi-idl/lib/updates.mli | 2 +- ocaml/xapi-idl/lib_test/updates_test.ml | 8 ++++---- ocaml/xenopsd/lib/xenops_server.ml | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ocaml/xapi-idl/lib/updates.ml b/ocaml/xapi-idl/lib/updates.ml index 93904f2b65b..f6420da6834 100644 --- a/ocaml/xapi-idl/lib/updates.ml +++ b/ocaml/xapi-idl/lib/updates.ml @@ -66,6 +66,7 @@ functor ) let inject_barrier id filterfn t = + let filterfn key _ = filterfn key in ( { map= t.map ; barriers= diff --git a/ocaml/xapi-idl/lib/updates.mli b/ocaml/xapi-idl/lib/updates.mli index a054c5581d8..9b678a28839 100644 --- a/ocaml/xapi-idl/lib/updates.mli +++ b/ocaml/xapi-idl/lib/updates.mli @@ -64,7 +64,7 @@ module Updates : functor (Interface : INTERFACE) -> sig (* [inject_barrier n p t] Inject a barrier identified by [n] into [t]. The barrier will contain a snapshot of all current updates that match the predicate [p]. *) - val inject_barrier : int -> (Interface.Dynamic.id -> int -> bool) -> t -> unit + val inject_barrier : int -> (Interface.Dynamic.id -> bool) -> t -> unit (* Removes a barrier *) val remove_barrier : int -> t -> unit diff --git a/ocaml/xapi-idl/lib_test/updates_test.ml b/ocaml/xapi-idl/lib_test/updates_test.ml index 66c5f09450e..790e72854c1 100644 --- a/ocaml/xapi-idl/lib_test/updates_test.ml +++ b/ocaml/xapi-idl/lib_test/updates_test.ml @@ -84,7 +84,7 @@ let test_inject_barrier () = let u = M.empty scheduler in M.add update_a u ; M.add update_b u ; - M.inject_barrier 1 (fun _ _ -> true) u ; + M.inject_barrier 1 (fun _ -> true) u ; M.add update_a u ; M.add update_c u ; let barriers, updates, _id = M.get "dbg" None (Some 1) u in @@ -107,7 +107,7 @@ let test_remove_barrier () = let u = M.empty scheduler in M.add update_a u ; M.add update_b u ; - M.inject_barrier 1 (fun _ _ -> true) u ; + M.inject_barrier 1 (fun _ -> true) u ; M.add update_a u ; M.add update_c u ; M.remove_barrier 1 u ; @@ -125,7 +125,7 @@ let test_inject_barrier_rpc () = let u = M.empty scheduler in M.add update_a u ; M.add update_b u ; - M.inject_barrier 1 (fun _ _ -> true) u ; + M.inject_barrier 1 (fun _ -> true) u ; M.add update_a u ; M.add update_c u ; let barriers, updates, _id = M.get "dbg" None (Some 1) u in @@ -175,7 +175,7 @@ let test_filter () = let test_dump () = let u = M.empty scheduler in M.add update_a u ; - M.inject_barrier 1 (fun _ _ -> true) u ; + M.inject_barrier 1 (fun _ -> true) u ; let dump = M.Dump.make u in let dumped_rpc = M.Dump.rpc_of_dump dump in let expected_rpc = diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index 8fe027630fe..569dabc11a1 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -4039,7 +4039,7 @@ module UPDATES = struct Debug.with_thread_associated dbg (fun () -> debug "UPDATES.inject_barrier %s %d" vm_id id ; - let filter k _ = + let filter k = match k with | Dynamic.Task _ -> false From 4316e6bd97bfe012c88942b5b4b32c260b1e6e01 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Fri, 20 Jun 2025 10:54:59 +0100 Subject: [PATCH 044/111] xapi_xenops: Refactor update_vm_internal Drop the first member of the (Vm.t * Vm.state) tuple as it's never used. This removes the need for several 'snd info', 'Option.iter (fun (_, state) ->) info' constructs. Replace all the following constructs: ``` if different (fun x -> x.field) && predicate then Option.iter (fun state -> ...) info ``` With this: ``` different (fun x -> x.field) ((&&) predicate) (fun field -> ...); ``` It 1) removes the additional level of indentation inside Option.iter (fun state), hides the duplication of this construct 2) makes it obvious where the whole 'state' is accessed and not only the field that was checked to be different Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_xenops.ml | 602 +++++++++++++++++++------------------- 1 file changed, 302 insertions(+), 300 deletions(-) diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index dbefffb1571..4b7b738b000 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -1857,10 +1857,17 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = if info = None then debug "xenopsd event: VM state missing: assuming VM has shut down" ; let should_update_allowed_operations = ref false in - let different f = - let a = Option.map (fun x -> f (snd x)) info in - let b = Option.map f previous in - a <> b + + (* If a field (accessed by [accessor] for [Vm.state]) changed in an + update and [predicate has_changed], call [f (accessor info)] *) + let different accessor predicate f = + let a = Option.map (fun x -> accessor x) info in + let b = Option.map accessor previous in + let diff = a <> b in + if predicate diff then + Option.iter f a + else + () in (* Helpers to create and update guest metrics when needed *) let lookup state key = List.assoc_opt key state.Vm.guest_agent in @@ -1896,7 +1903,7 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = let gm = Db.VM.get_guest_metrics ~__context ~self in if gm = Ref.null then Option.iter - (fun (_, state) -> + (fun state -> List.iter (fun domid -> try @@ -1917,7 +1924,7 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = in let check_guest_agent () = Option.iter - (fun (_, state) -> + (fun state -> Option.iter (fun oldstate -> let old_ga = oldstate.Vm.guest_agent in @@ -1965,8 +1972,7 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = process restart or an event is generated. We may wish to periodically inject artificial events IF there has been an event sync failure? *) let power_state = - xenapi_of_xenops_power_state - (Option.map (fun x -> (snd x).Vm.power_state) info) + xenapi_of_xenops_power_state (Option.map (fun x -> x.Vm.power_state) info) in let power_state_before_update = Db.VM.get_power_state ~__context ~self in (* We preserve the current_domain_type of suspended VMs like we preserve @@ -1974,37 +1980,40 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = whether suspended VMs are going to resume into PV or PVinPVH for example. We do this before updating the power_state to maintain the invariant that any VM that's not `Halted cannot have an unspecified current_domain_type *) - if different (fun x -> x.domain_type) && power_state <> `Suspended then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - let update domain_type = - debug "xenopsd event: Updating VM %s current_domain_type <- %s" id - (Record_util.domain_type_to_string domain_type) ; - Db.VM_metrics.set_current_domain_type ~__context ~self:metrics - ~value:domain_type - in - match state.Vm.domain_type with - | Domain_HVM -> - update `hvm - | Domain_PV -> - update `pv - | Domain_PVinPVH -> - update `pv_in_pvh - | Domain_PVH -> - update `pvh - | Domain_undefined -> - if power_state <> `Halted then - debug - "xenopsd returned an undefined domain type for non-halted VM \ - %s;assuming this is transient, so not updating \ - current_domain_type" - id - else - update `unspecified - ) - info ; - ( if different (fun x -> x.power_state) then + different + (fun x -> x.Vm.domain_type) + (( && ) (power_state <> `Suspended)) + (fun domain_type -> + let metrics = Db.VM.get_metrics ~__context ~self in + let update domain_type = + debug "xenopsd event: Updating VM %s current_domain_type <- %s" id + (Record_util.domain_type_to_string domain_type) ; + Db.VM_metrics.set_current_domain_type ~__context ~self:metrics + ~value:domain_type + in + match domain_type with + | Vm.Domain_HVM -> + update `hvm + | Domain_PV -> + update `pv + | Domain_PVinPVH -> + update `pv_in_pvh + | Domain_PVH -> + update `pvh + | Domain_undefined -> + if power_state <> `Halted then + debug + "xenopsd returned an undefined domain type for non-halted VM \ + %s;assuming this is transient, so not updating \ + current_domain_type" + id + else + update `unspecified + ) ; + different + (fun x -> x.Vm.power_state) + Fun.id + (fun _ -> try debug "Will update VM.allowed_operations because power_state has changed." ; @@ -2058,14 +2067,17 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = with e -> error "Caught %s: while updating VM %s power_state" (Printexc.to_string e) id - ) ; - ( if different (fun x -> x.domids) then + ) ; + different + (fun x -> x.Vm.domids) + Fun.id + (fun _ -> try debug "Will update VM.allowed_operations because domid has changed." ; should_update_allowed_operations := true ; debug "xenopsd event: Updating VM %s domid" id ; Option.iter - (fun (_, state) -> + (fun state -> match state.Vm.domids with | value :: _ -> Db.VM.set_domid ~__context ~self ~value:(Int64.of_int value) @@ -2090,306 +2102,296 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = (System_domains.pbd_of_vm ~__context ~vm:self) with e -> error "Caught %s: while updating VM %s domids" (Printexc.to_string e) id - ) ; + ) ; (* consoles *) - ( if different (fun x -> x.consoles) then + different + (fun x -> x.Vm.consoles) + Fun.id + (fun consoles -> try debug "xenopsd event: Updating VM %s consoles" id ; - Option.iter - (fun (_, state) -> - let address = Db.Host.get_address ~__context ~self:localhost in - let uri = - Uri.( - make ~scheme:"https" ~host:address ~path:Constants.console_uri - () - |> to_string + let address = Db.Host.get_address ~__context ~self:localhost in + let uri = + Uri.( + make ~scheme:"https" ~host:address ~path:Constants.console_uri () + |> to_string + ) + in + let get_uri_from_location loc = + try + let n = String.index loc '?' in + String.sub loc 0 n + with Not_found -> loc + in + let current_protocols = + List.map + (fun self -> + ( ( Db.Console.get_protocol ~__context ~self + |> to_xenops_console_protocol + , Db.Console.get_location ~__context ~self + |> get_uri_from_location + ) + , self ) - in - let get_uri_from_location loc = + ) + (Db.VM.get_consoles ~__context ~self) + in + let new_protocols = + List.map (fun c -> ((c.Vm.protocol, uri), c)) consoles + in + (* Destroy consoles that have gone away *) + List.iter + (fun protocol -> + let self = List.assoc protocol current_protocols in + Db.Console.destroy ~__context ~self + ) + (Listext.set_difference + (List.map fst current_protocols) + (List.map fst new_protocols) + ) ; + (* Create consoles that have appeared *) + List.iter + (fun (protocol, _) -> + let ref = Ref.make () in + let uuid = Uuidx.to_string (Uuidx.make ()) in + let location = Printf.sprintf "%s?uuid=%s" uri uuid in + let port = try - let n = String.index loc '?' in - String.sub loc 0 n - with Not_found -> loc - in - let current_protocols = - List.map - (fun self -> - ( ( Db.Console.get_protocol ~__context ~self - |> to_xenops_console_protocol - , Db.Console.get_location ~__context ~self - |> get_uri_from_location - ) - , self - ) - ) - (Db.VM.get_consoles ~__context ~self) - in - let new_protocols = - List.map (fun c -> ((c.Vm.protocol, uri), c)) state.Vm.consoles + Int64.of_int + (List.find (fun c -> c.Vm.protocol = protocol) consoles).port + with Not_found -> -1L in - (* Destroy consoles that have gone away *) - List.iter - (fun protocol -> - let self = List.assoc protocol current_protocols in - Db.Console.destroy ~__context ~self - ) - (Listext.set_difference - (List.map fst current_protocols) - (List.map fst new_protocols) - ) ; - (* Create consoles that have appeared *) - List.iter - (fun (protocol, _) -> - let ref = Ref.make () in - let uuid = Uuidx.to_string (Uuidx.make ()) in - let location = Printf.sprintf "%s?uuid=%s" uri uuid in - let port = - try - Int64.of_int - (List.find - (fun c -> c.Vm.protocol = protocol) - state.Vm.consoles - ) - .port - with Not_found -> -1L - in - Db.Console.create ~__context ~ref ~uuid - ~protocol:(to_xenapi_console_protocol protocol) - ~location ~vM:self ~other_config:[] ~port - ) - (Listext.set_difference - (List.map fst new_protocols) - (List.map fst current_protocols) - ) + Db.Console.create ~__context ~ref ~uuid + ~protocol:(to_xenapi_console_protocol protocol) + ~location ~vM:self ~other_config:[] ~port + ) + (Listext.set_difference + (List.map fst new_protocols) + (List.map fst current_protocols) ) - info with e -> error "Caught %s: while updating VM %s consoles" (Printexc.to_string e) id - ) ; - ( if different (fun x -> x.memory_target) then + ) ; + different + (fun x -> x.Vm.memory_target) + Fun.id + (fun memory_target -> try - Option.iter - (fun (_, state) -> - debug "xenopsd event: Updating VM %s memory_target <- %Ld" id - state.Vm.memory_target ; - Db.VM.set_memory_target ~__context ~self ~value:state.memory_target - ) - info + debug "xenopsd event: Updating VM %s memory_target <- %Ld" id + memory_target ; + Db.VM.set_memory_target ~__context ~self ~value:memory_target with e -> error "Caught %s: while updating VM %s consoles" (Printexc.to_string e) id - ) ; - ( if different (fun x -> x.rtc_timeoffset) then + ) ; + different + (fun x -> x.rtc_timeoffset) + Fun.id + (fun rtc_timeoffset -> try - Option.iter - (fun (_, state) -> - if state.Vm.rtc_timeoffset <> "" then ( - debug "xenopsd event: Updating VM %s platform:timeoffset <- %s" id - state.rtc_timeoffset ; - ( try - Db.VM.remove_from_platform ~__context ~self - ~key:Vm_platform.timeoffset - with _ -> () - ) ; - Db.VM.add_to_platform ~__context ~self ~key:Vm_platform.timeoffset - ~value:state.rtc_timeoffset - ) - ) - info + if rtc_timeoffset <> "" then ( + debug "xenopsd event: Updating VM %s platform:timeoffset <- %s" id + rtc_timeoffset ; + ( try + Db.VM.remove_from_platform ~__context ~self + ~key:Vm_platform.timeoffset + with _ -> () + ) ; + Db.VM.add_to_platform ~__context ~self ~key:Vm_platform.timeoffset + ~value:rtc_timeoffset + ) with e -> error "Caught %s: while updating VM %s rtc/timeoffset" (Printexc.to_string e) id - ) ; - if different (fun x -> x.hvm) then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s hvm <- %s" id - (string_of_bool state.Vm.hvm) ; - Db.VM_metrics.set_hvm ~__context ~self:metrics ~value:state.Vm.hvm - ) - info ; - if different (fun x -> x.nomigrate) then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s nomigrate <- %s" id - (string_of_bool state.Vm.nomigrate) ; - Db.VM_metrics.set_nomigrate ~__context ~self:metrics - ~value:state.Vm.nomigrate - ) - info ; - if different (fun x -> x.nested_virt) then - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s nested_virt <- %s" id - (string_of_bool state.Vm.nested_virt) ; - Db.VM_metrics.set_nested_virt ~__context ~self:metrics - ~value:state.Vm.nested_virt - ) - info ; - let update_pv_drivers_detected () = - Option.iter - (fun (_, state) -> - try - let gm = Db.VM.get_guest_metrics ~__context ~self in - debug "xenopsd event: Updating VM %s PV drivers detected %b" id - state.Vm.pv_drivers_detected ; - Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm - ~value:state.Vm.pv_drivers_detected ; - Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm - ~value:state.Vm.pv_drivers_detected - with e -> - debug "Caught %s: while updating VM %s PV drivers" - (Printexc.to_string e) id - ) - info - in + ) ; + different + (fun x -> x.hvm) + Fun.id + (fun hvm -> + let metrics = Db.VM.get_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s hvm <- %s" id (string_of_bool hvm) ; + Db.VM_metrics.set_hvm ~__context ~self:metrics ~value:hvm + ) ; + different + (fun x -> x.nomigrate) + Fun.id + (fun nomigrate -> + let metrics = Db.VM.get_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s nomigrate <- %s" id + (string_of_bool nomigrate) ; + Db.VM_metrics.set_nomigrate ~__context ~self:metrics ~value:nomigrate + ) ; + different + (fun x -> x.nested_virt) + Fun.id + (fun nested_virt -> + let metrics = Db.VM.get_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s nested_virt <- %s" id + (string_of_bool nested_virt) ; + Db.VM_metrics.set_nested_virt ~__context ~self:metrics ~value:nested_virt + ) ; (* Chack last_start_time before updating anything in the guest metrics *) - ( if different (fun x -> x.last_start_time) then + different + (fun x -> x.last_start_time) + Fun.id + (fun last_start_time -> try - Option.iter - (fun (_, state) -> - let metrics = Db.VM.get_metrics ~__context ~self in - (* Clamp time to full seconds, stored timestamps do not - have decimals *) - let start_time = - Float.floor state.Vm.last_start_time |> Date.of_unix_time - in - let expected_time = - Db.VM_metrics.get_start_time ~__context ~self:metrics - in - if Date.is_later ~than:expected_time start_time then ( - debug "xenopsd event: Updating VM %s last_start_time <- %s" id - Date.(to_rfc3339 (of_unix_time state.Vm.last_start_time)) ; - Db.VM_metrics.set_start_time ~__context ~self:metrics - ~value:start_time ; - if - (* VM start and VM reboot *) - power_state = `Running - && power_state_before_update <> `Suspended - then ( - Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self - ~value:`restart_device_model ; - Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self - ~value:`restart_vm - ) - ) ; - create_guest_metrics_if_needed () ; - let gm = Db.VM.get_guest_metrics ~__context ~self in - let update_time = - Db.VM_guest_metrics.get_last_updated ~__context ~self:gm - in - if update_time < start_time then ( - debug - "VM %s guest metrics update time (%s) < VM start time (%s): \ - deleting" - id - (Date.to_rfc3339 update_time) - (Date.to_rfc3339 start_time) ; - Xapi_vm_helpers.delete_guest_metrics ~__context ~self ; - check_guest_agent () - ) + let metrics = Db.VM.get_metrics ~__context ~self in + (* Clamp time to full seconds, stored timestamps do not + have decimals *) + let start_time = Float.floor last_start_time |> Date.of_unix_time in + let expected_time = + Db.VM_metrics.get_start_time ~__context ~self:metrics + in + if Date.is_later ~than:expected_time start_time then ( + debug "xenopsd event: Updating VM %s last_start_time <- %s" id + Date.(to_rfc3339 (of_unix_time last_start_time)) ; + Db.VM_metrics.set_start_time ~__context ~self:metrics + ~value:start_time ; + if + (* VM start and VM reboot *) + power_state = `Running && power_state_before_update <> `Suspended + then ( + Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self + ~value:`restart_device_model ; + Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self + ~value:`restart_vm ) - info + ) ; + create_guest_metrics_if_needed () ; + let gm = Db.VM.get_guest_metrics ~__context ~self in + let update_time = + Db.VM_guest_metrics.get_last_updated ~__context ~self:gm + in + if update_time < start_time then ( + debug + "VM %s guest metrics update time (%s) < VM start time (%s): \ + deleting" + id + (Date.to_rfc3339 update_time) + (Date.to_rfc3339 start_time) ; + Xapi_vm_helpers.delete_guest_metrics ~__context ~self ; + check_guest_agent () + ) with e -> error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id - ) ; + ) ; Option.iter - (fun (_, state) -> + (fun state -> List.iter (fun domid -> (* Guest metrics could have been destroyed during the last_start_time check by recreating them, we avoid CA-223387 *) create_guest_metrics_if_needed () ; - if different (fun x -> x.Vm.uncooperative_balloon_driver) then - debug - "xenopsd event: VM %s domid %d uncooperative_balloon_driver = %b" - id domid state.Vm.uncooperative_balloon_driver ; - if different (fun x -> x.Vm.guest_agent) then - check_guest_agent () ; - if different (fun x -> x.Vm.pv_drivers_detected) then - update_pv_drivers_detected () ; - ( if different (fun x -> x.Vm.xsdata_state) then + different + (fun x -> x.Vm.uncooperative_balloon_driver) + Fun.id + (fun uncooperative_balloon_driver -> + debug + "xenopsd event: VM %s domid %d uncooperative_balloon_driver = \ + %b" + id domid uncooperative_balloon_driver + ) ; + different + (fun x -> x.Vm.guest_agent) + Fun.id + (fun _ -> check_guest_agent ()) ; + different + (fun x -> x.Vm.pv_drivers_detected) + Fun.id + (fun pv_drivers_detected -> + try + let gm = Db.VM.get_guest_metrics ~__context ~self in + debug "xenopsd event: Updating VM %s PV drivers detected %b" id + pv_drivers_detected ; + Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm + ~value:pv_drivers_detected ; + Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context + ~self:gm ~value:pv_drivers_detected + with e -> + debug "Caught %s: while updating VM %s PV drivers" + (Printexc.to_string e) id + ) ; + different + (fun x -> x.Vm.xsdata_state) + Fun.id + (fun xsdata_state -> try debug "xenopsd event: Updating VM %s domid %d xsdata" id domid ; - Db.VM.set_xenstore_data ~__context ~self - ~value:state.Vm.xsdata_state + Db.VM.set_xenstore_data ~__context ~self ~value:xsdata_state with e -> error "Caught %s: while updating VM %s xsdata" (Printexc.to_string e) id - ) ; - if different (fun x -> x.Vm.memory_target) then - try - debug "xenopsd event: Updating VM %s domid %d memory target" id - domid ; - Rrdd.update_vm_memory_target domid state.Vm.memory_target - with e -> - error "Caught %s: while updating VM %s memory_target" - (Printexc.to_string e) id + ) ; + different + (fun x -> x.Vm.memory_target) + Fun.id + (fun memory_target -> + try + debug "xenopsd event: Updating VM %s domid %d memory target" id + domid ; + Rrdd.update_vm_memory_target domid memory_target + with e -> + error "Caught %s: while updating VM %s memory_target" + (Printexc.to_string e) id + ) ) state.Vm.domids ) info ; - if different (fun x -> x.Vm.vcpu_target) then - Option.iter - (fun (_, state) -> - try - debug "xenopsd event: Updating VM %s vcpu_target <- %d" id - state.Vm.vcpu_target ; - let metrics = Db.VM.get_metrics ~__context ~self in - Db.VM_metrics.set_VCPUs_number ~__context ~self:metrics - ~value:(Int64.of_int state.Vm.vcpu_target) - with e -> - error "Caught %s: while updating VM %s VCPUs_number" - (Printexc.to_string e) id - ) - info ; - ( if different (fun x -> x.shadow_multiplier_target) then + different + (fun x -> x.Vm.vcpu_target) + Fun.id + (fun vcpu_target -> try - Option.iter - (fun (_, state) -> - debug "xenopsd event: Updating VM %s shadow_multiplier <- %.2f" id - state.Vm.shadow_multiplier_target ; - if - state.Vm.power_state <> Halted - && state.Vm.shadow_multiplier_target >= 0.0 - then - Db.VM.set_HVM_shadow_multiplier ~__context ~self - ~value:state.Vm.shadow_multiplier_target - ) - info + debug "xenopsd event: Updating VM %s vcpu_target <- %d" id vcpu_target ; + let metrics = Db.VM.get_metrics ~__context ~self in + Db.VM_metrics.set_VCPUs_number ~__context ~self:metrics + ~value:(Int64.of_int vcpu_target) + with e -> + error "Caught %s: while updating VM %s VCPUs_number" + (Printexc.to_string e) id + ) ; + different + (fun x -> x.shadow_multiplier_target) + Fun.id + (fun shadow_multiplier_target -> + try + debug "xenopsd event: Updating VM %s shadow_multiplier <- %.2f" id + shadow_multiplier_target ; + if power_state <> `Halted && shadow_multiplier_target >= 0.0 then + Db.VM.set_HVM_shadow_multiplier ~__context ~self + ~value:shadow_multiplier_target with e -> error "Caught %s: while updating VM %s HVM_shadow_multiplier" (Printexc.to_string e) id - ) ; + ) ; (* Preserve last_boot_CPU_flags when suspending (see current_domain_type) *) - if different (fun x -> x.Vm.featureset) && power_state <> `Suspended then - Option.iter - (fun (_, state) -> - try - debug "xenopsd event: Updating VM %s last_boot_CPU_flags <- %s" id - state.Vm.featureset ; - let vendor = - Db.Host.get_cpu_info ~__context ~self:localhost - |> List.assoc Constants.cpu_info_vendor_key - in - let value = - [ - (Constants.cpu_info_vendor_key, vendor) - ; (Constants.cpu_info_features_key, state.Vm.featureset) - ] - in - Db.VM.set_last_boot_CPU_flags ~__context ~self ~value - with e -> - error "Caught %s: while updating VM %s last_boot_CPU_flags" - (Printexc.to_string e) id - ) - info ; - Xenops_cache.update_vm id (Option.map snd info) ; + different + (fun x -> x.Vm.featureset) + (( && ) (power_state <> `Suspended)) + (fun featureset -> + try + debug "xenopsd event: Updating VM %s last_boot_CPU_flags <- %s" id + featureset ; + let vendor = + Db.Host.get_cpu_info ~__context ~self:localhost + |> List.assoc Constants.cpu_info_vendor_key + in + let value = + [ + (Constants.cpu_info_vendor_key, vendor) + ; (Constants.cpu_info_features_key, featureset) + ] + in + Db.VM.set_last_boot_CPU_flags ~__context ~self ~value + with e -> + error "Caught %s: while updating VM %s last_boot_CPU_flags" + (Printexc.to_string e) id + ) ; + Xenops_cache.update_vm id info ; if !should_update_allowed_operations then Helpers.call_api_functions ~__context (fun rpc session_id -> XenAPI.VM.update_allowed_operations ~rpc ~session_id ~self @@ -2408,8 +2410,8 @@ let update_vm ~__context id = let module Client = (val make_client (queue_of_vm ~__context ~self) : XENOPS) in - let info = try Some (Client.VM.stat dbg id) with _ -> None in - if Option.map snd info <> previous then + let info = try Some (snd (Client.VM.stat dbg id)) with _ -> None in + if info <> previous then update_vm_internal ~__context ~id ~self ~previous ~info ~localhost with e -> error From ba144c2f46942b91b13e4d64aabc53887030310e Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Thu, 5 Jun 2025 11:04:14 +0100 Subject: [PATCH 045/111] CP-308253: Instrument `Consumers` Spans in `Message-switch`. Instruments `process`/`Consumers` spans of message-switch service in xenopsd. Signed-off-by: Gabriel Buica --- ocaml/xenopsd/lib/xenopsd.ml | 83 +++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 19 deletions(-) diff --git a/ocaml/xenopsd/lib/xenopsd.ml b/ocaml/xenopsd/lib/xenopsd.ml index ccacea0ed8b..5ad6401730b 100644 --- a/ocaml/xenopsd/lib/xenopsd.ml +++ b/ocaml/xenopsd/lib/xenopsd.ml @@ -300,29 +300,74 @@ let json_path () = path () ^ ".json" let rpc_fn call = (* Upgrade import_metadata API call *) - let call' = + let call', call_name, span_parent = match (call.Rpc.name, call.Rpc.params) with - | "VM.import_metadata", [debug_info; metadata] -> + | ("VM.import_metadata" as call_name), [Rpc.String debug_info; metadata] -> debug "Upgrading VM.import_metadata" ; - Rpc. - { - name= "VM.import_metadata" - ; params= - [Rpc.Dict [("debug_info", debug_info); ("metadata", metadata)]] - ; is_notification= false - } - | "query", [debug_info; unit_p] -> + let span_parent = + let di = debug_info |> Debug_info.of_string in + di.tracing + in + ( Rpc. + { + name= "VM.import_metadata" + ; params= + [ + Rpc.Dict + [ + ("debug_info", Rpc.String debug_info) + ; ("metadata", metadata) + ] + ] + ; is_notification= false + } + , call_name + , span_parent + ) + | ("query" as call_name), [Rpc.String debug_info; unit_p] -> debug "Upgrading query" ; - Rpc. - { - name= "query" - ; params= [Rpc.Dict [("debug_info", debug_info); ("unit", unit_p)]] - ; is_notification= false - } - | _ -> - call + let span_parent = + let di = debug_info |> Debug_info.of_string in + di.tracing + in + ( Rpc. + { + name= "query" + ; params= + [ + Rpc.Dict + [("debug_info", Rpc.String debug_info); ("unit", unit_p)] + ] + ; is_notification= false + } + , call_name + , span_parent + ) + | call_name, [Rpc.Dict kv_list] -> + let span_parent = + kv_list + |> List.find_map (function + | "debug_info", Rpc.String debug_info -> + let di = debug_info |> Debug_info.of_string in + di.tracing + | _ -> + None + ) + in + (call, call_name, span_parent) + | call_name, _ -> + (call, call_name, None) in - Idl.Exn.server Xenops_server.Server.implementation call' + Tracing.with_tracing + ~attributes: + [ + ("messaging.operation.name", "process") + ; ("messaging.system", "message-switch") + ; ("messaging.destination.name", !Xenops_interface.queue_name) + ] + ~span_kind:Tracing.SpanKind.Consumer ~parent:span_parent + ~name:("process" ^ " " ^ call_name) + @@ fun _ -> Idl.Exn.server Xenops_server.Server.implementation call' let handle_received_fd this_connection = let msg_size = 16384 in From 7a49235a7cae690a93837100e81a25263b7ce97b Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Thu, 5 Jun 2025 17:51:59 +0100 Subject: [PATCH 046/111] CP-50001: Instrument `xapi_xenops.ml` -- `dbg` carrier Intruments functions in `xapi_xenops.ml` that carry the traceparent/tracecontext through dbg. `Debug_info.with_dbg` now accepts setting up attributes for spans. Also, instruments `Events_from_xenopsd` to capture the event spans: `subscribe`/`settle`. It's nto straight forward to link them on the same trace. For now the only way they are connnected is having the same `message.id` attribute. Signed-off-by: Gabriel Buica --- ocaml/xapi-idl/lib/debug_info.ml | 5 +++-- ocaml/xapi-idl/lib/debug_info.mli | 3 ++- ocaml/xapi/xapi_xenops.ml | 31 +++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/ocaml/xapi-idl/lib/debug_info.ml b/ocaml/xapi-idl/lib/debug_info.ml index edf3c4979a8..e3845fa080d 100644 --- a/ocaml/xapi-idl/lib/debug_info.ml +++ b/ocaml/xapi-idl/lib/debug_info.ml @@ -76,13 +76,14 @@ let to_log_string t = t.log (* Sets the logging context based on `dbg`. Also adds a new tracing span, linked to the parent span from `dbg`, if available. *) -let with_dbg ?(with_thread = false) ?(module_name = "") ~name ~dbg f = +let with_dbg ?attributes ?(with_thread = false) ?(module_name = "") ~name ~dbg f + = let di = of_string dbg in let f_with_trace () = let name = match module_name with "" -> name | _ -> module_name ^ "." ^ name in - Tracing.with_tracing ~parent:di.tracing ~name (fun span -> + Tracing.with_tracing ?attributes ~parent:di.tracing ~name (fun span -> match span with Some _ -> f {di with tracing= span} | None -> f di ) in diff --git a/ocaml/xapi-idl/lib/debug_info.mli b/ocaml/xapi-idl/lib/debug_info.mli index 9db63471035..2b0244ac94a 100644 --- a/ocaml/xapi-idl/lib/debug_info.mli +++ b/ocaml/xapi-idl/lib/debug_info.mli @@ -23,7 +23,8 @@ val to_string : t -> string val to_log_string : t -> string val with_dbg : - ?with_thread:bool + ?attributes:(string * string) list + -> ?with_thread:bool -> ?module_name:string -> name:string -> dbg:string diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index dbefffb1571..39629e75aee 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -48,6 +48,8 @@ let check_power_state_is ~__context ~self ~expected = (Record_util.vm_power_state_to_lowercase_string expected) let event_wait queue_name dbg ?from p = + Debug_info.with_dbg ~name:__FUNCTION__ ~dbg @@ fun di -> + let dbg = Debug_info.to_string di in let finished = ref false in let event_id = ref from in let module Client = (val make_client queue_name : XENOPS) in @@ -58,6 +60,8 @@ let event_wait queue_name dbg ?from p = done let task_ended queue_name dbg id = + Debug_info.with_dbg ~name:__FUNCTION__ ~dbg @@ fun di -> + let dbg = Debug_info.to_string di in let module Client = (val make_client queue_name : XENOPS) in match (Client.TASK.stat dbg id).Task.state with | Task.Completed _ | Task.Failed _ -> @@ -66,6 +70,8 @@ let task_ended queue_name dbg id = false let wait_for_task queue_name dbg id = + Debug_info.with_dbg ~name:__FUNCTION__ ~dbg @@ fun di -> + let dbg = Debug_info.to_string di in let module Client = (val make_client queue_name : XENOPS) in let finished = function | Dynamic.Task id' -> @@ -1419,6 +1425,8 @@ let id_of_vm ~__context ~self = Db.VM.get_uuid ~__context ~self let vm_of_id ~__context uuid = Db.VM.get_by_uuid ~__context ~uuid let vm_exists_in_xenopsd queue_name dbg id = + Debug_info.with_dbg ~name:__FUNCTION__ ~dbg @@ fun di -> + let dbg = Debug_info.to_string di in let module Client = (val make_client queue_name : XENOPS) in Client.VM.exists dbg id @@ -1793,6 +1801,18 @@ module Events_from_xenopsd = struct let module Client = (val make_client queue_name : XENOPS) in let t = make () in let id = register t in + Debug_info.with_dbg + ~attributes: + [ + ("messaging.operation.name", "subscribe") + ; ("messaging.system", "event") + ; ("messaging.destination.subscription.name", vm_id) + ; ("messaging.message.id", string_of_int id) + ] + ~name:("subscribe" ^ " " ^ queue_name) + ~dbg + @@ fun di -> + let dbg = Debug_info.to_string di in debug "Client.UPDATES.inject_barrier %d" id ; Client.UPDATES.inject_barrier dbg vm_id id ; with_lock t.m (fun () -> @@ -1802,6 +1822,17 @@ module Events_from_xenopsd = struct ) let wakeup queue_name dbg id = + Debug_info.with_dbg + ~attributes: + [ + ("messaging.operation.name", "settle") + ; ("messaging.system", "event") + ; ("messaging.message.id", string_of_int id) + ] + ~name:("settle" ^ " " ^ queue_name) + ~dbg + @@ fun di -> + let dbg = Debug_info.to_string di in let module Client = (val make_client queue_name : XENOPS) in Client.UPDATES.remove_barrier dbg id ; let t = From d9a3268152a624e0733397180da39c02aae6bed7 Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Fri, 6 Jun 2025 10:08:36 +0100 Subject: [PATCH 047/111] CP-50001: Instrument `xapi_xenops.ml` -- `context` carrier Intruments the functions in `xapi_xenops.ml` that carry the traceparent/tracecontext through `context`. Signed-off-by: Gabriel Buica --- ocaml/xapi/xapi_xenops.ml | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 39629e75aee..70102faae44 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -33,7 +33,10 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let rpc_of t x = Rpcmarshal.marshal t.Rpc.Types.ty x +let ( let@ ) f x = f x + let check_power_state_is ~__context ~self ~expected = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in if expected <> `Running then Xapi_vm_lifecycle.assert_final_power_state_is ~__context ~self ~expected else @@ -112,6 +115,7 @@ let xenops_vdi_locator_of sr vdi = (Storage_interface.Vdi.string_of vdi) let xenops_vdi_locator ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let sr = Db.VDI.get_SR ~__context ~self in let sr_uuid = Db.SR.get_uuid ~__context ~self:sr in let vdi_location = Db.VDI.get_location ~__context ~self in @@ -120,9 +124,11 @@ let xenops_vdi_locator ~__context ~self = (Storage_interface.Vdi.of_string vdi_location) let disk_of_vdi ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try Some (VDI (xenops_vdi_locator ~__context ~self)) with _ -> None let vdi_of_disk ~__context x = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in match String.split ~limit:2 '/' x with | [sr_uuid; location] -> ( let open Xapi_database.Db_filter_types in @@ -157,6 +163,7 @@ let backend_of_network net = (* PR-1255 *) let backend_of_vif ~__context ~vif = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vif_record = Db.VIF.get_record_internal ~__context ~self:vif in let net = Db.Network.get_record ~__context ~self:vif_record.Db_actions.vIF_network @@ -261,6 +268,7 @@ let firmware_of_vm vm = default_firmware let varstore_rm_with_sandbox ~__context ~vm_uuid f = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let domid = 0 in let chroot, socket_path = @@ -271,6 +279,7 @@ let varstore_rm_with_sandbox ~__context ~vm_uuid f = (fun () -> Xenops_sandbox.Varstore_guard.stop dbg ~domid ~vm_uuid) let nvram_post_clone ~__context ~self ~uuid = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in match Db.VM.get_NVRAM ~__context ~self with | [] -> () @@ -298,6 +307,7 @@ let nvram_post_clone ~__context ~self ~uuid = debug "VM %s: NVRAM changed due to clone" uuid let rtc_timeoffset_of_vm ~__context (vm, vm_t) vbds = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let timeoffset = string vm_t.API.vM_platform "0" Vm_platform.timeoffset in (* If any VDI has on_boot = reset AND has a VDI.other_config:timeoffset then we override the platform/timeoffset. This is needed because windows @@ -371,6 +381,7 @@ let kernel_path filename = Ok real_path let builder_of_vm ~__context (vmref, vm) timeoffset pci_passthrough vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let open Vm in let video_mode = if vgpu then @@ -531,6 +542,7 @@ let builder_of_vm ~__context (vmref, vm) timeoffset pci_passthrough vgpu = Helpers.internal_error "invalid boot configuration" let list_net_sriov_vf_pcis ~__context ~vm = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in vm.API.vM_VIFs |> List.filter (fun self -> Db.VIF.get_currently_attached ~__context ~self) |> List.filter_map (fun vif -> @@ -545,6 +557,7 @@ module MD = struct (** Convert between xapi DB records and xenopsd records *) let of_vbd ~__context ~vm ~vbd = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let hvm = match vm.API.vM_domain_type with | `hvm -> @@ -697,6 +710,7 @@ module MD = struct } let of_pvs_proxy ~__context vif proxy = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let site = Db.PVS_proxy.get_site ~__context ~self:proxy in let site_uuid = Db.PVS_site.get_uuid ~__context ~self:site in let servers = Db.PVS_site.get_servers ~__context ~self:site in @@ -716,6 +730,7 @@ module MD = struct (site_uuid, servers, interface) let of_vif ~__context ~vm ~vif:(vif_ref, vif) = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let net = Db.Network.get_record ~__context ~self:vif.API.vIF_network in let net_mtu = Int64.to_int net.API.network_MTU in let mtu = @@ -859,6 +874,7 @@ module MD = struct } let pcis_of_vm ~__context (vmref, vm) = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vgpu_pcidevs = Vgpuops.list_pcis_for_passthrough ~__context ~vm:vmref in let devs = List.concat_map (fun (_, dev) -> dev) (Pciops.sort_pcidevs vgpu_pcidevs) @@ -889,6 +905,7 @@ module MD = struct devs let get_target_pci_address ~__context vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let pgpu = if Db.is_valid_ref __context @@ -917,6 +934,7 @@ module MD = struct * is passed trough completely. *) let sriov_vf ~__context vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let is_sriov () = let ty = vgpu.Db_actions.vGPU_type in match Db.VGPU_type.get_implementation ~__context ~self:ty with @@ -937,6 +955,7 @@ module MD = struct Xenops_interface.Pci.address_of_string str |> fun addr -> Some addr let of_nvidia_vgpu ~__context vm vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let open Vgpu in (* Get the PCI address. *) let physical_pci_address = get_target_pci_address ~__context vgpu in @@ -973,6 +992,7 @@ module MD = struct } let of_gvt_g_vgpu ~__context vm vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let open Vgpu in (* Get the PCI address. *) let physical_pci_address = get_target_pci_address ~__context vgpu in @@ -1013,6 +1033,7 @@ module MD = struct failwith "Intel GVT-g settings invalid" let of_mxgpu_vgpu ~__context vm vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let open Vgpu in (* Get the PCI address. *) let physical_pci_address = get_target_pci_address ~__context vgpu in @@ -1049,6 +1070,7 @@ module MD = struct failwith "AMD MxGPU settings invalid" let vgpus_of_vm ~__context (_, vm) = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in List.fold_left (fun acc vgpu -> let vgpu_record = Db.VGPU.get_record_internal ~__context ~self:vgpu in @@ -1070,6 +1092,7 @@ module MD = struct [] vm.API.vM_VGPUs let of_vusb ~__context ~vm ~pusb = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let open Vusb in try let path = pusb.API.pUSB_path in @@ -1093,6 +1116,7 @@ module MD = struct raise e let vusbs_of_vm ~__context (_, vm) = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in vm.API.vM_VUSBs |> List.map (fun self -> Db.VUSB.get_record ~__context ~self) |> List.filter (fun self -> self.API.vUSB_currently_attached) @@ -1102,6 +1126,7 @@ module MD = struct |> List.map (fun pusb -> of_vusb ~__context ~vm ~pusb) let of_vm ~__context (vmref, vm) vbds pci_passthrough vgpu = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let on_action_behaviour = function | `preserve -> [Vm.Pause] @@ -1357,6 +1382,7 @@ module Guest_agent_features = struct auto_update_enabled @ auto_update_url let of_config ~__context config = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let open Features in let vss = let name = Features.name_of_feature VSS in @@ -1376,6 +1402,7 @@ module Guest_agent_features = struct end let apply_guest_agent_config ~__context config = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let features = Guest_agent_features.of_config ~__context config in let module Client = (val make_client (default_xenopsd ()) : XENOPS) in @@ -1383,6 +1410,7 @@ let apply_guest_agent_config ~__context config = (* Create an instance of Metadata.t, suitable for uploading to the xenops service *) let create_metadata ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VM.get_record ~__context ~self in let vbds = List.filter @@ -1635,6 +1663,7 @@ module Xenopsd_metadata = struct (* If the VM has Xapi_globs.persist_xenopsd_md -> filename in its other_config, we persist the xenopsd metadata to a well-known location in the filesystem *) let maybe_persist_md ~__context ~self md = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let oc = Db.VM.get_other_config ~__context ~self in if List.mem_assoc Xapi_globs.persist_xenopsd_md oc then let file_path = @@ -1655,6 +1684,7 @@ module Xenopsd_metadata = struct ) let push ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in with_lock metadata_m (fun () -> let md = create_metadata ~__context ~self in let txt = md |> rpc_of Metadata.t |> Jsonrpc.to_string in @@ -1671,6 +1701,7 @@ module Xenopsd_metadata = struct ) let delete_nolock ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in info "xenops: VM.remove %s" id ; try @@ -1695,6 +1726,7 @@ module Xenopsd_metadata = struct (* Unregisters a VM with xenopsd, and cleans up metadata and caches *) let pull ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in with_lock metadata_m (fun () -> info "xenops: VM.export_metadata %s" id ; let dbg = Context.string_of_task_and_tracing __context in @@ -1725,9 +1757,11 @@ module Xenopsd_metadata = struct ) let delete ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in with_lock metadata_m (fun () -> delete_nolock ~__context id) let update ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let id = id_of_vm ~__context ~self in let queue_name = queue_of_vm ~__context ~self in with_lock metadata_m (fun () -> @@ -2427,6 +2461,7 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = ) let update_vm ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try if Events_from_xenopsd.are_suppressed id then debug "xenopsd event: ignoring event for VM (VM %s migrating away)" id @@ -2449,6 +2484,7 @@ let update_vm ~__context id = (string_of_exn e) let update_vbd ~__context (id : string * string) = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VBD (VM %s migrating away)" @@ -2551,6 +2587,7 @@ let update_vbd ~__context (id : string * string) = error "xenopsd event: Caught %s while updating VBD" (string_of_exn e) let update_vif ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VIF (VM %s migrating away)" @@ -2659,6 +2696,7 @@ let update_vif ~__context id = error "xenopsd event: Caught %s while updating VIF" (string_of_exn e) let update_pci ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for PCI (VM %s migrating away)" @@ -2727,6 +2765,7 @@ let update_pci ~__context id = error "xenopsd event: Caught %s while updating PCI" (string_of_exn e) let update_vgpu ~__context id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VGPU (VM %s migrating away)" @@ -2791,6 +2830,7 @@ let update_vgpu ~__context id = error "xenopsd event: Caught %s while updating VGPU" (string_of_exn e) let update_vusb ~__context (id : string * string) = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VUSB (VM %s migrating away)" @@ -2846,14 +2886,17 @@ let unwrap x = raise Not_a_xenops_task let register_task __context ?cancellable queue_name id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in TaskHelper.register_task __context ?cancellable (wrap queue_name id) ; id let unregister_task __context queue_name id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in TaskHelper.unregister_task __context (wrap queue_name id) ; id let update_task ~__context queue_name id = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try let self = TaskHelper.id_to_task_exn (TaskHelper.Xenops (queue_name, id)) in (* throws Not_found *) @@ -2887,6 +2930,7 @@ let update_task ~__context queue_name id = error "xenopsd event: Caught %s while updating task" (string_of_exn e) let rec events_watch ~__context cancel queue_name from = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in if Xapi_fist.delay_xenopsd_event_threads () then Thread.delay 30.0 ; let module Client = (val make_client queue_name : XENOPS) in @@ -2954,6 +2998,7 @@ let events_from_xenopsd queue_name = ) let refresh_vm ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let id = id_of_vm ~__context ~self in info "xenops: UPDATES.refresh_vm %s" id ; let dbg = Context.string_of_task_and_tracing __context in @@ -2963,6 +3008,7 @@ let refresh_vm ~__context ~self = Events_from_xenopsd.wait queue_name dbg id () let resync_resident_on ~__context = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let localhost = Helpers.get_localhost ~__context in let domain0 = Helpers.get_domain_zero ~__context in @@ -3105,6 +3151,7 @@ let resync_resident_on ~__context = xapi_vms_not_in_xenopsd let resync_all_vms ~__context = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in (* This should now be correct *) let localhost = Helpers.get_localhost ~__context in let domain0 = Helpers.get_domain_zero ~__context in @@ -3116,11 +3163,13 @@ let resync_all_vms ~__context = (* experimental feature for hard-pinning vcpus *) let hard_numa_enabled ~__context = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let pool = Helpers.get_pool ~__context in let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in List.assoc_opt "restrict_hard_numa" restrictions = Some "false" let set_numa_affinity_policy ~__context ~value = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task __context in let open Xapi_xenops_queue in let module Client = (val make_client (default_xenopsd ()) : XENOPS) in @@ -3139,6 +3188,7 @@ let set_numa_affinity_policy ~__context ~value = Client.HOST.set_numa_affinity_policy dbg value let on_xapi_restart ~__context = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let host = Helpers.get_localhost ~__context in let value = Db.Host.get_numa_affinity_policy ~__context ~self:host in info "Setting NUMA affinity policy in xenopsd on startup to %s" @@ -3162,6 +3212,7 @@ let on_xapi_restart ~__context = apply_guest_agent_config ~__context config let assert_resident_on ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let localhost = Helpers.get_localhost ~__context in if not (Db.VM.get_resident_on ~__context ~self = localhost) then Helpers.internal_error "the VM %s is not resident on this host" @@ -3494,6 +3545,7 @@ let transform_xenops_exn ~__context ~vm queue_name f = should not be any other suppression going on. *) let set_resident_on ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let id = id_of_vm ~__context ~self in debug "VM %s set_resident_on" id ; let localhost = Helpers.get_localhost ~__context in @@ -3508,6 +3560,7 @@ let set_resident_on ~__context ~self = Xenopsd_metadata.update ~__context ~self let update_debug_info __context t = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let task = Context.get_task_id __context in let debug_info = List.map (fun (k, v) -> ("debug_info:" ^ k, v)) t.Task.debug_info @@ -3522,6 +3575,7 @@ let update_debug_info __context t = debug_info let sync_with_task_result __context ?cancellable queue_name x = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in x |> register_task __context ?cancellable queue_name @@ -3533,6 +3587,7 @@ let sync_with_task __context ?cancellable queue_name x = sync_with_task_result __context ?cancellable queue_name x |> ignore let sync __context queue_name x = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in x |> wait_for_task queue_name dbg @@ -3540,6 +3595,7 @@ let sync __context queue_name x = |> ignore let pause ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3553,6 +3609,7 @@ let pause ~__context ~self = ) let unpause ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3565,6 +3622,7 @@ let unpause ~__context ~self = ) let request_rdp ~__context ~self enabled = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3577,6 +3635,7 @@ let request_rdp ~__context ~self enabled = ) let run_script ~__context ~self script = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3593,6 +3652,7 @@ let run_script ~__context ~self script = ) let set_xenstore_data ~__context ~self xsdata = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3604,6 +3664,7 @@ let set_xenstore_data ~__context ~self xsdata = ) let set_vcpus ~__context ~self n = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3631,6 +3692,7 @@ let set_vcpus ~__context ~self n = ) let set_shadow_multiplier ~__context ~self target = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3660,6 +3722,7 @@ let set_shadow_multiplier ~__context ~self target = ) let set_memory_dynamic_range ~__context ~self min max = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3672,6 +3735,7 @@ let set_memory_dynamic_range ~__context ~self min max = ) let maybe_refresh_vm ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let queue_name = queue_of_vm ~__context ~self in let id = id_of_vm ~__context ~self in @@ -3684,6 +3748,7 @@ let maybe_refresh_vm ~__context ~self = ) let start ~__context ~self paused force = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> @@ -3745,6 +3810,7 @@ let start ~__context ~self paused force = ) let start ~__context ~self paused force = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> try start ~__context ~self paused force @@ -3770,6 +3836,7 @@ let start ~__context ~self paused force = ) let reboot ~__context ~self timeout = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> assert_resident_on ~__context ~self ; @@ -3792,6 +3859,7 @@ let reboot ~__context ~self timeout = ) let shutdown ~__context ~self timeout = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> assert_resident_on ~__context ~self ; @@ -3825,6 +3893,7 @@ let shutdown ~__context ~self timeout = ) let suspend ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> assert_resident_on ~__context ~self ; @@ -3901,6 +3970,7 @@ let suspend ~__context ~self = ) let resume ~__context ~self ~start_paused ~force:_ = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let queue_name = queue_of_vm ~__context ~self in let vm_id = id_of_vm ~__context ~self in @@ -3954,6 +4024,7 @@ let resume ~__context ~self ~start_paused ~force:_ = ~expected:(if start_paused then `Paused else `Running) let s3suspend ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3965,6 +4036,7 @@ let s3suspend ~__context ~self = ) let s3resume ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let queue_name = queue_of_vm ~__context ~self in transform_xenops_exn ~__context ~vm:self queue_name (fun () -> let id = id_of_vm ~__context ~self in @@ -3976,12 +4048,14 @@ let s3resume ~__context ~self = ) let md_of_vbd ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VBD.get_VM ~__context ~self in MD.of_vbd ~__context ~vm:(Db.VM.get_record ~__context ~self:vm) ~vbd:(Db.VBD.get_record ~__context ~self) let vbd_plug ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VBD.get_VM ~__context ~self in let vm_id = id_of_vm ~__context ~self:vm in let queue_name = queue_of_vm ~__context ~self:vm in @@ -4008,6 +4082,7 @@ let vbd_plug ~__context ~self = ) let vbd_unplug ~__context ~self force = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VBD.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4037,6 +4112,7 @@ let vbd_unplug ~__context ~self force = ) let vbd_eject_hvm ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VBD.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4059,6 +4135,7 @@ let vbd_eject_hvm ~__context ~self = ) let vbd_insert_hvm ~__context ~self ~vdi = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VBD.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4084,6 +4161,7 @@ let vbd_insert_hvm ~__context ~self ~vdi = ) let has_qemu ~__context ~vm = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let dbg = Context.string_of_task_and_tracing __context in let id = Db.VM.get_uuid ~__context ~self:vm in let queue_name = queue_of_vm ~__context ~self:vm in @@ -4092,10 +4170,12 @@ let has_qemu ~__context ~vm = state.Vm.domain_type = Domain_HVM let ejectable ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VBD.get_VM ~__context ~self in has_qemu ~__context ~vm let vbd_eject ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in if ejectable ~__context ~self then vbd_eject_hvm ~__context ~self else ( @@ -4105,6 +4185,7 @@ let vbd_eject ~__context ~self = ) let vbd_insert ~__context ~self ~vdi = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in if ejectable ~__context ~self then vbd_insert_hvm ~__context ~self ~vdi else ( @@ -4114,12 +4195,14 @@ let vbd_insert ~__context ~self ~vdi = ) let md_of_vif ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in MD.of_vif ~__context ~vm:(Db.VM.get_record ~__context ~self:vm) ~vif:(self, Db.VIF.get_record ~__context ~self) let vif_plug ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let vm_id = id_of_vm ~__context ~self:vm in let queue_name = queue_of_vm ~__context ~self:vm in @@ -4148,6 +4231,7 @@ let vif_plug ~__context ~self = ) let vif_set_locking_mode ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4162,6 +4246,7 @@ let vif_set_locking_mode ~__context ~self = ) let vif_set_pvs_proxy ~__context ~self creating = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4177,6 +4262,7 @@ let vif_set_pvs_proxy ~__context ~self creating = ) let vif_unplug ~__context ~self force = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4199,6 +4285,7 @@ let vif_unplug ~__context ~self force = ) let vif_move ~__context ~self _network = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4225,6 +4312,7 @@ let vif_move ~__context ~self _network = ) let vif_set_ipv4_configuration ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4241,6 +4329,7 @@ let vif_set_ipv4_configuration ~__context ~self = ) let vif_set_ipv6_configuration ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VIF.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4257,6 +4346,7 @@ let vif_set_ipv6_configuration ~__context ~self = ) let task_cancel ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in try let queue_name, id = TaskHelper.task_to_id_exn self |> unwrap in let module Client = (val make_client queue_name : XENOPS) in @@ -4272,6 +4362,7 @@ let task_cancel ~__context ~self = false let md_of_vusb ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VUSB.get_VM ~__context ~self in let usb_group = Db.VUSB.get_USB_group ~__context ~self in let pusb = Helpers.get_first_pusb ~__context usb_group in @@ -4279,6 +4370,7 @@ let md_of_vusb ~__context ~self = MD.of_vusb ~__context ~vm:(Db.VM.get_record ~__context ~self:vm) ~pusb:pusbr let vusb_unplug_hvm ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VUSB.get_VM ~__context ~self in let queue_name = queue_of_vm ~__context ~self:vm in transform_xenops_exn ~__context ~vm queue_name (fun () -> @@ -4295,10 +4387,12 @@ let vusb_unplug_hvm ~__context ~self = ) let vusb_plugable ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in let vm = Db.VUSB.get_VM ~__context ~self in has_qemu ~__context ~vm let vusb_unplug ~__context ~self = + let@ __context = Context.with_tracing ~__context __FUNCTION__ in if vusb_plugable ~__context ~self then vusb_unplug_hvm ~__context ~self else From cb64cd6571ea50ea997c1e954def3527979cb7bc Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Fri, 6 Jun 2025 15:20:24 +0100 Subject: [PATCH 048/111] CP-50001: Instrument `Xapi_xenops.events_watch` This function is called recursively and the context span is only closed on failure. This makes it hard to read the trace. Therefore, I am resetting the context tracing each recursion step and now each recursion step will have its own trace. Signed-off-by: Gabriel Buica --- ocaml/xapi/xapi_xenops.ml | 111 ++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 70102faae44..0dc0f2780c9 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -2930,60 +2930,65 @@ let update_task ~__context queue_name id = error "xenopsd event: Caught %s while updating task" (string_of_exn e) let rec events_watch ~__context cancel queue_name from = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in - let dbg = Context.string_of_task_and_tracing __context in - if Xapi_fist.delay_xenopsd_event_threads () then Thread.delay 30.0 ; - let module Client = (val make_client queue_name : XENOPS) in - let barriers, events, next = Client.UPDATES.get dbg from None in - if !cancel then - raise (Api_errors.Server_error (Api_errors.task_cancelled, [])) ; - let done_events = ref [] in - let already_done x = List.mem x !done_events in - let add_event x = done_events := x :: !done_events in - let do_updates l = - let open Dynamic in - List.iter - (fun ev -> - debug "Processing event: %s" - (ev |> Dynamic.rpc_of_id |> Jsonrpc.to_string) ; - if already_done ev then - debug "Skipping (already processed this round)" - else ( - add_event ev ; - match ev with - | Vm id -> - debug "xenops event on VM %s" id ; - update_vm ~__context id - | Vbd id -> - debug "xenops event on VBD %s.%s" (fst id) (snd id) ; - update_vbd ~__context id - | Vif id -> - debug "xenops event on VIF %s.%s" (fst id) (snd id) ; - update_vif ~__context id - | Pci id -> - debug "xenops event on PCI %s.%s" (fst id) (snd id) ; - update_pci ~__context id - | Vgpu id -> - debug "xenops event on VGPU %s.%s" (fst id) (snd id) ; - update_vgpu ~__context id - | Vusb id -> - debug "xenops event on VUSB %s.%s" (fst id) (snd id) ; - update_vusb ~__context id - | Task id -> - debug "xenops event on Task %s" id ; - update_task ~__context queue_name id - ) - ) - l - in - List.iter - (fun (id, b_events) -> - debug "Processing barrier %d" id ; - do_updates b_events ; - Events_from_xenopsd.wakeup queue_name dbg id + Context.complete_tracing __context ; + let next = + Context.with_tracing ~__context __FUNCTION__ (fun __context -> + let dbg = Context.string_of_task_and_tracing __context in + if Xapi_fist.delay_xenopsd_event_threads () then Thread.delay 30.0 ; + let module Client = (val make_client queue_name : XENOPS) in + let barriers, events, next = Client.UPDATES.get dbg from None in + if !cancel then + raise (Api_errors.Server_error (Api_errors.task_cancelled, [])) ; + let done_events = ref [] in + let already_done x = List.mem x !done_events in + let add_event x = done_events := x :: !done_events in + let do_updates l = + let open Dynamic in + List.iter + (fun ev -> + debug "Processing event: %s" + (ev |> Dynamic.rpc_of_id |> Jsonrpc.to_string) ; + if already_done ev then + debug "Skipping (already processed this round)" + else ( + add_event ev ; + match ev with + | Vm id -> + debug "xenops event on VM %s" id ; + update_vm ~__context id + | Vbd id -> + debug "xenops event on VBD %s.%s" (fst id) (snd id) ; + update_vbd ~__context id + | Vif id -> + debug "xenops event on VIF %s.%s" (fst id) (snd id) ; + update_vif ~__context id + | Pci id -> + debug "xenops event on PCI %s.%s" (fst id) (snd id) ; + update_pci ~__context id + | Vgpu id -> + debug "xenops event on VGPU %s.%s" (fst id) (snd id) ; + update_vgpu ~__context id + | Vusb id -> + debug "xenops event on VUSB %s.%s" (fst id) (snd id) ; + update_vusb ~__context id + | Task id -> + debug "xenops event on Task %s" id ; + update_task ~__context queue_name id + ) + ) + l + in + List.iter + (fun (id, b_events) -> + debug "Processing barrier %d" id ; + do_updates b_events ; + Events_from_xenopsd.wakeup queue_name dbg id + ) + barriers ; + do_updates events ; + next ) - barriers ; - do_updates events ; + in events_watch ~__context cancel queue_name (Some next) let events_from_xenopsd queue_name = From 7bede8387a58d97de61e0f11b4d96c7cda912824 Mon Sep 17 00:00:00 2001 From: Gabriel Buica Date: Wed, 18 Jun 2025 13:19:38 +0100 Subject: [PATCH 049/111] CP-50001: Add attributes to updates in `events_watch` Signed-off-by: Gabriel Buica --- ocaml/xapi/context.ml | 3 ++- ocaml/xapi/context.mli | 7 ++++++- ocaml/xapi/xapi_xenops.ml | 42 ++++++++++++++++++++++++++++++++------- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/ocaml/xapi/context.ml b/ocaml/xapi/context.ml index f03ce60e2a0..a49c8ecd1bb 100644 --- a/ocaml/xapi/context.ml +++ b/ocaml/xapi/context.ml @@ -532,11 +532,12 @@ let with_forwarded_task ?http_other_config ?session_id ?origin task_id f = in finally_destroy_context ~__context f -let with_tracing ?originator ~__context name f = +let with_tracing ?(attributes = []) ?originator ~__context name f = let open Tracing in let parent = __context.tracing in let span_attributes = Attributes.attr_of_originator originator + @ attributes @ make_attributes ~task_id:__context.task_id ?session_id:__context.session_id () in diff --git a/ocaml/xapi/context.mli b/ocaml/xapi/context.mli index 281f67ca4b2..ac3250f8569 100644 --- a/ocaml/xapi/context.mli +++ b/ocaml/xapi/context.mli @@ -185,6 +185,11 @@ val with_forwarded_task : then ensure [__context] is destroyed.*) val with_tracing : - ?originator:string -> __context:t -> string -> (t -> 'a) -> 'a + ?attributes:(string * string) list + -> ?originator:string + -> __context:t + -> string + -> (t -> 'a) + -> 'a val set_client_span : t -> Tracing.Span.t option diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 0dc0f2780c9..5d9a1cc0a41 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -2461,7 +2461,11 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = ) let update_vm ~__context id = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.vm", id)] + ~__context __FUNCTION__ + in try if Events_from_xenopsd.are_suppressed id then debug "xenopsd event: ignoring event for VM (VM %s migrating away)" id @@ -2484,7 +2488,11 @@ let update_vm ~__context id = (string_of_exn e) let update_vbd ~__context (id : string * string) = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.vm", fst id); ("xapi.event.on.vbd", snd id)] + ~__context __FUNCTION__ + in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VBD (VM %s migrating away)" @@ -2587,7 +2595,11 @@ let update_vbd ~__context (id : string * string) = error "xenopsd event: Caught %s while updating VBD" (string_of_exn e) let update_vif ~__context id = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.vm", fst id); ("xapi.event.on.vif", snd id)] + ~__context __FUNCTION__ + in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VIF (VM %s migrating away)" @@ -2696,7 +2708,11 @@ let update_vif ~__context id = error "xenopsd event: Caught %s while updating VIF" (string_of_exn e) let update_pci ~__context id = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.vm", fst id); ("xapi.event.on.pci", snd id)] + ~__context __FUNCTION__ + in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for PCI (VM %s migrating away)" @@ -2765,7 +2781,11 @@ let update_pci ~__context id = error "xenopsd event: Caught %s while updating PCI" (string_of_exn e) let update_vgpu ~__context id = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.vm", fst id); ("xapi.event.on.vgpu", snd id)] + ~__context __FUNCTION__ + in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VGPU (VM %s migrating away)" @@ -2830,7 +2850,11 @@ let update_vgpu ~__context id = error "xenopsd event: Caught %s while updating VGPU" (string_of_exn e) let update_vusb ~__context (id : string * string) = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.vm", fst id); ("xapi.event.on.vusb", snd id)] + ~__context __FUNCTION__ + in try if Events_from_xenopsd.are_suppressed (fst id) then debug "xenopsd event: ignoring event for VUSB (VM %s migrating away)" @@ -2896,7 +2920,11 @@ let unregister_task __context queue_name id = id let update_task ~__context queue_name id = - let@ __context = Context.with_tracing ~__context __FUNCTION__ in + let@ __context = + Context.with_tracing + ~attributes:[("xapi.event.on.task", id)] + ~__context __FUNCTION__ + in try let self = TaskHelper.id_to_task_exn (TaskHelper.Xenops (queue_name, id)) in (* throws Not_found *) From 9d2468978f1ea067863dffee9e065c8ac40b0632 Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Mon, 16 Jun 2025 07:31:13 +0100 Subject: [PATCH 050/111] CA-406770: Improve error message 1. WLB request will raise `wlb_authentication_failed` when catching `Http_client.Http_error`. But actually only error code 401 and 403 should raise this type exception. For other error code, raise `wlb_connection_reset`. Also print the detail error code and message. 2. `message_forwarding` raises same error for `Http_request_rejected` and `Connection_reset` so we don't know which exception actually be raised. Print detailed logs for these 2 exceptions. Signed-off-by: Bengang Yuan --- ocaml/xapi/message_forwarding.ml | 27 +++++++++------------------ ocaml/xapi/workload_balancing.ml | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index d1773e4f0c6..b52aaaa20ec 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -143,24 +143,15 @@ let do_op_on_common ~local_fn ~__context ~host ~remote_fn f = let task_opt = set_forwarding_on_task ~__context ~host in f __context host task_opt remote_fn with - | Xmlrpc_client.Connection_reset | Http_client.Http_request_rejected _ -> - warn - "Caught Connection_reset when contacting host %s; converting into \ - CANNOT_CONTACT_HOST" - (Ref.string_of host) ; - raise - (Api_errors.Server_error - (Api_errors.cannot_contact_host, [Ref.string_of host]) - ) - | Xmlrpc_client.Stunnel_connection_failed -> - warn - "Caught Stunnel_connection_failed while contacting host %s; converting \ - into CANNOT_CONTACT_HOST" - (Ref.string_of host) ; - raise - (Api_errors.Server_error - (Api_errors.cannot_contact_host, [Ref.string_of host]) - ) + | ( Xmlrpc_client.Connection_reset + | Http_client.Http_request_rejected _ + | Xmlrpc_client.Stunnel_connection_failed ) as e + -> + error + "%s: Caught %s when contacting host %s; converting into \ + CANNOT_CONTACT_HOST" + __FUNCTION__ (Printexc.to_string e) (Ref.string_of host) ; + raise Api_errors.(Server_error (cannot_contact_host, [Ref.string_of host])) (* regular forwarding fn, with session and live-check. Used by most calls, will use the connection cache. *) diff --git a/ocaml/xapi/workload_balancing.ml b/ocaml/xapi/workload_balancing.ml index 27fa184da84..7108032dbf7 100644 --- a/ocaml/xapi/workload_balancing.ml +++ b/ocaml/xapi/workload_balancing.ml @@ -329,8 +329,19 @@ let wlb_request ~__context ~host ~port ~auth ~meth ~params ~handler ~enable_log with | Remote_requests.Timed_out -> raise_timeout timeout - | Http_client.Http_request_rejected _ | Http_client.Http_error _ -> - raise_authentication_failed () + | Http_client.Http_error (code, _) as e -> ( + error "%s: Caught %s when contacting WLB" __FUNCTION__ + (Printexc.to_string e) ; + match code with + | "401" | "403" -> + raise_authentication_failed () + | _ -> + raise_connection_reset () + ) + | Http_client.Http_request_rejected _ as e -> + error "%s: Caught %s when contacting WLB" __FUNCTION__ + (Printexc.to_string e) ; + raise_connection_reset () | Xmlrpc_client.Connection_reset -> raise_connection_reset () | Stunnel.Stunnel_verify_error reason -> From ad5cbe5e3518863ce380f9509d1d7a4eddd8db31 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 23 Jun 2025 10:06:45 +0100 Subject: [PATCH 051/111] xenopsd: Remove data/updated from the list of watched paths "data/updated" is not read or used anywhere in xenopsd or xapi: * xapi_guest_agent's last_updated field is just Unix.gettimeofday (). * xapi_xenops removes "data/updated" from the guest agent state altogether before checking if it's changed: ``` let ignored_keys = ["data/meminfo_free"; "data/updated"; "data/update_cnt"] ``` So there is no need to watch this path at all. This greatly reduces unnecessary traffic between xapi and xenopsd, since any VM with a guest agent would write to data/updated once every 60 seconds, which would generate a Dynamic.Vm event, making xapi call xenopsd's VM.stat to rescan the domain's xenstore tree and perform several hypercalls. Almost always, this would be completely unnecessary as nothing else about the VM would change, but a lot of work would be done anyhow. Signed-off-by: Andrii Sultanov --- ocaml/xenopsd/xc/xenops_server_xen.ml | 1 - 1 file changed, 1 deletion(-) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index cdc54d32873..b8577746b88 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -4935,7 +4935,6 @@ module Actions = struct let open Printf in [ sprintf "/local/domain/%d/attr" domid - ; sprintf "/local/domain/%d/data/updated" domid ; sprintf "/local/domain/%d/data/ts" domid ; sprintf "/local/domain/%d/data/service" domid ; sprintf "/local/domain/%d/memory/target" domid From 5d0fb87b4edff8c076eda285c576c752ad4aba11 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 23 Jun 2025 11:05:21 +0100 Subject: [PATCH 052/111] xapi_xenops: Simplify update_* functions Drop the first member of the (X.t * X.state) tuple coming from X.stat immediately as it's never used. This removes the need for several 'snd info', 'Option.iter (fun (_, state) ->) info' constructs. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_xenops.ml | 40 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 4b7b738b000..8be594f9657 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -2433,8 +2433,8 @@ let update_vbd ~__context (id : string * string) = let module Client = (val make_client (queue_of_vm ~__context ~self:vm) : XENOPS) in - let info = try Some (Client.VBD.stat dbg id) with _ -> None in - if Option.map snd info <> previous then ( + let info = try Some (snd (Client.VBD.stat dbg id)) with _ -> None in + if info <> previous then ( let vbds = Db.VM.get_VBDs ~__context ~self:vm in let vbdrs = List.map @@ -2469,7 +2469,7 @@ let update_vbd ~__context (id : string * string) = debug "VBD %s.%s matched device %s" (fst id) (snd id) vbd_r.API.vBD_userdevice ; Option.iter - (fun (_, state) -> + (fun state -> let currently_attached = state.Vbd.plugged || state.Vbd.active in debug "xenopsd event: Updating VBD %s.%s device <- %s; \ @@ -2512,7 +2512,7 @@ let update_vbd ~__context (id : string * string) = ) ) info ; - Xenops_cache.update_vbd id (Option.map snd info) ; + Xenops_cache.update_vbd id info ; Xapi_vbd_helpers.update_allowed_operations ~__context ~self:vbd ; if not (Db.VBD.get_empty ~__context ~self:vbd) then let vdi = Db.VBD.get_VDI ~__context ~self:vbd in @@ -2535,8 +2535,8 @@ let update_vif ~__context id = let module Client = (val make_client (queue_of_vm ~__context ~self:vm) : XENOPS) in - let info = try Some (Client.VIF.stat dbg id) with _ -> None in - if Option.map snd info <> previous then ( + let info = try Some (snd (Client.VIF.stat dbg id)) with _ -> None in + if info <> previous then ( let vifs = Db.VM.get_VIFs ~__context ~self:vm in let vifrs = List.map @@ -2547,7 +2547,7 @@ let update_vif ~__context id = List.find (fun (_, vifr) -> vifr.API.vIF_device = snd id) vifrs in Option.iter - (fun (_, state) -> + (fun state -> if not (state.Vif.plugged || state.Vif.active) then ( ( try Xapi_network.deregister_vif ~__context vif with e -> @@ -2623,7 +2623,7 @@ let update_vif ~__context id = ~value:(state.plugged || state.active) ) info ; - Xenops_cache.update_vif id (Option.map snd info) ; + Xenops_cache.update_vif id info ; Xapi_vif_helpers.update_allowed_operations ~__context ~self:vif ) with e -> @@ -2643,8 +2643,8 @@ let update_pci ~__context id = let module Client = (val make_client (queue_of_vm ~__context ~self:vm) : XENOPS) in - let info = try Some (Client.PCI.stat dbg id) with _ -> None in - if Option.map snd info <> previous then ( + let info = try Some (snd (Client.PCI.stat dbg id)) with _ -> None in + if info <> previous then ( let pcis = Db.Host.get_PCIs ~__context ~self:localhost in let pcirs = List.map @@ -2661,7 +2661,7 @@ let update_pci ~__context id = List.mem vm (Db.PCI.get_attached_VMs ~__context ~self:pci) in Option.iter - (fun (_, state) -> + (fun state -> debug "xenopsd event: Updating PCI %s.%s currently_attached <- %b" (fst id) (snd id) state.Pci.plugged ; if attached_in_db && not state.Pci.plugged then @@ -2692,7 +2692,7 @@ let update_pci ~__context id = vgpu_opt ) info ; - Xenops_cache.update_pci id (Option.map snd info) + Xenops_cache.update_pci id info ) with e -> error "xenopsd event: Caught %s while updating PCI" (string_of_exn e) @@ -2711,8 +2711,8 @@ let update_vgpu ~__context id = let module Client = (val make_client (queue_of_vm ~__context ~self:vm) : XENOPS) in - let info = try Some (Client.VGPU.stat dbg id) with _ -> None in - if Option.map snd info <> previous then ( + let info = try Some (snd (Client.VGPU.stat dbg id)) with _ -> None in + if info <> previous then ( let vgpus = Db.VM.get_VGPUs ~__context ~self:vm in let vgpu_records = List.map @@ -2733,7 +2733,7 @@ let update_vgpu ~__context id = = None then Option.iter - (fun (_, state) -> + (fun state -> ( if state.Vgpu.plugged then let scheduled = Db.VGPU.get_scheduled_to_be_resident_on ~__context @@ -2756,7 +2756,7 @@ let update_vgpu ~__context id = ) ) info ; - Xenops_cache.update_vgpu id (Option.map snd info) + Xenops_cache.update_vgpu id info ) with e -> error "xenopsd event: Caught %s while updating VGPU" (string_of_exn e) @@ -2775,8 +2775,8 @@ let update_vusb ~__context (id : string * string) = let module Client = (val make_client (queue_of_vm ~__context ~self:vm) : XENOPS) in - let info = try Some (Client.VUSB.stat dbg id) with _ -> None in - if Option.map snd info <> previous then ( + let info = try Some (snd (Client.VUSB.stat dbg id)) with _ -> None in + if info <> previous then ( let pusb, _ = Db.VM.get_VUSBs ~__context ~self:vm |> List.map (fun self -> Db.VUSB.get_USB_group ~__context ~self) @@ -2791,7 +2791,7 @@ let update_vusb ~__context (id : string * string) = let usb_group = Db.PUSB.get_USB_group ~__context ~self:pusb in let vusb = Helpers.get_first_vusb ~__context usb_group in Option.iter - (fun (_, state) -> + (fun state -> debug "xenopsd event: Updating USB %s.%s; plugged <- %b" (fst id) (snd id) state.Vusb.plugged ; let currently_attached = state.Vusb.plugged in @@ -2799,7 +2799,7 @@ let update_vusb ~__context (id : string * string) = ~value:currently_attached ) info ; - Xenops_cache.update_vusb id (Option.map snd info) ; + Xenops_cache.update_vusb id info ; Xapi_vusb_helpers.update_allowed_operations ~__context ~self:vusb ) with e -> From acb973204e52f3a196801e56d479c56381588723 Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Fri, 20 Jun 2025 16:58:56 +0100 Subject: [PATCH 053/111] CP-308201: make unimplemented function more obvious Rename the "u" function used across storage and observer_helpers to make its meaning more obvious and use __FUNCTION__ instead of hardcoding the function name. Signed-off-by: Steven Woods --- ocaml/xapi-idl/lib/observer_skeleton.ml | 30 ++-- ocaml/xapi-idl/storage/storage_interface.ml | 2 + ocaml/xapi-idl/storage/storage_skeleton.ml | 190 ++++++++++++-------- ocaml/xapi/storage_mux.ml | 15 +- ocaml/xapi/storage_smapiv1_migrate.ml | 6 +- ocaml/xapi/storage_smapiv1_wrapper.ml | 25 +-- ocaml/xapi/storage_smapiv3_migrate.ml | 19 +- ocaml/xenopsd/lib/xenops_server_skeleton.ml | 85 +++++---- 8 files changed, 203 insertions(+), 169 deletions(-) diff --git a/ocaml/xapi-idl/lib/observer_skeleton.ml b/ocaml/xapi-idl/lib/observer_skeleton.ml index 8cf5e2f5221..e53a45f958c 100644 --- a/ocaml/xapi-idl/lib/observer_skeleton.ml +++ b/ocaml/xapi-idl/lib/observer_skeleton.ml @@ -13,36 +13,36 @@ *) [@@@ocaml.warning "-27"] -let u x = raise Observer_helpers.(Observer_error (Errors.Unimplemented x)) +let unimplemented x = + raise Observer_helpers.(Observer_error (Errors.Unimplemented x)) module Observer = struct type context = unit let create ctx ~dbg ~uuid ~name_label ~attributes ~endpoints ~enabled = - u "Observer.create" + unimplemented __FUNCTION__ - let destroy ctx ~dbg ~uuid = u "Observer.destroy" + let destroy ctx ~dbg ~uuid = unimplemented __FUNCTION__ - let set_enabled ctx ~dbg ~uuid ~enabled = u "Observer.set_enabled" + let set_enabled ctx ~dbg ~uuid ~enabled = unimplemented __FUNCTION__ - let set_attributes ctx ~dbg ~uuid ~attributes = u "Observer.set_attributes" + let set_attributes ctx ~dbg ~uuid ~attributes = unimplemented __FUNCTION__ - let set_endpoints ctx ~dbg ~uuid ~endpoints = u "Observer.set_endpoints" + let set_endpoints ctx ~dbg ~uuid ~endpoints = unimplemented __FUNCTION__ - let init ctx ~dbg = u "Observer.init" + let init ctx ~dbg = unimplemented __FUNCTION__ - let set_trace_log_dir ctx ~dbg ~dir = u "Observer.set_trace_log_dir" + let set_trace_log_dir ctx ~dbg ~dir = unimplemented __FUNCTION__ - let set_export_interval ctx ~dbg ~interval = u "Observer.set_export_interval" + let set_export_interval ctx ~dbg ~interval = unimplemented __FUNCTION__ - let set_max_spans ctx ~dbg ~spans = u "Observer.set_max_spans" + let set_max_spans ctx ~dbg ~spans = unimplemented __FUNCTION__ - let set_max_traces ctx ~dbg ~traces = u "Observer.set_max_traces" + let set_max_traces ctx ~dbg ~traces = unimplemented __FUNCTION__ - let set_max_file_size ctx ~dbg ~file_size = u "Observer.set_max_file_size" + let set_max_file_size ctx ~dbg ~file_size = unimplemented __FUNCTION__ - let set_host_id ctx ~dbg ~host_id = u "Observer.set_host_id" + let set_host_id ctx ~dbg ~host_id = unimplemented __FUNCTION__ - let set_compress_tracing_files ctx ~dbg ~enabled = - u "Observer.set_compress_tracing_files" + let set_compress_tracing_files ctx ~dbg ~enabled = unimplemented __FUNCTION__ end diff --git a/ocaml/xapi-idl/storage/storage_interface.ml b/ocaml/xapi-idl/storage/storage_interface.ml index 14ca03e6cb8..eaabacc9e8f 100644 --- a/ocaml/xapi-idl/storage/storage_interface.ml +++ b/ocaml/xapi-idl/storage/storage_interface.ml @@ -425,6 +425,8 @@ end exception Storage_error of Errors.error +let unimplemented x = raise (Storage_error (Errors.Unimplemented x)) + let () = (* register printer *) let sprintf = Printf.sprintf in diff --git a/ocaml/xapi-idl/storage/storage_skeleton.ml b/ocaml/xapi-idl/storage/storage_skeleton.ml index 290c09d6230..a2d2d04ab08 100644 --- a/ocaml/xapi-idl/storage/storage_skeleton.ml +++ b/ocaml/xapi-idl/storage/storage_skeleton.ml @@ -13,8 +13,6 @@ *) [@@@ocaml.warning "-27"] -let u x = raise Storage_interface.(Storage_error (Errors.Unimplemented x)) - type context = unit module UPDATES = struct @@ -27,193 +25,231 @@ module UPDATES = struct end module Query = struct - let query ctx ~dbg = u "Query.query" + let query ctx ~dbg = Storage_interface.unimplemented __FUNCTION__ - let diagnostics ctx ~dbg = u "Query.diagnostics" + let diagnostics ctx ~dbg = Storage_interface.unimplemented __FUNCTION__ end module DP = struct - let create ctx ~dbg ~id = u "DP.create" + let create ctx ~dbg ~id = Storage_interface.unimplemented __FUNCTION__ - let destroy ctx ~dbg ~dp ~allow_leak = u "DP.destroy" + let destroy ctx ~dbg ~dp ~allow_leak = + Storage_interface.unimplemented __FUNCTION__ - let destroy2 ctx ~dbg ~dp ~sr ~vdi ~vm ~allow_leak = u "DP.destroy2" + let destroy2 ctx ~dbg ~dp ~sr ~vdi ~vm ~allow_leak = + Storage_interface.unimplemented __FUNCTION__ - let attach_info ctx ~dbg ~sr ~vdi ~dp ~vm = u "DP.attach_info" + let attach_info ctx ~dbg ~sr ~vdi ~dp ~vm = + Storage_interface.unimplemented __FUNCTION__ - let diagnostics ctx () = u "DP.diagnostics" + let diagnostics ctx () = Storage_interface.unimplemented __FUNCTION__ - let stat_vdi ctx ~dbg ~sr ~vdi () = u "DP.stat_vdi" + let stat_vdi ctx ~dbg ~sr ~vdi () = + Storage_interface.unimplemented __FUNCTION__ end module SR = struct let create ctx ~dbg ~sr ~name_label ~name_description ~device_config ~physical_size = - u "SR.create" + Storage_interface.unimplemented __FUNCTION__ - let attach ctx ~dbg ~sr ~device_config = u "SR.attach" + let attach ctx ~dbg ~sr ~device_config = + Storage_interface.unimplemented __FUNCTION__ - let set_name_label ctx ~dbg ~sr ~new_name_label = u "SR.set_name_label" + let set_name_label ctx ~dbg ~sr ~new_name_label = + Storage_interface.unimplemented __FUNCTION__ let set_name_description ctx ~dbg ~sr ~new_name_description = - u "SR.set_name_description" + Storage_interface.unimplemented __FUNCTION__ - let detach ctx ~dbg ~sr = u "SR.detach" + let detach ctx ~dbg ~sr = Storage_interface.unimplemented __FUNCTION__ - let reset ctx ~dbg ~sr = u "SR.reset" + let reset ctx ~dbg ~sr = Storage_interface.unimplemented __FUNCTION__ - let destroy ctx ~dbg ~sr = u "SR.destroy" + let destroy ctx ~dbg ~sr = Storage_interface.unimplemented __FUNCTION__ - let probe ctx ~dbg ~queue ~device_config ~sm_config = u "SR.probe" + let probe ctx ~dbg ~queue ~device_config ~sm_config = + Storage_interface.unimplemented __FUNCTION__ - let scan ctx ~dbg ~sr = u "SR.scan" + let scan ctx ~dbg ~sr = Storage_interface.unimplemented __FUNCTION__ - let scan2 ctx ~dbg ~sr = u "SR.scan2" + let scan2 ctx ~dbg ~sr = Storage_interface.unimplemented __FUNCTION__ let update_snapshot_info_src ctx ~dbg ~sr ~vdi ~url ~dest ~dest_vdi ~snapshot_pairs = - u "SR.update_snapshot_info_src" + Storage_interface.unimplemented __FUNCTION__ let update_snapshot_info_dest ctx ~dbg ~sr ~vdi ~src_vdi ~snapshot_pairs = - u "SR.update_snapshot_info_dest" + Storage_interface.unimplemented __FUNCTION__ - let stat ctx ~dbg ~sr = u "SR.stat" + let stat ctx ~dbg ~sr = Storage_interface.unimplemented __FUNCTION__ - let list ctx ~dbg = u "SR.list" + let list ctx ~dbg = Storage_interface.unimplemented __FUNCTION__ end module VDI = struct - let create ctx ~dbg ~sr ~vdi_info = u "VDI.create" + let create ctx ~dbg ~sr ~vdi_info = + Storage_interface.unimplemented __FUNCTION__ - let set_name_label ctx ~dbg ~sr ~vdi ~new_name_label = u "VDI.set_name_label" + let set_name_label ctx ~dbg ~sr ~vdi ~new_name_label = + Storage_interface.unimplemented __FUNCTION__ let set_name_description ctx ~dbg ~sr ~vdi ~new_name_description = - u "VDI.set_name_description" + Storage_interface.unimplemented __FUNCTION__ - let snapshot ctx ~dbg ~sr ~vdi_info = u "VDI.snapshot" + let snapshot ctx ~dbg ~sr ~vdi_info = + Storage_interface.unimplemented __FUNCTION__ - let clone ctx ~dbg ~sr ~vdi_info = u "VDI.clone" + let clone ctx ~dbg ~sr ~vdi_info = + Storage_interface.unimplemented __FUNCTION__ - let resize ctx ~dbg ~sr ~vdi ~new_size = u "VDI.resize" + let resize ctx ~dbg ~sr ~vdi ~new_size = + Storage_interface.unimplemented __FUNCTION__ - let destroy ctx ~dbg ~sr ~vdi = u "VDI.destroy" + let destroy ctx ~dbg ~sr ~vdi = Storage_interface.unimplemented __FUNCTION__ - let stat ctx ~dbg ~sr ~vdi = u "VDI.stat" + let stat ctx ~dbg ~sr ~vdi = Storage_interface.unimplemented __FUNCTION__ - let introduce ctx ~dbg ~sr ~uuid ~sm_config ~location = u "VDI.introduce" + let introduce ctx ~dbg ~sr ~uuid ~sm_config ~location = + Storage_interface.unimplemented __FUNCTION__ - let set_persistent ctx ~dbg ~sr ~vdi ~persistent = u "VDI.set_persistent" + let set_persistent ctx ~dbg ~sr ~vdi ~persistent = + Storage_interface.unimplemented __FUNCTION__ let epoch_begin ctx ~dbg ~sr ~vdi ~vm ~persistent = () - let attach ctx ~dbg ~dp ~sr ~vdi ~read_write = u "VDI.attach" + let attach ctx ~dbg ~dp ~sr ~vdi ~read_write = + Storage_interface.unimplemented __FUNCTION__ - let attach2 ctx ~dbg ~dp ~sr ~vdi ~read_write = u "VDI.attach2" + let attach2 ctx ~dbg ~dp ~sr ~vdi ~read_write = + Storage_interface.unimplemented __FUNCTION__ - let attach3 ctx ~dbg ~dp ~sr ~vdi ~vm ~read_write = u "VDI.attach3" + let attach3 ctx ~dbg ~dp ~sr ~vdi ~vm ~read_write = + Storage_interface.unimplemented __FUNCTION__ - let activate ctx ~dbg ~dp ~sr ~vdi = u "VDI.activate" + let activate ctx ~dbg ~dp ~sr ~vdi = + Storage_interface.unimplemented __FUNCTION__ - let activate3 ctx ~dbg ~dp ~sr ~vdi ~vm = u "VDI.activate3" + let activate3 ctx ~dbg ~dp ~sr ~vdi ~vm = + Storage_interface.unimplemented __FUNCTION__ - let activate_readonly ctx ~dbg ~dp ~sr ~vdi ~vm = u "VDI.activate_readonly" + let activate_readonly ctx ~dbg ~dp ~sr ~vdi ~vm = + Storage_interface.unimplemented __FUNCTION__ - let deactivate ctx ~dbg ~dp ~sr ~vdi ~vm = u "VDI.deactivate" + let deactivate ctx ~dbg ~dp ~sr ~vdi ~vm = + Storage_interface.unimplemented __FUNCTION__ - let detach ctx ~dbg ~dp ~sr ~vdi ~vm = u "VDI.detach" + let detach ctx ~dbg ~dp ~sr ~vdi ~vm = + Storage_interface.unimplemented __FUNCTION__ let epoch_end ctx ~dbg ~sr ~vdi ~vm = () - let get_url ctx ~dbg ~sr ~vdi = u "VDI.get_url" + let get_url ctx ~dbg ~sr ~vdi = Storage_interface.unimplemented __FUNCTION__ - let similar_content ctx ~dbg ~sr ~vdi = u "VDI.similar_content" + let similar_content ctx ~dbg ~sr ~vdi = + Storage_interface.unimplemented __FUNCTION__ - let get_by_name ctx ~dbg ~sr ~name = u "VDI.get_by_name" + let get_by_name ctx ~dbg ~sr ~name = + Storage_interface.unimplemented __FUNCTION__ - let set_content_id ctx ~dbg ~sr ~vdi ~content_id = u "VDI.set_content_id" + let set_content_id ctx ~dbg ~sr ~vdi ~content_id = + Storage_interface.unimplemented __FUNCTION__ - let compose ctx ~dbg ~sr ~vdi1 ~vdi2 = u "VDI.compose" + let compose ctx ~dbg ~sr ~vdi1 ~vdi2 = + Storage_interface.unimplemented __FUNCTION__ - let add_to_sm_config ctx ~dbg ~sr ~vdi ~key ~value = u "VDI.add_to_sm_config" + let add_to_sm_config ctx ~dbg ~sr ~vdi ~key ~value = + Storage_interface.unimplemented __FUNCTION__ let remove_from_sm_config ctx ~dbg ~sr ~vdi ~key = - u "VDI.remove_from_sm_config" + Storage_interface.unimplemented __FUNCTION__ - let enable_cbt ctx ~dbg ~sr ~vdi = u "VDI.enable_cbt" + let enable_cbt ctx ~dbg ~sr ~vdi = + Storage_interface.unimplemented __FUNCTION__ - let disable_cbt ctx ~dbg ~sr ~vdi = u "VDI.disable_cbt" + let disable_cbt ctx ~dbg ~sr ~vdi = + Storage_interface.unimplemented __FUNCTION__ - let data_destroy ctx ~dbg ~sr ~vdi = u "VDI.data_destroy" + let data_destroy ctx ~dbg ~sr ~vdi = + Storage_interface.unimplemented __FUNCTION__ let list_changed_blocks ctx ~dbg ~sr ~vdi_from ~vdi_to = - u "VDI.list_changed_blocks" + Storage_interface.unimplemented __FUNCTION__ end -let get_by_name ctx ~dbg ~name = u "get_by_name" +let get_by_name ctx ~dbg ~name = Storage_interface.unimplemented __FUNCTION__ module DATA = struct - let copy ctx ~dbg ~sr ~vdi ~vm ~url ~dest = u "DATA.copy" + let copy ctx ~dbg ~sr ~vdi ~vm ~url ~dest = + Storage_interface.unimplemented __FUNCTION__ - let mirror ctx ~dbg ~sr ~vdi ~vm ~dest = u "DATA.mirror" + let mirror ctx ~dbg ~sr ~vdi ~vm ~dest = + Storage_interface.unimplemented __FUNCTION__ - let stat ctx ~dbg ~sr ~vdi ~vm ~key = u "DATA.stat" + let stat ctx ~dbg ~sr ~vdi ~vm ~key = + Storage_interface.unimplemented __FUNCTION__ let import_activate ctx ~dbg ~dp ~sr ~vdi ~vm = - u "DATA.MIRROR.import_activate" + Storage_interface.unimplemented __FUNCTION__ - let get_nbd_server ctx ~dbg ~dp ~sr ~vdi ~vm = u "DATA.MIRROR.get_nbd_server" + let get_nbd_server ctx ~dbg ~dp ~sr ~vdi ~vm = + Storage_interface.unimplemented __FUNCTION__ module MIRROR = struct type context = unit let send_start ctx ~dbg ~task_id ~dp ~sr ~vdi ~mirror_vm ~mirror_id ~local_vdi ~copy_vm ~live_vm ~url ~remote_mirror ~dest_sr ~verify_dest = - u "DATA.MIRROR.send_start" + Storage_interface.unimplemented __FUNCTION__ let receive_start ctx ~dbg ~sr ~vdi_info ~id ~similar = - u "DATA.MIRROR.receive_start" + Storage_interface.unimplemented __FUNCTION__ let receive_start2 ctx ~dbg ~sr ~vdi_info ~id ~similar ~vm = - u "DATA.MIRROR.receive_start2" + Storage_interface.unimplemented __FUNCTION__ let receive_start3 ctx ~dbg ~sr ~vdi_info ~mirror_id ~similar ~vm ~url ~verify_dest = - u "DATA.MIRROR.receive_start3" + Storage_interface.unimplemented __FUNCTION__ - let receive_finalize ctx ~dbg ~id = u "DATA.MIRROR.receive_finalize" + let receive_finalize ctx ~dbg ~id = + Storage_interface.unimplemented __FUNCTION__ - let receive_finalize2 ctx ~dbg ~id = u "DATA.MIRROR.receive_finalize2" + let receive_finalize2 ctx ~dbg ~id = + Storage_interface.unimplemented __FUNCTION__ let receive_finalize3 ctx ~dbg ~mirror_id ~sr ~url ~verify_dest = - u "DATA.MIRROR.receive_finalize3" + Storage_interface.unimplemented __FUNCTION__ - let receive_cancel ctx ~dbg ~id = u "DATA.MIRROR.receive_cancel" + let receive_cancel ctx ~dbg ~id = + Storage_interface.unimplemented __FUNCTION__ let receive_cancel2 ctx ~dbg ~mirror_id ~url ~verify_dest = - u "DATA.MIRROR.receive_cancel2" + Storage_interface.unimplemented __FUNCTION__ let pre_deactivate_hook ctx ~dbg ~dp ~sr ~vdi = - u "DATA.MIRROR.pre_deactivate_hook" + Storage_interface.unimplemented __FUNCTION__ let has_mirror_failed ctx ~dbg ~mirror_id ~sr = - u "DATA.MIRROR.has_mirror_failed" + Storage_interface.unimplemented __FUNCTION__ - let list ctx ~dbg = u "DATA.MIRROR.list" + let list ctx ~dbg = Storage_interface.unimplemented __FUNCTION__ - let stat ctx ~dbg ~id = u "DATA.MIRROR.stat" + let stat ctx ~dbg ~id = Storage_interface.unimplemented __FUNCTION__ end end module Policy = struct - let get_backend_vm ctx ~dbg ~vm ~sr ~vdi = u "Policy.get_backend_vm" + let get_backend_vm ctx ~dbg ~vm ~sr ~vdi = + Storage_interface.unimplemented __FUNCTION__ end module TASK = struct - let stat ctx ~dbg ~task = u "TASK.stat" + let stat ctx ~dbg ~task = Storage_interface.unimplemented __FUNCTION__ - let cancel ctx ~dbg ~task = u "TASK.cancel" + let cancel ctx ~dbg ~task = Storage_interface.unimplemented __FUNCTION__ - let destroy ctx ~dbg ~task = u "TASK.destroy" + let destroy ctx ~dbg ~task = Storage_interface.unimplemented __FUNCTION__ - let list ctx ~dbg = u "TASK.list" + let list ctx ~dbg = Storage_interface.unimplemented __FUNCTION__ end diff --git a/ocaml/xapi/storage_mux.ml b/ocaml/xapi/storage_mux.ml index 1ea91e94078..7e66e1a4d87 100644 --- a/ocaml/xapi/storage_mux.ml +++ b/ocaml/xapi/storage_mux.ml @@ -844,12 +844,11 @@ module Mux = struct module MIRROR = struct type context = unit - let u x = raise Storage_interface.(Storage_error (Errors.Unimplemented x)) - let send_start _ctx ~dbg:_ ~task_id:_ ~dp:_ ~sr:_ ~vdi:_ ~mirror_vm:_ ~mirror_id:_ ~local_vdi:_ ~copy_vm:_ ~live_vm:_ ~url:_ ~remote_mirror:_ ~dest_sr:_ ~verify_dest:_ = - u "DATA.MIRROR.send_start" (* see storage_smapi{v1,v3}_migrate.ml *) + Storage_interface.unimplemented + __FUNCTION__ (* see storage_smapi{v1,v3}_migrate.ml *) let receive_start () ~dbg ~sr ~vdi_info ~id ~similar = with_dbg ~name:"DATA.MIRROR.receive_start" ~dbg @@ fun _di -> @@ -880,7 +879,7 @@ module Mux = struct (** see storage_smapiv{1,3}_migrate.receive_start3 *) let receive_start3 () ~dbg:_ ~sr:_ ~vdi_info:_ ~mirror_id:_ ~similar:_ ~vm:_ = - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let receive_finalize () ~dbg ~id = with_dbg ~name:"DATA.MIRROR.receive_finalize" ~dbg @@ fun di -> @@ -893,7 +892,7 @@ module Mux = struct Storage_smapiv1_migrate.MIRROR.receive_finalize2 () ~dbg:di.log ~id let receive_finalize3 () ~dbg:_ ~mirror_id:_ ~sr:_ ~url:_ ~verify_dest:_ = - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let receive_cancel () ~dbg ~id = with_dbg ~name:"DATA.MIRROR.receive_cancel" ~dbg @@ fun di -> @@ -901,13 +900,13 @@ module Mux = struct Storage_smapiv1_migrate.MIRROR.receive_cancel () ~dbg:di.log ~id let receive_cancel2 () ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ = - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let pre_deactivate_hook _ctx ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ = - u "DATA.MIRROR.pre_deactivate_hook" + Storage_interface.unimplemented __FUNCTION__ let has_mirror_failed _ctx ~dbg:_ ~mirror_id:_ ~sr:_ = - u "DATA.MIRROR.has_mirror_failed" + Storage_interface.unimplemented __FUNCTION__ let list () ~dbg = with_dbg ~name:"DATA.MIRROR.list" ~dbg @@ fun di -> diff --git a/ocaml/xapi/storage_smapiv1_migrate.ml b/ocaml/xapi/storage_smapiv1_migrate.ml index fe291d44d66..c850d61f842 100644 --- a/ocaml/xapi/storage_smapiv1_migrate.ml +++ b/ocaml/xapi/storage_smapiv1_migrate.ml @@ -567,8 +567,6 @@ let mirror_cleanup ~dbg ~sr ~snapshot = module MIRROR : SMAPIv2_MIRROR = struct type context = unit - let u x = raise Storage_interface.(Storage_error (Errors.Unimplemented x)) - let send_start _ctx ~dbg ~task_id ~dp ~sr ~vdi ~mirror_vm ~mirror_id ~local_vdi ~copy_vm ~live_vm ~url ~remote_mirror ~dest_sr ~verify_dest = D.debug @@ -878,9 +876,9 @@ module MIRROR : SMAPIv2_MIRROR = struct | _ -> false - let list _ctx = u __FUNCTION__ + let list _ctx = Storage_interface.unimplemented __FUNCTION__ - let stat _ctx = u __FUNCTION__ + let stat _ctx = Storage_interface.unimplemented __FUNCTION__ let receive_cancel2 _ctx ~dbg ~mirror_id ~url ~verify_dest = let (module Remote) = diff --git a/ocaml/xapi/storage_smapiv1_wrapper.ml b/ocaml/xapi/storage_smapiv1_wrapper.ml index 7066a649ce2..86879780fba 100644 --- a/ocaml/xapi/storage_smapiv1_wrapper.ml +++ b/ocaml/xapi/storage_smapiv1_wrapper.ml @@ -1137,16 +1137,16 @@ functor end module DATA = struct - let u x = raise Storage_interface.(Storage_error (Errors.Unimplemented x)) - let copy context ~dbg ~sr ~vdi ~vm ~url ~dest = info "DATA.copy dbg:%s sr:%s vdi:%s url:%s dest:%s" dbg (s_of_sr sr) (s_of_vdi vdi) url (s_of_sr dest) ; Impl.DATA.copy context ~dbg ~sr ~vdi ~vm ~url ~dest - let mirror _context ~dbg:_ ~sr:_ ~vdi:_ ~vm:_ ~dest:_ = u "DATA.mirror" + let mirror _context ~dbg:_ ~sr:_ ~vdi:_ ~vm:_ ~dest:_ = + Storage_interface.unimplemented __FUNCTION__ - let stat _context ~dbg:_ ~sr:_ ~vdi:_ ~vm:_ ~key:_ = u "DATA.stat" + let stat _context ~dbg:_ ~sr:_ ~vdi:_ ~vm:_ ~key:_ = + Storage_interface.unimplemented __FUNCTION__ (* tapdisk supports three kind of nbd servers, the old style nbdserver, the new style nbd server and a real nbd server. The old and new style nbd servers @@ -1195,7 +1195,7 @@ functor let send_start _ctx ~dbg:_ ~task_id:_ ~dp:_ ~sr:_ ~vdi:_ ~mirror_vm:_ ~mirror_id:_ ~local_vdi:_ ~copy_vm:_ ~live_vm:_ ~url:_ ~remote_mirror:_ ~dest_sr:_ ~verify_dest:_ = - u "DATA.MIRROR.send_start" + Storage_interface.unimplemented __FUNCTION__ let receive_start context ~dbg ~sr ~vdi_info ~id ~similar = info "DATA.MIRROR.receive_start dbg:%s sr:%s id:%s similar:[%s]" dbg @@ -1215,7 +1215,7 @@ functor let receive_start3 _context ~dbg:_ ~sr:_ ~vdi_info:_ ~mirror_id:_ ~similar:_ ~vm:_ = (* See Storage_smapiv1_migrate.receive_start3 *) - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let receive_finalize context ~dbg ~id = info "DATA.MIRROR.receive_finalize dbg:%s id:%s" dbg id ; @@ -1228,24 +1228,25 @@ functor let receive_finalize3 _context ~dbg:_ ~mirror_id:_ ~sr:_ ~url:_ ~verify_dest:_ = (* see storage_smapiv{1,3}_migrate *) - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let receive_cancel context ~dbg ~id = info "DATA.MIRROR.receive_cancel dbg:%s id:%s" dbg id ; Impl.DATA.MIRROR.receive_cancel context ~dbg ~id let receive_cancel2 _context ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ = - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let pre_deactivate_hook _context ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ = - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ let has_mirror_failed _context ~dbg:_ ~mirror_id:_ ~sr:_ = - u __FUNCTION__ + Storage_interface.unimplemented __FUNCTION__ - let list _context ~dbg:_ = u __FUNCTION__ + let list _context ~dbg:_ = Storage_interface.unimplemented __FUNCTION__ - let stat _context ~dbg:_ ~id:_ = u __FUNCTION__ + let stat _context ~dbg:_ ~id:_ = + Storage_interface.unimplemented __FUNCTION__ end end diff --git a/ocaml/xapi/storage_smapiv3_migrate.ml b/ocaml/xapi/storage_smapiv3_migrate.ml index d9d34ffbe08..774239c0804 100644 --- a/ocaml/xapi/storage_smapiv3_migrate.ml +++ b/ocaml/xapi/storage_smapiv3_migrate.ml @@ -108,8 +108,6 @@ let mirror_wait ~dbg ~sr ~vdi ~vm ~mirror_id mirror_key = module MIRROR : SMAPIv2_MIRROR = struct type context = unit - let u x = raise Storage_interface.(Storage_error (Errors.Unimplemented x)) - let send_start _ctx ~dbg ~task_id:_ ~dp ~sr ~vdi ~mirror_vm ~mirror_id ~local_vdi:_ ~copy_vm:_ ~live_vm ~url ~remote_mirror ~dest_sr ~verify_dest = @@ -187,10 +185,10 @@ module MIRROR : SMAPIv2_MIRROR = struct ) let receive_start _ctx ~dbg:_ ~sr:_ ~vdi_info:_ ~id:_ ~similar:_ = - u "DATA.MIRROR.receive_start" + Storage_interface.unimplemented __FUNCTION__ let receive_start2 _ctx ~dbg:_ ~sr:_ ~vdi_info:_ ~id:_ ~similar:_ ~vm:_ = - u "DATA.MIRROR.receive_start2" + Storage_interface.unimplemented __FUNCTION__ let receive_start3 _ctx ~dbg ~sr ~vdi_info ~mirror_id ~similar:_ ~vm ~url ~verify_dest = @@ -269,9 +267,11 @@ module MIRROR : SMAPIv2_MIRROR = struct !on_fail ; raise e - let receive_finalize _ctx ~dbg:_ ~id:_ = u "DATA.MIRROR.receive_finalize" + let receive_finalize _ctx ~dbg:_ ~id:_ = + Storage_interface.unimplemented __FUNCTION__ - let receive_finalize2 _ctx ~dbg:_ ~id:_ = u "DATA.MIRROR.receive_finalize2" + let receive_finalize2 _ctx ~dbg:_ ~id:_ = + Storage_interface.unimplemented __FUNCTION__ let receive_finalize3 _ctx ~dbg ~mirror_id ~sr ~url ~verify_dest = D.debug "%s dbg:%s id: %s sr: %s url: %s verify_dest: %B" __FUNCTION__ dbg @@ -289,11 +289,12 @@ module MIRROR : SMAPIv2_MIRROR = struct recv_state ; State.remove_receive_mirror mirror_id - let receive_cancel _ctx ~dbg:_ ~id:_ = u __FUNCTION__ + let receive_cancel _ctx ~dbg:_ ~id:_ = + Storage_interface.unimplemented __FUNCTION__ - let list _ctx = u __FUNCTION__ + let list _ctx = Storage_interface.unimplemented __FUNCTION__ - let stat _ctx = u __FUNCTION__ + let stat _ctx = Storage_interface.unimplemented __FUNCTION__ let receive_cancel2 _ctx ~dbg ~mirror_id ~url ~verify_dest = D.debug "%s dbg:%s mirror_id:%s url:%s verify_dest:%B" __FUNCTION__ dbg diff --git a/ocaml/xenopsd/lib/xenops_server_skeleton.ml b/ocaml/xenopsd/lib/xenops_server_skeleton.ml index 2055837c47c..b938927a2e4 100644 --- a/ocaml/xenopsd/lib/xenops_server_skeleton.ml +++ b/ocaml/xenopsd/lib/xenops_server_skeleton.ml @@ -64,50 +64,49 @@ module VM = struct let remove _ = () - let create _ _ _ _ = unimplemented "VM.create" + let create _ _ _ _ = unimplemented __FUNCTION__ - let build ?restore_fd:_ _ _ _ _ _ = unimplemented "VM.build" + let build ?restore_fd:_ _ _ _ _ _ = unimplemented __FUNCTION__ - let create_device_model _ _ _ _ _ = unimplemented "VM.create_device_model" + let create_device_model _ _ _ _ _ = unimplemented __FUNCTION__ - let destroy_device_model _ _ = unimplemented "VM.destroy_device_model" + let destroy_device_model _ _ = unimplemented __FUNCTION__ - let destroy _ _ = unimplemented "VM.destroy" + let destroy _ _ = unimplemented __FUNCTION__ - let pause _ _ = unimplemented "VM.pause" + let pause _ _ = unimplemented __FUNCTION__ - let unpause _ _ = unimplemented "VM.unpause" + let unpause _ _ = unimplemented __FUNCTION__ - let set_xsdata _ _ _ = unimplemented "VM.set_xsdata" + let set_xsdata _ _ _ = unimplemented __FUNCTION__ - let set_vcpus _ _ _ = unimplemented "VM.set_vcpus" + let set_vcpus _ _ _ = unimplemented __FUNCTION__ - let set_shadow_multiplier _ _ _ = unimplemented "VM.set_shadow_multipler" + let set_shadow_multiplier _ _ _ = unimplemented __FUNCTION__ - let set_memory_dynamic_range _ _ _ _ = - unimplemented "VM.set_memory_dynamic_range" + let set_memory_dynamic_range _ _ _ _ = unimplemented __FUNCTION__ - let request_shutdown _ _ _ _ = unimplemented "VM.request_shutdown" + let request_shutdown _ _ _ _ = unimplemented __FUNCTION__ - let wait_shutdown _ _ _ _ = unimplemented "VM.wait_shutdown" + let wait_shutdown _ _ _ _ = unimplemented __FUNCTION__ - let assert_can_save _ = unimplemented "VM.assert_can_save" + let assert_can_save _ = unimplemented __FUNCTION__ - let save _ _ _ _ _ _ _ = unimplemented "VM.save" + let save _ _ _ _ _ _ _ = unimplemented __FUNCTION__ - let restore _ _ _ _ _ _ _ = unimplemented "VM.restore" + let restore _ _ _ _ _ _ _ = unimplemented __FUNCTION__ - let s3suspend _ _ = unimplemented "VM.s3suspend" + let s3suspend _ _ = unimplemented __FUNCTION__ - let s3resume _ _ = unimplemented "VM.s3resume" + let s3resume _ _ = unimplemented __FUNCTION__ - let soft_reset _ _ = unimplemented "VM.soft_reset" + let soft_reset _ _ = unimplemented __FUNCTION__ let get_state _ = Xenops_utils.halted_vm - let request_rdp _ _ = unimplemented "VM.request_rdp" + let request_rdp _ _ = unimplemented __FUNCTION__ - let run_script _ _ _ = unimplemented "VM.run_script" + let run_script _ _ _ = unimplemented __FUNCTION__ let set_domain_action_request _ _ = () @@ -131,9 +130,9 @@ module PCI = struct let dequarantine _ = () - let plug _ _ _ = unimplemented "PCI.plug" + let plug _ _ _ = unimplemented __FUNCTION__ - let unplug _ _ _ = unimplemented "PCI.unplug" + let unplug _ _ _ = unimplemented __FUNCTION__ let get_device_action_request _ _ = None end @@ -145,17 +144,17 @@ module VBD = struct let epoch_end _ _ _ = () - let attach _ _ _ = unimplemented "VBD.attach" + let attach _ _ _ = unimplemented __FUNCTION__ - let activate _ _ _ = unimplemented "VBD.activate" + let activate _ _ _ = unimplemented __FUNCTION__ - let deactivate _ _ _ _ = unimplemented "VBD.deactivate" + let deactivate _ _ _ _ = unimplemented __FUNCTION__ - let detach _ _ _ = unimplemented "VBD.detach" + let detach _ _ _ = unimplemented __FUNCTION__ - let insert _ _ _ _ = unimplemented "VBD.insert" + let insert _ _ _ _ = unimplemented __FUNCTION__ - let eject _ _ _ = unimplemented "VBD.eject" + let eject _ _ _ = unimplemented __FUNCTION__ let set_qos _ _ _ = () @@ -167,23 +166,21 @@ end module VIF = struct let set_active _ _ _ _ = () - let plug _ _ _ = unimplemented "VIF.plug" + let plug _ _ _ = unimplemented __FUNCTION__ - let unplug _ _ _ _ = unimplemented "VIF.unplug" + let unplug _ _ _ _ = unimplemented __FUNCTION__ - let move _ _ _ _ = unimplemented "VIF.move" + let move _ _ _ _ = unimplemented __FUNCTION__ - let set_carrier _ _ _ _ = unimplemented "VIF.set_carrier" + let set_carrier _ _ _ _ = unimplemented __FUNCTION__ - let set_locking_mode _ _ _ _ = unimplemented "VIF.set_locking_mode" + let set_locking_mode _ _ _ _ = unimplemented __FUNCTION__ - let set_ipv4_configuration _ _ _ _ = - unimplemented "VIF.set_ipv4_configuration" + let set_ipv4_configuration _ _ _ _ = unimplemented __FUNCTION__ - let set_ipv6_configuration _ _ _ _ = - unimplemented "VIF.set_ipv6_configuration" + let set_ipv6_configuration _ _ _ _ = unimplemented __FUNCTION__ - let set_pvs_proxy _ _ _ _ = unimplemented "VIF.set_pvs_proxy" + let set_pvs_proxy _ _ _ _ = unimplemented __FUNCTION__ let get_state _ _ = unplugged_vif @@ -191,7 +188,7 @@ module VIF = struct end module VGPU = struct - let start _ _ _ _ = unimplemented "VGPU.start" + let start _ _ _ _ = unimplemented __FUNCTION__ let set_active _ _ _ _ = () @@ -199,9 +196,9 @@ module VGPU = struct end module VUSB = struct - let plug _ _ _ = unimplemented "VUSB.plug" + let plug _ _ _ = unimplemented __FUNCTION__ - let unplug _ _ _ = unimplemented "VUSB.unplug" + let unplug _ _ _ = unimplemented __FUNCTION__ let get_state _ _ = unplugged_vusb @@ -216,4 +213,4 @@ module UPDATES = struct assert false end -module DEBUG = struct let trigger _ _ = unimplemented "DEBUG.trigger" end +module DEBUG = struct let trigger _ _ = unimplemented __FUNCTION__ end From 3ae8ff956f862a40fde3ee9d517afb860baba8b7 Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Wed, 2 Apr 2025 21:19:54 +0100 Subject: [PATCH 054/111] Use just id_of vbd for attached_vdis key instead of the (VM, VBD) tuple The VM id part of Vbd.id is unnecessary in the attached_vdis key as the DB is already indexed by the VM id. This also prevents problems when the VM is renamed. Signed-off-by: Steven Woods --- ocaml/xenopsd/xc/xenops_server_xen.ml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index b8577746b88..ccf3eac9764 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -187,7 +187,7 @@ module VmExtra = struct ; pv_drivers_detected: bool [@default false] ; xen_platform: (int * int) option (* (device_id, revision) for QEMU *) ; platformdata: (string * string) list [@default []] - ; attached_vdis: (Vbd.id * attached_vdi) list [@default []] + ; attached_vdis: (string * attached_vdi) list [@default []] } [@@deriving rpcty] @@ -3682,9 +3682,13 @@ module VBD = struct persistent= { vm_t.VmExtra.persistent with + (* Index by id_of vbd rather than vbd.id as VmExtra is + already indexed by VM id, so the VM id part of + vbd.id is unnecessary and causes issues finding the + attached_vdi when the VM is renamed. *) attached_vdis= - (vbd.Vbd.id, vdi) - :: List.remove_assoc vbd.Vbd.id + (id_of vbd, vdi) + :: List.remove_assoc (id_of vbd) vm_t.persistent.attached_vdis } } @@ -3706,7 +3710,7 @@ module VBD = struct let activate task vm vbd = let vmextra = DB.read_exn vm in - match List.assoc_opt vbd.id vmextra.persistent.attached_vdis with + match List.assoc_opt (id_of vbd) vmextra.persistent.attached_vdis with | None -> debug "No attached_vdi info, so not activating" | Some vdi -> @@ -3857,7 +3861,7 @@ module VBD = struct ) vm ) - (fun () -> cleanup_attached_vdis vm vbd.id) + (fun () -> cleanup_attached_vdis vm (id_of vbd)) let deactivate task vm vbd force = with_xc_and_xs (fun xc xs -> @@ -4021,7 +4025,7 @@ module VBD = struct | _ -> () ) ; - cleanup_attached_vdis vm vbd.id + cleanup_attached_vdis vm (id_of vbd) let insert task vm vbd d = on_frontend From 0728527b0598c86f497bda5a6c387c360f3132ee Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Tue, 24 Jun 2025 14:47:58 +0100 Subject: [PATCH 055/111] CA-410965: Modify default ref of console The `ref` parameter within the `location` attribute of the console can refer to either the VM's ref or the console's own ref. Currently, the console's location uses the VM's ref by default. This causes an issue: when executing xs console, the requested location contains the VM's ref. If the ref points to the VM, xapi will attempt to use the RFB console (which is graphical) by default, rather than the VT100 console (which is text-based). As a result, the xs console command fails to open the console and hangs. **Solution:** Update the default ref in the console's `location` to the console's own ref. With this change, whether accessing the RFB or the VT100 console, the ref in the `location` will always point to the respective console itself. Signed-off-by: Bengang Yuan --- ocaml/xapi/create_misc.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/create_misc.ml b/ocaml/xapi/create_misc.ml index cd3412156cd..26d2c886d52 100644 --- a/ocaml/xapi/create_misc.ml +++ b/ocaml/xapi/create_misc.ml @@ -307,7 +307,7 @@ and create_domain_zero_console_record_with_protocol ~__context ~domain_zero_ref let location = Uri.( make ~scheme:"https" ~host:address ~path:Constants.console_uri - ~query:[("ref", [Ref.string_of domain_zero_ref])] + ~query:[("ref", [Ref.string_of console_ref])] () |> to_string ) From 38256eb03af53bce840e224692f2b2e9b61bf78a Mon Sep 17 00:00:00 2001 From: Guillaume Date: Thu, 13 Mar 2025 17:59:39 +0100 Subject: [PATCH 056/111] Design proposal for supported image formats (v3) Add details on specifying image format for VDI and VM migration. In particular This revision explains how to choose the destination image format during VDI creation and migration, including VM migration scenarios. Also fixes minor typos in the document. Signed-off-by: Guillaume --- .../design/sm-supported-image-formats.md | 138 +++++++++++++++--- 1 file changed, 114 insertions(+), 24 deletions(-) diff --git a/doc/content/design/sm-supported-image-formats.md b/doc/content/design/sm-supported-image-formats.md index fd1118e885d..3d860c2833f 100644 --- a/doc/content/design/sm-supported-image-formats.md +++ b/doc/content/design/sm-supported-image-formats.md @@ -2,7 +2,7 @@ title: Add supported image formats in sm-list layout: default design_doc: true -revision: 2 +revision: 3 status: proposed --- @@ -22,32 +22,16 @@ available formats. # Design Proposal To expose the available image formats to clients (e.g., XenCenter, XenOrchestra, etc.), -we propose adding a new field called `supported-image-formats` to the Storage Manager (SM) -module. This field will be included in the output of the `SM.get_all_records` call. +we propose adding a new field called `supported_image_formats` to the Storage Manager +(SM) module. This field will be included in the output of the `SM.get_all_records` call. -The `supported-image-formats` field will be populated by retrieving information -from the SMAPI drivers. Specifically, each driver will update its `DRIVER_INFO` -dictionary with a new key, `supported_image_formats`, which will contain a list -of strings representing the supported image formats -(for example: `["vhd", "raw", "qcow2"]`). - -The list designates the driver's preferred VDI format as its first entry. That -means that when migrating a VDI, the destination storage repository will -attempt to create a VDI in this preferred format. If the default format cannot -be used (e.g., due to size limitations), an error will be generated. - -If a driver does not provide this information (as is currently the case with existing -drivers), the default value will be an empty array. This signifies that it is the -driver that decides which format it will use. This ensures that the modification -remains compatible with both current and future drivers. - -With this new information, listing all parameters of the SM object will return: +- With this new information, listing all parameters of the SM object will return: ```bash # xe sm-list params=all ``` -will output something like: +Output of the command will look like (notice that CLI uses hyphens): ``` uuid ( RO) : c6ae9a43-fff6-e482-42a9-8c3f8c533e36 @@ -65,12 +49,118 @@ required-cluster-stack ( RO) : supported-image-formats ( RO) : vhd, raw, qcow2 ``` -This change impacts the SM data model, and as such, the XAPI database version will -be incremented. +## Implementation details + +The `supported_image_formats` field will be populated by retrieving information +from the SMAPI drivers. Specifically, each driver will update its `DRIVER_INFO` +dictionary with a new key, `supported_image_formats`, which will contain a list +of strings representing the supported image formats +(for example: `["vhd", "raw", "qcow2"]`). Although the formats are listed as a +list of strings, they are treated as a set-specifying the same format multiple +times has no effect. + +### Driver behavior without `supported_image_formats` + +If a driver does not provide this information (as is currently the case with +existing drivers), the default value will be an empty list. This signifies +that the driver determines which format to use when creating VDI. During a migration, +the destination driver will choose the format of the VDI if none is explicitly +specified. This ensures backward compatibility with both current and future drivers. + +### Specifying image formats for VDIs creation + +If the supported image format is exposed to the client, then, when creating new VDI, +user can specify the desired format via the `sm_config` parameter `image-format=qcow2` (or +any format that is supported). If no format is specified, the driver will use its +preferred default format. If the specified format is not supported, an error will be +generated indicating that the SR does not support it. Here is how it can be achieved +using the XE CLI: + +```bash +# xe vdi-create \ + sr-uuid=cbe2851e-9f9b-f310-9bca-254c1cf3edd8 \ + name-label="A new VDI" \ + virtual-size=10240 \ + sm-config:image-format=vhd +``` + +### Specifying image formats for VDIs migration + +When migrating a VDI, an API client may need to specify the desired image format if +the destination SR supports multiple storage formats. + +#### VDI pool migrate + +To support this, a new parameter, `dest_img_format`, is introduced to +`VDI.pool_migrate`. This field accepts a string specifying the desired format (e.g., *qcow2*), +ensuring that the VDI is migrated in the correct format. The new signature of +`VDI.pool_migrate` will be +`VDI ref pool_migrate (session ref, VDI ref, SR ref, string, (string -> string) map)`. + +If the specified format is not supported or cannot be used (e.g., due to size limitations), +an error will be generated. Validation will be performed as early as possible to prevent +disruptions during migration. These checks can be performed by examining the XAPI database +to determine whether the SR provided as the destination has a corresponding SM object with +the expected format. If this is not the case, a `format not found` error will be returned. +If no format is specified by the client, the destination driver will determine the appropriate +format. + +```bash +# xe vdi-pool-migrate \ + uuid= \ + sr-uuid= \ + dest-img-format=qcow2 +``` + +#### VM migration to remote host + +A VDI migration can also occur during a VM migration. In this case, we need to +be able to specify the expected destination format as well. Unlike `VDI.pool_migrate`, +which applies to a single VDI, VM migration may involve multiple VDIs. +The current signature of `VM.migrate_send` is `(session ref, VM ref, (string -> string) map, +bool, (VDI ref -> SR ref) map, (VIF ref -> network ref) map, (string -> string) map, +(VGPU ref -> GPU_group ref) map)`. Thus there is already a parameter that maps each source +VDI to its destination SR. We propose to add a new parameter that allows specifying the +desired destination format for a given source VDI: `(VDI ref -> string)`. It is +similar to the VDI-to-SR mapping. We will update the XE cli to support this new format. +It would be `image_format:=`: + +```bash +# xe vm-migrate \ + host-uuid= \ + remote-master= \ + remote-password= \ + remote-username= \ + vdi:= \ + vdi:= \ + image-format:=vhd \ + image-format:=qcow2 \ + uuid= +``` +The destination image format would be a string such as *vhd*, *qcow2*, or another +supported format. It is optional to specify a format. If omitted, the driver +managing the destination SR will determine the appropriate format. +As with VDI pool migration, if this parameter is not supported by the SM driver, +a `format not found` error will be returned. The validation must happen before +sending a creation message to the SM driver, ideally at the same time as checking +whether all VDIs can be migrated. + +To be able to check the format, we will need to modify `VM.assert_can_migrate` and +add the mapping from VDI references to their image formats, as is done in `VM.migrate_send`. # Impact -- **Data Model:** A new field (`supported-image-formats`) is added to the SM records. +It should have no impact on existing storage repositories that do not provide any information +about the supported image format. + +This change impacts the SM data model, and as such, the XAPI database version will +be incremented. It also impacts the API. + +- **Data Model:** + - A new field (`supported_image_formats`) is added to the SM records. + - A new parameter is added to `VM.migrate_send`: `(VDI ref -> string) map` + - A new parameter is added to `VM.assert_can_migrate`: `(VDI ref -> string) map` + - A new parameter is added to `VDI.pool_migrate`: `string` - **Client Awareness:** Clients like the `xe` CLI will now be able to query and display the supported image formats for a given SR. - **Database Versioning:** The XAPI database version will be updated to reflect this change. From ec612aebb261a6f3b1ad0e27531bb602f622c204 Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Tue, 24 Jun 2025 11:02:45 +0100 Subject: [PATCH 057/111] CA-411477: Fix SM API version check failure When shutdown a VM, xapi will check SM API version to decide if to call `post_deactivate_hook`. But if the SR has already been unplugged, the checking will fail. Solution: Check if the dp of the SR still exists. If not, skip the SM API checking. Signed-off-by: Bengang Yuan --- ocaml/xapi/storage_mux.ml | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/ocaml/xapi/storage_mux.ml b/ocaml/xapi/storage_mux.ml index 7e66e1a4d87..0427f76ca54 100644 --- a/ocaml/xapi/storage_mux.ml +++ b/ocaml/xapi/storage_mux.ml @@ -644,16 +644,23 @@ module Mux = struct with_dbg ~name:"VDI.deativate" ~dbg @@ fun di -> info "VDI.deactivate dbg:%s dp:%s sr:%s vdi:%s vm:%s" dbg dp (s_of_sr sr) (s_of_vdi vdi) (s_of_vm vm) ; - let module C = StorageAPI (Idl.Exn.GenClient (struct - let rpc = of_sr sr - end)) in - C.VDI.deactivate (Debug_info.to_string di) dp sr vdi vm ; - (*XX The hook should not be called here, nor should storage_mux care about - the SMAPI version of the SR, but as xapi-storage-script cannot call code - xapi, and smapiv1_wrapper has state tracking logic, the hook has to be placed - here for now. *) - if smapi_version_of_sr sr = SMAPIv3 then - Storage_migrate.post_deactivate_hook ~sr ~vdi ~dp + let open DP_info in + match read dp with + | Some {sr; vdi; vm; _} -> + let module C = StorageAPI (Idl.Exn.GenClient (struct + let rpc = of_sr sr + end)) in + C.VDI.deactivate (Debug_info.to_string di) dp sr vdi vm ; + (*XX The hook should not be called here, nor should storage_mux care about + the SMAPI version of the SR, but as xapi-storage-script cannot call code + xapi, and smapiv1_wrapper has state tracking logic, the hook has to be placed + here for now. *) + if smapi_version_of_sr sr = SMAPIv3 then + Storage_migrate.post_deactivate_hook ~sr ~vdi ~dp + | None -> + info + "dp %s is not associated with a locally attached VDI; nothing to do" + dp let detach () ~dbg ~dp ~sr ~vdi ~vm = with_dbg ~name:"VDI.detach" ~dbg @@ fun di -> From 5351b0b62dfa3365cbb884f91a14b69c4ab6f812 Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Wed, 2 Apr 2025 21:31:01 +0100 Subject: [PATCH 058/111] CP-54207: Move VBD_attach outside of VM migrate downtime VBDs can be attached to multiple VMs, so now that VBD_plug has been split into VBD_attach and VBD_activate, the attach can happen outside of the VM migrate downtime. This doesn't change the overall duration of the migration but can reduce the downtime by several seconds. This new functionality is dependent on two flags: firstly, xenopsd_vbd_plug_unplug_legacy must be false so that the VBD_attach and VBD_activate are separate atoms. This is off by default. Then there is another flag can_attach_early which is currently true iff the VBD's SM has required_api_version >= 3.0 Signed-off-by: Steven Woods --- ocaml/xapi-idl/xen/xenops_interface.ml | 1 + ocaml/xapi/xapi_sr.ml | 12 ++++++ ocaml/xapi/xapi_xenops.ml | 30 ++++++++++++++ ocaml/xenopsd/cli/xn.ml | 1 + ocaml/xenopsd/lib/xenops_server.ml | 55 +++++++++++++++++++++----- 5 files changed, 89 insertions(+), 10 deletions(-) diff --git a/ocaml/xapi-idl/xen/xenops_interface.ml b/ocaml/xapi-idl/xen/xenops_interface.ml index 9b3f2941910..41eb44e0875 100644 --- a/ocaml/xapi-idl/xen/xenops_interface.ml +++ b/ocaml/xapi-idl/xen/xenops_interface.ml @@ -303,6 +303,7 @@ module Vbd = struct ; extra_private_keys: (string * string) list [@default []] ; qos: qos option [@default None] ; persistent: bool [@default true] + ; can_attach_early: bool [@default false] } [@@deriving rpcty] diff --git a/ocaml/xapi/xapi_sr.ml b/ocaml/xapi/xapi_sr.ml index 4a0684147af..8261757bb5e 100644 --- a/ocaml/xapi/xapi_sr.ml +++ b/ocaml/xapi/xapi_sr.ml @@ -1080,3 +1080,15 @@ let get_live_hosts ~__context ~sr = Xapi_vm_helpers.assert_can_see_specified_SRs ~__context ~reqd_srs:[sr] ~host in Xapi_vm_helpers.possible_hosts ~__context ~choose_fn () + +let required_api_version_of_sr ~__context ~sr = + let sr_type = Db.SR.get_type ~__context ~self:sr in + let expr = + Xapi_database.Db_filter_types.(Eq (Field "type", Literal sr_type)) + in + match Db.SM.get_records_where ~__context ~expr with + | (_, sm) :: _ -> + Some sm.API.sM_required_api_version + | [] -> + warn "Couldn't find SM with type %s" sr_type ; + None diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 8ad3c8b9962..83469f58eaf 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -553,6 +553,10 @@ let list_net_sriov_vf_pcis ~__context ~vm = None ) +module StringMap = Map.Make (String) + +let sr_version_cache = ref StringMap.empty + module MD = struct (** Convert between xapi DB records and xenopsd records *) @@ -684,6 +688,31 @@ module MD = struct ) else disk_of_vdi ~__context ~self:vbd.API.vBD_VDI in + let can_attach_early = + let sr_opt = + try Some (Db.VDI.get_SR ~__context ~self:vbd.API.vBD_VDI) + with _ -> None + in + match sr_opt with + | Some sr -> ( + let sr_key = Ref.string_of sr in + match StringMap.find_opt sr_key !sr_version_cache with + | Some cached_api_version -> + Version.String.ge cached_api_version "3.0" + | None -> ( + match Xapi_sr.required_api_version_of_sr ~__context ~sr with + | Some api_version -> + sr_version_cache := + StringMap.add sr_key api_version !sr_version_cache ; + Version.String.ge api_version "3.0" + | None -> + false + ) + ) + | None -> + (* If we can't get the SR, we have to default to false *) + false + in { id= (vm.API.vM_uuid, Device_number.to_linux_device device_number) ; position= Some device_number @@ -707,6 +736,7 @@ module MD = struct ( try Db.VDI.get_on_boot ~__context ~self:vbd.API.vBD_VDI = `persist with _ -> true ) + ; can_attach_early } let of_pvs_proxy ~__context vif proxy = diff --git a/ocaml/xenopsd/cli/xn.ml b/ocaml/xenopsd/cli/xn.ml index a6ed6a884bd..24fecb9cf09 100644 --- a/ocaml/xenopsd/cli/xn.ml +++ b/ocaml/xenopsd/cli/xn.ml @@ -278,6 +278,7 @@ let vbd_of_disk_info vm_id info = ; extra_private_keys= [] ; qos= None ; persistent= true + ; can_attach_early= false } let print_disk vbd = diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index 569dabc11a1..15715ac7ac7 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -1763,7 +1763,8 @@ let rec atomics_of_operation = function serial "VIF.activate_and_plug" ~id [VIF_set_active (vif.Vif.id, true); VIF_plug vif.Vif.id] ) - | VM_restore_devices (id, restore_vifs) -> + | VM_restore_devices (id, migration) -> + let restore_vifs = not migration in let vbds_rw, vbds_ro = VBD_DB.vbds id |> vbd_plug_sets in let vgpus = VGPU_DB.vgpus id in let pcis = PCI_DB.pcis id |> pci_plug_order in @@ -1773,8 +1774,22 @@ let rec atomics_of_operation = function let name_multi = pf "VBDs.activate_and_plug %s" typ in let name_one = pf "VBD.activate_and_plug %s" typ in parallel_map name_multi ~id vbds (fun vbd -> - serial name_one ~id - [VBD_set_active (vbd.Vbd.id, true); vbd_plug vbd.Vbd.id] + (* When migrating, attach early if the vbd's SM allows it. + Note: there is a bug here for SxM if migrating between API + versions as the Vbd's new SR won't have propagated to xenopsd + yet. This means can_attach_early will be based on the origin SR. + This is a non-issue as v1 <-> v3 migration is still experimental + and v1 is already early-attaching in SxM through mirroring. + *) + if + migration + && (not !xenopsd_vbd_plug_unplug_legacy) + && vbd.Vbd.can_attach_early + then + [VBD_activate vbd.Vbd.id] + else + serial name_one ~id + [VBD_set_active (vbd.Vbd.id, true); vbd_plug vbd.Vbd.id] ) in [ @@ -1897,7 +1912,7 @@ let rec atomics_of_operation = function ] ; vgpu_start_operations ; [VM_restore (id, data, vgpu_data)] - ; atomics_of_operation (VM_restore_devices (id, true)) + ; atomics_of_operation (VM_restore_devices (id, false)) ; [ (* At this point the domain is considered survivable. *) VM_set_domain_action_request (id, None) @@ -2696,9 +2711,9 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = | VM_restore_vifs id -> debug "VM_restore_vifs %s" id ; perform_atomics (atomics_of_operation op) t - | VM_restore_devices (id, restore_vifs) -> + | VM_restore_devices (id, migration) -> (* XXX: this is delayed due to the 'attach'/'activate' behaviour *) - debug "VM_restore_devices %s %b" id restore_vifs ; + debug "VM_restore_devices %s %b" id migration ; perform_atomics (atomics_of_operation op) t | VM_resume (id, _data) -> debug "VM.resume %s" id ; @@ -3022,11 +3037,31 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = ( try let no_sharept = VGPU_DB.vgpus id |> List.exists is_no_sharept in debug "VM %s no_sharept=%b (%s)" id no_sharept __LOC__ ; + (* If plug is split into activate and attach, we could attach + early so that it is outside of the VM downtime (if the SM + supports this) *) + let early_attach = + parallel_map "VBDs.set_active_and_attach" ~id (VBD_DB.vbds id) + (fun vbd -> + if + (not !xenopsd_vbd_plug_unplug_legacy) + && vbd.Vbd.can_attach_early + then + serial "VBD.set_active_and_attach" ~id + [ + VBD_set_active (vbd.Vbd.id, true) + ; VBD_attach vbd.Vbd.id + ] + else + [] + ) + in perform_atomics ([VM_create (id, Some memory_limit, Some final_id, no_sharept)] - @ (* Perform as many operations as possible on the destination - domain before pausing the original domain *) - atomics_of_operation (VM_restore_vifs id) + (* Perform as many operations as possible on the destination + domain before pausing the original domain *) + @ atomics_of_operation (VM_restore_vifs id) + @ early_attach ) t ; Handshake.send s Handshake.Success @@ -3142,7 +3177,7 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit = ) ; debug "VM.receive_memory: restoring remaining devices and unpausing" ; perform_atomics - (atomics_of_operation (VM_restore_devices (final_id, false)) + (atomics_of_operation (VM_restore_devices (final_id, true)) @ [ VM_unpause final_id ; VM_set_domain_action_request (final_id, None) From 4697489ae095af240c205c662bf36581c270bd81 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 19 Jun 2025 16:05:39 +0100 Subject: [PATCH 059/111] xenopsd/xc: do try to allocate pages to a particular NUMA node This reverts commit 9e6fb15bb069404a64836dc7c6603d41226cc6bb Signed-off-by: Pau Ruiz Safont --- ocaml/xenopsd/xc/domain.ml | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index c1561b862a5..c4730673295 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -886,7 +886,7 @@ let numa_placement domid ~vcpus ~memory affinity = Array.map2 NUMAResource.min_memory (Array.of_list nodes) a in numa_resources := Some nodea ; - let _ = + let memory_plan = match Softaffinity.plan ~vm host nodea with | None -> D.debug "NUMA-aware placement failed for domid %d" domid ; @@ -898,9 +898,28 @@ let numa_placement domid ~vcpus ~memory affinity = done ; mem_plan in - (* Neither xenguest nor emu-manager allow allocating pages to a single - NUMA node, don't return any NUMA in any case. Claiming the memory - would be done here, but it conflicts with DMC. *) + (* Xen only allows a single node when using memory claims, or none at all. *) + let* numa_node, node = + match memory_plan with + | [Node node] -> + Some (Xenctrlext.NumaNode.from node, node) + | [] | _ :: _ :: _ -> + D.debug + "%s: domain %d can't fit a single NUMA node, falling back to \ + default behaviour" + __FUNCTION__ domid ; + None + in + let nr_pages = Int64.div memory 4096L |> Int64.to_int in + try + Xenctrlext.domain_claim_pages xcext domid ~numa_node nr_pages ; + Some (node, memory) + with Xenctrlext.Unix_error (errno, _) -> + D.info + "%s: unable to claim enough memory, domain %d won't be hosted in a \ + single NUMA node. (error %s)" + __FUNCTION__ domid + Unix.(error_message errno) ; None ) From d09bcec2ec83578832edab73fbb1b193a177e29c Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 19 Jun 2025 16:14:23 +0100 Subject: [PATCH 060/111] xenopsd/xc: Expect an error when claiming pages from a single NUMA node On xen versions that don't support this call yet, xenctrlext will simply fail and continue to behave like before. Signed-off-by: Pau Ruiz Safont --- ocaml/xenopsd/xc/domain.ml | 19 ++++++++++++------- ocaml/xenopsd/xc/xenctrlext.ml | 2 ++ ocaml/xenopsd/xc/xenctrlext.mli | 6 +++++- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index c4730673295..287c1c77b27 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -914,13 +914,18 @@ let numa_placement domid ~vcpus ~memory affinity = try Xenctrlext.domain_claim_pages xcext domid ~numa_node nr_pages ; Some (node, memory) - with Xenctrlext.Unix_error (errno, _) -> - D.info - "%s: unable to claim enough memory, domain %d won't be hosted in a \ - single NUMA node. (error %s)" - __FUNCTION__ domid - Unix.(error_message errno) ; - None + with + | Xenctrlext.Not_available -> + (* Xen does not provide the interface to claim pages from a single NUMA + node, ignore the error and continue. *) + None + | Xenctrlext.Unix_error (errno, _) -> + D.info + "%s: unable to claim enough memory, domain %d won't be hosted in a \ + single NUMA node. (error %s)" + __FUNCTION__ domid + Unix.(error_message errno) ; + None ) let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = diff --git a/ocaml/xenopsd/xc/xenctrlext.ml b/ocaml/xenopsd/xc/xenctrlext.ml index 4078ee7b945..3760d1ab35d 100644 --- a/ocaml/xenopsd/xc/xenctrlext.ml +++ b/ocaml/xenopsd/xc/xenctrlext.ml @@ -125,5 +125,7 @@ module NumaNode = struct let from = Fun.id end +exception Not_available + let domain_claim_pages handle domid ?(numa_node = NumaNode.none) nr_pages = stub_domain_claim_pages handle domid numa_node nr_pages diff --git a/ocaml/xenopsd/xc/xenctrlext.mli b/ocaml/xenopsd/xc/xenctrlext.mli index 2199f42c452..1572a1a8589 100644 --- a/ocaml/xenopsd/xc/xenctrlext.mli +++ b/ocaml/xenopsd/xc/xenctrlext.mli @@ -102,5 +102,9 @@ module NumaNode : sig val from : int -> t end +exception Not_available + val domain_claim_pages : handle -> domid -> ?numa_node:NumaNode.t -> int -> unit -(** Raises {Unix_error} if there's not enough memory to claim in the system *) +(** Raises {Unix_error} if there's not enough memory to claim in the system. + Raises {Not_available} if a single numa node is requested and xen does not + provide page claiming for single numa nodes. *) From 43a7ab2f8c6126b4e3515be055746e4eeaa48b1b Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 19 Jun 2025 16:15:58 +0100 Subject: [PATCH 061/111] xenopsd/xc: fail when claiming pages for a single NUMA node This interface is not yet available in xen, so fail before doing the hypercall. This patch is meant to be reverted on system that provide the new interface for easily test it. Signed-off-by: Pau Ruiz Safont --- ocaml/xenopsd/xc/xenctrlext.ml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocaml/xenopsd/xc/xenctrlext.ml b/ocaml/xenopsd/xc/xenctrlext.ml index 3760d1ab35d..1c983daae26 100644 --- a/ocaml/xenopsd/xc/xenctrlext.ml +++ b/ocaml/xenopsd/xc/xenctrlext.ml @@ -128,4 +128,6 @@ end exception Not_available let domain_claim_pages handle domid ?(numa_node = NumaNode.none) nr_pages = + if numa_node <> NumaNode.none then + raise Not_available ; stub_domain_claim_pages handle domid numa_node nr_pages From 24ebb5b1356247da57f667b73852be7d9419a93b Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 26 Jun 2025 14:46:55 +0100 Subject: [PATCH 062/111] idl: Remove unused vm_lacks_feature_* errors Most of these have been unused for almost 10 years since c4ccc564e ("CA-217842: Replace instances of vm_lacks_feature_x with vm_lacks_feature") Drop them. Signed-off-by: Andrii Sultanov --- ocaml/idl/datamodel_errors.ml | 20 -------------------- ocaml/xapi-consts/api_errors.ml | 9 --------- 2 files changed, 29 deletions(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index b22d91f9715..30e185ac192 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -532,26 +532,6 @@ let _ = "You attempted an operation on a VM which requires a more recent version \ of the PV drivers. Please upgrade your PV drivers." () ; - error Api_errors.vm_lacks_feature_shutdown ["vm"] - ~doc: - "You attempted an operation which needs the cooperative shutdown feature \ - on a VM which lacks it." - () ; - error Api_errors.vm_lacks_feature_vcpu_hotplug ["vm"] - ~doc: - "You attempted an operation which needs the VM hotplug-vcpu feature on a \ - VM which lacks it." - () ; - error Api_errors.vm_lacks_feature_suspend ["vm"] - ~doc: - "You attempted an operation which needs the VM cooperative suspend \ - feature on a VM which lacks it." - () ; - error Api_errors.vm_lacks_feature_static_ip_setting ["vm"] - ~doc: - "You attempted an operation which needs the VM static-ip-setting feature \ - on a VM which lacks it." - () ; error Api_errors.vm_lacks_feature ["vm"] ~doc:"You attempted an operation on a VM which lacks the feature." () ; error Api_errors.vm_is_template ["vm"] diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index d5927c91bfb..077a8dacbf9 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -440,15 +440,6 @@ let vm_old_pv_drivers = add_error "VM_OLD_PV_DRIVERS" let vm_lacks_feature = add_error "VM_LACKS_FEATURE" -let vm_lacks_feature_shutdown = add_error "VM_LACKS_FEATURE_SHUTDOWN" - -let vm_lacks_feature_suspend = add_error "VM_LACKS_FEATURE_SUSPEND" - -let vm_lacks_feature_vcpu_hotplug = add_error "VM_LACKS_FEATURE_VCPU_HOTPLUG" - -let vm_lacks_feature_static_ip_setting = - add_error "VM_LACKS_FEATURE_STATIC_IP_SETTING" - let vm_cannot_delete_default_template = add_error "VM_CANNOT_DELETE_DEFAULT_TEMPLATE" From d9233f99d3a34c7ad399786f45b4581cdad35279 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 22 Apr 2025 09:30:40 +0100 Subject: [PATCH 063/111] python: Add qcow2-to-stdout.py script Taken from https://github.com/qemu/qemu/blob/a9cd5bc6399a80fcf233ed0fffe6067b731227d8/scripts/qcow2-to-stdout.py > This tool converts a disk image to qcow2, writing the result directly > to stdout. This can be used for example to send the generated file > over the network. > > This is equivalent to using qemu-img to convert a file to qcow2 and > then writing the result to stdout, with the difference that this tool > does not need to create this temporary qcow2 file and therefore does > not need any additional disk space. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_globs.ml | 3 + python3/Makefile | 1 + python3/libexec/qcow2-to-stdout.py | 450 +++++++++++++++++++++++++++++ 3 files changed, 454 insertions(+) create mode 100755 python3/libexec/qcow2-to-stdout.py diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index e3957deea71..2390b93ab01 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -805,6 +805,8 @@ let sparse_dd = ref "sparse_dd" let vhd_tool = ref "vhd-tool" +let qcow_to_stdout = ref "/opt/xensource/libexec/qcow2-to-stdout.py" + let fence = ref "fence" let host_bugreport_upload = ref "host-bugreport-upload" @@ -1799,6 +1801,7 @@ module Resources = struct ) ; ("sparse_dd", sparse_dd, "Path to sparse_dd") ; ("vhd-tool", vhd_tool, "Path to vhd-tool") + ; ("qcow_to_stdout", qcow_to_stdout, "Path to qcow-to-stdout script") ; ("fence", fence, "Path to fence binary, used for HA host fencing") ; ( "host-bugreport-upload" , host_bugreport_upload diff --git a/python3/Makefile b/python3/Makefile index fb13068ca0e..3646ad9f54a 100644 --- a/python3/Makefile +++ b/python3/Makefile @@ -30,6 +30,7 @@ install: $(IPROG) libexec/mail-alarm $(DESTDIR)$(LIBEXECDIR) $(IPROG) libexec/backup-sr-metadata.py $(DESTDIR)$(LIBEXECDIR) $(IPROG) libexec/restore-sr-metadata.py $(DESTDIR)$(LIBEXECDIR) + $(IPROG) libexec/qcow2-to-stdout.py $(DESTDIR)$(LIBEXECDIR) $(IPROG) bin/hfx_filename $(DESTDIR)$(OPTDIR)/bin $(IPROG) bin/xe-reset-networking $(DESTDIR)$(OPTDIR)/bin diff --git a/python3/libexec/qcow2-to-stdout.py b/python3/libexec/qcow2-to-stdout.py new file mode 100755 index 00000000000..8109f8fc351 --- /dev/null +++ b/python3/libexec/qcow2-to-stdout.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 + +# This tool reads a disk image in any format and converts it to qcow2, +# writing the result directly to stdout. +# +# Copyright (C) 2024 Igalia, S.L. +# +# Authors: Alberto Garcia +# Madeeha Javed +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# qcow2 files produced by this script are always arranged like this: +# +# - qcow2 header +# - refcount table +# - refcount blocks +# - L1 table +# - L2 tables +# - Data clusters +# +# A note about variable names: in qcow2 there is one refcount table +# and one (active) L1 table, although each can occupy several +# clusters. For the sake of simplicity the code sometimes talks about +# refcount tables and L1 tables when referring to those clusters. + +import argparse +import errno +import math +import os +import signal +import struct +import subprocess +import sys +import tempfile +import time +from contextlib import contextmanager + +QCOW2_DEFAULT_CLUSTER_SIZE = 65536 +QCOW2_DEFAULT_REFCOUNT_BITS = 16 +QCOW2_FEATURE_NAME_TABLE = 0x6803F857 +QCOW2_DATA_FILE_NAME_STRING = 0x44415441 +QCOW2_V3_HEADER_LENGTH = 112 # Header length in QEMU 9.0. Must be a multiple of 8 +QCOW2_INCOMPAT_DATA_FILE_BIT = 2 +QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT = 1 +QCOW_OFLAG_COPIED = 1 << 63 +QEMU_STORAGE_DAEMON = "qemu-storage-daemon" + + +def bitmap_set(bitmap, idx): + bitmap[idx // 8] |= 1 << (idx % 8) + + +def bitmap_is_set(bitmap, idx): + return (bitmap[idx // 8] & (1 << (idx % 8))) != 0 + + +def bitmap_iterator(bitmap, length): + for idx in range(length): + if bitmap_is_set(bitmap, idx): + yield idx + + +def align_up(num, d): + return d * math.ceil(num / d) + + +# Holes in the input file contain only zeroes so we can skip them and +# save time. This function returns the indexes of the clusters that +# are known to contain data. Those are the ones that we need to read. +def clusters_with_data(fd, cluster_size): + data_to = 0 + while True: + try: + data_from = os.lseek(fd, data_to, os.SEEK_DATA) + data_to = align_up(os.lseek(fd, data_from, os.SEEK_HOLE), cluster_size) + for idx in range(data_from // cluster_size, data_to // cluster_size): + yield idx + except OSError as err: + if err.errno == errno.ENXIO: # End of file reached + break + raise err + + +# write_qcow2_content() expects a raw input file. If we have a different +# format we can use qemu-storage-daemon to make it appear as raw. +@contextmanager +def get_input_as_raw_file(input_file, input_format): + if input_format == "raw": + yield input_file + return + try: + temp_dir = tempfile.mkdtemp() + pid_file = os.path.join(temp_dir, "pid") + raw_file = os.path.join(temp_dir, "raw") + open(raw_file, "wb").close() + ret = subprocess.run( + [ + QEMU_STORAGE_DAEMON, + "--daemonize", + "--pidfile", pid_file, + "--blockdev", f"driver=file,node-name=file0,driver=file,filename={input_file},read-only=on", + "--blockdev", f"driver={input_format},node-name=disk0,file=file0,read-only=on", + "--export", f"type=fuse,id=export0,node-name=disk0,mountpoint={raw_file},writable=off", + ], + capture_output=True, + ) + if ret.returncode != 0: + sys.exit("[Error] Could not start the qemu-storage-daemon:\n" + + ret.stderr.decode().rstrip('\n')) + yield raw_file + finally: + # Kill the storage daemon on exit + # and remove all temporary files + if os.path.exists(pid_file): + with open(pid_file, "r") as f: + pid = int(f.readline()) + os.kill(pid, signal.SIGTERM) + while os.path.exists(pid_file): + time.sleep(0.1) + os.unlink(raw_file) + os.rmdir(temp_dir) + + +def write_features(cluster, offset, data_file_name): + if data_file_name is not None: + encoded_name = data_file_name.encode("utf-8") + padded_name_len = align_up(len(encoded_name), 8) + struct.pack_into(f">II{padded_name_len}s", cluster, offset, + QCOW2_DATA_FILE_NAME_STRING, + len(encoded_name), + encoded_name) + offset += 8 + padded_name_len + + qcow2_features = [ + # Incompatible + (0, 0, "dirty bit"), + (0, 1, "corrupt bit"), + (0, 2, "external data file"), + (0, 3, "compression type"), + (0, 4, "extended L2 entries"), + # Compatible + (1, 0, "lazy refcounts"), + # Autoclear + (2, 0, "bitmaps"), + (2, 1, "raw external data"), + ] + struct.pack_into(">I", cluster, offset, QCOW2_FEATURE_NAME_TABLE) + struct.pack_into(">I", cluster, offset + 4, len(qcow2_features) * 48) + offset += 8 + for feature_type, feature_bit, feature_name in qcow2_features: + struct.pack_into(">BB46s", cluster, offset, + feature_type, feature_bit, feature_name.encode("ascii")) + offset += 48 + + +def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, data_file_raw): + # Some basic values + l1_entries_per_table = cluster_size // 8 + l2_entries_per_table = cluster_size // 8 + refcounts_per_table = cluster_size // 8 + refcounts_per_block = cluster_size * 8 // refcount_bits + + # Virtual disk size, number of data clusters and L1 entries + disk_size = align_up(os.path.getsize(input_file), 512) + total_data_clusters = math.ceil(disk_size / cluster_size) + l1_entries = math.ceil(total_data_clusters / l2_entries_per_table) + allocated_l1_tables = math.ceil(l1_entries / l1_entries_per_table) + + # Max L1 table size is 32 MB (QCOW_MAX_L1_SIZE in block/qcow2.h) + if (l1_entries * 8) > (32 * 1024 * 1024): + sys.exit("[Error] The image size is too large. Try using a larger cluster size.") + + # Two bitmaps indicating which L1 and L2 entries are set + l1_bitmap = bytearray(allocated_l1_tables * l1_entries_per_table // 8) + l2_bitmap = bytearray(l1_entries * l2_entries_per_table // 8) + allocated_l2_tables = 0 + allocated_data_clusters = 0 + + if data_file_raw: + # If data_file_raw is set then all clusters are allocated and + # we don't need to read the input file at all. + allocated_l2_tables = l1_entries + for idx in range(l1_entries): + bitmap_set(l1_bitmap, idx) + for idx in range(total_data_clusters): + bitmap_set(l2_bitmap, idx) + else: + # Open the input file for reading + fd = os.open(input_file, os.O_RDONLY) + zero_cluster = bytes(cluster_size) + # Read all the clusters that contain data + for idx in clusters_with_data(fd, cluster_size): + cluster = os.pread(fd, cluster_size, cluster_size * idx) + # If the last cluster is smaller than cluster_size pad it with zeroes + if len(cluster) < cluster_size: + cluster += bytes(cluster_size - len(cluster)) + # If a cluster has non-zero data then it must be allocated + # in the output file and its L2 entry must be set + if cluster != zero_cluster: + bitmap_set(l2_bitmap, idx) + allocated_data_clusters += 1 + # Allocated data clusters also need their corresponding L1 entry and L2 table + l1_idx = math.floor(idx / l2_entries_per_table) + if not bitmap_is_set(l1_bitmap, l1_idx): + bitmap_set(l1_bitmap, l1_idx) + allocated_l2_tables += 1 + + # Total amount of allocated clusters excluding the refcount blocks and table + total_allocated_clusters = 1 + allocated_l1_tables + allocated_l2_tables + if data_file_name is None: + total_allocated_clusters += allocated_data_clusters + + # Clusters allocated for the refcount blocks and table + allocated_refcount_blocks = math.ceil(total_allocated_clusters / refcounts_per_block) + allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table) + + # Now we have a problem because allocated_refcount_blocks and allocated_refcount_tables... + # (a) increase total_allocated_clusters, and + # (b) need to be recalculated when total_allocated_clusters is increased + # So we need to repeat the calculation as long as the numbers change + while True: + new_total_allocated_clusters = total_allocated_clusters + allocated_refcount_tables + allocated_refcount_blocks + new_allocated_refcount_blocks = math.ceil(new_total_allocated_clusters / refcounts_per_block) + if new_allocated_refcount_blocks > allocated_refcount_blocks: + allocated_refcount_blocks = new_allocated_refcount_blocks + allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table) + else: + break + + # Now that we have the final numbers we can update total_allocated_clusters + total_allocated_clusters += allocated_refcount_tables + allocated_refcount_blocks + + # At this point we have the exact number of clusters that the output + # image is going to use so we can calculate all the offsets. + current_cluster_idx = 1 + + refcount_table_offset = current_cluster_idx * cluster_size + current_cluster_idx += allocated_refcount_tables + + refcount_block_offset = current_cluster_idx * cluster_size + current_cluster_idx += allocated_refcount_blocks + + l1_table_offset = current_cluster_idx * cluster_size + current_cluster_idx += allocated_l1_tables + + l2_table_offset = current_cluster_idx * cluster_size + current_cluster_idx += allocated_l2_tables + + data_clusters_offset = current_cluster_idx * cluster_size + + # Calculate some values used in the qcow2 header + if allocated_l1_tables == 0: + l1_table_offset = 0 + + hdr_cluster_bits = int(math.log2(cluster_size)) + hdr_refcount_bits = int(math.log2(refcount_bits)) + hdr_length = QCOW2_V3_HEADER_LENGTH + hdr_incompat_features = 0 + if data_file_name is not None: + hdr_incompat_features |= 1 << QCOW2_INCOMPAT_DATA_FILE_BIT + hdr_autoclear_features = 0 + if data_file_raw: + hdr_autoclear_features |= 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT + + ### Write qcow2 header + cluster = bytearray(cluster_size) + struct.pack_into(">4sIQIIQIIQQIIQQQQII", cluster, 0, + b"QFI\xfb", # QCOW magic string + 3, # version + 0, # backing file offset + 0, # backing file sizes + hdr_cluster_bits, + disk_size, + 0, # encryption method + l1_entries, + l1_table_offset, + refcount_table_offset, + allocated_refcount_tables, + 0, # number of snapshots + 0, # snapshot table offset + hdr_incompat_features, + 0, # compatible features + hdr_autoclear_features, + hdr_refcount_bits, + hdr_length, + ) + + write_features(cluster, hdr_length, data_file_name) + + sys.stdout.buffer.write(cluster) + + ### Write refcount table + cur_offset = refcount_block_offset + remaining_refcount_table_entries = allocated_refcount_blocks # Each entry is a pointer to a refcount block + while remaining_refcount_table_entries > 0: + cluster = bytearray(cluster_size) + to_write = min(remaining_refcount_table_entries, refcounts_per_table) + remaining_refcount_table_entries -= to_write + for idx in range(to_write): + struct.pack_into(">Q", cluster, idx * 8, cur_offset) + cur_offset += cluster_size + sys.stdout.buffer.write(cluster) + + ### Write refcount blocks + remaining_refcount_block_entries = total_allocated_clusters # One entry for each allocated cluster + for tbl in range(allocated_refcount_blocks): + cluster = bytearray(cluster_size) + to_write = min(remaining_refcount_block_entries, refcounts_per_block) + remaining_refcount_block_entries -= to_write + # All refcount entries contain the number 1. The only difference + # is their bit width, defined when the image is created. + for idx in range(to_write): + if refcount_bits == 64: + struct.pack_into(">Q", cluster, idx * 8, 1) + elif refcount_bits == 32: + struct.pack_into(">L", cluster, idx * 4, 1) + elif refcount_bits == 16: + struct.pack_into(">H", cluster, idx * 2, 1) + elif refcount_bits == 8: + cluster[idx] = 1 + elif refcount_bits == 4: + cluster[idx // 2] |= 1 << ((idx % 2) * 4) + elif refcount_bits == 2: + cluster[idx // 4] |= 1 << ((idx % 4) * 2) + elif refcount_bits == 1: + cluster[idx // 8] |= 1 << (idx % 8) + sys.stdout.buffer.write(cluster) + + ### Write L1 table + cur_offset = l2_table_offset + for tbl in range(allocated_l1_tables): + cluster = bytearray(cluster_size) + for idx in range(l1_entries_per_table): + l1_idx = tbl * l1_entries_per_table + idx + if bitmap_is_set(l1_bitmap, l1_idx): + struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED) + cur_offset += cluster_size + sys.stdout.buffer.write(cluster) + + ### Write L2 tables + cur_offset = data_clusters_offset + for tbl in range(l1_entries): + # Skip the empty L2 tables. We can identify them because + # there is no L1 entry pointing at them. + if bitmap_is_set(l1_bitmap, tbl): + cluster = bytearray(cluster_size) + for idx in range(l2_entries_per_table): + l2_idx = tbl * l2_entries_per_table + idx + if bitmap_is_set(l2_bitmap, l2_idx): + if data_file_name is None: + struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED) + cur_offset += cluster_size + else: + struct.pack_into(">Q", cluster, idx * 8, (l2_idx * cluster_size) | QCOW_OFLAG_COPIED) + sys.stdout.buffer.write(cluster) + + ### Write data clusters + if data_file_name is None: + for idx in bitmap_iterator(l2_bitmap, total_data_clusters): + cluster = os.pread(fd, cluster_size, cluster_size * idx) + # If the last cluster is smaller than cluster_size pad it with zeroes + if len(cluster) < cluster_size: + cluster += bytes(cluster_size - len(cluster)) + sys.stdout.buffer.write(cluster) + + if not data_file_raw: + os.close(fd) + + +def main(): + # Command-line arguments + parser = argparse.ArgumentParser( + description="This program converts a QEMU disk image to qcow2 " + "and writes it to the standard output" + ) + parser.add_argument("input_file", help="name of the input file") + parser.add_argument( + "-f", + dest="input_format", + metavar="input_format", + help="format of the input file (default: raw)", + default="raw", + ) + parser.add_argument( + "-c", + dest="cluster_size", + metavar="cluster_size", + help=f"qcow2 cluster size (default: {QCOW2_DEFAULT_CLUSTER_SIZE})", + default=QCOW2_DEFAULT_CLUSTER_SIZE, + type=int, + choices=[1 << x for x in range(9, 22)], + ) + parser.add_argument( + "-r", + dest="refcount_bits", + metavar="refcount_bits", + help=f"width of the reference count entries (default: {QCOW2_DEFAULT_REFCOUNT_BITS})", + default=QCOW2_DEFAULT_REFCOUNT_BITS, + type=int, + choices=[1 << x for x in range(7)], + ) + parser.add_argument( + "-d", + dest="data_file", + help="create an image with input_file as an external data file", + action="store_true", + ) + parser.add_argument( + "-R", + dest="data_file_raw", + help="enable data_file_raw on the generated image (implies -d)", + action="store_true", + ) + args = parser.parse_args() + + if args.data_file_raw: + args.data_file = True + + if not os.path.isfile(args.input_file): + sys.exit(f"[Error] {args.input_file} does not exist or is not a regular file.") + + if args.data_file and args.input_format != "raw": + sys.exit("[Error] External data files can only be used with raw input images") + + # A 512 byte header is too small for the data file name extension + if args.data_file and args.cluster_size == 512: + sys.exit("[Error] External data files require a larger cluster size") + + if sys.stdout.isatty(): + sys.exit("[Error] Refusing to write to a tty. Try redirecting stdout.") + + if args.data_file: + data_file_name = args.input_file + else: + data_file_name = None + + with get_input_as_raw_file(args.input_file, args.input_format) as raw_file: + write_qcow2_content( + raw_file, + args.cluster_size, + args.refcount_bits, + data_file_name, + args.data_file_raw, + ) + + +if __name__ == "__main__": + main() + From 978765bc46b8e9da4c9602ed03e024cbd424bb7f Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 26 Jun 2025 15:22:13 +0100 Subject: [PATCH 064/111] python3/qcow2-to-stdout: Remove unused code We are only ever going to use this script with "raw" files. Signed-off-by: Andrii Sultanov --- python3/libexec/qcow2-to-stdout.py | 71 +++--------------------------- 1 file changed, 7 insertions(+), 64 deletions(-) diff --git a/python3/libexec/qcow2-to-stdout.py b/python3/libexec/qcow2-to-stdout.py index 8109f8fc351..724b0d309d8 100755 --- a/python3/libexec/qcow2-to-stdout.py +++ b/python3/libexec/qcow2-to-stdout.py @@ -28,13 +28,8 @@ import errno import math import os -import signal import struct -import subprocess import sys -import tempfile -import time -from contextlib import contextmanager QCOW2_DEFAULT_CLUSTER_SIZE = 65536 QCOW2_DEFAULT_REFCOUNT_BITS = 16 @@ -44,7 +39,6 @@ QCOW2_INCOMPAT_DATA_FILE_BIT = 2 QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT = 1 QCOW_OFLAG_COPIED = 1 << 63 -QEMU_STORAGE_DAEMON = "qemu-storage-daemon" def bitmap_set(bitmap, idx): @@ -82,46 +76,6 @@ def clusters_with_data(fd, cluster_size): raise err -# write_qcow2_content() expects a raw input file. If we have a different -# format we can use qemu-storage-daemon to make it appear as raw. -@contextmanager -def get_input_as_raw_file(input_file, input_format): - if input_format == "raw": - yield input_file - return - try: - temp_dir = tempfile.mkdtemp() - pid_file = os.path.join(temp_dir, "pid") - raw_file = os.path.join(temp_dir, "raw") - open(raw_file, "wb").close() - ret = subprocess.run( - [ - QEMU_STORAGE_DAEMON, - "--daemonize", - "--pidfile", pid_file, - "--blockdev", f"driver=file,node-name=file0,driver=file,filename={input_file},read-only=on", - "--blockdev", f"driver={input_format},node-name=disk0,file=file0,read-only=on", - "--export", f"type=fuse,id=export0,node-name=disk0,mountpoint={raw_file},writable=off", - ], - capture_output=True, - ) - if ret.returncode != 0: - sys.exit("[Error] Could not start the qemu-storage-daemon:\n" + - ret.stderr.decode().rstrip('\n')) - yield raw_file - finally: - # Kill the storage daemon on exit - # and remove all temporary files - if os.path.exists(pid_file): - with open(pid_file, "r") as f: - pid = int(f.readline()) - os.kill(pid, signal.SIGTERM) - while os.path.exists(pid_file): - time.sleep(0.1) - os.unlink(raw_file) - os.rmdir(temp_dir) - - def write_features(cluster, offset, data_file_name): if data_file_name is not None: encoded_name = data_file_name.encode("utf-8") @@ -375,13 +329,6 @@ def main(): "and writes it to the standard output" ) parser.add_argument("input_file", help="name of the input file") - parser.add_argument( - "-f", - dest="input_format", - metavar="input_format", - help="format of the input file (default: raw)", - default="raw", - ) parser.add_argument( "-c", dest="cluster_size", @@ -420,9 +367,6 @@ def main(): if not os.path.isfile(args.input_file): sys.exit(f"[Error] {args.input_file} does not exist or is not a regular file.") - if args.data_file and args.input_format != "raw": - sys.exit("[Error] External data files can only be used with raw input images") - # A 512 byte header is too small for the data file name extension if args.data_file and args.cluster_size == 512: sys.exit("[Error] External data files require a larger cluster size") @@ -435,14 +379,13 @@ def main(): else: data_file_name = None - with get_input_as_raw_file(args.input_file, args.input_format) as raw_file: - write_qcow2_content( - raw_file, - args.cluster_size, - args.refcount_bits, - data_file_name, - args.data_file_raw, - ) + write_qcow2_content( + args.input_file, + args.cluster_size, + args.refcount_bits, + data_file_name, + args.data_file_raw, + ) if __name__ == "__main__": From 7708c456295989f716473cac020458d4312dded6 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 22 Apr 2025 16:16:53 +0100 Subject: [PATCH 065/111] python3/qcow2-to-stdout: Update to handle block special files The original script is designed to only work on regular files. Adapt it slightly to work on special block devices (since that's what we want to export from). Aside from dropping some unnecessary checks and replacing os.path.getsize with a functional equivalent that works on block special files, these changes remove detection of "holes" in the raw file, since SEEK_DATA and SEEK_HOLE are unimplemented in this case. Signed-off-by: Andrii Sultanov --- python3/libexec/qcow2-to-stdout.py | 33 ++++++++---------------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/python3/libexec/qcow2-to-stdout.py b/python3/libexec/qcow2-to-stdout.py index 724b0d309d8..8824564e6fa 100755 --- a/python3/libexec/qcow2-to-stdout.py +++ b/python3/libexec/qcow2-to-stdout.py @@ -25,7 +25,6 @@ # refcount tables and L1 tables when referring to those clusters. import argparse -import errno import math import os import struct @@ -59,23 +58,6 @@ def align_up(num, d): return d * math.ceil(num / d) -# Holes in the input file contain only zeroes so we can skip them and -# save time. This function returns the indexes of the clusters that -# are known to contain data. Those are the ones that we need to read. -def clusters_with_data(fd, cluster_size): - data_to = 0 - while True: - try: - data_from = os.lseek(fd, data_to, os.SEEK_DATA) - data_to = align_up(os.lseek(fd, data_from, os.SEEK_HOLE), cluster_size) - for idx in range(data_from // cluster_size, data_to // cluster_size): - yield idx - except OSError as err: - if err.errno == errno.ENXIO: # End of file reached - break - raise err - - def write_features(cluster, offset, data_file_name): if data_file_name is not None: encoded_name = data_file_name.encode("utf-8") @@ -115,8 +97,12 @@ def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, refcounts_per_table = cluster_size // 8 refcounts_per_block = cluster_size * 8 // refcount_bits + # Open the input file for reading + fd = os.open(input_file, os.O_RDONLY) + # Virtual disk size, number of data clusters and L1 entries - disk_size = align_up(os.path.getsize(input_file), 512) + block_device_size = os.lseek(fd, 0, os.SEEK_END) + disk_size = align_up(block_device_size, 512) total_data_clusters = math.ceil(disk_size / cluster_size) l1_entries = math.ceil(total_data_clusters / l2_entries_per_table) allocated_l1_tables = math.ceil(l1_entries / l1_entries_per_table) @@ -140,11 +126,10 @@ def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, for idx in range(total_data_clusters): bitmap_set(l2_bitmap, idx) else: - # Open the input file for reading - fd = os.open(input_file, os.O_RDONLY) zero_cluster = bytes(cluster_size) + last_cluster = align_up(block_device_size, cluster_size) // cluster_size # Read all the clusters that contain data - for idx in clusters_with_data(fd, cluster_size): + for idx in range(0, last_cluster): cluster = os.pread(fd, cluster_size, cluster_size * idx) # If the last cluster is smaller than cluster_size pad it with zeroes if len(cluster) < cluster_size: @@ -364,8 +349,8 @@ def main(): if args.data_file_raw: args.data_file = True - if not os.path.isfile(args.input_file): - sys.exit(f"[Error] {args.input_file} does not exist or is not a regular file.") + if not os.path.exists(args.input_file): + sys.exit(f"[Error] {args.input_file} does not exist.") # A 512 byte header is too small for the data file name extension if args.data_file and args.cluster_size == 512: From 8d3bf09d662d5d33166c64d08418e99cff79885e Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Thu, 15 May 2025 11:16:30 +0100 Subject: [PATCH 066/111] python3/qcow2-to-stdout: Add --diff option to export only changed clusters When specified, only the clusters that have changed between diff_file_name and input_file will be allocated and exported in a sparse manner. This is analogous to vhd-tool's --relative-to option. Example usage: ``` Create an empty raw file: $ dd if=/dev/zero of=disk.before.img bs=1M count=100 Fill the first three clusters with random data $ dd if=/dev/random conv=notrunc of=disk.before.img bs=65536 count=3 $ python3/libexec/qcow2-to-stdout.py disk.before.img > disk.before.qcow2 Check that only the first three clusters are allocated (each is 0x10000 in length) $ qemu-img map disk.before.qcow2 Offset Length Mapped to File 0 0x30000 0x50000 disk.before.qcow2 Overwrite the 2nd and 3rd clusters with new data $ cp disk.before.img disk.after.img $ dd if=/dev/random conv=notrunc of=disk.after.img bs=65536 count=2 oseek=1 Export the difference, verifying that only the 2nd and 3rd clusters are allocated $ python3/libexec/qcow2-to-stdout.py disk.after.img --diff disk.before.img > disk.diff.qcow2 $ qemu-img map disk.diff.qcow2 Offset Length Mapped to File 0x10000 0x20000 0x50000 disk.diff.qcow2 The image can be recreated if the base is imported first, with the difference overwriting it $ qemu-img convert -f qcow2 -O raw disk.before.qcow2 disk.test.img $ qemu-img convert -f qcow2 -O raw disk.diff.qcow2 disk.test.img --target-is-zero -n $ diff disk.test.img disk.after.img ``` Signed-off-by: Andrii Sultanov --- python3/libexec/qcow2-to-stdout.py | 62 +++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/python3/libexec/qcow2-to-stdout.py b/python3/libexec/qcow2-to-stdout.py index 8824564e6fa..b0638bc5904 100755 --- a/python3/libexec/qcow2-to-stdout.py +++ b/python3/libexec/qcow2-to-stdout.py @@ -90,7 +90,8 @@ def write_features(cluster, offset, data_file_name): offset += 48 -def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, data_file_raw): +def write_qcow2_content(input_file, cluster_size, refcount_bits, + data_file_name, data_file_raw, diff_file_name): # Some basic values l1_entries_per_table = cluster_size // 8 l2_entries_per_table = cluster_size // 8 @@ -126,17 +127,17 @@ def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, for idx in range(total_data_clusters): bitmap_set(l2_bitmap, idx) else: - zero_cluster = bytes(cluster_size) - last_cluster = align_up(block_device_size, cluster_size) // cluster_size - # Read all the clusters that contain data - for idx in range(0, last_cluster): - cluster = os.pread(fd, cluster_size, cluster_size * idx) + # Allocates a cluster in the appropriate bitmaps if it's different + # from cluster_to_compare_with + def check_cluster_allocate(idx, cluster, cluster_to_compare_with): + nonlocal allocated_data_clusters + nonlocal allocated_l2_tables # If the last cluster is smaller than cluster_size pad it with zeroes if len(cluster) < cluster_size: cluster += bytes(cluster_size - len(cluster)) - # If a cluster has non-zero data then it must be allocated - # in the output file and its L2 entry must be set - if cluster != zero_cluster: + # If a cluster has different data from the cluster_to_compare_with then it + # must be allocated in the output file and its L2 entry must be set + if cluster != cluster_to_compare_with: bitmap_set(l2_bitmap, idx) allocated_data_clusters += 1 # Allocated data clusters also need their corresponding L1 entry and L2 table @@ -145,6 +146,36 @@ def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, bitmap_set(l1_bitmap, l1_idx) allocated_l2_tables += 1 + zero_cluster = bytes(cluster_size) + last_cluster = align_up(block_device_size, cluster_size) // cluster_size + if diff_file_name: + # Read all the clusters that differ from the diff_file_name + diff_fd = os.open(diff_file_name, os.O_RDONLY) + diff_block_device_size = os.lseek(diff_fd, 0, os.SEEK_END) + last_diff_cluster = align_up(diff_block_device_size, cluster_size) // cluster_size + # In case input_file is bigger than diff_file_name, first check + # if clusters from diff_file_name differ, and then check if the + # rest contain data + for idx in range(0, last_diff_cluster): + cluster = os.pread(fd, cluster_size, cluster_size * idx) + original_cluster = os.pread(diff_fd, cluster_size, cluster_size * idx) + + # If a cluster has different data from the original_cluster + # then it must be allocated + check_cluster_allocate(idx, cluster, original_cluster) + for idx in range(last_diff_cluster, last_cluster): + cluster = os.pread(fd, cluster_size, cluster_size * idx) + + # If a cluster has different data from the original_cluster + # then it must be allocated + check_cluster_allocate(idx, cluster, zero_cluster) + else: + # Read all the clusters that contain data + for idx in range(0, last_cluster): + cluster = os.pread(fd, cluster_size, cluster_size * idx) + # If a cluster has non-zero data then it must be allocated + check_cluster_allocate(idx, cluster, zero_cluster) + # Total amount of allocated clusters excluding the refcount blocks and table total_allocated_clusters = 1 + allocated_l1_tables + allocated_l2_tables if data_file_name is None: @@ -314,6 +345,15 @@ def main(): "and writes it to the standard output" ) parser.add_argument("input_file", help="name of the input file") + parser.add_argument( + "--diff", + dest="diff_file_name", + metavar="diff_file_name", + help=("name of the original file to compare input_file against. " + "If specified, will only export clusters that are different " + "between the files"), + default=None, + ) parser.add_argument( "-c", dest="cluster_size", @@ -352,6 +392,9 @@ def main(): if not os.path.exists(args.input_file): sys.exit(f"[Error] {args.input_file} does not exist.") + if args.diff_file_name and not os.path.exists(args.diff_file_name): + sys.exit(f"[Error] {args.diff_file_name} does not exist.") + # A 512 byte header is too small for the data file name extension if args.data_file and args.cluster_size == 512: sys.exit("[Error] External data files require a larger cluster size") @@ -370,6 +413,7 @@ def main(): args.refcount_bits, data_file_name, args.data_file_raw, + args.diff_file_name ) From c115a6d81dbf39c649c5f67b21ad3c2659825b64 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 26 Jun 2025 15:19:05 +0100 Subject: [PATCH 067/111] rrdp-squeezed: Don't collect metrics from a domain with missing counters Previously if only some of the values was missing, the arithmetic operations where done and they were taken into account into the host total. When a domain's values are missing, they are not taken into the sum. Also prepare the code to add per-VM metrics by using shared infrastructure and using a record. Signed-off-by: Pau Ruiz Safont --- ocaml/xcp-rrdd/bin/rrdp-squeezed/dune | 2 +- .../bin/rrdp-squeezed/rrdp_squeezed.ml | 104 +++++++++++------- 2 files changed, 65 insertions(+), 41 deletions(-) diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune index d45dd928de1..ca5b6ae7d88 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune @@ -2,8 +2,8 @@ (modes exe) (name rrdp_squeezed) (libraries - rrdd-plugin + rrdd_plugin_xenctrl rrdd_plugins_libs xapi-stdext-std ezxenstore diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index 4c0b13cf3e3..455cb03f736 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -17,10 +17,6 @@ open Rrdd_plugin module Process = Process (struct let name = "xcp-rrdd-squeezed" end) -open Process - -let with_xc f = Xenctrl.with_intf f - module Xs = struct module Client = Xs_client_unix.Client (Xs_transport_unix_client) include Client @@ -38,10 +34,6 @@ module Xs = struct c end -(* Return a list of domids of VMs running on this host *) -let get_running_domains xc = - Xenctrl.domain_getinfolist xc 0 |> List.map (fun di -> di.Xenctrl.domid) - module D = Debug.Make (struct let name = "rrdd-plugins" end) module XSW = Ez_xenstore_watch.Make (D) @@ -106,40 +98,61 @@ end module Watcher = WatchXenstore (MemoryActions) -(* Return a tuple (dynamic-max, dynamic-min, target) for a running VM *) -let get_squeezed_data domid = - let get_current_value ~label current_values = - try IntMap.find domid !current_values - with _ -> - if domid <> 0 then - D.warn "Couldn't find cached %s value for domain %d, using 0" label - domid ; - 0L +type values = { + dynamic_max: int64 option + ; dynamic_min: int64 option + ; target: int64 option +} + +let get_values (_, uuid, domid) = + let get_current_value current_values = + IntMap.find_opt domid !current_values in - ( get_current_value ~label:"dynamic-max" current_dynamic_max_values - , get_current_value ~label:"dynamic-min" current_dynamic_min_values - , get_current_value ~label:"target" current_target_values + ( (uuid, domid) + , { + dynamic_max= get_current_value current_dynamic_max_values + ; dynamic_min= get_current_value current_dynamic_min_values + ; target= get_current_value current_target_values + } ) -let get_datas () = - (* Create a tuple (dynamic-max, dynamic-min, target) for each VM running on the host *) - let domids = with_xc get_running_domains in - List.map get_squeezed_data domids +let get_domain_stats xc = + let _, domains, _ = Xenctrl_lib.domain_snapshot xc in + List.map get_values domains -let generate_squeezed_dss () = +let generate_host_sources counters = let memory_reclaimed, memory_possibly_reclaimed = - get_datas () - (* Calculate metrics - - Host memory reclaimed by squeezed = - sum_across_running_vms(dynamic_max - target) - - Host memory that could be reclaimed by squeezed = - sum_across_running_vms(target - dynamic_min) + (* Calculate host metrics + - Host memory reclaimed by squeezed = + sum_across_running_vms(dynamic_max - target) + - Host memory that could be reclaimed by squeezed = + sum_across_running_vms(target - dynamic_min) *) + let ( let* ) = Option.bind in + counters |> List.fold_left - (fun (acc1, acc2) (max, min, target) -> - ( Int64.add acc1 (Int64.sub max target) - , Int64.add acc2 (Int64.sub target min) - ) + (fun (acc1, acc2) (_, {dynamic_max; dynamic_min; target}) -> + let r = + let* target in + let acc1 = + let* max = dynamic_max in + Some (Int64.add acc1 (Int64.sub max target)) + in + let acc2 = + let* min = dynamic_min in + Some (Int64.add acc2 (Int64.sub target min)) + in + Some (acc1, acc2) + in + match r with + | None | Some (None, None) -> + (acc1, acc2) + | Some (Some acc1, Some acc2) -> + (acc1, acc2) + | Some (Some acc1, None) -> + (acc1, acc2) + | Some (None, Some acc2) -> + (acc1, acc2) ) (Int64.zero, Int64.zero) in @@ -159,11 +172,22 @@ let generate_squeezed_dss () = ) ] +let generate_sources xc () = + let counters = get_domain_stats xc in + generate_host_sources counters + (* This plugin always reports two datasources only, so one page is fine. *) -let shared_page_count = 1 +let host_page_count = 1 + +let vm_page_count = 0 -let _ = - initialise () ; +let shared_page_count = host_page_count + vm_page_count + +let () = Watcher.create_watcher_thread () ; - main_loop ~neg_shift:0.5 ~target:(Reporter.Local shared_page_count) - ~protocol:Rrd_interface.V2 ~dss_f:generate_squeezed_dss + Process.initialise () ; + Xenctrl.with_intf (fun xc -> + Process.main_loop ~neg_shift:0.5 + ~target:(Reporter.Local shared_page_count) ~protocol:Rrd_interface.V2 + ~dss_f:(generate_sources xc) + ) From 165bdec3ab1650beaf0b9ae8c6b0703a632d7ac0 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 26 Jun 2025 15:53:57 +0100 Subject: [PATCH 068/111] rrdp-squeezed: generate per-vm memory target datasources The daemon was already watching the vm memory targets, so it makes sense to make it generate the datasources directly as well. This uses the shared xenctrl infrastructure that's needed to fetch the domains' uuids. This means that xcp-rrdd does not need to collect this anymore, and xenopsd does not need to send it to xcp-rrdd. The function in the idl is safe to delete because the only user was xenopsd, and it was well-protected against errors. This means that if an old version of xenopsd tries to call the function while xcp-rrdd has already been updated, it won't interrupt xneopsd functionality. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi-idl/rrd/cli-help.t | 5 --- ocaml/xapi-idl/rrd/rrd_interface.ml | 12 ------ ocaml/xapi/xapi_xenops.ml | 12 ------ ocaml/xcp-rrdd/bin/rrdd/rrdd_bindings.ml | 3 -- ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml | 5 --- ocaml/xcp-rrdd/bin/rrdd/rrdd_server.mli | 2 - ocaml/xcp-rrdd/bin/rrdd/rrdd_shared.ml | 5 --- ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml | 25 +---------- .../bin/rrdp-squeezed/rrdp_squeezed.ml | 43 ++++++++++++++++--- 9 files changed, 38 insertions(+), 74 deletions(-) diff --git a/ocaml/xapi-idl/rrd/cli-help.t b/ocaml/xapi-idl/rrd/cli-help.t index a503e0b75bb..1a15779d7f7 100644 --- a/ocaml/xapi-idl/rrd/cli-help.t +++ b/ocaml/xapi-idl/rrd/cli-help.t @@ -166,11 +166,6 @@ observed values will be created alongside the standard archive of average values - update_vm_memory_target [OPTION]… domid target - Sets the `memory_target` value for a VM. This is called by xapi - when it is told by xenopsd that squeezed has changed the target - for a VM. - COMMON OPTIONS --help[=FMT] (default=auto) Show this help in format FMT. The value FMT must be one of auto, diff --git a/ocaml/xapi-idl/rrd/rrd_interface.ml b/ocaml/xapi-idl/rrd/rrd_interface.ml index 1cfa1e39a2f..066912eacf2 100644 --- a/ocaml/xapi-idl/rrd/rrd_interface.ml +++ b/ocaml/xapi-idl/rrd/rrd_interface.ml @@ -412,18 +412,6 @@ module RPC_API (R : RPC) = struct ] (value_p @-> returning unit_p rrd_err) - let update_vm_memory_target = - let target_p = - Param.mk ~name:"target" ~description:["VM memory target"] Types.int64 - in - declare "update_vm_memory_target" - [ - "Sets the `memory_target` value for a VM. This is called by xapi when \ - it is told by" - ; "xenopsd that squeezed has changed the target for a VM." - ] - (domid_p @-> target_p @-> returning unit_p rrd_err) - let set_cache_sr = declare "set_cache_sr" [ diff --git a/ocaml/xapi/xapi_xenops.ml b/ocaml/xapi/xapi_xenops.ml index 8ad3c8b9962..213a30c5aff 100644 --- a/ocaml/xapi/xapi_xenops.ml +++ b/ocaml/xapi/xapi_xenops.ml @@ -2389,18 +2389,6 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost = with e -> error "Caught %s: while updating VM %s xsdata" (Printexc.to_string e) id - ) ; - different - (fun x -> x.Vm.memory_target) - Fun.id - (fun memory_target -> - try - debug "xenopsd event: Updating VM %s domid %d memory target" id - domid ; - Rrdd.update_vm_memory_target domid memory_target - with e -> - error "Caught %s: while updating VM %s memory_target" - (Printexc.to_string e) id ) ) state.Vm.domids diff --git a/ocaml/xcp-rrdd/bin/rrdd/rrdd_bindings.ml b/ocaml/xcp-rrdd/bin/rrdd/rrdd_bindings.ml index f5d977d632c..3ddc24e462f 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/rrdd_bindings.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/rrdd_bindings.ml @@ -47,8 +47,6 @@ module type RRDD_IMPLEMENTATION = sig val update_use_min_max : bool -> unit - val update_vm_memory_target : int -> int64 -> unit - val set_cache_sr : string -> unit val unset_cache_sr : unit -> unit @@ -119,7 +117,6 @@ module Make (Impl : RRDD_IMPLEMENTATION) = struct Server.query_possible_sr_dss Impl.query_possible_sr_dss ; Server.query_sr_ds Impl.query_sr_ds ; Server.update_use_min_max Impl.update_use_min_max ; - Server.update_vm_memory_target Impl.update_vm_memory_target ; Server.set_cache_sr Impl.set_cache_sr ; Server.unset_cache_sr Impl.unset_cache_sr ; (* module Plugin*) diff --git a/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml b/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml index 6a1212f178a..15eee76cfe6 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml @@ -571,11 +571,6 @@ let update_use_min_max (value : bool) : unit = debug "Updating use_min_max: New value=%b" value ; use_min_max := value -let update_vm_memory_target (domid : int) (target : int64) : unit = - with_lock memory_targets_m (fun _ -> - Hashtbl.replace memory_targets domid target - ) - let set_cache_sr (sr_uuid : string) : unit = with_lock cache_sr_lock (fun () -> cache_sr_uuid := Some sr_uuid) diff --git a/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.mli b/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.mli index 000c53de121..bd8ae2e6c99 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.mli +++ b/ocaml/xcp-rrdd/bin/rrdd/rrdd_server.mli @@ -50,8 +50,6 @@ val query_sr_ds : string -> string -> float val update_use_min_max : bool -> unit -val update_vm_memory_target : int -> int64 -> unit - val set_cache_sr : string -> unit val unset_cache_sr : unit -> unit diff --git a/ocaml/xcp-rrdd/bin/rrdd/rrdd_shared.ml b/ocaml/xcp-rrdd/bin/rrdd/rrdd_shared.ml index 816860e5815..b15e91b50cb 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/rrdd_shared.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/rrdd_shared.ml @@ -30,11 +30,6 @@ let next_iteration_start : Clock.Timer.t ref = (* The mutex that protects the next_iteration_start against data corruption. *) let next_iteration_start_m : Mutex.t = Mutex.create () -(** Cache memory/target values *) -let memory_targets : (int, int64) Hashtbl.t = Hashtbl.create 20 - -let memory_targets_m = Mutex.create () - let cache_sr_uuid : string option ref = ref None let cache_sr_lock = Mutex.create () diff --git a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml index 7f110d7e576..d6850621d0c 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml @@ -292,9 +292,6 @@ let domain_snapshot xc = let domains = Xenctrl.domain_getinfolist xc 0 |> List.filter_map metadata_of_domain in - let domids = List.map (fun (_, _, i) -> i) domains |> IntSet.of_list in - let domains_only k v = Option.map (Fun.const v) (IntSet.find_opt k domids) in - Hashtbl.filter_map_inplace domains_only Rrdd_shared.memory_targets ; domains |> List.to_seq let dss_mem_vms xc = @@ -311,23 +308,6 @@ let dss_mem_vms xc = ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) in - let memory_target_opt = - with_lock Rrdd_shared.memory_targets_m (fun _ -> - Hashtbl.find_opt Rrdd_shared.memory_targets domid - ) - in - let mem_target_ds = - Option.map - (fun memory_target -> - ( Rrd.VM uuid - , Ds.ds_make ~name:"memory_target" - ~description:"Target of VM balloon driver" ~units:"B" - ~value:(Rrd.VT_Int64 memory_target) ~ty:Rrd.Gauge ~min:0.0 - ~default:true () - ) - ) - memory_target_opt - in let other_ds = if domid = 0 then match mem_available () with @@ -359,10 +339,7 @@ let dss_mem_vms xc = ) with Not_found -> None in - let metrics = - List.concat - [main_mem_ds :: Option.to_list other_ds; Option.to_list mem_target_ds] - in + let metrics = List.concat [main_mem_ds :: Option.to_list other_ds] in Some (List.to_seq metrics) in (* CA-34383: Memory updates from paused domains serve no useful purpose. diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index 455cb03f736..a0a6fab1f65 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -104,11 +104,11 @@ type values = { ; target: int64 option } -let get_values (_, uuid, domid) = +let get_values ((_, _, domid) as dom) = let get_current_value current_values = IntMap.find_opt domid !current_values in - ( (uuid, domid) + ( dom , { dynamic_max= get_current_value current_dynamic_max_values ; dynamic_min= get_current_value current_dynamic_min_values @@ -172,14 +172,45 @@ let generate_host_sources counters = ) ] +let generate_vm_sources domains = + let metrics_of ((dom, uuid, _), {target; _}) = + let target = + Option.map + (fun target -> + ( Rrd.VM uuid + , Ds.ds_make ~name:"memory_target" + ~description:"Target of VM balloon driver" ~units:"B" + ~value:(Rrd.VT_Int64 target) ~ty:Rrd.Gauge ~min:0.0 ~default:true + () + ) + ) + target + in + (* CA-34383: Memory updates from paused domains serve no useful purpose. + During a migrate such updates can also cause undesirable + discontinuities in the observed value of memory_actual. Hence, we + ignore changes from paused domains: *) + if dom.Xenctrl.paused then + [] + else + Option.to_list target + in + + List.concat_map metrics_of domains + let generate_sources xc () = - let counters = get_domain_stats xc in - generate_host_sources counters + let domain_stats = get_domain_stats xc in + generate_host_sources domain_stats @ generate_vm_sources domain_stats + +(** The json-like serialization for 3 dss in dss_mem_vms takes 622 bytes. These + bytes plus some overhead make 1024 bytes an upper bound. *) + +let bytes_per_mem_vm = 1024 -(* This plugin always reports two datasources only, so one page is fine. *) let host_page_count = 1 -let vm_page_count = 0 +let vm_page_count = + ((Rrd_interface.max_supported_vms * bytes_per_mem_vm) + 4095) / 4096 let shared_page_count = host_page_count + vm_page_count From a38ee7f6df17f34e15c131a8de0b4d0223bf6312 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 26 Jun 2025 18:07:43 +0100 Subject: [PATCH 069/111] rrdp-squeezed: collect agent-collected per-vm free memory metrics Since this daemon already uses xenstore to watch other memory keys in xenstore, move another one in here, allows to delete quite a bit of code from xcp-rrdd and drop dependencies as well. Signed-off-by: Pau Ruiz Safont --- ocaml/xcp-rrdd/bin/rrdd/dune | 4 - ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml | 144 +----------------- .../bin/rrdp-squeezed/rrdp_squeezed.ml | 92 +++++++++-- 3 files changed, 83 insertions(+), 157 deletions(-) diff --git a/ocaml/xcp-rrdd/bin/rrdd/dune b/ocaml/xcp-rrdd/bin/rrdd/dune index d84e06e46fd..6ce134dd522 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/dune +++ b/ocaml/xcp-rrdd/bin/rrdd/dune @@ -41,7 +41,6 @@ (libraries astring ezxenstore.core - ezxenstore.watch forkexec http_lib httpsvr @@ -66,9 +65,6 @@ xapi-stdext-threads xapi-stdext-unix xenctrl - xenstore - xenstore.unix - xenstore_transport.unix ) ) diff --git a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml index d6850621d0c..40e5ab34b79 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml @@ -119,84 +119,6 @@ let start (xmlrpc_path, http_fwd_path) process = let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute -(*****************************************************) -(* xenstore related code *) -(*****************************************************) - -module XSW_Debug = Debug.Make (struct let name = "xenstore_watch" end) - -module Watch = Ez_xenstore_watch.Make (XSW_Debug) - -module Xs = struct - module Client = Xs_client_unix.Client (Xs_transport_unix_client) - - let client = ref None - - (* Initialise the clients on demand - must be done after daemonisation! *) - let get_client () = - match !client with - | Some client -> - client - | None -> - let c = Client.make () in - client := Some c ; - c -end - -(* Map from domid to the latest seen meminfo_free value *) -let current_meminfofree_values = ref Watch.IntMap.empty - -let meminfo_path domid = - Printf.sprintf "/local/domain/%d/data/meminfo_free" domid - -module Meminfo = struct - let watch_token domid = Printf.sprintf "xcp-rrdd:domain-%d" domid - - let interesting_paths_for_domain domid _uuid = [meminfo_path domid] - - let fire_event_on_vm domid domains = - let d = int_of_string domid in - if not (Watch.IntMap.mem d domains) then - info "Ignoring watch on shutdown domain %d" d - else - let path = meminfo_path d in - try - let client = Xs.get_client () in - let meminfo_free_string = - Xs.Client.immediate client (fun xs -> Xs.Client.read xs path) - in - let meminfo_free = Int64.of_string meminfo_free_string in - info "memfree has changed to %Ld in domain %d" meminfo_free d ; - current_meminfofree_values := - Watch.IntMap.add d meminfo_free !current_meminfofree_values - with Xs_protocol.Enoent _hint -> - info - "Couldn't read path %s; forgetting last known memfree value for \ - domain %d" - path d ; - current_meminfofree_values := - Watch.IntMap.remove d !current_meminfofree_values - - let watch_fired _ _xc path domains _ = - match - List.filter (fun x -> x <> "") Astring.String.(cuts ~sep:"/" path) - with - | ["local"; "domain"; domid; "data"; "meminfo_free"] -> - fire_event_on_vm domid domains - | _ -> - debug "Ignoring unexpected watch: %s" path - - let unmanaged_domain _ _ = false - - let found_running_domain _ _ = () - - let domain_appeared _ _ _ = () - - let domain_disappeared _ _ _ = () -end - -module Watcher = Watch.WatchXenstore (Meminfo) - (*****************************************************) (* memory stats *) (*****************************************************) @@ -231,30 +153,6 @@ let bytes_per_mem_vm = 1024 let mem_vm_writer_pages = ((Rrd_interface.max_supported_vms * bytes_per_mem_vm) + 4095) / 4096 -let res_error fmt = Printf.ksprintf Result.error fmt - -let ok x = Result.ok x - -let ( let* ) = Result.bind - -let finally f always = Fun.protect ~finally:always f - -let scanning path f = - let io = Scanf.Scanning.open_in path in - finally (fun () -> f io) (fun () -> Scanf.Scanning.close_in io) - -let scan path = - try - scanning path @@ fun io -> - Scanf.bscanf io {|MemTotal: %_d %_s MemFree: %_d %_s MemAvailable: %Ld %s|} - (fun size kb -> ok (size, kb) - ) - with _ -> res_error "failed to scan %s" path - -let mem_available () = - let* size, kb = scan "/proc/meminfo" in - match kb with "kB" -> ok size | _ -> res_error "unexpected unit: %s" kb - let uuid_blacklist = ["00000000-0000-0000"; "deadbeef-dead-beef"] module IntSet = Set.Make (Int) @@ -295,7 +193,7 @@ let domain_snapshot xc = domains |> List.to_seq let dss_mem_vms xc = - let mem_metrics_of (dom, uuid, domid) = + let mem_metrics_of (dom, uuid, _) = let vm_metrics () = let kib = Xenctrl.pages_to_kib (Int64.of_nativeint dom.Xenctrl.total_memory_pages) @@ -308,39 +206,7 @@ let dss_mem_vms xc = ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) in - let other_ds = - if domid = 0 then - match mem_available () with - | Ok mem -> - Some - ( Rrd.VM uuid - , Ds.ds_make ~name:"memory_internal_free" ~units:"KiB" - ~description:"Dom0 current free memory" - ~value:(Rrd.VT_Int64 mem) ~ty:Rrd.Gauge ~min:0.0 - ~default:true () - ) - | Error msg -> - let _ = - error "%s: retrieving Dom0 free memory failed: %s" __FUNCTION__ - msg - in - None - else - try - let mem_free = - Watch.IntMap.find domid !current_meminfofree_values - in - Some - ( Rrd.VM uuid - , Ds.ds_make ~name:"memory_internal_free" ~units:"KiB" - ~description:"Memory used as reported by the guest agent" - ~value:(Rrd.VT_Int64 mem_free) ~ty:Rrd.Gauge ~min:0.0 - ~default:true () - ) - with Not_found -> None - in - let metrics = List.concat [main_mem_ds :: Option.to_list other_ds] in - Some (List.to_seq metrics) + Some main_mem_ds in (* CA-34383: Memory updates from paused domains serve no useful purpose. During a migrate such updates can also cause undesirable @@ -349,7 +215,7 @@ let dss_mem_vms xc = if dom.Xenctrl.paused then None else vm_metrics () in let domains = domain_snapshot xc in - Seq.filter_map mem_metrics_of domains |> Seq.concat |> List.of_seq + Seq.filter_map mem_metrics_of domains |> List.of_seq (**** Local cache SR stuff *) @@ -786,10 +652,6 @@ let _ = (List.map (fun (name, _) -> writer_basename name) stats_to_write) ; ignore @@ GCLog.start () ; debug "Starting xenstore-watching thread .." ; - let () = - try Watcher.create_watcher_thread () - with _ -> error "xenstore-watching thread has failed" - in let module Daemon = Xapi_stdext_unix.Unixext.Daemon in if Daemon.systemd_booted () then if Daemon.systemd_notify Daemon.State.Ready then diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index a0a6fab1f65..a3091a90daf 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -45,12 +45,19 @@ let current_dynamic_min_values = ref IntMap.empty let current_target_values = ref IntMap.empty +let current_free_values = ref IntMap.empty + module MemoryActions = struct let interesting_paths_for_domain domid _ = - let keys = ["dynamic-max"; "dynamic-min"; "target"] in - List.map - (fun key -> Printf.sprintf "/local/domain/%d/memory/%s" domid key) - keys + let keys = + [ + "memory/dynamic-max" + ; "memory/dynamic-min" + ; "memory/target" + ; "data/meminfo_free" + ] + in + List.map (fun key -> Printf.sprintf "/local/domain/%d/%s" domid key) keys let watch_token domid = Printf.sprintf "xcp-rrdd-plugins/squeezed:domain-%d" domid @@ -65,10 +72,7 @@ module MemoryActions = struct try let client = Xs.get_client () in let value = - Xs.immediate client (fun xs -> Xs.read xs path) - |> Int64.of_string - |> Int64.mul 1024L - (* convert from KiB to bytes *) + Xs.immediate client (fun xs -> Xs.read xs path) |> Int64.of_string in current_memory_values := IntMap.add domid value !current_memory_values with Xs_protocol.Enoent _ -> @@ -84,6 +88,8 @@ module MemoryActions = struct read_new_value domid current_dynamic_min_values | ["local"; "domain"; domid; "memory"; "target"] -> read_new_value domid current_target_values + | ["local"; "domain"; domid; "data"; "meminfo_free"] -> + read_new_value domid current_free_values | _ -> D.debug "Ignoring unexpected watch: %s" path @@ -98,10 +104,12 @@ end module Watcher = WatchXenstore (MemoryActions) +(** All these values are reported in KiB *) type values = { dynamic_max: int64 option ; dynamic_min: int64 option ; target: int64 option + ; free: int64 option } let get_values ((_, _, domid) as dom) = @@ -113,6 +121,7 @@ let get_values ((_, _, domid) as dom) = dynamic_max= get_current_value current_dynamic_max_values ; dynamic_min= get_current_value current_dynamic_min_values ; target= get_current_value current_target_values + ; free= get_current_value current_free_values } ) @@ -120,6 +129,8 @@ let get_domain_stats xc = let _, domains, _ = Xenctrl_lib.domain_snapshot xc in List.map get_values domains +let bytes_of_kib kib = Int64.mul 1024L kib + let generate_host_sources counters = let memory_reclaimed, memory_possibly_reclaimed = (* Calculate host metrics @@ -131,7 +142,7 @@ let generate_host_sources counters = let ( let* ) = Option.bind in counters |> List.fold_left - (fun (acc1, acc2) (_, {dynamic_max; dynamic_min; target}) -> + (fun (acc1, acc2) (_, {dynamic_max; dynamic_min; target; _}) -> let r = let* target in let acc1 = @@ -156,6 +167,8 @@ let generate_host_sources counters = ) (Int64.zero, Int64.zero) in + let memory_reclaimed = bytes_of_kib memory_reclaimed in + let memory_possibly_reclaimed = bytes_of_kib memory_possibly_reclaimed in (* Build corresponding Ds.ds values *) [ ( Rrd.Host @@ -172,11 +185,60 @@ let generate_host_sources counters = ) ] +let res_error fmt = Printf.ksprintf Result.error fmt + +let finally f finally = Fun.protect ~finally f + +let scanning path f = + let io = Scanf.Scanning.open_in path in + finally (fun () -> f io) (fun () -> Scanf.Scanning.close_in io) + +let scan path = + try + scanning path @@ fun io -> + Scanf.bscanf io {|MemTotal: %_d %_s MemFree: %_d %_s MemAvailable: %Ld %s|} + (fun size kb -> Ok (size, kb) + ) + with _ -> res_error "failed to scan %s" path + +let free_dom0 uuid = + let result = + match scan "/proc/meminfo" with + | Ok (size, "kB") -> + Ok size + | Ok (_, unit) -> + res_error "unexpected unit: %s" unit + | Error e -> + Error e + in + match result with + | Ok mem -> + Some + ( Rrd.VM uuid + , Ds.ds_make ~name:"memory_internal_free" ~units:"KiB" + ~description:"Dom0 current free memory" ~value:(Rrd.VT_Int64 mem) + ~ty:Rrd.Gauge ~min:0.0 ~default:true () + ) + | Error msg -> + let _ = + D.error "%s: retrieving Dom0 free memory failed: %s" __FUNCTION__ msg + in + None + +let free_other uuid free = + Some + ( Rrd.VM uuid + , Ds.ds_make ~name:"memory_internal_free" ~units:"KiB" + ~description:"Memory used as reported by the guest agent" + ~value:(Rrd.VT_Int64 free) ~ty:Rrd.Gauge ~min:0.0 ~default:true () + ) + let generate_vm_sources domains = - let metrics_of ((dom, uuid, _), {target; _}) = - let target = + let metrics_of ((dom, uuid, domid), {target; free; _}) = + let target () = Option.map (fun target -> + let target = bytes_of_kib target in ( Rrd.VM uuid , Ds.ds_make ~name:"memory_target" ~description:"Target of VM balloon driver" ~units:"B" @@ -186,6 +248,12 @@ let generate_vm_sources domains = ) target in + let free () = + if domid = 0 then + free_dom0 uuid + else + Option.bind free (free_other uuid) + in (* CA-34383: Memory updates from paused domains serve no useful purpose. During a migrate such updates can also cause undesirable discontinuities in the observed value of memory_actual. Hence, we @@ -193,7 +261,7 @@ let generate_vm_sources domains = if dom.Xenctrl.paused then [] else - Option.to_list target + Option.to_list (target ()) @ Option.to_list (free ()) in List.concat_map metrics_of domains From 0fef829d3e13a7a22ffd637a71a24f7a1c7144d7 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 12 Jun 2025 15:44:03 +0100 Subject: [PATCH 070/111] monitor_mem: Prepare xapi to consolidate memory metrics into a single file Xapi currently reads directly from memory-mapped metrics files. Since we want to move all the memory metrics to rrdp_squeezed, and this will output all the metrics into a single memory-mapped file, move the host and VM memory metrics into a single module, and make them share a bit of code. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/monitor_dbcalls.ml | 3 +- ocaml/xapi/monitor_mem.ml | 177 +++++++++++++++++++++++++++++++++ ocaml/xapi/monitor_mem.mli | 18 ++++ ocaml/xapi/monitor_mem_host.ml | 98 ------------------ ocaml/xapi/monitor_mem_vms.ml | 89 ----------------- quality-gate.sh | 2 +- 6 files changed, 197 insertions(+), 190 deletions(-) create mode 100644 ocaml/xapi/monitor_mem.ml create mode 100644 ocaml/xapi/monitor_mem.mli delete mode 100644 ocaml/xapi/monitor_mem_host.ml delete mode 100644 ocaml/xapi/monitor_mem_vms.ml diff --git a/ocaml/xapi/monitor_dbcalls.ml b/ocaml/xapi/monitor_dbcalls.ml index ab521155d2c..48b96bbd92a 100644 --- a/ocaml/xapi/monitor_dbcalls.ml +++ b/ocaml/xapi/monitor_dbcalls.ml @@ -127,8 +127,7 @@ let monitor_dbcall_thread () = try let rrd_files = Monitor_types.find_rrd_files () in pifs_update_fn () ; - Monitor_mem_host.update rrd_files ; - Monitor_mem_vms.update rrd_files ; + Monitor_mem.update rrd_files ; Monitor_pvs_proxy.update rrd_files ; Thread.delay 5. with e -> diff --git a/ocaml/xapi/monitor_mem.ml b/ocaml/xapi/monitor_mem.ml new file mode 100644 index 00000000000..502f0d6ca7a --- /dev/null +++ b/ocaml/xapi/monitor_mem.ml @@ -0,0 +1,177 @@ +(* + * Copyright (C) Citrix Systems Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +module Mtxext = Xapi_stdext_threads.Threadext.Mutex +module Mcache = Monitor_dbcalls_cache + +module D = Debug.Make (struct let name = __MODULE__ end) + +open D + +let get_datasources ~prefix rrd_files = + List.filter (String.starts_with ~prefix) rrd_files + |> List.map (fun fn -> (fn, Monitor_types.datasources_from_filename fn)) + +module Host = struct + let get_changes datasources = + let named_dss = + List.concat_map + (fun (filename, datasources) -> + try + Mcache.log_errors_from filename ; + datasources + |> List.filter_map (function + | Rrd.Host, ds + when List.mem ds.Ds.ds_name + ["memory_total_kib"; "memory_free_kib"] -> + Some ds + | _ -> + None (* we are only interested in Host memory stats *) + ) + |> List.map (function ds -> + let value = + match ds.Ds.ds_value with + | Rrd.VT_Int64 v -> + Memory.bytes_of_kib v + | Rrd.VT_Float v -> + Memory.bytes_of_kib (Int64.of_float v) + | Rrd.VT_Unknown -> + -1L + in + (ds.Ds.ds_name, value) + ) + with e -> + if not (Mcache.is_ignored filename) then ( + error "Unable to read host memory metrics from %s: %s" filename + (Printexc.to_string e) ; + Mcache.ignore_errors_from filename + ) ; + [] + ) + datasources + in + let free_bytes = List.assoc_opt "memory_free_kib" named_dss in + let total_bytes = List.assoc_opt "memory_total_kib" named_dss in + (* Check if anything has changed since our last reading. *) + match (free_bytes, total_bytes) with + | Some free, Some total + when !Mcache.host_memory_free_cached <> free + || !Mcache.host_memory_total_cached <> total -> + Some (free, total) + | _ -> + None + + let set_changes (free_bytes, total_bytes) = + Mtxext.execute Mcache.host_memory_m (fun _ -> + Mcache.host_memory_free_cached := free_bytes ; + Mcache.host_memory_total_cached := total_bytes + ) + + let update rrd_files = + Server_helpers.exec_with_new_task "Updating host memory metrics" + (fun __context -> + let datasources = + get_datasources ~prefix:Xapi_globs.metrics_prefix_mem_host rrd_files + in + let changes = get_changes datasources in + match changes with + | None -> + () + | Some ((free, total) as c) -> ( + try + let host = Helpers.get_localhost ~__context in + let metrics = Db.Host.get_metrics ~__context ~self:host in + Db.Host_metrics.set_memory_total ~__context ~self:metrics + ~value:total ; + Db.Host_metrics.set_memory_free ~__context ~self:metrics ~value:free ; + set_changes c + with e -> + error "Unable to update host memory metrics: %s" + (Printexc.to_string e) + ) + ) +end + +module VMs = struct + let get_changes datasources = + List.iter + (fun (filename, datasources) -> + try + Mcache.log_errors_from filename ; + datasources + |> List.filter_map (function + | Rrd.VM vm_uuid, ds when ds.Ds.ds_name = "memory" -> + Some (vm_uuid, ds) + | _ -> + None (* we are only interested in VM stats *) + ) + |> List.iter (function vm_uuid, ds -> + let value = + match ds.Ds.ds_value with + | Rrd.VT_Int64 v -> + v + | Rrd.VT_Float v -> + Int64.of_float v + | Rrd.VT_Unknown -> + -1L + in + Hashtbl.add Mcache.vm_memory_tmp vm_uuid value + ) + with e -> + if not (Mcache.is_ignored filename) then ( + error "Unable to read memory usage for VM %s: %s" filename + (Printexc.to_string e) ; + Mcache.ignore_errors_from filename + ) + ) + datasources ; + (* Check if anything has changed since our last reading. *) + Mcache.get_updates_map ~before:Mcache.vm_memory_cached + ~after:Mcache.vm_memory_tmp + + let set_changes ?except () = + Mtxext.execute Mcache.vm_memory_cached_m (fun _ -> + Mcache.transfer_map ?except ~source:Mcache.vm_memory_tmp + ~target:Mcache.vm_memory_cached () + ) + + let update rrd_files = + Server_helpers.exec_with_new_task "Updating VM memory usage" + (fun __context -> + let datasources = + get_datasources ~prefix:Xapi_globs.metrics_prefix_mem_vms rrd_files + in + let host = Helpers.get_localhost ~__context in + let keeps = ref [] in + List.iter + (fun (vm_uuid, memory) -> + try + let vm = Db.VM.get_by_uuid ~__context ~uuid:vm_uuid in + let vmm = Db.VM.get_metrics ~__context ~self:vm in + if Db.VM.get_resident_on ~__context ~self:vm = host then + Db.VM_metrics.set_memory_actual ~__context ~self:vmm + ~value:memory + else + Mcache.clear_cache_for_vm ~vm_uuid + with e -> + keeps := vm_uuid :: !keeps ; + error "Unable to update memory usage for VM %s: %s" vm_uuid + (Printexc.to_string e) + ) + (get_changes datasources) ; + set_changes ~except:!keeps () + ) +end + +let update rrd_files = Host.update rrd_files ; VMs.update rrd_files diff --git a/ocaml/xapi/monitor_mem.mli b/ocaml/xapi/monitor_mem.mli new file mode 100644 index 00000000000..c2b74b2512f --- /dev/null +++ b/ocaml/xapi/monitor_mem.mli @@ -0,0 +1,18 @@ +(* Copyright (C) Cloud Software Group Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; version 2.1 only. with the special + exception on linking described in file LICENSE. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. +*) + +module Mcache = Monitor_dbcalls_cache + +val update : Mcache.StringSet.elt list -> unit +(** [update rrd_files] Reads rrd_files and update the host and VM memory + metrics in xapi's cache. *) diff --git a/ocaml/xapi/monitor_mem_host.ml b/ocaml/xapi/monitor_mem_host.ml deleted file mode 100644 index e4c2f012a24..00000000000 --- a/ocaml/xapi/monitor_mem_host.ml +++ /dev/null @@ -1,98 +0,0 @@ -(* - * Copyright (C) Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) - -module Mtxext = Xapi_stdext_threads.Threadext.Mutex -module Mcache = Monitor_dbcalls_cache - -module D = Debug.Make (struct let name = "monitor_mem_host" end) - -open D - -let get_changes rrd_files = - let named_dss = - List.concat_map - (fun filename -> - try - let datasources = Monitor_types.datasources_from_filename filename in - Mcache.log_errors_from filename ; - datasources - |> List.filter_map (function - | Rrd.Host, ds - when List.mem ds.Ds.ds_name - ["memory_total_kib"; "memory_free_kib"] -> - Some ds - | _ -> - None (* we are only interested in Host memory stats *) - ) - |> List.map (function ds -> - let value = - match ds.Ds.ds_value with - | Rrd.VT_Int64 v -> - Memory.bytes_of_kib v - | Rrd.VT_Float v -> - Memory.bytes_of_kib (Int64.of_float v) - | Rrd.VT_Unknown -> - -1L - in - (ds.Ds.ds_name, value) - ) - with e -> - if not (Mcache.is_ignored filename) then ( - error "Unable to read host memory metrics from %s: %s" filename - (Printexc.to_string e) ; - Mcache.ignore_errors_from filename - ) ; - [] - ) - rrd_files - in - let free_bytes = List.assoc_opt "memory_free_kib" named_dss in - let total_bytes = List.assoc_opt "memory_total_kib" named_dss in - (* Check if anything has changed since our last reading. *) - match (free_bytes, total_bytes) with - | Some free, Some total - when !Mcache.host_memory_free_cached <> free - || !Mcache.host_memory_total_cached <> total -> - Some (free, total) - | _ -> - None - -let set_changes (free_bytes, total_bytes) = - Mtxext.execute Mcache.host_memory_m (fun _ -> - Mcache.host_memory_free_cached := free_bytes ; - Mcache.host_memory_total_cached := total_bytes - ) - -let update rrd_files = - let is_host_rrd = - Astring.String.is_prefix ~affix:Xapi_globs.metrics_prefix_mem_host - in - let rrd_files = List.filter is_host_rrd rrd_files in - Server_helpers.exec_with_new_task "Updating host memory metrics" - (fun __context -> - let changes = get_changes rrd_files in - match changes with - | None -> - () - | Some ((free, total) as c) -> ( - try - let host = Helpers.get_localhost ~__context in - let metrics = Db.Host.get_metrics ~__context ~self:host in - Db.Host_metrics.set_memory_total ~__context ~self:metrics ~value:total ; - Db.Host_metrics.set_memory_free ~__context ~self:metrics ~value:free ; - set_changes c - with e -> - error "Unable to update host memory metrics: %s" (Printexc.to_string e) - ) - ) diff --git a/ocaml/xapi/monitor_mem_vms.ml b/ocaml/xapi/monitor_mem_vms.ml deleted file mode 100644 index 37d737d92df..00000000000 --- a/ocaml/xapi/monitor_mem_vms.ml +++ /dev/null @@ -1,89 +0,0 @@ -(* - * Copyright (C) Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) - -module Mtxext = Xapi_stdext_threads.Threadext.Mutex -module Mcache = Monitor_dbcalls_cache - -module D = Debug.Make (struct let name = "monitor_mem_vms" end) - -open D - -let get_changes rrd_files = - List.iter - (fun filename -> - try - let datasources = Monitor_types.datasources_from_filename filename in - Mcache.log_errors_from filename ; - datasources - |> List.filter_map (function - | Rrd.VM vm_uuid, ds when ds.Ds.ds_name = "memory" -> - Some (vm_uuid, ds) - | _ -> - None (* we are only interested in VM stats *) - ) - |> List.iter (function vm_uuid, ds -> - let value = - match ds.Ds.ds_value with - | Rrd.VT_Int64 v -> - v - | Rrd.VT_Float v -> - Int64.of_float v - | Rrd.VT_Unknown -> - -1L - in - Hashtbl.add Mcache.vm_memory_tmp vm_uuid value - ) - with e -> - if not (Mcache.is_ignored filename) then ( - error "Unable to read memory usage for VM %s: %s" filename - (Printexc.to_string e) ; - Mcache.ignore_errors_from filename - ) - ) - rrd_files ; - (* Check if anything has changed since our last reading. *) - Mcache.get_updates_map ~before:Mcache.vm_memory_cached - ~after:Mcache.vm_memory_tmp - -let set_changes ?except () = - Mtxext.execute Mcache.vm_memory_cached_m (fun _ -> - Mcache.transfer_map ?except ~source:Mcache.vm_memory_tmp - ~target:Mcache.vm_memory_cached () - ) - -let update rrd_files = - let is_vm_rrd = - Astring.String.is_prefix ~affix:Xapi_globs.metrics_prefix_mem_vms - in - let rrd_files = List.filter is_vm_rrd rrd_files in - Server_helpers.exec_with_new_task "Updating VM memory usage" (fun __context -> - let host = Helpers.get_localhost ~__context in - let keeps = ref [] in - List.iter - (fun (vm_uuid, memory) -> - try - let vm = Db.VM.get_by_uuid ~__context ~uuid:vm_uuid in - let vmm = Db.VM.get_metrics ~__context ~self:vm in - if Db.VM.get_resident_on ~__context ~self:vm = host then - Db.VM_metrics.set_memory_actual ~__context ~self:vmm ~value:memory - else - Mcache.clear_cache_for_vm ~vm_uuid - with e -> - keeps := vm_uuid :: !keeps ; - error "Unable to update memory usage for VM %s: %s" vm_uuid - (Printexc.to_string e) - ) - (get_changes rrd_files) ; - set_changes ~except:!keeps () - ) diff --git a/quality-gate.sh b/quality-gate.sh index 6455846d21b..f6540cb2a1f 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=469 + N=467 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" From f601c12094810e3c0c5db2f169a64f35e66bb1fa Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 12 Jun 2025 11:36:05 +0100 Subject: [PATCH 071/111] rrdp-squeezed: move remaining memory metrics to this plugin This allows to xcp-rrdd to stop writing to memory-mapped files This needed xapi to be adapted since it read the contents of the memory-mapped files. Now it can get those metric from a single memory-mapped file. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/monitor_mem.ml | 105 +++++++-------- ocaml/xapi/xapi_globs.ml | 4 +- ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml | 126 ++---------------- ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.mli | 0 .../bin/rrdp-squeezed/rrdp_squeezed.ml | 40 +++++- quality-gate.sh | 2 +- 6 files changed, 101 insertions(+), 176 deletions(-) create mode 100644 ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.mli diff --git a/ocaml/xapi/monitor_mem.ml b/ocaml/xapi/monitor_mem.ml index 502f0d6ca7a..79cf3cadf9d 100644 --- a/ocaml/xapi/monitor_mem.ml +++ b/ocaml/xapi/monitor_mem.ml @@ -19,9 +19,15 @@ module D = Debug.Make (struct let name = __MODULE__ end) open D -let get_datasources ~prefix rrd_files = - List.filter (String.starts_with ~prefix) rrd_files - |> List.map (fun fn -> (fn, Monitor_types.datasources_from_filename fn)) +let get_datasources rrd_files = + List.filter_map + (fun filename -> + if String.starts_with ~prefix:Xapi_globs.metrics_prefix_mem filename then + Some (filename, Monitor_types.datasources_from_filename filename) + else + None + ) + rrd_files module Host = struct let get_changes datasources = @@ -78,28 +84,19 @@ module Host = struct Mcache.host_memory_total_cached := total_bytes ) - let update rrd_files = - Server_helpers.exec_with_new_task "Updating host memory metrics" - (fun __context -> - let datasources = - get_datasources ~prefix:Xapi_globs.metrics_prefix_mem_host rrd_files - in - let changes = get_changes datasources in - match changes with - | None -> - () - | Some ((free, total) as c) -> ( - try - let host = Helpers.get_localhost ~__context in - let metrics = Db.Host.get_metrics ~__context ~self:host in - Db.Host_metrics.set_memory_total ~__context ~self:metrics - ~value:total ; - Db.Host_metrics.set_memory_free ~__context ~self:metrics ~value:free ; - set_changes c - with e -> - error "Unable to update host memory metrics: %s" - (Printexc.to_string e) - ) + let update __context datasources = + match get_changes datasources with + | None -> + () + | Some ((free, total) as c) -> ( + try + let host = Helpers.get_localhost ~__context in + let metrics = Db.Host.get_metrics ~__context ~self:host in + Db.Host_metrics.set_memory_total ~__context ~self:metrics ~value:total ; + Db.Host_metrics.set_memory_free ~__context ~self:metrics ~value:free ; + set_changes c + with e -> + error "Unable to update host memory metrics: %s" (Printexc.to_string e) ) end @@ -146,32 +143,36 @@ module VMs = struct ~target:Mcache.vm_memory_cached () ) - let update rrd_files = - Server_helpers.exec_with_new_task "Updating VM memory usage" - (fun __context -> - let datasources = - get_datasources ~prefix:Xapi_globs.metrics_prefix_mem_vms rrd_files - in - let host = Helpers.get_localhost ~__context in - let keeps = ref [] in - List.iter - (fun (vm_uuid, memory) -> - try - let vm = Db.VM.get_by_uuid ~__context ~uuid:vm_uuid in - let vmm = Db.VM.get_metrics ~__context ~self:vm in - if Db.VM.get_resident_on ~__context ~self:vm = host then - Db.VM_metrics.set_memory_actual ~__context ~self:vmm - ~value:memory - else - Mcache.clear_cache_for_vm ~vm_uuid - with e -> - keeps := vm_uuid :: !keeps ; - error "Unable to update memory usage for VM %s: %s" vm_uuid - (Printexc.to_string e) - ) - (get_changes datasources) ; - set_changes ~except:!keeps () - ) + let update __context datasources = + let host = Helpers.get_localhost ~__context in + let keeps = ref [] in + List.iter + (fun (vm_uuid, memory) -> + try + let vm = Db.VM.get_by_uuid ~__context ~uuid:vm_uuid in + let vmm = Db.VM.get_metrics ~__context ~self:vm in + if Db.VM.get_resident_on ~__context ~self:vm = host then + Db.VM_metrics.set_memory_actual ~__context ~self:vmm ~value:memory + else + Mcache.clear_cache_for_vm ~vm_uuid + with e -> + keeps := vm_uuid :: !keeps ; + error "Unable to update memory usage for VM %s: %s" vm_uuid + (Printexc.to_string e) + ) + (get_changes datasources) ; + set_changes ~except:!keeps () end -let update rrd_files = Host.update rrd_files ; VMs.update rrd_files +let update rrd_files = + let ( let@ ) f x = f x in + let@ __context = + Server_helpers.exec_with_new_task "Updating memory metrics" + in + let datasources = get_datasources rrd_files in + if datasources = [] then + error "%s: no memory datasources found!" __FUNCTION__ + else ( + Host.update __context datasources ; + VMs.update __context datasources + ) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index e3957deea71..ad8914e9de7 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -635,9 +635,7 @@ let event_hook_auth_on_xapi_initialize_succeeded = ref false let metrics_root = "/dev/shm/metrics" -let metrics_prefix_mem_host = "xcp-rrdd-mem_host" - -let metrics_prefix_mem_vms = "xcp-rrdd-mem_vms" +let metrics_prefix_mem = "xcp-rrdd-squeezed" let metrics_prefix_pvs_proxy = "pvsproxy-" diff --git a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml index 40e5ab34b79..588f2de37dd 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml @@ -119,40 +119,6 @@ let start (xmlrpc_path, http_fwd_path) process = let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute -(*****************************************************) -(* memory stats *) -(*****************************************************) -let dss_mem_host xc = - let physinfo = Xenctrl.physinfo xc in - let total_kib = - Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.total_pages) - and free_kib = - Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.free_pages) - in - [ - ( Rrd.Host - , Ds.ds_make ~name:"memory_total_kib" - ~description:"Total amount of memory in the host" - ~value:(Rrd.VT_Int64 total_kib) ~ty:Rrd.Gauge ~min:0.0 ~default:true - ~units:"KiB" () - ) - ; ( Rrd.Host - , Ds.ds_make ~name:"memory_free_kib" - ~description:"Total amount of free memory" - ~value:(Rrd.VT_Int64 free_kib) ~ty:Rrd.Gauge ~min:0.0 ~default:true - ~units:"KiB" () - ) - ] - -(** estimate the space needed to serialize all the dss_mem_vms in a host. the - json-like serialization for the 3 dss in dss_mem_vms takes 622 bytes. these - bytes plus some overhead make 1024 bytes an upper bound. *) - -let bytes_per_mem_vm = 1024 - -let mem_vm_writer_pages = - ((Rrd_interface.max_supported_vms * bytes_per_mem_vm) + 4095) / 4096 - let uuid_blacklist = ["00000000-0000-0000"; "deadbeef-dead-beef"] module IntSet = Set.Make (Int) @@ -192,31 +158,6 @@ let domain_snapshot xc = in domains |> List.to_seq -let dss_mem_vms xc = - let mem_metrics_of (dom, uuid, _) = - let vm_metrics () = - let kib = - Xenctrl.pages_to_kib (Int64.of_nativeint dom.Xenctrl.total_memory_pages) - in - let memory = Int64.mul kib 1024L in - let main_mem_ds = - ( Rrd.VM uuid - , Ds.ds_make ~name:"memory" - ~description:"Memory currently allocated to VM" ~units:"B" - ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () - ) - in - Some main_mem_ds - in - (* CA-34383: Memory updates from paused domains serve no useful purpose. - During a migrate such updates can also cause undesirable - discontinuities in the observed value of memory_actual. Hence, we - ignore changes from paused domains: *) - if dom.Xenctrl.paused then None else vm_metrics () - in - let domains = domain_snapshot xc in - Seq.filter_map mem_metrics_of domains |> List.of_seq - (**** Local cache SR stuff *) type last_vals = { @@ -323,8 +264,6 @@ let handle_exn log f default = let dom0_stat_generators = [ ("ha", fun _ _ -> Rrdd_ha_stats.all ()) - ; ("mem_host", fun xc _ -> dss_mem_host xc) - ; ("mem_vms", fun xc _ -> dss_mem_vms xc) ; ("cache", fun _ timestamp -> dss_cache timestamp) ] @@ -335,23 +274,9 @@ let generate_all_dom0_stats xc = in List.map handle_generator dom0_stat_generators -let write_dom0_stats writers tagged_dss = - let write_dss (name, writer) = - match List.assoc_opt name tagged_dss with - | None -> - debug - "Could not write stats for \"%s\": no stats were associated with \ - this name" - name - | Some (timestamp, dss) -> - writer.Rrd_writer.write_payload {timestamp; datasources= dss} - in - List.iter write_dss writers - -let do_monitor_write domains_before xc writers = +let do_monitor_write domains_before xc = Rrdd_libs.Stats.time_this "monitor" (fun _ -> let tagged_dom0_stats = generate_all_dom0_stats xc in - write_dom0_stats writers tagged_dom0_stats ; let dom0_stats = tagged_dom0_stats |> List.to_seq @@ -380,14 +305,14 @@ let do_monitor_write domains_before xc writers = domains_after ) -let monitor_write_loop writers = +let monitor_write_loop () = Debug.with_thread_named "monitor_write" (fun () -> Xenctrl.with_intf (fun xc -> let domains = ref Seq.empty in while true do try - domains := do_monitor_write !domains xc writers ; + domains := do_monitor_write !domains xc ; with_lock Rrdd_shared.next_iteration_start_m (fun _ -> Rrdd_shared.next_iteration_start := Clock.Timer.extend_by !Rrdd_shared.timeslice @@ -579,45 +504,15 @@ let doc = the datasources and records historical data in RRD format." ] -(** write memory stats to the filesystem so they can be propagated to xapi, - along with the number of pages they require to be allocated *) -let stats_to_write = [("mem_host", 1); ("mem_vms", mem_vm_writer_pages)] - -let writer_basename = ( ^ ) "xcp-rrdd-" - -let configure_writers () = - List.map - (fun (name, n_pages) -> - let path = Rrdd_server.Plugin.get_path (writer_basename name) in - ignore (Xapi_stdext_unix.Unixext.mkdir_safe (Filename.dirname path) 0o644) ; - let writer = - snd - (Rrd_writer.FileWriter.create - {path; shared_page_count= n_pages} - Rrd_protocol_v2.protocol - ) - in - (name, writer) - ) - stats_to_write - -(** we need to make sure we call exit on fatal signals to make sure profiling - data is dumped *) -let stop err writers signal = - debug "caught signal %a" Debug.Pp.signal signal ; - List.iter (fun (_, writer) -> writer.Rrd_writer.cleanup ()) writers ; - exit err - (* Entry point. *) -let _ = +let () = Rrdd_bindings.Rrd_daemon.bind () ; (* bind PPX-generated server calls to implementation of API *) - let writers = configure_writers () in (* Prevent shutdown due to sigpipe interrupt. This protects against potential stunnel crashes. *) Sys.set_signal Sys.sigpipe Sys.Signal_ignore ; - Sys.set_signal Sys.sigterm (Sys.Signal_handle (stop 1 writers)) ; - Sys.set_signal Sys.sigint (Sys.Signal_handle (stop 0 writers)) ; + Sys.set_signal Sys.sigterm (Sys.Signal_handle (fun _ -> exit 1)) ; + Sys.set_signal Sys.sigint (Sys.Signal_handle (fun _ -> exit 0)) ; (* Enable the new logging library. *) Debug.set_facility Syslog.Local5 ; (* Read configuration file. *) @@ -647,11 +542,8 @@ let _ = start (!Rrd_interface.default_path, !Rrd_interface.forwarded_path) (fun () -> Idl.Exn.server Rrdd_bindings.Server.implementation ) ; - ignore - @@ Discover.start - (List.map (fun (name, _) -> writer_basename name) stats_to_write) ; - ignore @@ GCLog.start () ; - debug "Starting xenstore-watching thread .." ; + let _ : Thread.t = Discover.start [] in + let _ : Thread.t = GCLog.start () in let module Daemon = Xapi_stdext_unix.Unixext.Daemon in if Daemon.systemd_booted () then if Daemon.systemd_notify Daemon.State.Ready then @@ -660,7 +552,7 @@ let _ = warn "Sending systemd notification failed at %s" __LOC__ ; debug "Creating monitoring loop thread .." ; let () = - try Debug.with_thread_associated "main" monitor_write_loop writers + try Debug.with_thread_associated "main" monitor_write_loop () with _ -> error "monitoring loop thread has failed" in while true do diff --git a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.mli b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.mli new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index a3091a90daf..df49dca259f 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -131,7 +131,7 @@ let get_domain_stats xc = let bytes_of_kib kib = Int64.mul 1024L kib -let generate_host_sources counters = +let generate_host_sources xc counters = let memory_reclaimed, memory_possibly_reclaimed = (* Calculate host metrics - Host memory reclaimed by squeezed = @@ -169,6 +169,13 @@ let generate_host_sources counters = in let memory_reclaimed = bytes_of_kib memory_reclaimed in let memory_possibly_reclaimed = bytes_of_kib memory_possibly_reclaimed in + let physinfo = Xenctrl.physinfo xc in + let total_kib = + Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.total_pages) + in + let free_kib = + Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.free_pages) + in (* Build corresponding Ds.ds values *) [ ( Rrd.Host @@ -183,6 +190,18 @@ let generate_host_sources counters = ~value:(Rrd.VT_Int64 memory_possibly_reclaimed) ~ty:Rrd.Gauge ~default:true ~units:"B" () ) + ; ( Rrd.Host + , Ds.ds_make ~name:"memory_total_kib" + ~description:"Total amount of memory in the host" + ~value:(Rrd.VT_Int64 total_kib) ~ty:Rrd.Gauge ~min:0.0 ~default:true + ~units:"KiB" () + ) + ; ( Rrd.Host + , Ds.ds_make ~name:"memory_free_kib" + ~description:"Total amount of free memory" + ~value:(Rrd.VT_Int64 free_kib) ~ty:Rrd.Gauge ~min:0.0 ~default:true + ~units:"KiB" () + ) ] let res_error fmt = Printf.ksprintf Result.error fmt @@ -233,6 +252,8 @@ let free_other uuid free = ~value:(Rrd.VT_Int64 free) ~ty:Rrd.Gauge ~min:0.0 ~default:true () ) +let get_list f = Option.to_list (f ()) + let generate_vm_sources domains = let metrics_of ((dom, uuid, domid), {target; free; _}) = let target () = @@ -254,6 +275,19 @@ let generate_vm_sources domains = else Option.bind free (free_other uuid) in + let total () = + let memory = + Int64.of_nativeint dom.Xenctrl.total_memory_pages + |> Xenctrl.pages_to_kib + |> bytes_of_kib + in + Some + ( Rrd.VM uuid + , Ds.ds_make ~name:"memory" + ~description:"Memory currently allocated to VM" ~units:"B" + ~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true () + ) + in (* CA-34383: Memory updates from paused domains serve no useful purpose. During a migrate such updates can also cause undesirable discontinuities in the observed value of memory_actual. Hence, we @@ -261,14 +295,14 @@ let generate_vm_sources domains = if dom.Xenctrl.paused then [] else - Option.to_list (target ()) @ Option.to_list (free ()) + get_list target @ get_list free @ get_list total in List.concat_map metrics_of domains let generate_sources xc () = let domain_stats = get_domain_stats xc in - generate_host_sources domain_stats @ generate_vm_sources domain_stats + generate_host_sources xc domain_stats @ generate_vm_sources domain_stats (** The json-like serialization for 3 dss in dss_mem_vms takes 622 bytes. These bytes plus some overhead make 1024 bytes an upper bound. *) diff --git a/quality-gate.sh b/quality-gate.sh index f6540cb2a1f..ceb82f67f65 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=467 + N=466 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" From fa5bc72d44ec181a90e21a6b2175704bfa16c18b Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 26 Jun 2025 18:53:55 +0100 Subject: [PATCH 072/111] xcp-rrdd: remove duplicated code to fetch domains Signed-off-by: Pau Ruiz Safont --- ocaml/xcp-rrdd/bin/rrdd/dune | 1 + ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml | 42 ++--------------------------- 2 files changed, 3 insertions(+), 40 deletions(-) diff --git a/ocaml/xcp-rrdd/bin/rrdd/dune b/ocaml/xcp-rrdd/bin/rrdd/dune index 6ce134dd522..2f215e8a7cf 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/dune +++ b/ocaml/xcp-rrdd/bin/rrdd/dune @@ -50,6 +50,7 @@ rpclib.json rpclib.xml rrdd_libs_internal + rrdd_plugin_xenctrl rrd-transport threads.posix uuid diff --git a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml index 588f2de37dd..17ca619440d 100644 --- a/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml +++ b/ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml @@ -119,45 +119,6 @@ let start (xmlrpc_path, http_fwd_path) process = let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute -let uuid_blacklist = ["00000000-0000-0000"; "deadbeef-dead-beef"] - -module IntSet = Set.Make (Int) - -let domain_snapshot xc = - let metadata_of_domain dom = - let ( let* ) = Option.bind in - let* uuid_raw = Uuidx.of_int_array dom.Xenctrl.handle in - let uuid = Uuidx.to_string uuid_raw in - let domid = dom.Xenctrl.domid in - let start = String.sub uuid 0 18 in - (* Actively hide migrating VM uuids, these are temporary and xenops writes - the original and the final uuid to xenstore *) - let uuid_from_key key = - let path = Printf.sprintf "/vm/%s/%s" uuid key in - try Ezxenstore_core.Xenstore.(with_xs (fun xs -> xs.read path)) - with Xs_protocol.Enoent _hint -> - info "Couldn't read path %s; falling back to actual uuid" path ; - uuid - in - let stable_uuid = Option.fold ~none:uuid ~some:uuid_from_key in - if List.mem start uuid_blacklist then - None - else - let key = - if Astring.String.is_suffix ~affix:"000000000000" uuid then - Some "origin-uuid" - else if Astring.String.is_suffix ~affix:"000000000001" uuid then - Some "final-uuid" - else - None - in - Some (dom, stable_uuid key, domid) - in - let domains = - Xenctrl.domain_getinfolist xc 0 |> List.filter_map metadata_of_domain - in - domains |> List.to_seq - (**** Local cache SR stuff *) type last_vals = { @@ -285,7 +246,8 @@ let do_monitor_write domains_before xc = ) in let plugins_stats = Rrdd_server.Plugin.read_stats () in - let domains_after = domain_snapshot xc in + let _, domains_after, _ = Xenctrl_lib.domain_snapshot xc in + let domains_after = List.to_seq domains_after in let stats = Seq.append plugins_stats dom0_stats in Rrdd_stats.print_snapshot () ; (* merge the domain ids from the previous iteration and the current one From 57cbad4fc5f32ecb2aad5caba209713f3c95851a Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Wed, 25 Jun 2025 21:46:39 +0100 Subject: [PATCH 073/111] Move common retry_econnrefused function to xcp_client Signed-off-by: Steven Woods --- ocaml/xapi-idl/lib/observer_helpers.ml | 20 +++----------------- ocaml/xapi-idl/lib/xcp_client.ml | 18 ++++++++++++++++++ ocaml/xapi-idl/rrd/rrd_client.ml | 22 ++++------------------ ocaml/xapi-idl/storage/storage_client.ml | 22 ++++------------------ 4 files changed, 29 insertions(+), 53 deletions(-) diff --git a/ocaml/xapi-idl/lib/observer_helpers.ml b/ocaml/xapi-idl/lib/observer_helpers.ml index 125ba101722..24f7ee3db46 100644 --- a/ocaml/xapi-idl/lib/observer_helpers.ml +++ b/ocaml/xapi-idl/lib/observer_helpers.ml @@ -241,24 +241,10 @@ module Server (Impl : Server_impl) () = struct let process call = Idl.Exn.server S.implementation call end -let rec retry_econnrefused f = - try f () with - | Unix.Unix_error (Unix.ECONNREFUSED, "connect", _) -> - (* debug "Caught ECONNREFUSED; retrying in 5s"; *) - Thread.delay 5. ; retry_econnrefused f - | e -> - (* error "Caught %s: does the observer service need restarting?" - (Printexc.to_string e); *) - raise e - module Client = ObserverAPI (Idl.Exn.GenClient (struct - open Xcp_client - let rpc call = - retry_econnrefused (fun () -> - if !use_switch then - json_switch_rpc queue_name call - else - xml_http_rpc ~srcstr:(get_user_agent ()) ~dststr:queue_name uri call + Xcp_client.( + retry_and_switch_rpc call ~use_switch:!use_switch ~queue_name + ~dststr:queue_name ~uri ) end)) diff --git a/ocaml/xapi-idl/lib/xcp_client.ml b/ocaml/xapi-idl/lib/xcp_client.ml index 435a63e3126..a7ebd1f996a 100644 --- a/ocaml/xapi-idl/lib/xcp_client.ml +++ b/ocaml/xapi-idl/lib/xcp_client.ml @@ -190,3 +190,21 @@ let binary_rpc string_of_call response_of_string ?(srcstr = "unset") let json_binary_rpc = binary_rpc Jsonrpc.string_of_call Jsonrpc.response_of_string + +let rec retry_econnrefused f = + try f () with + | Unix.Unix_error (Unix.ECONNREFUSED, "connect", _) -> + (* debug "Caught ECONNREFUSED; retrying in 5s"; *) + Thread.delay 5. ; retry_econnrefused f + | e -> + (* error "Caught %s: does the service need restarting?" + (Printexc.to_string e); *) + raise e + +let retry_and_switch_rpc call ~use_switch ~queue_name ~dststr ~uri = + retry_econnrefused (fun () -> + if use_switch then + json_switch_rpc queue_name call + else + xml_http_rpc ~srcstr:(get_user_agent ()) ~dststr uri call + ) diff --git a/ocaml/xapi-idl/rrd/rrd_client.ml b/ocaml/xapi-idl/rrd/rrd_client.ml index abb12a118de..08a9b731f71 100644 --- a/ocaml/xapi-idl/rrd/rrd_client.ml +++ b/ocaml/xapi-idl/rrd/rrd_client.ml @@ -13,26 +13,12 @@ *) open Rrd_interface -open Xcp_client - -let rec retry_econnrefused f = - try f () with - | Unix.Unix_error (Unix.ECONNREFUSED, "connect", _) -> - (* debug "Caught ECONNREFUSED; retrying in 5s"; *) - Thread.delay 5. ; retry_econnrefused f - | e -> - (* error "Caught %s: does the rrd service need restarting?" - (Printexc.to_string e); *) - raise e +(* TODO: use_switch=false as the message switch doesn't handle raw HTTP very well *) let rpc call = - retry_econnrefused (fun () -> - (* TODO: the message switch doesn't handle raw HTTP very well *) - if (* !use_switch *) false then - json_switch_rpc !queue_name call - else - xml_http_rpc ~srcstr:(get_user_agent ()) ~dststr:"rrd" Rrd_interface.uri - call + Xcp_client.( + retry_and_switch_rpc call ~use_switch:false ~queue_name:!queue_name + ~dststr:"rrd" ~uri ) module Client = RPC_API (Idl.Exn.GenClient (struct let rpc = rpc end)) diff --git a/ocaml/xapi-idl/storage/storage_client.ml b/ocaml/xapi-idl/storage/storage_client.ml index b66636daf6a..eeb0e765170 100644 --- a/ocaml/xapi-idl/storage/storage_client.ml +++ b/ocaml/xapi-idl/storage/storage_client.ml @@ -13,25 +13,11 @@ *) open Storage_interface -open Xcp_client -let rec retry_econnrefused f = - try f () with - | Unix.Unix_error (Unix.ECONNREFUSED, "connect", _) -> - (* debug "Caught ECONNREFUSED; retrying in 5s"; *) - Thread.delay 5. ; retry_econnrefused f - | e -> - (* error "Caught %s: does the storage service need restarting?" - (Printexc.to_string e); *) - raise e - -module Client = Storage_interface.StorageAPI (Idl.Exn.GenClient (struct +module Client = StorageAPI (Idl.Exn.GenClient (struct let rpc call = - retry_econnrefused (fun () -> - if !use_switch then - json_switch_rpc !queue_name call - else - xml_http_rpc ~srcstr:(get_user_agent ()) ~dststr:"storage" - Storage_interface.uri call + Xcp_client.( + retry_and_switch_rpc call ~use_switch:!use_switch ~queue_name:!queue_name + ~dststr:"storage" ~uri ) end)) From 267c414d5753b4697119d20a363a5170c6444209 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Tue, 17 Jun 2025 02:20:45 +0000 Subject: [PATCH 074/111] CA-412636: hostname changed to localhost with static IP and reboot According to https://www.freedesktop.org/software/systemd/man/latest/hostname.html Systemd set hostname with following sequence - kernel parameter, systemd.hostname - static hostname in /etc/hostname - transient hostname like DHCP - localhost at systemd compile time Once the host is configured with static IP and reboot, it would just lost its hostname as no DHCP or static IP available. However, the hostname is critical to AD function as it construct the machine account. The hostname should be persisted as static name during joining AD, this is also what PBIS does. Note: the static hostname is not cleaned during domain leave. This is by intention to avoid losing hostname after reboot with static IP cba2f1d5e tried to resovle the issue and update /etc/resolv.conf However, /etc/resolv.conf does not help and conflict with xcp-networkd, as networkd override the configure every 5 minutes Here we just revert the resolv.conf update. Other parts of that commit can still benifit as it push the hostname to DNS Signed-off-by: Lin Liu --- ocaml/xapi/extauth_plugin_ADwinbind.ml | 27 +++++--------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/ocaml/xapi/extauth_plugin_ADwinbind.ml b/ocaml/xapi/extauth_plugin_ADwinbind.ml index a279de5c5c7..6def6c5bb64 100644 --- a/ocaml/xapi/extauth_plugin_ADwinbind.ml +++ b/ocaml/xapi/extauth_plugin_ADwinbind.ml @@ -1434,23 +1434,6 @@ module ConfigHosts = struct |> write_string_to_file path end -module ResolveConfig = struct - let path = "/etc/resolv.conf" - - type t = Add | Remove - - let handle op domain = - let open Xapi_stdext_unix.Unixext in - let config = Printf.sprintf "search %s" domain in - read_lines ~path |> List.filter (fun x -> x <> config) |> fun x -> - (match op with Add -> config :: x | Remove -> x) |> fun x -> - x @ [""] |> String.concat "\n" |> write_string_to_file path - - let join ~domain = handle Add domain - - let leave ~domain = handle Remove domain -end - module DNSSync = struct let task_name = "Sync hostname with DNS" @@ -1827,7 +1810,11 @@ module AuthADWinbind : Auth_signature.AUTH_MODULE = struct ClosestKdc.trigger_update ~start:0. ; RotateMachinePassword.trigger_rotate ~start:0. ; ConfigHosts.join ~domain:service_name ~name:netbios_name ; - ResolveConfig.join ~domain:service_name ; + let _, _ = + Forkhelpers.execute_command_get_output !Xapi_globs.set_hostname + [get_localhost_name ()] + in + (* Trigger right now *) DNSSync.trigger_sync ~start:0. ; Winbind.set_machine_account_encryption_type netbios_name ; debug "Succeed to join domain %s" service_name @@ -1836,7 +1823,6 @@ module AuthADWinbind : Auth_signature.AUTH_MODULE = struct error "Join domain: %s error: %s" service_name stdout ; clear_winbind_config () ; ConfigHosts.leave ~domain:service_name ~name:netbios_name ; - ResolveConfig.leave ~domain:service_name ; (* The configure is kept for debug purpose with max level *) raise (Auth_service_error (stdout |> tag_from_err_msg, stdout)) | Xapi_systemctl.Systemctl_fail _ -> @@ -1844,7 +1830,6 @@ module AuthADWinbind : Auth_signature.AUTH_MODULE = struct error "Start daemon error: %s" msg ; config_winbind_daemon ~domain:None ~workgroup:None ~netbios_name:None ; ConfigHosts.leave ~domain:service_name ~name:netbios_name ; - ResolveConfig.leave ~domain:service_name ; raise (Auth_service_error (E_GENERIC, msg)) | e -> let msg = @@ -1856,7 +1841,6 @@ module AuthADWinbind : Auth_signature.AUTH_MODULE = struct error "Enable extauth error: %s" msg ; clear_winbind_config () ; ConfigHosts.leave ~domain:service_name ~name:netbios_name ; - ResolveConfig.leave ~domain:service_name ; raise (Auth_service_error (E_GENERIC, msg)) (* unit on_disable() @@ -1871,7 +1855,6 @@ module AuthADWinbind : Auth_signature.AUTH_MODULE = struct let user = List.assoc_opt "user" config_params in let pass = List.assoc_opt "pass" config_params in let {service_name; netbios_name; _} = get_domain_info_from_db () in - ResolveConfig.leave ~domain:service_name ; DNSSync.stop_sync () ; ( match netbios_name with | Some netbios -> From 4927eefd44e1940def75c16da231cfdfdca5b842 Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Mon, 16 Jun 2025 11:15:20 +0100 Subject: [PATCH 075/111] Add mlis for observer_helpers and observer_skeleton Signed-off-by: Steven Woods --- ocaml/xapi-idl/lib/observer_helpers.mli | 227 +++++++++++++++++++++++ ocaml/xapi-idl/lib/observer_skeleton.mli | 46 +++++ quality-gate.sh | 2 +- 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 ocaml/xapi-idl/lib/observer_helpers.mli create mode 100644 ocaml/xapi-idl/lib/observer_skeleton.mli diff --git a/ocaml/xapi-idl/lib/observer_helpers.mli b/ocaml/xapi-idl/lib/observer_helpers.mli new file mode 100644 index 00000000000..cd23d2d1e80 --- /dev/null +++ b/ocaml/xapi-idl/lib/observer_helpers.mli @@ -0,0 +1,227 @@ +val queue_name : string + +val default_path : string + +module Errors : sig + type error = + | Internal_error of string + | Unimplemented of string + | Unknown_error + + val typ_of_error : error Rpc.Types.typ + + val error : error Rpc.Types.def +end + +exception Observer_error of Errors.error + +type debug_info = string + +(** ObserverAPI contains the declarations for the RPCs which are sent to + Observer modules when the corresponding function is called on the Observer + see ocaml/libs/tracing/ and ocaml/xapi/xapi_observer.ml *) +module ObserverAPI : functor (R : Idl.RPC) -> sig + val description : Idl.Interface.description + + val implementation : R.implementation + + val create : + ( debug_info + -> string + -> string + -> (string * string) list + -> string list + -> bool + -> (unit, Errors.error) R.comp + ) + R.res + (** [create dbg uuid name attributes endpoints enabled] notifies the + forwarder that an Observer with [uuid] has been created. The subsequent + parameters are the fields the Observer was created with. *) + + val destroy : (debug_info -> string -> (unit, Errors.error) R.comp) R.res + (** [destroy dbg uuid] notifies the forwarder that an Observer with [uuid] + has been destroyed. *) + + val set_enabled : + (debug_info -> string -> bool -> (unit, Errors.error) R.comp) R.res + (** [set_enabled dbg uuid enabled] notifies the fowarder that the Observer + with [uuid] has had its enabled field set to [enabled]. *) + + val set_attributes : + ( debug_info + -> string + -> (string * string) list + -> (unit, Errors.error) R.comp + ) + R.res + (** [set_attributes dbg uuid attributes] notifies the fowarder that the + Observer with [uuid] has had its attributes field set to [attributes]. *) + + val set_endpoints : + (debug_info -> string -> string list -> (unit, Errors.error) R.comp) R.res + (** [set_endpoints dbg uuid endpoints] notifies the fowarder that the Observer + with [uuid] has had its endpoints field set to [endpoints]. *) + + val init : (debug_info -> (unit, Errors.error) R.comp) R.res + (** [init dbg] notifies the forwarder that it should perform any tracing + initialisation. *) + + val set_trace_log_dir : + (debug_info -> string -> (unit, Errors.error) R.comp) R.res + (** [set_trace_log_dir dbg dir] notifies the fowarder that the trace_log_dir + has been set to [dir]. *) + + val set_export_interval : + (debug_info -> float -> (unit, Errors.error) R.comp) R.res + (** [set_export_interval dbg interval] notifies the fowarder that the interval + between trace exports has been set to [interval]. *) + + val set_max_spans : (debug_info -> int -> (unit, Errors.error) R.comp) R.res + (** [set_max_spans dbg spans] notifies the fowarder that the max number of + spans has been set to [spans]. *) + + val set_max_traces : (debug_info -> int -> (unit, Errors.error) R.comp) R.res + (** [set_max_traces dbg traces] notifies the fowarder that the max number of + traces has been set to [traces]. *) + + val set_max_file_size : + (debug_info -> int -> (unit, Errors.error) R.comp) R.res + (** [set_max_file_size dbg file_size] notifies the fowarder that the max file + size has been set to [file_size]. *) + + val set_host_id : (debug_info -> string -> (unit, Errors.error) R.comp) R.res + (** [set_host_id dbg host_id] notifies the fowarder that the host to be traced + has been set to [host_id]. *) + + val set_compress_tracing_files : + (debug_info -> bool -> (unit, Errors.error) R.comp) R.res + (** [set_compress_tracing_files dbg enabled] notifies the fowarder that the + compression of tracing files has been set to [enabled]. *) +end + +(** A Server_impl module will define how the Server responds to ObserverAPI calls *) +module type Server_impl = sig + type context = unit + + val create : + context + -> dbg:debug_info + -> uuid:string + -> name_label:string + -> attributes:(string * string) list + -> endpoints:string list + -> enabled:bool + -> unit + + val destroy : context -> dbg:debug_info -> uuid:string -> unit + + val set_enabled : + context -> dbg:debug_info -> uuid:string -> enabled:bool -> unit + + val set_attributes : + context + -> dbg:debug_info + -> uuid:string + -> attributes:(string * string) list + -> unit + + val set_endpoints : + context -> dbg:debug_info -> uuid:string -> endpoints:string list -> unit + + val init : context -> dbg:debug_info -> unit + + val set_trace_log_dir : context -> dbg:debug_info -> dir:string -> unit + + val set_export_interval : context -> dbg:debug_info -> interval:float -> unit + + val set_max_spans : context -> dbg:debug_info -> spans:int -> unit + + val set_max_traces : context -> dbg:debug_info -> traces:int -> unit + + val set_max_file_size : context -> dbg:debug_info -> file_size:int -> unit + + val set_host_id : context -> dbg:debug_info -> host_id:string -> unit + + val set_compress_tracing_files : + context -> dbg:debug_info -> enabled:bool -> unit +end + +(** A Server for receiving ObserverAPI calls *) +module Server : functor (_ : Server_impl) () -> sig + module S : sig + val create : + ( debug_info + -> string + -> string + -> (string * string) list + -> string list + -> bool + -> unit + ) + -> unit + + val destroy : (debug_info -> string -> unit) -> unit + + val set_enabled : (debug_info -> string -> bool -> unit) -> unit + + val set_attributes : + (debug_info -> string -> (string * string) list -> unit) -> unit + + val set_endpoints : (debug_info -> string -> string list -> unit) -> unit + + val init : (debug_info -> unit) -> unit + + val set_trace_log_dir : (debug_info -> string -> unit) -> unit + + val set_export_interval : (debug_info -> float -> unit) -> unit + + val set_max_spans : (debug_info -> int -> unit) -> unit + + val set_max_traces : (debug_info -> int -> unit) -> unit + + val set_max_file_size : (debug_info -> int -> unit) -> unit + + val set_host_id : (debug_info -> string -> unit) -> unit + + val set_compress_tracing_files : (debug_info -> bool -> unit) -> unit + end + + val process : Rpc.call -> Rpc.response +end + +(** A client for sending ObserverAPI calls to the above queue_name *) +module Client : sig + val create : + debug_info + -> string + -> string + -> (string * string) list + -> string list + -> bool + -> unit + + val destroy : debug_info -> string -> unit + + val set_enabled : debug_info -> string -> bool -> unit + + val set_attributes : debug_info -> string -> (string * string) list -> unit + + val set_endpoints : debug_info -> string -> string list -> unit + + val init : debug_info -> unit + + val set_trace_log_dir : debug_info -> string -> unit + + val set_export_interval : debug_info -> float -> unit + + val set_max_spans : debug_info -> int -> unit + + val set_max_traces : debug_info -> int -> unit + + val set_max_file_size : debug_info -> int -> unit + + val set_host_id : debug_info -> string -> unit + + val set_compress_tracing_files : debug_info -> bool -> unit +end diff --git a/ocaml/xapi-idl/lib/observer_skeleton.mli b/ocaml/xapi-idl/lib/observer_skeleton.mli new file mode 100644 index 00000000000..c99b77f9a34 --- /dev/null +++ b/ocaml/xapi-idl/lib/observer_skeleton.mli @@ -0,0 +1,46 @@ +(** This module provides dummy implementations for each Observer function. + These are intended to be used to fill in the functions that the module will + not ever use, as they will raise an Unimplemented error if called *) +module Observer : sig + type context = unit + + val create : + context + -> dbg:string + -> uuid:string + -> name_label:string + -> attributes:(string * string) list + -> endpoints:string list + -> enabled:bool + -> unit + + val destroy : context -> dbg:string -> uuid:string -> unit + + val set_enabled : context -> dbg:string -> uuid:string -> enabled:bool -> unit + + val set_attributes : + context + -> dbg:string + -> uuid:string + -> attributes:(string * string) list + -> unit + + val set_endpoints : + context -> dbg:string -> uuid:string -> endpoints:string list -> unit + + val init : context -> dbg:string -> unit + + val set_trace_log_dir : context -> dbg:string -> dir:string -> unit + + val set_export_interval : context -> dbg:string -> interval:float -> unit + + val set_max_spans : context -> dbg:string -> spans:int -> unit + + val set_max_traces : context -> dbg:string -> traces:int -> unit + + val set_max_file_size : context -> dbg:string -> file_size:int -> unit + + val set_host_id : context -> dbg:string -> host_id:string -> unit + + val set_compress_tracing_files : context -> dbg:string -> enabled:bool -> unit +end diff --git a/quality-gate.sh b/quality-gate.sh index ceb82f67f65..7591e3c4ff4 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -25,7 +25,7 @@ verify-cert () { } mli-files () { - N=466 + N=464 X="ocaml/tests" X+="|ocaml/quicktest" X+="|ocaml/message-switch/core_test" From fc5f98b80badd2c9bb952b9a6ee7d7367bcb4f70 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Tue, 1 Jul 2025 15:56:18 +0800 Subject: [PATCH 076/111] CA-393417: Drop device controller of cgroup v1 For deprivileged qemu, following ops are performed - bind mount /dev/ to qemu chroot, so qemu can access it - cgroup controller deny all devices, except the target usb device However, new XS updated to cgroup v2 and the devices controller available anymore. Instead of bind mount all /dev folder, only the permitted usb devices are created into the chroot. Thus, the cgroup controller is no longer necessary. Besides, there are following updates accordingly - qemu pid is no longer necessary as command line args, as cgroup is dropped. - save and restore system /etc/ devices file ownership is no longer necessary. New file is cloned into chroot instead of bind mount system device file, so only need to set ownership of chroot file directly Signed-off-by: Lin Liu --- python3/libexec/usb_reset.py | 267 ++++++++--------------------------- 1 file changed, 55 insertions(+), 212 deletions(-) diff --git a/python3/libexec/usb_reset.py b/python3/libexec/usb_reset.py index 573936ae1c3..3e5ff849060 100755 --- a/python3/libexec/usb_reset.py +++ b/python3/libexec/usb_reset.py @@ -15,36 +15,30 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # # attach -# ./usb_reset.py attach device -d dom-id -p pid [-r] +# ./usb_reset.py attach device -d dom-id [-r] # ./usb_reset.py attach 2-2 -d 12 -p 4130 # ./usb_reset.py attach 2-2 -d 12 -p 4130 -r # 1. reset device -# if without -r, do step 2~4 +# if without -r, do step 2~3 # 2. if it's the first USB device to pass-through -# a) bind mount /dev /sys in chroot directory (/var/xen/qemu/root-) -# b) create new cgroup devices:/qemu-, -# c) blacklist all and add default device whitelist, -# d) join current qemu process to this cgroup -# 3. save device uid/gid to /var/run/nonpersistent/usb/ -# 4. set device file uid/gid to (qemu_base + dom-id) -# 5. add current device to whitelist +# a) bind mount /sys in chroot directory (/var/xen/qemu/root-) +# b) clone (create the device with same major/minor number and mode) in chroot directory with same path +# 3. set device file uid/gid to (qemu_base + dom-id) # # detach # ./usb_reset.py detach device -d dom-id # ./usb_reset.py detach 2-2 -d 12 -# 1. restore device file uid/gid from /var/run/nonpersistent/usb/ -# 2. remove current device from whitelist +# 1. Remove the cloned device file in chroot directory # # cleanup # ./usb_reset.py cleanup -d dom-id # ./usb_reset.py cleanup -d 12 -# 1.remove the cgroup if one has been created. -# 2.umount /dev, /sys from chroot directory if they are mounted. +# 1.umount /sys from chroot directory if they are mounted. +# 2.remove /dev/bus directory in chroot directory if it exists import argparse import ctypes import ctypes.util -import errno import fcntl import grp import xcp.logger as log # pytype: disable=import-error @@ -52,7 +46,7 @@ import os import pwd import re -from stat import S_ISCHR, S_ISBLK +import shutil def parse_arg(): @@ -64,8 +58,6 @@ def parse_arg(): attach.add_argument("device", help="the target usb device") attach.add_argument("-d", dest="domid", type=int, required=True, help="specify the domid of the VM") - attach.add_argument("-p", dest="pid", type=int, required=True, - help="the process id of QEMU") attach.add_argument("-r", dest="reset_only", action="store_true", help="reset device only, for privileged mode") @@ -85,56 +77,6 @@ def get_root_dir(domid): return "/var/xen/qemu/root-{}".format(domid) -def get_cg_dir(domid): - return "/sys/fs/cgroup/devices/qemu-{}".format(domid) - - -def get_ids_path(device): - usb_dir = "/var/run/nonpersistent/usb" - try: - os.makedirs(usb_dir) - except OSError as e: - if e.errno != errno.EEXIST: - raise - - return os.path.join(usb_dir, device) - - -def save_device_ids(device): - path = dev_path(device) - - try: - stat = os.stat(path) - ids_info = "{} {}".format(stat.st_uid, stat.st_gid) - except OSError as e: - log.error("Failed to stat {}: {}".format(path, str(e))) - exit(1) - - try: - with open(get_ids_path(device), "w") as f: - f.write(ids_info) - except IOError as e: - log.error("Failed to save device ids {}: {}".format(path, str(e))) - exit(1) - - -def load_device_ids(device): - ids_path = get_ids_path(device) - try: - with open(ids_path) as f: - uid, gid = list(map(int, f.readline().split())) - except (IOError, ValueError) as e: - log.error("Failed to load device ids: {}".format(str(e))) - - try: - os.remove(ids_path) - except OSError as e: - # ignore and continue - log.warning("Failed to remove device ids: {}".format(str(e))) - - return uid, gid # pyright: ignore[reportPossiblyUnboundVariable] # pragma: no cover - - # throw IOError, ValueError def read_int(path): with open(path) as f: @@ -157,109 +99,6 @@ def dev_path(device): exit(1) -def get_ctl(path, mode): # type:(str, str) -> str - """get the string to control device access for cgroup - :param path: the device file path - :param mode: either "r" or "rw" - :return: the string to control device access - """ - try: - st = os.stat(path) - except OSError as e: - log.error("Failed to get stat of {}: {}".format(path, str(e))) - raise - - t = "" - if S_ISBLK(st.st_mode): - t = "b" - elif S_ISCHR(st.st_mode): - t = "c" - if t and mode in ("r", "rw"): - return "{} {}:{} {}".format(t, os.major(st.st_rdev), os.minor( - st.st_rdev), mode) - raise RuntimeError("Failed to get control string of {}".format(path)) - - -def _device_ctl(path, domid, allow): - cg_dir = get_cg_dir(domid) - file_name = "/devices.allow" if allow else "/devices.deny" - try: - with open(cg_dir + file_name, "w") as f: - f.write(get_ctl(path, "rw")) - except (IOError, OSError, RuntimeError) as e: - log.error("Failed to {} {}: {}".format( - "allow" if allow else "deny", path, str(e))) - exit(1) - - -def allow_device(path, domid): - _device_ctl(path, domid, True) - - -def deny_device(path, domid): - _device_ctl(path, domid, False) - - -def setup_cgroup(domid, pid): # type:(str, str) -> None - """ - Associate the given process id (pid) with the given Linux kernel control group - and limit it's device access to only /dev/null. - - :param domid (str): The control group ID string (passed on from the command line) - :param pid (str): The process ID string (passed on from the command line) - - If the control group directory does not exist yet, the control group is created. - - - The pid goes into the file "tasks" to associate the process with the cgroup. - - Deny device access by default by writing "a" to devices.deny. - - Grant read-write access to /dev/null, writing it's device IDs to devices.allow. - - If any error occur during the setup process, the error is logged and - the program exits with a status code of 1. - """ - cg_dir = get_cg_dir(domid) - - try: - os.mkdir(cg_dir, 0o755) - except OSError as e: - if e.errno != errno.EEXIST: - log.error("Failed to create cgroup: {}".format(cg_dir)) - exit(1) - - try: - # unbuffered write to ensure each one is flushed immediately - # to the kernel's control group filesystem: - # - # The order of writes is likely not important, but the writes - # may have to be a single write() system call for the entire string. - # - # Using the unbuffered Raw IO mode, we know the write was done - # in exactly this way by the write function call itself, not later. - # - # With small writes like this , splitting them because of overflowing the - # buffer is not expected to happen. To stay safe and keep using unbuffered I/O - # We have to migrate to binary mode in python3,as python3 supports unbuffered - # raw I/O in binary mode. - # - with open(cg_dir + "/tasks", "wb", 0) as tasks, \ - open(cg_dir + "/devices.deny", "wb", 0) as deny, \ - open(cg_dir + "/devices.allow", "wb", 0) as allow: - - # deny all - deny.write(b"a") - - # To write bytes, we've to encode the strings to bytes below: - - # grant rw access to /dev/null by default - allow.write(get_ctl("/dev/null", "rw").encode()) - - tasks.write(str(pid).encode()) - - except (IOError, OSError, RuntimeError) as e: - log.error("Failed to setup cgroup: {}".format(str(e))) - exit(1) - - def mount(source, target, fs, flags=0): if ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True ).mount(source.encode(), target.encode(), fs.encode(), flags, None) < 0: @@ -277,7 +116,43 @@ def umount(target): format(target, os.strerror(ctypes.get_errno()))) -def attach(device, domid, pid, reset_only): +def clone_device(path, root_dir, domid): + """ + Clone the device file into the chroot directory. + + :param path: The source device file under system /dev to clone. + :param root_dir: The root directory of the chroot environment. + :param domid: The domain ID of the VM, used to set the device file's uid/gid. + """ + target_path = os.path.join(root_dir, path.lstrip(os.path.sep)) + if os.path.exists(target_path): + log.info("Device file {} already exists in chroot".format(target_path)) + return + + os.makedirs(os.path.dirname(target_path), exist_ok=True, mode=0o755) + + try: + st = os.stat(path) + except OSError as e: + log.error("Failed to get stat of {}: {}".format(path, str(e))) + exit(1) + + mode = st.st_mode + major = os.major(st.st_rdev) + minor = os.minor(st.st_rdev) + clone_device_id = os.makedev(major, minor) + os.mknod(target_path, mode, clone_device_id) + + # set device file uid/gid + try: + os.chown(target_path, pwd.getpwnam("qemu_base").pw_uid + domid, + grp.getgrnam("qemu_base").gr_gid + domid) + except OSError as e: + log.error("Failed to chown device file {}: {}".format(path, str(e))) + exit(1) + + +def attach(device, domid, reset_only): path = dev_path(device) # reset device @@ -293,27 +168,13 @@ def attach(device, domid, pid, reset_only): if reset_only: return - save_device_ids(device) - - # set device file uid/gid - try: - os.chown(path, pwd.getpwnam("qemu_base").pw_uid + domid, - grp.getgrnam("qemu_base").gr_gid + domid) - except OSError as e: - log.error("Failed to chown device file {}: {}".format(path, str(e))) - exit(1) - root_dir = get_root_dir(domid) dev_dir = root_dir + "/dev" if not os.path.isdir(root_dir) or not os.path.isdir(dev_dir): log.error("Error: The chroot or dev directory doesn't exist") exit(1) - if not os.path.isdir(dev_dir + "/bus"): - # first USB device to pass-through - MS_BIND = 4096 # mount flags, from fs.h - mount("/dev", dev_dir, "", MS_BIND) - setup_cgroup(domid, pid) + clone_device(path, root_dir, domid) sys_dir = root_dir + "/sys" # sys_dir could already be mounted because of PCI pass-through @@ -326,41 +187,23 @@ def attach(device, domid, pid, reset_only): if not os.path.isdir(sys_dir + "/devices"): mount("/sys", sys_dir, "sysfs") - # add device to cgroup allow list - allow_device(path, domid) - def detach(device, domid): path = dev_path(device) - uid, gid = load_device_ids(device) - - # restore uid, gid of the device file. - try: - os.chown(path, uid, gid) - except OSError as e: - log.error("Failed to chown device file {}: {}".format(path, str(e))) - exit(1) - - # remove device from cgroup allow list - deny_device(path, domid) + root_dir = get_root_dir(domid) + target_path = os.path.join(root_dir, path.lstrip(os.path.sep)) + os.remove(target_path) def cleanup(domid): - # remove the cgroup if one has been created. - if os.path.isdir(get_cg_dir(domid)): - try: - os.rmdir(get_cg_dir(domid)) - except OSError as e: - # log and continue - log.error("Failed to remove cgroup qemu-{}: {}" - .format(domid, str(e))) - # umount /dev, /sys from chroot directory if they are mounted. root_dir = get_root_dir(domid) dev_dir = root_dir + "/dev" sys_dir = root_dir + "/sys" - if os.path.isdir(dev_dir + "/bus"): - umount(dev_dir) + bus_dir = dev_dir + "/bus" + if os.path.isdir(bus_dir): + log.info("Removing bus directory: {} for cleanup".format(bus_dir)) + shutil.rmtree(bus_dir) if os.path.isdir(sys_dir + "/devices"): umount(sys_dir) @@ -371,7 +214,7 @@ def cleanup(domid): arg = parse_arg() if "attach" == arg.command: - attach(arg.device, arg.domid, arg.pid, arg.reset_only) + attach(arg.device, arg.domid, arg.reset_only) elif "detach" == arg.command: detach(arg.device, arg.domid) elif "cleanup" == arg.command: From 56bd7c62c3ede7e063fc492a551e3c073877cb56 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Mon, 9 Jun 2025 10:20:57 +0100 Subject: [PATCH 077/111] CP-308455 VM.sysprep start with skeleton Add a new API call for VM sysprep and the corresponding XE implementation. This is mostly scaffolding. Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_lifecycle.ml | 2 + ocaml/idl/datamodel_vm.ml | 11 +++++ ocaml/idl/schematest.ml | 2 +- ocaml/xapi-cli-server/cli_frontend.ml | 9 ++++ ocaml/xapi-cli-server/cli_operations.ml | 18 ++++++++ ocaml/xapi-cli-server/record_util.ml | 1 + ocaml/xapi/message_forwarding.ml | 11 +++++ ocaml/xapi/vm_sysprep.ml | 61 +++++++++++++++++++++++++ ocaml/xapi/vm_sysprep.mli | 15 ++++++ ocaml/xapi/xapi_vm.ml | 3 ++ ocaml/xapi/xapi_vm.mli | 2 + ocaml/xapi/xapi_vm_lifecycle.ml | 1 + 12 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 ocaml/xapi/vm_sysprep.ml create mode 100644 ocaml/xapi/vm_sysprep.mli diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index bbab96b8f0f..1973f0ed506 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -239,6 +239,8 @@ let prototyped_of_message = function Some "25.2.0" | "host", "set_numa_affinity_policy" -> Some "24.0.0" + | "VM", "sysprep" -> + Some "25.21.0-next" | "VM", "get_secureboot_readiness" -> Some "24.17.0" | "VM", "set_uefi_mode" -> diff --git a/ocaml/idl/datamodel_vm.ml b/ocaml/idl/datamodel_vm.ml index e72721b4ce0..5e4134afd0b 100644 --- a/ocaml/idl/datamodel_vm.ml +++ b/ocaml/idl/datamodel_vm.ml @@ -2211,6 +2211,7 @@ let operations = ; ("reverting", "Reverting the VM to a previous snapshotted state") ; ("destroy", "refers to the act of uninstalling the VM") ; ("create_vtpm", "Creating and adding a VTPM to this VM") + ; ("sysprep", "Performing a Windows sysprep on this VM") ] ) @@ -2369,6 +2370,15 @@ let restart_device_models = ~allowed_roles:(_R_VM_POWER_ADMIN ++ _R_CLIENT_CERT) () +let sysprep = + call ~name:"sysprep" ~lifecycle:[] + ~params: + [ + (Ref _vm, "self", "The VM") + ; (String, "unattend", "XML content passed to sysprep") + ] + ~doc:"Pass unattend.xml to Windows sysprep" ~allowed_roles:_R_VM_ADMIN () + let vm_uefi_mode = Enum ( "vm_uefi_mode" @@ -2571,6 +2581,7 @@ let t = ; set_blocked_operations ; add_to_blocked_operations ; remove_from_blocked_operations + ; sysprep ] ~contents: ([ diff --git a/ocaml/idl/schematest.ml b/ocaml/idl/schematest.ml index c8abcb1f999..7bd70cb3aa5 100644 --- a/ocaml/idl/schematest.ml +++ b/ocaml/idl/schematest.ml @@ -3,7 +3,7 @@ let hash x = Digest.string x |> Digest.to_hex (* BEWARE: if this changes, check that schema has been bumped accordingly in ocaml/idl/datamodel_common.ml, usually schema_minor_vsn *) -let last_known_schema_hash = "4cd835e2557dd7b5cbda6c681730c447" +let last_known_schema_hash = "9cd32d98d092440c36617546a3d995bd" let current_schema_hash : string = let open Datamodel_types in diff --git a/ocaml/xapi-cli-server/cli_frontend.ml b/ocaml/xapi-cli-server/cli_frontend.ml index b066d7a8dd5..255f2be789e 100644 --- a/ocaml/xapi-cli-server/cli_frontend.ml +++ b/ocaml/xapi-cli-server/cli_frontend.ml @@ -2764,6 +2764,15 @@ let rec cmdtable_data : (string * cmd_spec) list = ; flags= [] } ) + ; ( "vm-sysprep" + , { + reqd= ["filename"] + ; optn= [] + ; help= "Pass and execute sysprep configuration file" + ; implementation= With_fd Cli_operations.vm_sysprep + ; flags= [Vm_selectors] + } + ) ; ( "diagnostic-vm-status" , { reqd= ["uuid"] diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index 65c91a031fc..f51c50851d4 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -3588,6 +3588,24 @@ let vm_data_source_forget printer rpc session_id params = params ["data-source"] ) +let vm_sysprep fd printer rpc session_id params = + let filename = List.assoc "filename" params in + let unattend = + match get_client_file fd filename with + | Some xml -> + xml + | None -> + marshal fd (Command (PrintStderr "Failed to read file.\n")) ; + raise (ExitWithError 1) + in + ignore + (do_vm_op printer rpc session_id + (fun vm -> + Client.VM.sysprep ~rpc ~session_id ~self:(vm.getref ()) ~unattend + ) + params ["filename"] + ) + (* APIs to collect SR level RRDs *) let sr_data_source_list printer rpc session_id params = ignore diff --git a/ocaml/xapi-cli-server/record_util.ml b/ocaml/xapi-cli-server/record_util.ml index d28b6b5f763..a11b30decb3 100644 --- a/ocaml/xapi-cli-server/record_util.ml +++ b/ocaml/xapi-cli-server/record_util.ml @@ -75,6 +75,7 @@ let vm_operation_table = ; (`csvm, "csvm") ; (`call_plugin, "call_plugin") ; (`create_vtpm, "create_vtpm") + ; (`sysprep, "sysprep") ] (* Intentional shadowing - data_souces_op, assertoperationinvalid, diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index b52aaaa20ec..15b984ad993 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -3115,6 +3115,17 @@ functor (vm_uuid ~__context self) ; Local.VM.remove_from_blocked_operations ~__context ~self ~key ; Xapi_vm_lifecycle.update_allowed_operations ~__context ~self + + let sysprep ~__context ~self ~unattend = + info "VM.sysprep: self = '%s'" (vm_uuid ~__context self) ; + let local_fn = Local.VM.sysprep ~self ~unattend in + let remote_fn = Client.VM.sysprep ~self ~unattend in + let policy = Helpers.Policy.fail_immediately in + with_vm_operation ~__context ~self ~doc:"VM.sysprep" ~op:`sysprep + ~policy (fun () -> + forward_vm_op ~local_fn ~__context ~vm:self ~remote_fn + ) ; + Xapi_vm_lifecycle.update_allowed_operations ~__context ~self end module VM_metrics = struct end diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml new file mode 100644 index 00000000000..28be828daab --- /dev/null +++ b/ocaml/xapi/vm_sysprep.ml @@ -0,0 +1,61 @@ +(* + * Copyright (c) Cloud Software Group, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +module D = Debug.Make (struct let name = __MODULE__ end) + +open D +open Xapi_stdext_unix + +let ( // ) = Filename.concat + +let finally = Xapi_stdext_pervasives.Pervasiveext.finally + +let tmp_dir = Filename.get_temp_dir_name () + +let sr_dir = "/opt/opt/iso" + +let genisoimage = "/usr/bin/genisoimage" + +(** name of the ISO we will use for a VMi; this is not a path *) +let iso_name ~vm_uuid = + let now = Ptime_clock.now () |> Ptime.to_rfc3339 in + Printf.sprintf "config-%s-%s.iso" vm_uuid now + +(** taken from OCaml 5 stdlib *) +let temp_dir ?(dir = tmp_dir) ?(perms = 0o700) prefix suffix = + let rec try_name counter = + let name = Filename.temp_file ~temp_dir:dir prefix suffix in + try Sys.mkdir name perms ; name + with Sys_error _ as e -> + if counter >= 20 then raise e else try_name (counter + 1) + in + try_name 0 + +(** Crteate a temporary directory, and pass its path to [f]. Once [f] + returns the directory is removed again *) +let with_temp_dir ?(dir = tmp_dir) ?(perms = 0o700) prefix suffix f = + let dir = temp_dir ~dir ~perms prefix suffix in + finally (fun () -> f dir) (fun () -> Unixext.rm_rec dir) + +let make_iso ~vm_uuid ~unattend = + try + let _iso = sr_dir // iso_name ~vm_uuid in + Xapi_stdext_unix.Unixext.mkdir_rec sr_dir 0o755 + (* Unixext.write_string_to_file path unattend *) + with e -> + let msg = Printexc.to_string e in + Helpers.internal_error "%s failed: %s" __FUNCTION__ msg + +(* This function is executed on the host where [vm] is running *) +let sysprep ~__context ~vm ~unattend = debug "%s" __FUNCTION__ diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli new file mode 100644 index 00000000000..9f6f9ab9724 --- /dev/null +++ b/ocaml/xapi/vm_sysprep.mli @@ -0,0 +1,15 @@ +(* + * Copyright (c) Cloud Software Group, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +val sysprep : __context:Context.t -> vm:API.ref_VM -> unattend:string -> unit diff --git a/ocaml/xapi/xapi_vm.ml b/ocaml/xapi/xapi_vm.ml index 8a1ca5e493a..f76c632665f 100644 --- a/ocaml/xapi/xapi_vm.ml +++ b/ocaml/xapi/xapi_vm.ml @@ -1701,3 +1701,6 @@ let get_secureboot_readiness ~__context ~self = ) ) ) + +let sysprep ~__context ~self ~unattend = + Vm_sysprep.sysprep ~__context ~vm:self ~unattend diff --git a/ocaml/xapi/xapi_vm.mli b/ocaml/xapi/xapi_vm.mli index 363e68b03d1..005b4cae4ae 100644 --- a/ocaml/xapi/xapi_vm.mli +++ b/ocaml/xapi/xapi_vm.mli @@ -450,3 +450,5 @@ val add_to_blocked_operations : val remove_from_blocked_operations : __context:Context.t -> self:API.ref_VM -> key:API.vm_operations -> unit + +val sysprep : __context:Context.t -> self:API.ref_VM -> unattend:string -> unit diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index 5ec4ca6d792..50d7fcddb58 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -61,6 +61,7 @@ let allowed_power_states ~__context ~vmr ~(op : API.vm_operations) = | `send_sysrq | `send_trigger | `snapshot_with_quiesce + | `sysprep | `suspend -> [`Running] | `changing_dynamic_range -> From 8d35ebc9b61aefe4fe9f7398ac86957eb24f9ccc Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 11 Jun 2025 14:31:42 +0100 Subject: [PATCH 078/111] CP-308455 VM.sysprep implement mkdtemp We want to create a temporary directory that will be used to hold files for creating an ISO. There is no existing function that creates all necessary directories in a predicatble way. Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_lifecycle.ml | 2 +- ocaml/xapi/vm_sysprep.ml | 63 +++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index 1973f0ed506..ef79f8aec15 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -240,7 +240,7 @@ let prototyped_of_message = function | "host", "set_numa_affinity_policy" -> Some "24.0.0" | "VM", "sysprep" -> - Some "25.21.0-next" + Some "25.22.0" | "VM", "get_secureboot_readiness" -> Some "24.17.0" | "VM", "set_uefi_mode" -> diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 28be828daab..03a8e456af4 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -21,41 +21,68 @@ let ( // ) = Filename.concat let finally = Xapi_stdext_pervasives.Pervasiveext.finally -let tmp_dir = Filename.get_temp_dir_name () +let temp_dir = Filename.get_temp_dir_name () let sr_dir = "/opt/opt/iso" let genisoimage = "/usr/bin/genisoimage" -(** name of the ISO we will use for a VMi; this is not a path *) -let iso_name ~vm_uuid = - let now = Ptime_clock.now () |> Ptime.to_rfc3339 in - Printf.sprintf "config-%s-%s.iso" vm_uuid now +let failwith_fmt fmt = Printf.ksprintf failwith fmt + +let prng = Random.State.make_self_init () + +let temp_name prefix suffix = + let rnd = Random.State.bits prng land 0xFFFFFF in + Printf.sprintf "%s%06x%s" prefix rnd suffix -(** taken from OCaml 5 stdlib *) -let temp_dir ?(dir = tmp_dir) ?(perms = 0o700) prefix suffix = - let rec try_name counter = - let name = Filename.temp_file ~temp_dir:dir prefix suffix in - try Sys.mkdir name perms ; name - with Sys_error _ as e -> - if counter >= 20 then raise e else try_name (counter + 1) +(** [mkdtmp] creates a directory in [dir] and returns its path. If [dir] + does not yet exist it is created. It is a an error if [dir] is not a + directory. *) +let mkdtemp ?(dir = temp_dir) ?(perms = 0o700) prefix suffix = + ( match Sys.file_exists dir with + | true when not (Sys.is_directory dir) -> + failwith_fmt "s: %s is not a directory" __FUNCTION__ dir + | true -> + () + | false -> + Unixext.mkdir_rec dir perms + ) ; + let rec loop = function + | n when n >= 20 -> + failwith_fmt "s: can't create directory in %s" __FUNCTION__ dir + | n -> ( + let path = Filename.concat dir (temp_name prefix suffix) in + try Sys.mkdir path perms ; path with Sys_error _ -> loop (n + 1) + ) in - try_name 0 + loop 0 (** Crteate a temporary directory, and pass its path to [f]. Once [f] returns the directory is removed again *) -let with_temp_dir ?(dir = tmp_dir) ?(perms = 0o700) prefix suffix f = - let dir = temp_dir ~dir ~perms prefix suffix in +let with_temp_dir ?(dir = temp_dir) ?(perms = 0o700) prefix suffix f = + let dir = mkdtemp ~dir ~perms prefix suffix in finally (fun () -> f dir) (fun () -> Unixext.rm_rec dir) +(** name of the ISO we will use for a VMi; this is not a path *) +let iso_name ~vm_uuid = + let now = Ptime_clock.now () |> Ptime.to_rfc3339 in + Printf.sprintf "config-%s-%s.iso" vm_uuid now + let make_iso ~vm_uuid ~unattend = try let _iso = sr_dir // iso_name ~vm_uuid in - Xapi_stdext_unix.Unixext.mkdir_rec sr_dir 0o755 - (* Unixext.write_string_to_file path unattend *) + Xapi_stdext_unix.Unixext.mkdir_rec sr_dir 0o755 ; + with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" @@ fun temp_dir -> + debug "%s: %s = %b" __FUNCTION__ temp_dir (Sys.file_exists temp_dir) ; + let path = temp_dir // "unattend.xml" in + Unixext.write_string_to_file path unattend ; + debug "%s: written to %s" __FUNCTION__ path with e -> let msg = Printexc.to_string e in Helpers.internal_error "%s failed: %s" __FUNCTION__ msg (* This function is executed on the host where [vm] is running *) -let sysprep ~__context ~vm ~unattend = debug "%s" __FUNCTION__ +let sysprep ~__context ~vm ~unattend = + debug "%s" __FUNCTION__ ; + let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in + make_iso ~vm_uuid ~unattend From c6bee52b7d37f89c3894a44dafed082e5fc0b11b Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 12 Jun 2025 13:19:26 +0100 Subject: [PATCH 079/111] CP-308455 VM.sysprep make iso Implement creating an ISO from a temporary directory Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 03a8e456af4..16764b824a3 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -36,8 +36,8 @@ let temp_name prefix suffix = Printf.sprintf "%s%06x%s" prefix rnd suffix (** [mkdtmp] creates a directory in [dir] and returns its path. If [dir] - does not yet exist it is created. It is a an error if [dir] is not a - directory. *) + does not yet exist it is created. It is a an error if [dir] exists + and is not a directory. *) let mkdtemp ?(dir = temp_dir) ?(perms = 0o700) prefix suffix = ( match Sys.file_exists dir with | true when not (Sys.is_directory dir) -> @@ -68,15 +68,18 @@ let iso_name ~vm_uuid = let now = Ptime_clock.now () |> Ptime.to_rfc3339 in Printf.sprintf "config-%s-%s.iso" vm_uuid now +(** Create an ISO in [sr_dir] with content [unattend]. [sr_dir] is + created if it not already exists. *) let make_iso ~vm_uuid ~unattend = try - let _iso = sr_dir // iso_name ~vm_uuid in + let iso = sr_dir // iso_name ~vm_uuid in Xapi_stdext_unix.Unixext.mkdir_rec sr_dir 0o755 ; with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" @@ fun temp_dir -> - debug "%s: %s = %b" __FUNCTION__ temp_dir (Sys.file_exists temp_dir) ; let path = temp_dir // "unattend.xml" in Unixext.write_string_to_file path unattend ; - debug "%s: written to %s" __FUNCTION__ path + debug "%s: written to %s" __FUNCTION__ path ; + let args = ["-r"; "-J"; "-o"; iso; temp_dir] in + Forkhelpers.execute_command_get_output genisoimage args |> ignore with e -> let msg = Printexc.to_string e in Helpers.internal_error "%s failed: %s" __FUNCTION__ msg From 0e6f6ee78b759a7980bdc6297671debc01e6fbad Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 12 Jun 2025 13:59:23 +0100 Subject: [PATCH 080/111] CP-308455 VM.sysprep Add logging Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 16764b824a3..dd6c5906dc3 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -23,7 +23,7 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let temp_dir = Filename.get_temp_dir_name () -let sr_dir = "/opt/opt/iso" +let sr_dir = "/var/opt/iso" let genisoimage = "/usr/bin/genisoimage" @@ -69,17 +69,19 @@ let iso_name ~vm_uuid = Printf.sprintf "config-%s-%s.iso" vm_uuid now (** Create an ISO in [sr_dir] with content [unattend]. [sr_dir] is - created if it not already exists. *) + created if it not already exists. Returns the path of the ISO image *) let make_iso ~vm_uuid ~unattend = try let iso = sr_dir // iso_name ~vm_uuid in Xapi_stdext_unix.Unixext.mkdir_rec sr_dir 0o755 ; - with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" @@ fun temp_dir -> - let path = temp_dir // "unattend.xml" in - Unixext.write_string_to_file path unattend ; - debug "%s: written to %s" __FUNCTION__ path ; - let args = ["-r"; "-J"; "-o"; iso; temp_dir] in - Forkhelpers.execute_command_get_output genisoimage args |> ignore + with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" (fun temp_dir -> + let path = temp_dir // "unattend.xml" in + Unixext.write_string_to_file path unattend ; + debug "%s: written to %s" __FUNCTION__ path ; + let args = ["-r"; "-J"; "-o"; iso; temp_dir] in + Forkhelpers.execute_command_get_output genisoimage args |> ignore ; + iso + ) with e -> let msg = Printexc.to_string e in Helpers.internal_error "%s failed: %s" __FUNCTION__ msg @@ -88,4 +90,5 @@ let make_iso ~vm_uuid ~unattend = let sysprep ~__context ~vm ~unattend = debug "%s" __FUNCTION__ ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in - make_iso ~vm_uuid ~unattend + let iso = make_iso ~vm_uuid ~unattend in + debug "%s: created %s" __FUNCTION__ iso From b88c199d419dc14ee1f805618fbfc80eec38de37 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 12 Jun 2025 16:27:58 +0100 Subject: [PATCH 081/111] CP-308455 VM.sysprep Add SR creation Create a local SR unless it exists. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 43 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index dd6c5906dc3..87f2689e0dd 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -23,14 +23,18 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let temp_dir = Filename.get_temp_dir_name () -let sr_dir = "/var/opt/iso" - let genisoimage = "/usr/bin/genisoimage" let failwith_fmt fmt = Printf.ksprintf failwith fmt let prng = Random.State.make_self_init () +module SR = struct + let dir = "/var/opt/iso" + + let name hostname = Printf.sprintf "SYSPREP-%s" hostname +end + let temp_name prefix suffix = let rnd = Random.State.bits prng land 0xFFFFFF in Printf.sprintf "%s%06x%s" prefix rnd suffix @@ -64,16 +68,16 @@ let with_temp_dir ?(dir = temp_dir) ?(perms = 0o700) prefix suffix f = finally (fun () -> f dir) (fun () -> Unixext.rm_rec dir) (** name of the ISO we will use for a VMi; this is not a path *) -let iso_name ~vm_uuid = +let iso_basename ~vm_uuid = let now = Ptime_clock.now () |> Ptime.to_rfc3339 in - Printf.sprintf "config-%s-%s.iso" vm_uuid now + Printf.sprintf "sysprep-%s-%s.iso" vm_uuid now -(** Create an ISO in [sr_dir] with content [unattend]. [sr_dir] is +(** Create an ISO in [SR.dir] with content [unattend]. [SR.dir] is created if it not already exists. Returns the path of the ISO image *) let make_iso ~vm_uuid ~unattend = try - let iso = sr_dir // iso_name ~vm_uuid in - Xapi_stdext_unix.Unixext.mkdir_rec sr_dir 0o755 ; + let iso = SR.dir // iso_basename ~vm_uuid in + Xapi_stdext_unix.Unixext.mkdir_rec SR.dir 0o755 ; with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" (fun temp_dir -> let path = temp_dir // "unattend.xml" in Unixext.write_string_to_file path unattend ; @@ -86,9 +90,32 @@ let make_iso ~vm_uuid ~unattend = let msg = Printexc.to_string e in Helpers.internal_error "%s failed: %s" __FUNCTION__ msg +(** create a local ISO SR when necessary and update it such that it + recognises any ISO we added or removed *) +let update_sr ~__context = + let host = Helpers.get_localhost ~__context in + let hostname = Db.Host.get_hostname ~__context ~self:host in + let label = SR.name hostname in + let mib n = Int64.(n * 1024 * 1024 |> of_int) in + let sr = + match Db.SR.get_by_name_label ~__context ~label with + | [sr] -> + sr + | sr :: _ -> + warn "%s: more than one SR with label %s" __FUNCTION__ label ; + sr + | [] -> + let device_config = [("location", SR.dir); ("legcay_mode", "true")] in + Xapi_sr.create ~__context ~host ~name_label:label ~device_config + ~content_type:"iso" ~_type:"iso" ~name_description:"Sysprep ISOs" + ~shared:false ~sm_config:[] ~physical_size:(mib 512) + in + Xapi_sr.scan ~__context ~sr + (* This function is executed on the host where [vm] is running *) let sysprep ~__context ~vm ~unattend = debug "%s" __FUNCTION__ ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in let iso = make_iso ~vm_uuid ~unattend in - debug "%s: created %s" __FUNCTION__ iso + debug "%s: created ISO %s" __FUNCTION__ iso ; + update_sr ~__context From 2061e2e6f22c89fdf7012878dbc477facb8acc8b Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Mon, 16 Jun 2025 13:57:01 +0100 Subject: [PATCH 082/111] CP-308455 VM.sysprep Find VBD for VM's CDR We need to locate the CD drive of the VM. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 56 +++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 87f2689e0dd..84b1336827d 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -21,8 +21,6 @@ let ( // ) = Filename.concat let finally = Xapi_stdext_pervasives.Pervasiveext.finally -let temp_dir = Filename.get_temp_dir_name () - let genisoimage = "/usr/bin/genisoimage" let failwith_fmt fmt = Printf.ksprintf failwith fmt @@ -35,10 +33,13 @@ module SR = struct let name hostname = Printf.sprintf "SYSPREP-%s" hostname end +(** create a name with a random infix *) let temp_name prefix suffix = let rnd = Random.State.bits prng land 0xFFFFFF in Printf.sprintf "%s%06x%s" prefix rnd suffix +let temp_dir = Filename.get_temp_dir_name () + (** [mkdtmp] creates a directory in [dir] and returns its path. If [dir] does not yet exist it is created. It is a an error if [dir] exists and is not a directory. *) @@ -67,7 +68,7 @@ let with_temp_dir ?(dir = temp_dir) ?(perms = 0o700) prefix suffix f = let dir = mkdtemp ~dir ~perms prefix suffix in finally (fun () -> f dir) (fun () -> Unixext.rm_rec dir) -(** name of the ISO we will use for a VMi; this is not a path *) +(** name of the ISO we will use for a VM; this is not a path *) let iso_basename ~vm_uuid = let now = Ptime_clock.now () |> Ptime.to_rfc3339 in Printf.sprintf "sysprep-%s-%s.iso" vm_uuid now @@ -76,7 +77,8 @@ let iso_basename ~vm_uuid = created if it not already exists. Returns the path of the ISO image *) let make_iso ~vm_uuid ~unattend = try - let iso = SR.dir // iso_basename ~vm_uuid in + let basename = iso_basename ~vm_uuid in + let iso = SR.dir // basename in Xapi_stdext_unix.Unixext.mkdir_rec SR.dir 0o755 ; with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" (fun temp_dir -> let path = temp_dir // "unattend.xml" in @@ -84,7 +86,7 @@ let make_iso ~vm_uuid ~unattend = debug "%s: written to %s" __FUNCTION__ path ; let args = ["-r"; "-J"; "-o"; iso; temp_dir] in Forkhelpers.execute_command_get_output genisoimage args |> ignore ; - iso + (iso, basename) ) with e -> let msg = Printexc.to_string e in @@ -105,17 +107,53 @@ let update_sr ~__context = warn "%s: more than one SR with label %s" __FUNCTION__ label ; sr | [] -> - let device_config = [("location", SR.dir); ("legcay_mode", "true")] in + let device_config = [("location", SR.dir); ("legacy_mode", "true")] in Xapi_sr.create ~__context ~host ~name_label:label ~device_config ~content_type:"iso" ~_type:"iso" ~name_description:"Sysprep ISOs" ~shared:false ~sm_config:[] ~physical_size:(mib 512) in - Xapi_sr.scan ~__context ~sr + Xapi_sr.scan ~__context ~sr ; + sr + +let find_cdr_vbd ~__context ~vm = + let vbds = Db.VM.get_VBDs ~__context ~self:vm in + let vbds' = + List.map (fun self -> (self, Db.VBD.get_record ~__context ~self)) vbds + in + let is_cd (_rf, rc) = + let open API in + rc.vBD_type = `CD && rc.vBD_empty + in + let uuid = Db.VM.get_uuid ~__context ~self:vm in + match List.filter is_cd vbds' with + | [] -> + failwith_fmt "%s: can't find CDR for VM %s" __FUNCTION__ uuid + | [(rf, rc)] -> + debug "%s: for VM %s using VBD %s" __FUNCTION__ uuid rc.API.vBD_uuid ; + rf + | (rf, rc) :: _ -> + debug "%s: for VM %s using VBD %s" __FUNCTION__ uuid rc.API.vBD_uuid ; + warn "%s: for VM %s found additions VBDs" __FUNCTION__ uuid ; + rf + +let find_vdi ~__context ~label = + match Db.VDI.get_by_name_label ~__context ~label with + | [] -> + failwith_fmt "%s: can't find VDI for %s" __FUNCTION__ label + | [vdi] -> + vdi + | vdi :: _ -> + warn "%s: more than one VDI with label %s" __FUNCTION__ label ; + vdi (* This function is executed on the host where [vm] is running *) let sysprep ~__context ~vm ~unattend = debug "%s" __FUNCTION__ ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in - let iso = make_iso ~vm_uuid ~unattend in + let iso, label = make_iso ~vm_uuid ~unattend in debug "%s: created ISO %s" __FUNCTION__ iso ; - update_sr ~__context + let _sr = update_sr ~__context in + let vbd = find_cdr_vbd ~__context ~vm in + let vdi = find_vdi ~__context ~label in + debug "%s: inserting Syspep VDI for VM %s" __FUNCTION__ vm_uuid ; + Xapi_vbd.insert ~__context ~vdi ~vbd From b22f441be1dfa0706eec4895e2947deff3a859fc Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Tue, 17 Jun 2025 11:03:18 +0100 Subject: [PATCH 083/111] CP-308455 VM.sysprep Implement trigger The VM is notified to perform a sysprep by writing to XenStore. The VM picks this up via its guest agent. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 84b1336827d..ae3acbb030e 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -115,6 +115,7 @@ let update_sr ~__context = Xapi_sr.scan ~__context ~sr ; sr +(** Find the VBD for the CD drive on [vm] *) let find_cdr_vbd ~__context ~vm = let vbds = Db.VM.get_VBDs ~__context ~self:vm in let vbds' = @@ -136,6 +137,8 @@ let find_cdr_vbd ~__context ~vm = warn "%s: for VM %s found additions VBDs" __FUNCTION__ uuid ; rf +(** Find the VDI that contains the unattend.xml based on its name. This + should be unique *) let find_vdi ~__context ~label = match Db.VDI.get_by_name_label ~__context ~label with | [] -> @@ -146,14 +149,29 @@ let find_vdi ~__context ~label = warn "%s: more than one VDI with label %s" __FUNCTION__ label ; vdi +let trigger ~domid = + let open Ezxenstore_core.Xenstore in + let control = Printf.sprintf "/local/domain/%Ld/control/sysprep" domid in + with_xs (fun xs -> + xs.Xs.write (control // "filename") "D://unattend.xml" ; + Thread.delay 5.0 ; + xs.Xs.write (control // "action") "sysprep" + ) ; + debug "%s: notified domain %Ld" __FUNCTION__ domid + (* This function is executed on the host where [vm] is running *) let sysprep ~__context ~vm ~unattend = debug "%s" __FUNCTION__ ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in + let domid = Db.VM.get_domid ~__context ~self:vm in + if domid <= 0L then + failwith_fmt "%s: VM %s does not have a domain" __FUNCTION__ vm_uuid ; let iso, label = make_iso ~vm_uuid ~unattend in debug "%s: created ISO %s" __FUNCTION__ iso ; let _sr = update_sr ~__context in let vbd = find_cdr_vbd ~__context ~vm in let vdi = find_vdi ~__context ~label in - debug "%s: inserting Syspep VDI for VM %s" __FUNCTION__ vm_uuid ; - Xapi_vbd.insert ~__context ~vdi ~vbd + debug "%s: inserting Sysppep VDI for VM %s" __FUNCTION__ vm_uuid ; + Xapi_vbd.insert ~__context ~vdi ~vbd ; + Thread.delay 5.0 ; + trigger ~domid From 84e5a47de81be1c3e564752144536530a428c2f1 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 18 Jun 2025 10:28:32 +0100 Subject: [PATCH 084/111] CP-308455 VM.sysprep log sysprep status Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index ae3acbb030e..69459107ced 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -149,15 +149,19 @@ let find_vdi ~__context ~label = warn "%s: more than one VDI with label %s" __FUNCTION__ label ; vdi +(** notify the VM with [domid] to run sysprep and where to find the + file. *) let trigger ~domid = let open Ezxenstore_core.Xenstore in let control = Printf.sprintf "/local/domain/%Ld/control/sysprep" domid in - with_xs (fun xs -> - xs.Xs.write (control // "filename") "D://unattend.xml" ; - Thread.delay 5.0 ; - xs.Xs.write (control // "action") "sysprep" - ) ; - debug "%s: notified domain %Ld" __FUNCTION__ domid + with_xs @@ fun xs -> + xs.Xs.write (control // "filename") "D://unattend.xml" ; + Thread.delay 5.0 ; + xs.Xs.write (control // "action") "sysprep" ; + debug "%s: notified domain %Ld" __FUNCTION__ domid ; + Thread.delay 5.0 ; + let action = xs.Xs.read (control // "action") in + debug "%s: sysprep for domain %Ld reports %S" __FUNCTION__ domid action (* This function is executed on the host where [vm] is running *) let sysprep ~__context ~vm ~unattend = From 637267205c3f7b9c7ab7c0b73c40ce95049fe92f Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 18 Jun 2025 13:33:11 +0100 Subject: [PATCH 085/111] CP-308455 VM.sysprep watch execution, clean up Introduce on_startup(), called on xapi startup, to clean up the local SR to avoid accumulating ISO files. Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_errors.ml | 3 + ocaml/xapi-consts/api_errors.ml | 6 ++ ocaml/xapi/vm_sysprep.ml | 102 +++++++++++++++++++++++++------- ocaml/xapi/vm_sysprep.mli | 12 +++- ocaml/xapi/xapi.ml | 4 ++ ocaml/xapi/xapi_vm.ml | 13 +++- 6 files changed, 117 insertions(+), 23 deletions(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index 30e185ac192..acd470a6f46 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -2048,6 +2048,9 @@ let _ = enable it in XC or run xe pool-enable-tls-verification instead." () ; + error Api_errors.sysprep ["vm"; "message"] + ~doc:"VM.sysprep error with details in the message" () ; + message (fst Api_messages.ha_pool_overcommitted) ~doc: diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 077a8dacbf9..210bebe1b2a 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -1429,3 +1429,9 @@ let host_driver_no_hardware = add_error "HOST_DRIVER_NO_HARDWARE" let tls_verification_not_enabled_in_pool = add_error "TLS_VERIFICATION_NOT_ENABLED_IN_POOL" + +(* VM.sysprep *) + +(* Using a single error during development, might want to expand this + later *) +let sysprep = add_error "SYSPREP" diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 69459107ced..52f16473633 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -31,8 +31,36 @@ module SR = struct let dir = "/var/opt/iso" let name hostname = Printf.sprintf "SYSPREP-%s" hostname + + let find_opt ~__context ~label = + match Db.SR.get_by_name_label ~__context ~label with + | [sr] -> + Some sr + | sr :: _ -> + warn "%s: more than one SR with label %s" __FUNCTION__ label ; + Some sr + | [] -> + None end +(** This is called on xapi startup. Opportunity to set up or clean up. + We destroy all VDIs that are unused. *) +let on_startup ~__context = + let host = Helpers.get_localhost ~__context in + let hostname = Db.Host.get_hostname ~__context ~self:host in + match SR.find_opt ~__context ~label:(SR.name hostname) with + | None -> + () + | Some sr -> ( + Db.SR.get_VDIs ~__context ~self:sr + |> List.iter @@ fun self -> + match Db.VDI.get_record ~__context ~self with + | API.{vDI_VBDs= []; vDI_location= _location; _} -> + Xapi_vdi.destroy ~__context ~self + | _ -> + () + ) + (** create a name with a random infix *) let temp_name prefix suffix = let rnd = Random.State.bits prng land 0xFFFFFF in @@ -52,15 +80,16 @@ let mkdtemp ?(dir = temp_dir) ?(perms = 0o700) prefix suffix = | false -> Unixext.mkdir_rec dir perms ) ; - let rec loop = function - | n when n >= 20 -> + let rec try_upto = function + | n when n < 0 -> failwith_fmt "s: can't create directory in %s" __FUNCTION__ dir | n -> ( let path = Filename.concat dir (temp_name prefix suffix) in - try Sys.mkdir path perms ; path with Sys_error _ -> loop (n + 1) + try Sys.mkdir path perms ; path + with Sys_error _ -> try_upto (n - 1) ) in - loop 0 + try_upto 20 (** Crteate a temporary directory, and pass its path to [f]. Once [f] returns the directory is removed again *) @@ -100,13 +129,10 @@ let update_sr ~__context = let label = SR.name hostname in let mib n = Int64.(n * 1024 * 1024 |> of_int) in let sr = - match Db.SR.get_by_name_label ~__context ~label with - | [sr] -> - sr - | sr :: _ -> - warn "%s: more than one SR with label %s" __FUNCTION__ label ; + match SR.find_opt ~__context ~label with + | Some sr -> sr - | [] -> + | None -> let device_config = [("location", SR.dir); ("legacy_mode", "true")] in Xapi_sr.create ~__context ~host ~name_label:label ~device_config ~content_type:"iso" ~_type:"iso" ~name_description:"Sysprep ISOs" @@ -154,28 +180,62 @@ let find_vdi ~__context ~label = let trigger ~domid = let open Ezxenstore_core.Xenstore in let control = Printf.sprintf "/local/domain/%Ld/control/sysprep" domid in - with_xs @@ fun xs -> - xs.Xs.write (control // "filename") "D://unattend.xml" ; - Thread.delay 5.0 ; - xs.Xs.write (control // "action") "sysprep" ; - debug "%s: notified domain %Ld" __FUNCTION__ domid ; - Thread.delay 5.0 ; - let action = xs.Xs.read (control // "action") in - debug "%s: sysprep for domain %Ld reports %S" __FUNCTION__ domid action + with_xs (fun xs -> + xs.Xs.write (control // "filename") "D://unattend.xml" ; + Thread.delay 5.0 ; + xs.Xs.write (control // "action") "sysprep" ; + debug "%s: notified domain %Ld" __FUNCTION__ domid ; + let rec wait n = + match (n, xs.Xs.read (control // "action")) with + | _, "running" -> + "running" + | n, action when n < 0 -> + action + | _, _ -> + Thread.delay 1.0 ; + wait (n - 1) + in + (* wait up to 5 iterations for runnung to appear or report whatever + is the status at the end *) + wait 5 + ) (* This function is executed on the host where [vm] is running *) let sysprep ~__context ~vm ~unattend = + let open Ezxenstore_core.Xenstore in debug "%s" __FUNCTION__ ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in let domid = Db.VM.get_domid ~__context ~self:vm in + let control = Printf.sprintf "/local/domain/%Ld/control" domid in if domid <= 0L then - failwith_fmt "%s: VM %s does not have a domain" __FUNCTION__ vm_uuid ; + failwith_fmt "%s: VM %s is not running" __FUNCTION__ vm_uuid ; + if String.length unattend > 32 * 1024 then + fail "%s: provided file for %s larger than 32KiB" __FUNCTION__ vm_uuid ; + with_xs (fun xs -> + match xs.Xs.read (control // "feature-sysprep") with + | "1" -> + debug "%s: VM %s supports sysprep" __FUNCTION__ vm_uuid + | _ -> + fail "VM %s does not support sysprep" vm_uuid + | exception _ -> + fail "VM %s does not support sysprep" vm_uuid + ) ; let iso, label = make_iso ~vm_uuid ~unattend in debug "%s: created ISO %s" __FUNCTION__ iso ; let _sr = update_sr ~__context in let vbd = find_cdr_vbd ~__context ~vm in let vdi = find_vdi ~__context ~label in - debug "%s: inserting Sysppep VDI for VM %s" __FUNCTION__ vm_uuid ; + debug "%s: inserting Sysprep VDI for VM %s" __FUNCTION__ vm_uuid ; Xapi_vbd.insert ~__context ~vdi ~vbd ; Thread.delay 5.0 ; - trigger ~domid + match trigger ~domid with + | "running" -> + debug "%s: sysprep running, ejecting CD" __FUNCTION__ ; + Xapi_vbd.eject ~__context ~vbd ; + Sys.remove iso ; + Result.ok () + | status -> + debug "%s: sysprep %S, ejecting CD" __FUNCTION__ status ; + Xapi_vbd.eject ~__context ~vbd ; + Sys.remove iso ; + Result.error status diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli index 9f6f9ab9724..5c04935d27a 100644 --- a/ocaml/xapi/vm_sysprep.mli +++ b/ocaml/xapi/vm_sysprep.mli @@ -12,4 +12,14 @@ * GNU Lesser General Public License for more details. *) -val sysprep : __context:Context.t -> vm:API.ref_VM -> unattend:string -> unit +val on_startup : __context:Context.t -> unit +(** clean up on toolstart start up *) + +val sysprep : + __context:Context.t + -> vm:API.ref_VM + -> unattend:string + -> (unit, string) Result.t +(** Execute sysprep on [vm] using script [unattend]. This requires + driver support from the VM and is checked. [unattend:string] must + not exceed 32kb. *) diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml index a12e3ec0c83..3d908bdec0f 100644 --- a/ocaml/xapi/xapi.ml +++ b/ocaml/xapi/xapi.ml @@ -1170,6 +1170,10 @@ let server_init () = , [Startup.OnThread] , Remote_requests.handle_requests ) + ; ( "Remove local ISO SR" + , [Startup.OnThread] + , fun () -> Vm_sysprep.on_startup ~__context + ) ] ; ( match Pool_role.get_role () with | Pool_role.Master -> diff --git a/ocaml/xapi/xapi_vm.ml b/ocaml/xapi/xapi_vm.ml index f76c632665f..a2cca867c63 100644 --- a/ocaml/xapi/xapi_vm.ml +++ b/ocaml/xapi/xapi_vm.ml @@ -1703,4 +1703,15 @@ let get_secureboot_readiness ~__context ~self = ) let sysprep ~__context ~self ~unattend = - Vm_sysprep.sysprep ~__context ~vm:self ~unattend + match Vm_sysprep.sysprep ~__context ~vm:self ~unattend with + | Ok _ -> + () + | Error msg -> + let uuid = Db.VM.get_uuid ~__context ~self in + raise + Api_errors.( + Server_error (sysprep, [uuid; "Sysprep not found running: " ^ msg]) + ) + | exception Failure msg -> + let uuid = Db.VM.get_uuid ~__context ~self in + raise Api_errors.(Server_error (sysprep, [uuid; msg])) From bf3ef79d9167d0c90e725490c1f5759bf46a16d3 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 19 Jun 2025 14:21:30 +0100 Subject: [PATCH 086/111] CP-308455 VM.sysprep add feature flag For simplicity, add vm-sysprep-enabled = true/false to xapi.conf rather than using a full V6D feature flag. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 5 +++-- ocaml/xapi/xapi_globs.ml | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 52f16473633..660f2e1212f 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -85,8 +85,7 @@ let mkdtemp ?(dir = temp_dir) ?(perms = 0o700) prefix suffix = failwith_fmt "s: can't create directory in %s" __FUNCTION__ dir | n -> ( let path = Filename.concat dir (temp_name prefix suffix) in - try Sys.mkdir path perms ; path - with Sys_error _ -> try_upto (n - 1) + try Sys.mkdir path perms ; path with Sys_error _ -> try_upto (n - 1) ) in try_upto 20 @@ -204,6 +203,8 @@ let trigger ~domid = let sysprep ~__context ~vm ~unattend = let open Ezxenstore_core.Xenstore in debug "%s" __FUNCTION__ ; + if not !Xapi_globs.vm_sysprep_enabled then + fail "Experimental VM.sysprep API call is not enabled" ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in let domid = Db.VM.get_domid ~__context ~self:vm in let control = Printf.sprintf "/local/domain/%Ld/control" domid in diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index ad8914e9de7..fa0cc8d3451 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -1089,6 +1089,9 @@ let reuse_pool_sessions = ref false let validate_reusable_pool_session = ref false (* Validate a reusable session before each use. This is slower and should not be required *) +let vm_sysprep_enabled = ref false +(* enable VM.sysprep API *) + let test_open = ref 0 let xapi_requests_cgroup = @@ -1751,6 +1754,11 @@ let other_options = , (fun () -> string_of_bool !validate_reusable_pool_session) , "Enable validation of reusable pool sessions before use" ) + ; ( "vm-sysprep-enabled" + , Arg.Set vm_sysprep_enabled + , (fun () -> string_of_bool !vm_sysprep_enabled) + , "Enable VM.sysprep API" + ) ] (* The options can be set with the variable xapiflags in /etc/sysconfig/xapi. From 4c3e27ddcdc6ebc638ded75ffce458944e7b04b2 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 19 Jun 2025 16:46:55 +0100 Subject: [PATCH 087/111] CP-308455 VM.syspreo add some comments Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 660f2e1212f..da0e08bb68d 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -27,6 +27,8 @@ let failwith_fmt fmt = Printf.ksprintf failwith fmt let prng = Random.State.make_self_init () +(* A local ISO SR; we create an ISO that holds an unattend.xml file that + is than passed as CD to a VM *) module SR = struct let dir = "/var/opt/iso" @@ -61,7 +63,8 @@ let on_startup ~__context = () ) -(** create a name with a random infix *) +(** create a name with a random infix. We need random names for + temporay directories to avoid collition *) let temp_name prefix suffix = let rnd = Random.State.bits prng land 0xFFFFFF in Printf.sprintf "%s%06x%s" prefix rnd suffix From 20e33e76ae9ffd8f42c1a08173c51cdc8c8bd975 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Fri, 20 Jun 2025 10:16:33 +0100 Subject: [PATCH 088/111] CP-308455 VM.sysprep unify error handling Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 24 ++++++++++++++---------- ocaml/xapi/vm_sysprep.mli | 10 ++++------ ocaml/xapi/xapi_vm.ml | 19 ++++++++++--------- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index da0e08bb68d..647d35bffa3 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -23,7 +23,12 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let genisoimage = "/usr/bin/genisoimage" -let failwith_fmt fmt = Printf.ksprintf failwith fmt +(** This will be shown to the user to explain a failure *) +exception Sysprep of string + +let fail fmt = Printf.ksprintf (fun msg -> raise (Sysprep msg)) fmt + +let internal_error = Helpers.internal_error let prng = Random.State.make_self_init () @@ -77,7 +82,7 @@ let temp_dir = Filename.get_temp_dir_name () let mkdtemp ?(dir = temp_dir) ?(perms = 0o700) prefix suffix = ( match Sys.file_exists dir with | true when not (Sys.is_directory dir) -> - failwith_fmt "s: %s is not a directory" __FUNCTION__ dir + internal_error "s: %s is not a directory" __FUNCTION__ dir | true -> () | false -> @@ -85,7 +90,7 @@ let mkdtemp ?(dir = temp_dir) ?(perms = 0o700) prefix suffix = ) ; let rec try_upto = function | n when n < 0 -> - failwith_fmt "s: can't create directory in %s" __FUNCTION__ dir + internal_error "%s: can't create directory %S" __FUNCTION__ dir | n -> ( let path = Filename.concat dir (temp_name prefix suffix) in try Sys.mkdir path perms ; path with Sys_error _ -> try_upto (n - 1) @@ -156,7 +161,7 @@ let find_cdr_vbd ~__context ~vm = let uuid = Db.VM.get_uuid ~__context ~self:vm in match List.filter is_cd vbds' with | [] -> - failwith_fmt "%s: can't find CDR for VM %s" __FUNCTION__ uuid + fail "can't find CDR for VM %s" uuid | [(rf, rc)] -> debug "%s: for VM %s using VBD %s" __FUNCTION__ uuid rc.API.vBD_uuid ; rf @@ -170,7 +175,7 @@ let find_cdr_vbd ~__context ~vm = let find_vdi ~__context ~label = match Db.VDI.get_by_name_label ~__context ~label with | [] -> - failwith_fmt "%s: can't find VDI for %s" __FUNCTION__ label + internal_error "%s: can't find VDI for %s" __FUNCTION__ label | [vdi] -> vdi | vdi :: _ -> @@ -212,7 +217,7 @@ let sysprep ~__context ~vm ~unattend = let domid = Db.VM.get_domid ~__context ~self:vm in let control = Printf.sprintf "/local/domain/%Ld/control" domid in if domid <= 0L then - failwith_fmt "%s: VM %s is not running" __FUNCTION__ vm_uuid ; + fail " VM %s is not running" __FUNCTION__ vm_uuid ; if String.length unattend > 32 * 1024 then fail "%s: provided file for %s larger than 32KiB" __FUNCTION__ vm_uuid ; with_xs (fun xs -> @@ -236,10 +241,9 @@ let sysprep ~__context ~vm ~unattend = | "running" -> debug "%s: sysprep running, ejecting CD" __FUNCTION__ ; Xapi_vbd.eject ~__context ~vbd ; - Sys.remove iso ; - Result.ok () + Sys.remove iso | status -> debug "%s: sysprep %S, ejecting CD" __FUNCTION__ status ; Xapi_vbd.eject ~__context ~vbd ; - Sys.remove iso ; - Result.error status + fail "VM %s sysprep not found running as expected: %S" vm_uuid status + status diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli index 5c04935d27a..e577af93a07 100644 --- a/ocaml/xapi/vm_sysprep.mli +++ b/ocaml/xapi/vm_sysprep.mli @@ -12,14 +12,12 @@ * GNU Lesser General Public License for more details. *) +exception Failure of string + val on_startup : __context:Context.t -> unit (** clean up on toolstart start up *) -val sysprep : - __context:Context.t - -> vm:API.ref_VM - -> unattend:string - -> (unit, string) Result.t +val sysprep : __context:Context.t -> vm:API.ref_VM -> unattend:string -> unit (** Execute sysprep on [vm] using script [unattend]. This requires driver support from the VM and is checked. [unattend:string] must - not exceed 32kb. *) + not exceed 32kb. Raised [Failure] that must be handled, *) diff --git a/ocaml/xapi/xapi_vm.ml b/ocaml/xapi/xapi_vm.ml index a2cca867c63..f2a8b5bfe8a 100644 --- a/ocaml/xapi/xapi_vm.ml +++ b/ocaml/xapi/xapi_vm.ml @@ -1703,15 +1703,16 @@ let get_secureboot_readiness ~__context ~self = ) let sysprep ~__context ~self ~unattend = + let uuid = Db.VM.get_uuid ~__context ~self in + debug "%s %S (1/2)" __FUNCTION__ uuid ; match Vm_sysprep.sysprep ~__context ~vm:self ~unattend with - | Ok _ -> + | () -> + debug "%s %S (2/2)" __FUNCTION__ uuid ; () - | Error msg -> - let uuid = Db.VM.get_uuid ~__context ~self in - raise - Api_errors.( - Server_error (sysprep, [uuid; "Sysprep not found running: " ^ msg]) - ) - | exception Failure msg -> - let uuid = Db.VM.get_uuid ~__context ~self in + | exception Vm_sysprep.Sysprep msg -> + debug "%s %S (2/2)" __FUNCTION__ uuid ; + raise Api_errors.(Server_error (sysprep, [uuid; msg])) + | exception e -> + debug "%s %S (2/2)" __FUNCTION__ uuid ; + let msg = Printexc.to_string e in raise Api_errors.(Server_error (sysprep, [uuid; msg])) From 57dbab7513ffa5c09be53dbc3339b36d9bf9c790 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Mon, 23 Jun 2025 14:48:11 +0100 Subject: [PATCH 089/111] CP-308455 VM.sysprep list genisoimage as resource Add the paths to genisoimage so xapi_globs and list it as non-essential binary. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 4 ++-- ocaml/xapi/vm_sysprep.mli | 3 ++- ocaml/xapi/xapi_globs.ml | 3 +++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 647d35bffa3..22abb870a35 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -21,7 +21,7 @@ let ( // ) = Filename.concat let finally = Xapi_stdext_pervasives.Pervasiveext.finally -let genisoimage = "/usr/bin/genisoimage" +let genisoimage = !Xapi_globs.genisoimage_path (** This will be shown to the user to explain a failure *) exception Sysprep of string @@ -245,5 +245,5 @@ let sysprep ~__context ~vm ~unattend = | status -> debug "%s: sysprep %S, ejecting CD" __FUNCTION__ status ; Xapi_vbd.eject ~__context ~vbd ; + Sys.remove iso ; fail "VM %s sysprep not found running as expected: %S" vm_uuid status - status diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli index e577af93a07..db4a18455e7 100644 --- a/ocaml/xapi/vm_sysprep.mli +++ b/ocaml/xapi/vm_sysprep.mli @@ -12,7 +12,8 @@ * GNU Lesser General Public License for more details. *) -exception Failure of string +(** error message that may be passed to API clients *) +exception Sysprep of string val on_startup : __context:Context.t -> unit (** clean up on toolstart start up *) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index fa0cc8d3451..88b957c2f2f 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -1097,6 +1097,8 @@ let test_open = ref 0 let xapi_requests_cgroup = "/sys/fs/cgroup/cpu/control.slice/xapi.service/request" +let genisoimage_path = ref "/usr/bin/genisoimage" + (* Event.{from,next} batching delays *) let make_batching name ~delay_before ~delay_between = let name = Printf.sprintf "%s_delay" name in @@ -1950,6 +1952,7 @@ module Resources = struct , pvsproxy_close_cache_vdi , "Path to close-cache-vdi.sh" ) + ; ("genisoimage", genisoimage_path, "Path to genisoimage") ] let essential_files = From b16b6d8f4b593314fcb395b4378b593182ffdba1 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Mon, 23 Jun 2025 14:48:11 +0100 Subject: [PATCH 090/111] CP-308455 VM.sysprep improve error handling, use API.Client - use API.Client to make sure API calls are properly forwarded. - unify error handlign and use a sum type for the error - invoke on_startup later during xapi startup Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_lifecycle.ml | 2 +- ocaml/xapi-consts/api_errors.ml | 4 --- ocaml/xapi/message_forwarding.ml | 3 +- ocaml/xapi/vm_sysprep.ml | 60 +++++++++++++++++++++----------- ocaml/xapi/vm_sysprep.mli | 11 +++++- ocaml/xapi/xapi.ml | 8 ++--- ocaml/xapi/xapi_vm.ml | 32 +++++++++++++---- 7 files changed, 80 insertions(+), 40 deletions(-) diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index ef79f8aec15..2880742c9cd 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -240,7 +240,7 @@ let prototyped_of_message = function | "host", "set_numa_affinity_policy" -> Some "24.0.0" | "VM", "sysprep" -> - Some "25.22.0" + Some "25.23.0-next" | "VM", "get_secureboot_readiness" -> Some "24.17.0" | "VM", "set_uefi_mode" -> diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 210bebe1b2a..42722c118d6 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -1430,8 +1430,4 @@ let host_driver_no_hardware = add_error "HOST_DRIVER_NO_HARDWARE" let tls_verification_not_enabled_in_pool = add_error "TLS_VERIFICATION_NOT_ENABLED_IN_POOL" -(* VM.sysprep *) - -(* Using a single error during development, might want to expand this - later *) let sysprep = add_error "SYSPREP" diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index 15b984ad993..4c79f91cf5f 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -3124,8 +3124,7 @@ functor with_vm_operation ~__context ~self ~doc:"VM.sysprep" ~op:`sysprep ~policy (fun () -> forward_vm_op ~local_fn ~__context ~vm:self ~remote_fn - ) ; - Xapi_vm_lifecycle.update_allowed_operations ~__context ~self + ) end module VM_metrics = struct end diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 22abb870a35..fd769cc8099 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -15,6 +15,7 @@ module D = Debug.Make (struct let name = __MODULE__ end) open D +open Client open Xapi_stdext_unix let ( // ) = Filename.concat @@ -23,15 +24,27 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally let genisoimage = !Xapi_globs.genisoimage_path -(** This will be shown to the user to explain a failure *) -exception Sysprep of string +type error = + | API_not_enabled + | Other of string + | VM_CDR_not_found + | VM_misses_feature + | VM_not_running + | VM_sysprep_timeout + | XML_too_large -let fail fmt = Printf.ksprintf (fun msg -> raise (Sysprep msg)) fmt +exception Sysprep of error + +let _fail_fmt fmt = Printf.ksprintf (fun msg -> raise (Sysprep (Other msg))) fmt + +let fail error = raise (Sysprep error) let internal_error = Helpers.internal_error let prng = Random.State.make_self_init () +let call = Helpers.call_api_functions + (* A local ISO SR; we create an ISO that holds an unattend.xml file that is than passed as CD to a VM *) module SR = struct @@ -62,14 +75,15 @@ let on_startup ~__context = Db.SR.get_VDIs ~__context ~self:sr |> List.iter @@ fun self -> match Db.VDI.get_record ~__context ~self with - | API.{vDI_VBDs= []; vDI_location= _location; _} -> - Xapi_vdi.destroy ~__context ~self + | API.{vDI_VBDs= []; _} -> + call ~__context @@ fun rpc session_id -> + Client.VDI.destroy ~rpc ~session_id ~self | _ -> () ) (** create a name with a random infix. We need random names for - temporay directories to avoid collition *) + temporary directories to avoid collisions of concurrent API calls *) let temp_name prefix suffix = let rnd = Random.State.bits prng land 0xFFFFFF in Printf.sprintf "%s%06x%s" prefix rnd suffix @@ -141,11 +155,13 @@ let update_sr ~__context = sr | None -> let device_config = [("location", SR.dir); ("legacy_mode", "true")] in - Xapi_sr.create ~__context ~host ~name_label:label ~device_config + call ~__context @@ fun rpc session_id -> + Client.SR.create ~rpc ~session_id ~host ~name_label:label ~device_config ~content_type:"iso" ~_type:"iso" ~name_description:"Sysprep ISOs" ~shared:false ~sm_config:[] ~physical_size:(mib 512) in - Xapi_sr.scan ~__context ~sr ; + call ~__context @@ fun rpc session_id -> + Client.SR.scan ~rpc ~session_id ~sr ; sr (** Find the VBD for the CD drive on [vm] *) @@ -161,7 +177,7 @@ let find_cdr_vbd ~__context ~vm = let uuid = Db.VM.get_uuid ~__context ~self:vm in match List.filter is_cd vbds' with | [] -> - fail "can't find CDR for VM %s" uuid + fail VM_CDR_not_found | [(rf, rc)] -> debug "%s: for VM %s using VBD %s" __FUNCTION__ uuid rc.API.vBD_uuid ; rf @@ -189,7 +205,6 @@ let trigger ~domid = let control = Printf.sprintf "/local/domain/%Ld/control/sysprep" domid in with_xs (fun xs -> xs.Xs.write (control // "filename") "D://unattend.xml" ; - Thread.delay 5.0 ; xs.Xs.write (control // "action") "sysprep" ; debug "%s: notified domain %Ld" __FUNCTION__ domid ; let rec wait n = @@ -209,25 +224,27 @@ let trigger ~domid = (* This function is executed on the host where [vm] is running *) let sysprep ~__context ~vm ~unattend = - let open Ezxenstore_core.Xenstore in debug "%s" __FUNCTION__ ; if not !Xapi_globs.vm_sysprep_enabled then - fail "Experimental VM.sysprep API call is not enabled" ; + fail API_not_enabled ; let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in let domid = Db.VM.get_domid ~__context ~self:vm in let control = Printf.sprintf "/local/domain/%Ld/control" domid in if domid <= 0L then - fail " VM %s is not running" __FUNCTION__ vm_uuid ; + fail VM_not_running ; if String.length unattend > 32 * 1024 then - fail "%s: provided file for %s larger than 32KiB" __FUNCTION__ vm_uuid ; - with_xs (fun xs -> + fail XML_too_large ; + Ezxenstore_core.Xenstore.with_xs (fun xs -> + let open Ezxenstore_core.Xenstore in match xs.Xs.read (control // "feature-sysprep") with | "1" -> debug "%s: VM %s supports sysprep" __FUNCTION__ vm_uuid | _ -> - fail "VM %s does not support sysprep" vm_uuid + debug "%s: VM %s does not support sysprep" __FUNCTION__ vm_uuid ; + fail VM_misses_feature | exception _ -> - fail "VM %s does not support sysprep" vm_uuid + debug "%s: VM %s does not support sysprep" __FUNCTION__ vm_uuid ; + fail VM_misses_feature ) ; let iso, label = make_iso ~vm_uuid ~unattend in debug "%s: created ISO %s" __FUNCTION__ iso ; @@ -235,15 +252,16 @@ let sysprep ~__context ~vm ~unattend = let vbd = find_cdr_vbd ~__context ~vm in let vdi = find_vdi ~__context ~label in debug "%s: inserting Sysprep VDI for VM %s" __FUNCTION__ vm_uuid ; - Xapi_vbd.insert ~__context ~vdi ~vbd ; + call ~__context @@ fun rpc session_id -> + Client.VBD.insert ~rpc ~session_id ~vdi ~vbd ; Thread.delay 5.0 ; match trigger ~domid with | "running" -> debug "%s: sysprep running, ejecting CD" __FUNCTION__ ; - Xapi_vbd.eject ~__context ~vbd ; + Client.VBD.eject ~rpc ~session_id ~vbd ; Sys.remove iso | status -> debug "%s: sysprep %S, ejecting CD" __FUNCTION__ status ; - Xapi_vbd.eject ~__context ~vbd ; + Client.VBD.eject ~rpc ~session_id ~vbd ; Sys.remove iso ; - fail "VM %s sysprep not found running as expected: %S" vm_uuid status + fail VM_sysprep_timeout diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli index db4a18455e7..80f1874d7e9 100644 --- a/ocaml/xapi/vm_sysprep.mli +++ b/ocaml/xapi/vm_sysprep.mli @@ -13,7 +13,16 @@ *) (** error message that may be passed to API clients *) -exception Sysprep of string +type error = + | API_not_enabled + | Other of string + | VM_CDR_not_found + | VM_misses_feature + | VM_not_running + | VM_sysprep_timeout + | XML_too_large + +exception Sysprep of error val on_startup : __context:Context.t -> unit (** clean up on toolstart start up *) diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml index 3d908bdec0f..56561d76e06 100644 --- a/ocaml/xapi/xapi.ml +++ b/ocaml/xapi/xapi.ml @@ -1170,10 +1170,6 @@ let server_init () = , [Startup.OnThread] , Remote_requests.handle_requests ) - ; ( "Remove local ISO SR" - , [Startup.OnThread] - , fun () -> Vm_sysprep.on_startup ~__context - ) ] ; ( match Pool_role.get_role () with | Pool_role.Master -> @@ -1384,6 +1380,10 @@ let server_init () = , cache_metadata_vdis ) ; ("Stats reporting thread", [], Xapi_stats.start) + ; ( "Remove local ISO SR" + , [Startup.OnThread] + , fun () -> Vm_sysprep.on_startup ~__context + ) ] ; if !debug_dummy_data then Startup.run ~__context diff --git a/ocaml/xapi/xapi_vm.ml b/ocaml/xapi/xapi_vm.ml index f2a8b5bfe8a..f53f506e522 100644 --- a/ocaml/xapi/xapi_vm.ml +++ b/ocaml/xapi/xapi_vm.ml @@ -1704,15 +1704,33 @@ let get_secureboot_readiness ~__context ~self = let sysprep ~__context ~self ~unattend = let uuid = Db.VM.get_uuid ~__context ~self in - debug "%s %S (1/2)" __FUNCTION__ uuid ; + debug "%s %S" __FUNCTION__ uuid ; match Vm_sysprep.sysprep ~__context ~vm:self ~unattend with | () -> - debug "%s %S (2/2)" __FUNCTION__ uuid ; + debug "%s %S success" __FUNCTION__ uuid ; () - | exception Vm_sysprep.Sysprep msg -> - debug "%s %S (2/2)" __FUNCTION__ uuid ; + | exception Vm_sysprep.Sysprep API_not_enabled -> + raise Api_errors.(Server_error (sysprep, [uuid; "API call is disabled"])) + | exception Vm_sysprep.Sysprep VM_CDR_not_found -> + raise Api_errors.(Server_error (sysprep, [uuid; "CD-ROM drive not found"])) + | exception Vm_sysprep.Sysprep VM_misses_feature -> + raise + Api_errors.( + Server_error (sysprep, [uuid; "VM driver does not support sysprep"]) + ) + | exception Vm_sysprep.Sysprep VM_not_running -> + raise Api_errors.(Server_error (sysprep, [uuid; "VM is not running"])) + | exception Vm_sysprep.Sysprep VM_sysprep_timeout -> + raise + Api_errors.( + Server_error (sysprep, [uuid; "sysprep not found running - timeout"]) + ) + | exception Vm_sysprep.Sysprep XML_too_large -> + raise + Api_errors.( + Server_error (sysprep, [uuid; "unattend.xml file too large"]) + ) + | exception Vm_sysprep.Sysprep (Other msg) -> raise Api_errors.(Server_error (sysprep, [uuid; msg])) | exception e -> - debug "%s %S (2/2)" __FUNCTION__ uuid ; - let msg = Printexc.to_string e in - raise Api_errors.(Server_error (sysprep, [uuid; msg])) + raise e From c9e3bfd30677416406f860a30288da51ffc00deb Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Tue, 1 Jul 2025 15:36:29 +0100 Subject: [PATCH 091/111] CP-308455 VM.sysprep declare XML content as SecretString Desclare the string parameter holding unattend.xml as secret to avoid logging it. Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_vm.ml | 2 +- ocaml/xapi-cli-server/cli_operations.ml | 2 +- ocaml/xapi-types/secretString.ml | 2 ++ ocaml/xapi-types/secretString.mli | 2 ++ ocaml/xapi/vm_sysprep.ml | 4 ++-- ocaml/xapi/vm_sysprep.mli | 5 +++-- ocaml/xapi/xapi_vm.mli | 3 ++- 7 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ocaml/idl/datamodel_vm.ml b/ocaml/idl/datamodel_vm.ml index 5e4134afd0b..886b125659f 100644 --- a/ocaml/idl/datamodel_vm.ml +++ b/ocaml/idl/datamodel_vm.ml @@ -2375,7 +2375,7 @@ let sysprep = ~params: [ (Ref _vm, "self", "The VM") - ; (String, "unattend", "XML content passed to sysprep") + ; (SecretString, "unattend", "XML content passed to sysprep") ] ~doc:"Pass unattend.xml to Windows sysprep" ~allowed_roles:_R_VM_ADMIN () diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index f51c50851d4..a2cf537550e 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -3593,7 +3593,7 @@ let vm_sysprep fd printer rpc session_id params = let unattend = match get_client_file fd filename with | Some xml -> - xml + xml |> SecretString.of_string | None -> marshal fd (Command (PrintStderr "Failed to read file.\n")) ; raise (ExitWithError 1) diff --git a/ocaml/xapi-types/secretString.ml b/ocaml/xapi-types/secretString.ml index 781dac86697..b552e46edfd 100644 --- a/ocaml/xapi-types/secretString.ml +++ b/ocaml/xapi-types/secretString.ml @@ -24,6 +24,8 @@ let write_to_channel c s = output_string c s let equal = String.equal +let length = String.length + let pool_secret = "pool_secret" let with_cookie t cookies = (pool_secret, t) :: cookies diff --git a/ocaml/xapi-types/secretString.mli b/ocaml/xapi-types/secretString.mli index 82d97eaaa72..6d85364d04e 100644 --- a/ocaml/xapi-types/secretString.mli +++ b/ocaml/xapi-types/secretString.mli @@ -25,6 +25,8 @@ val of_string : string -> t val equal : t -> t -> bool +val length : t -> int + val json_rpc_of_t : t -> Rpc.t val t_of_rpc : Rpc.t -> t diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index fd769cc8099..6892898e9fb 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -132,7 +132,7 @@ let make_iso ~vm_uuid ~unattend = Xapi_stdext_unix.Unixext.mkdir_rec SR.dir 0o755 ; with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" (fun temp_dir -> let path = temp_dir // "unattend.xml" in - Unixext.write_string_to_file path unattend ; + SecretString.write_to_file path unattend ; debug "%s: written to %s" __FUNCTION__ path ; let args = ["-r"; "-J"; "-o"; iso; temp_dir] in Forkhelpers.execute_command_get_output genisoimage args |> ignore ; @@ -232,7 +232,7 @@ let sysprep ~__context ~vm ~unattend = let control = Printf.sprintf "/local/domain/%Ld/control" domid in if domid <= 0L then fail VM_not_running ; - if String.length unattend > 32 * 1024 then + if SecretString.length unattend > 32 * 1024 then fail XML_too_large ; Ezxenstore_core.Xenstore.with_xs (fun xs -> let open Ezxenstore_core.Xenstore in diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli index 80f1874d7e9..5c11ef7dfb7 100644 --- a/ocaml/xapi/vm_sysprep.mli +++ b/ocaml/xapi/vm_sysprep.mli @@ -27,7 +27,8 @@ exception Sysprep of error val on_startup : __context:Context.t -> unit (** clean up on toolstart start up *) -val sysprep : __context:Context.t -> vm:API.ref_VM -> unattend:string -> unit +val sysprep : + __context:Context.t -> vm:API.ref_VM -> unattend:SecretString.t -> unit (** Execute sysprep on [vm] using script [unattend]. This requires - driver support from the VM and is checked. [unattend:string] must + driver support from the VM and is checked. [unattend] must not exceed 32kb. Raised [Failure] that must be handled, *) diff --git a/ocaml/xapi/xapi_vm.mli b/ocaml/xapi/xapi_vm.mli index 005b4cae4ae..12515874aeb 100644 --- a/ocaml/xapi/xapi_vm.mli +++ b/ocaml/xapi/xapi_vm.mli @@ -451,4 +451,5 @@ val add_to_blocked_operations : val remove_from_blocked_operations : __context:Context.t -> self:API.ref_VM -> key:API.vm_operations -> unit -val sysprep : __context:Context.t -> self:API.ref_VM -> unattend:string -> unit +val sysprep : + __context:Context.t -> self:API.ref_VM -> unattend:SecretString.t -> unit From 903737db24ee30cd8cdb8e19d00873b3ba4742f8 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 10:37:30 +0100 Subject: [PATCH 092/111] Revert "CP-308455 VM.sysprep declare XML content as SecretString" This reverts commit d6ee7d150e2d439ba22816bc28ee43edd9390315. Signed-off-by: Christian Lindig --- ocaml/idl/datamodel_vm.ml | 2 +- ocaml/xapi-cli-server/cli_operations.ml | 2 +- ocaml/xapi-types/secretString.ml | 2 -- ocaml/xapi-types/secretString.mli | 2 -- ocaml/xapi/vm_sysprep.ml | 4 ++-- ocaml/xapi/vm_sysprep.mli | 5 ++--- ocaml/xapi/xapi_vm.mli | 3 +-- 7 files changed, 7 insertions(+), 13 deletions(-) diff --git a/ocaml/idl/datamodel_vm.ml b/ocaml/idl/datamodel_vm.ml index 886b125659f..5e4134afd0b 100644 --- a/ocaml/idl/datamodel_vm.ml +++ b/ocaml/idl/datamodel_vm.ml @@ -2375,7 +2375,7 @@ let sysprep = ~params: [ (Ref _vm, "self", "The VM") - ; (SecretString, "unattend", "XML content passed to sysprep") + ; (String, "unattend", "XML content passed to sysprep") ] ~doc:"Pass unattend.xml to Windows sysprep" ~allowed_roles:_R_VM_ADMIN () diff --git a/ocaml/xapi-cli-server/cli_operations.ml b/ocaml/xapi-cli-server/cli_operations.ml index a2cf537550e..f51c50851d4 100644 --- a/ocaml/xapi-cli-server/cli_operations.ml +++ b/ocaml/xapi-cli-server/cli_operations.ml @@ -3593,7 +3593,7 @@ let vm_sysprep fd printer rpc session_id params = let unattend = match get_client_file fd filename with | Some xml -> - xml |> SecretString.of_string + xml | None -> marshal fd (Command (PrintStderr "Failed to read file.\n")) ; raise (ExitWithError 1) diff --git a/ocaml/xapi-types/secretString.ml b/ocaml/xapi-types/secretString.ml index b552e46edfd..781dac86697 100644 --- a/ocaml/xapi-types/secretString.ml +++ b/ocaml/xapi-types/secretString.ml @@ -24,8 +24,6 @@ let write_to_channel c s = output_string c s let equal = String.equal -let length = String.length - let pool_secret = "pool_secret" let with_cookie t cookies = (pool_secret, t) :: cookies diff --git a/ocaml/xapi-types/secretString.mli b/ocaml/xapi-types/secretString.mli index 6d85364d04e..82d97eaaa72 100644 --- a/ocaml/xapi-types/secretString.mli +++ b/ocaml/xapi-types/secretString.mli @@ -25,8 +25,6 @@ val of_string : string -> t val equal : t -> t -> bool -val length : t -> int - val json_rpc_of_t : t -> Rpc.t val t_of_rpc : Rpc.t -> t diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 6892898e9fb..fd769cc8099 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -132,7 +132,7 @@ let make_iso ~vm_uuid ~unattend = Xapi_stdext_unix.Unixext.mkdir_rec SR.dir 0o755 ; with_temp_dir ~dir:"/var/tmp/xapi" "sysprep-" "-iso" (fun temp_dir -> let path = temp_dir // "unattend.xml" in - SecretString.write_to_file path unattend ; + Unixext.write_string_to_file path unattend ; debug "%s: written to %s" __FUNCTION__ path ; let args = ["-r"; "-J"; "-o"; iso; temp_dir] in Forkhelpers.execute_command_get_output genisoimage args |> ignore ; @@ -232,7 +232,7 @@ let sysprep ~__context ~vm ~unattend = let control = Printf.sprintf "/local/domain/%Ld/control" domid in if domid <= 0L then fail VM_not_running ; - if SecretString.length unattend > 32 * 1024 then + if String.length unattend > 32 * 1024 then fail XML_too_large ; Ezxenstore_core.Xenstore.with_xs (fun xs -> let open Ezxenstore_core.Xenstore in diff --git a/ocaml/xapi/vm_sysprep.mli b/ocaml/xapi/vm_sysprep.mli index 5c11ef7dfb7..80f1874d7e9 100644 --- a/ocaml/xapi/vm_sysprep.mli +++ b/ocaml/xapi/vm_sysprep.mli @@ -27,8 +27,7 @@ exception Sysprep of error val on_startup : __context:Context.t -> unit (** clean up on toolstart start up *) -val sysprep : - __context:Context.t -> vm:API.ref_VM -> unattend:SecretString.t -> unit +val sysprep : __context:Context.t -> vm:API.ref_VM -> unattend:string -> unit (** Execute sysprep on [vm] using script [unattend]. This requires - driver support from the VM and is checked. [unattend] must + driver support from the VM and is checked. [unattend:string] must not exceed 32kb. Raised [Failure] that must be handled, *) diff --git a/ocaml/xapi/xapi_vm.mli b/ocaml/xapi/xapi_vm.mli index 12515874aeb..005b4cae4ae 100644 --- a/ocaml/xapi/xapi_vm.mli +++ b/ocaml/xapi/xapi_vm.mli @@ -451,5 +451,4 @@ val add_to_blocked_operations : val remove_from_blocked_operations : __context:Context.t -> self:API.ref_VM -> key:API.vm_operations -> unit -val sysprep : - __context:Context.t -> self:API.ref_VM -> unattend:SecretString.t -> unit +val sysprep : __context:Context.t -> self:API.ref_VM -> unattend:string -> unit From 6118d47249aa9bd27a7ed5961b8c0bbdf2891437 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 30 Jun 2025 18:26:32 +0100 Subject: [PATCH 093/111] xapi_vm_lifecycle: Stop assuming PV driver's presence implies all features A 10+ year old note justified assuming that feature-suspend and others are always available by referring to a buggy Windows guest agent. It has been correctly writing 1 to control/feature-suspend for at least a decade now (tested on old Windows PV drivers), so there is no reason to maintain this workaround. This allows PV drivers to potentially disable such features. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_vm_lifecycle.ml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index 5ec4ca6d792..cfbc8162e53 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -166,10 +166,6 @@ let has_definitely_booted_pv ~vmmr = ) (** Return an error iff vmr is an HVM guest and lacks a needed feature. - * Note: it turned out that the Windows guest agent does not write "feature-suspend" - * on resume (only on startup), so we cannot rely just on that flag. We therefore - * add a clause that enables all features when PV drivers are present using the - * old-style check. * The "strict" param should be true for determining the allowed_operations list * (which is advisory only) and false (more permissive) when we are potentially about * to perform an operation. This makes a difference for ops that require the guest to @@ -180,8 +176,6 @@ let check_op_for_feature ~__context ~vmr:_ ~vmmr ~vmgmr ~power_state ~op ~ref power_state <> `Running (* PV guests offer support implicitly *) || has_definitely_booted_pv ~vmmr - || Xapi_pv_driver_version.(has_pv_drivers (of_guest_metrics vmgmr)) - (* Full PV drivers imply all features *) then None else From a215870ac06f217810fcb21c4509eda13608a219 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 14:49:25 +0100 Subject: [PATCH 094/111] CP-308455 Save backtrace in error case Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 1 + 1 file changed, 1 insertion(+) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index fd769cc8099..8bde41a0d77 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -145,6 +145,7 @@ let make_iso ~vm_uuid ~unattend = (** create a local ISO SR when necessary and update it such that it recognises any ISO we added or removed *) let update_sr ~__context = + Backtrace.is_important e ; let host = Helpers.get_localhost ~__context in let hostname = Db.Host.get_hostname ~__context ~self:host in let label = SR.name hostname in From 01981bd874bfc8f6c03075c35276ec7f47595457 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 14:49:25 +0100 Subject: [PATCH 095/111] CP-308455 VM.sysprep create better SR name label We identify the local SR by name. Make sure it is very unlikely to conflict we a user's SR and check its type, too. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 8bde41a0d77..74632f781c9 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -50,7 +50,13 @@ let call = Helpers.call_api_functions module SR = struct let dir = "/var/opt/iso" - let name hostname = Printf.sprintf "SYSPREP-%s" hostname + (* We create a deterministic unique name label to protect us against a + user using the same name *) + let name hostname = + let digest str = + Digest.(string str |> to_hex) |> fun hex -> String.sub hex 0 4 + in + Printf.sprintf "SYSPREP-%s-%s" hostname (digest hostname) let find_opt ~__context ~label = match Db.SR.get_by_name_label ~__context ~label with From 9ce41c58925866504ace1a7fe06fdd29da5009b6 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 14:49:25 +0100 Subject: [PATCH 096/111] CP-308455 improve locating local ISO SR Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 74632f781c9..20b2333cfcc 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -59,14 +59,14 @@ module SR = struct Printf.sprintf "SYSPREP-%s-%s" hostname (digest hostname) let find_opt ~__context ~label = - match Db.SR.get_by_name_label ~__context ~label with - | [sr] -> - Some sr - | sr :: _ -> - warn "%s: more than one SR with label %s" __FUNCTION__ label ; - Some sr - | [] -> - None + let check sr = + match Db.SR.get_record ~__context ~self:sr with + | API.{sR_type= "iso"; _} -> + true + | _ -> + false + in + Db.SR.get_by_name_label ~__context ~label |> List.find_opt check end (** This is called on xapi startup. Opportunity to set up or clean up. From 191db166bacb01e0add3148196d0951f9ff238c0 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 15:23:31 +0100 Subject: [PATCH 097/111] CP-308455 VM.sysprep save backtrace Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 1 + 1 file changed, 1 insertion(+) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 20b2333cfcc..b360bd56d55 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -145,6 +145,7 @@ let make_iso ~vm_uuid ~unattend = (iso, basename) ) with e -> + Backtrace.is_important e ; let msg = Printexc.to_string e in Helpers.internal_error "%s failed: %s" __FUNCTION__ msg From d736b7359e3122e608841fbcce17c3f08135733e Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 15:23:31 +0100 Subject: [PATCH 098/111] CP-308455 VM.sysprep fix saving backtrace Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 1 - 1 file changed, 1 deletion(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index b360bd56d55..c2067cde22e 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -152,7 +152,6 @@ let make_iso ~vm_uuid ~unattend = (** create a local ISO SR when necessary and update it such that it recognises any ISO we added or removed *) let update_sr ~__context = - Backtrace.is_important e ; let host = Helpers.get_localhost ~__context in let hostname = Db.Host.get_hostname ~__context ~self:host in let label = SR.name hostname in From 630ec5776d2b1ab6bb6032b828f054680a4c53a9 Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 15:23:31 +0100 Subject: [PATCH 099/111] CP-308455 VM.sysprep write VDI UUID to xenstore In addition to the file name, write the VDI UUID to xenstore for the guest agent to pick up. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index c2067cde22e..1af1db06059 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -207,11 +207,12 @@ let find_vdi ~__context ~label = (** notify the VM with [domid] to run sysprep and where to find the file. *) -let trigger ~domid = +let trigger ~domid ~uuid = let open Ezxenstore_core.Xenstore in let control = Printf.sprintf "/local/domain/%Ld/control/sysprep" domid in with_xs (fun xs -> xs.Xs.write (control // "filename") "D://unattend.xml" ; + xs.Xs.write (control // "vdi-uuid") uuid ; xs.Xs.write (control // "action") "sysprep" ; debug "%s: notified domain %Ld" __FUNCTION__ domid ; let rec wait n = @@ -258,11 +259,12 @@ let sysprep ~__context ~vm ~unattend = let _sr = update_sr ~__context in let vbd = find_cdr_vbd ~__context ~vm in let vdi = find_vdi ~__context ~label in + let uuid = Db.VDI.get_uuid ~__context ~self:vdi in debug "%s: inserting Sysprep VDI for VM %s" __FUNCTION__ vm_uuid ; call ~__context @@ fun rpc session_id -> Client.VBD.insert ~rpc ~session_id ~vdi ~vbd ; Thread.delay 5.0 ; - match trigger ~domid with + match trigger ~domid ~uuid with | "running" -> debug "%s: sysprep running, ejecting CD" __FUNCTION__ ; Client.VBD.eject ~rpc ~session_id ~vbd ; From eaf2050465f057016b0b141b03a4bc9e534207ff Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 15:23:31 +0100 Subject: [PATCH 100/111] CP-308455 VM.sysprep make delay configurable Mostly for development: when inserting a CD we wait before we expect the VM to have recognised it such that we can start sysprep. Make this configuratble for easier experimentation. Signed-off-by: Christian Lindig --- ocaml/xapi/xapi_globs.ml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 88b957c2f2f..d2c591e4f2c 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -1092,6 +1092,8 @@ let validate_reusable_pool_session = ref false let vm_sysprep_enabled = ref false (* enable VM.sysprep API *) +let vm_sysprep_wait = ref 5.0 (* seconds *) + let test_open = ref 0 let xapi_requests_cgroup = @@ -1761,6 +1763,11 @@ let other_options = , (fun () -> string_of_bool !vm_sysprep_enabled) , "Enable VM.sysprep API" ) + ; ( "vm-sysprep-wait" + , Arg.Set_float vm_sysprep_wait + , (fun () -> string_of_float !vm_sysprep_wait) + , "Time in seconds to wait for VM to recognise inserted CD" + ) ] (* The options can be set with the variable xapiflags in /etc/sysconfig/xapi. From 9153fb544119128b31899abf64f7e22ef39a7f1b Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Wed, 2 Jul 2025 15:23:31 +0100 Subject: [PATCH 101/111] CP-308455 VM.sysprep guard on_startup with feature flag Remove VDIs only when the feature is enabled. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 1af1db06059..16d485025b1 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -75,9 +75,7 @@ let on_startup ~__context = let host = Helpers.get_localhost ~__context in let hostname = Db.Host.get_hostname ~__context ~self:host in match SR.find_opt ~__context ~label:(SR.name hostname) with - | None -> - () - | Some sr -> ( + | Some sr when !Xapi_globs.vm_sysprep_enabled -> ( Db.SR.get_VDIs ~__context ~self:sr |> List.iter @@ fun self -> match Db.VDI.get_record ~__context ~self with @@ -87,6 +85,8 @@ let on_startup ~__context = | _ -> () ) + | _ -> + () (* none found or not enabled *) (** create a name with a random infix. We need random names for temporary directories to avoid collisions of concurrent API calls *) From c895469f5499b0cfb68001e37dca8b1ce294b249 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 2 Jul 2025 15:20:44 +0100 Subject: [PATCH 102/111] xapi_vm_lifecycle: Disallow suspend when cant_suspend_reason is present When data/cant_suspend_reason is present in xenstore (renamed data-cant-suspend-reason in "other" guest metrics), it would exclude operations involving suspend from the allowed operations list, but still allow actually suspending (strict=true during allowed operations checks but could be false otherwise. This was additionally overriden by assuming that PV driver presence guarantees feature-suspend). Suspending in such a situation would always result in a crash, usually with the following error: ``` $ xe vm-suspend uuid=d546c8c2-bcb1-0ed6-7931-d0fc9393ccb2 The server failed to handle your request, due to an internal error. The given message may give details useful for debugging the problem. message: xenopsd internal error: Device_common.QMP_Error(8, "{\"error\": {\"class\":\"GenericError\", \"desc\":\"State blocked by non-migratable device '0000:00:07.0/nvme'\", \"data\":{} }, \"id\":\"qmp-000006-8\"}") ``` So disallow suspending altogether when cant_suspend_reason is present. Log the QEMU error directly in the XAPI error as well: ``` $ xe vm-suspend uuid=d546c8c2-bcb1-0ed6-7931-d0fc9393ccb2 You attempted an operation on a VM which lacks the feature. vm: d546c8c2-bcb1-0ed6-7931-d0fc9393ccb2 (Windows 10 (64-bit)) reason: {"error":{"class":"GenericError","desc":"State blocked by non-migratable device '0000:00:07.0/nvme'","data":{}},"id":"qmp-000012-9"} ``` Signed-off-by: Andrii Sultanov --- ocaml/idl/datamodel_errors.ml | 2 ++ ocaml/xapi-consts/api_errors.ml | 2 ++ ocaml/xapi/xapi_vm_lifecycle.ml | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index 30e185ac192..b205b670159 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -534,6 +534,8 @@ let _ = () ; error Api_errors.vm_lacks_feature ["vm"] ~doc:"You attempted an operation on a VM which lacks the feature." () ; + error Api_errors.vm_non_suspendable ["vm"; "reason"] + ~doc:"You attempted an operation on a VM which is not suspendable." () ; error Api_errors.vm_is_template ["vm"] ~doc:"The operation attempted is not valid for a template VM" () ; error Api_errors.other_operation_in_progress ["class"; "object"] diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 077a8dacbf9..89ab735194b 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -440,6 +440,8 @@ let vm_old_pv_drivers = add_error "VM_OLD_PV_DRIVERS" let vm_lacks_feature = add_error "VM_LACKS_FEATURE" +let vm_non_suspendable = add_error "VM_NON_SUSPENDABLE" + let vm_cannot_delete_default_template = add_error "VM_CANNOT_DELETE_DEFAULT_TEMPLATE" diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index cfbc8162e53..f5a9d7dfd57 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -151,6 +151,12 @@ let has_feature ~vmgmr ~feature = try List.assoc feature other = "1" with Not_found -> false ) +let get_feature ~vmgmr ~feature = + Option.bind vmgmr (fun gmr -> + let other = gmr.Db_actions.vM_guest_metrics_other in + List.assoc_opt feature other + ) + (* Returns `true` only if we are certain that the VM has booted PV (if there * is no metrics record, then we can't tell) *) let has_definitely_booted_pv ~vmmr = @@ -194,9 +200,15 @@ let check_op_for_feature ~__context ~vmr:_ ~vmmr ~vmgmr ~power_state ~op ~ref some_err Api_errors.vm_lacks_feature | `changing_VCPUs_live when lack_feature "feature-vcpu-hotplug" -> some_err Api_errors.vm_lacks_feature - | (`suspend | `checkpoint | `pool_migrate | `migrate_send) - when strict && lack_feature "feature-suspend" -> - some_err Api_errors.vm_lacks_feature + | `suspend | `checkpoint | `pool_migrate | `migrate_send -> ( + match get_feature ~vmgmr ~feature:"data-cant-suspend-reason" with + | Some reason -> + Some (Api_errors.vm_non_suspendable, [Ref.string_of ref; reason]) + | None when strict && lack_feature "feature-suspend" -> + some_err Api_errors.vm_lacks_feature + | None -> + None + ) | _ -> None From 30c0ba15002ce05ce302946acbd03fc5a16eee7e Mon Sep 17 00:00:00 2001 From: Steven Woods Date: Thu, 3 Jul 2025 15:33:40 +0100 Subject: [PATCH 103/111] CA-413304: Restore VBD.unplug function to keep old functionality This is a partial revert of 1a46f33be768. It retains the deactivate and detach functions introduced but restores the original unplug function so that the VBD_unplug atom is completely unchanged when xenops_vbd_plug_unplug_legacy=true instead of running deactivate followed by detach. This will fix the S(Does_not_exist) Xenopsd errors we are seeing in some VBD_unplug calls, until a fix for the split functions is found. Signed-off-by: Steven Woods --- ocaml/xenopsd/lib/xenops_server.ml | 5 +- ocaml/xenopsd/lib/xenops_server_plugin.ml | 2 + ocaml/xenopsd/lib/xenops_server_simulator.ml | 2 + ocaml/xenopsd/lib/xenops_server_skeleton.ml | 2 + ocaml/xenopsd/xc/xenops_server_xen.ml | 121 +++++++++++++++++++ 5 files changed, 128 insertions(+), 4 deletions(-) diff --git a/ocaml/xenopsd/lib/xenops_server.ml b/ocaml/xenopsd/lib/xenops_server.ml index 15715ac7ac7..36a2ea92fed 100644 --- a/ocaml/xenopsd/lib/xenops_server.ml +++ b/ocaml/xenopsd/lib/xenops_server.ml @@ -2097,10 +2097,7 @@ let rec perform_atomic ~progress_callback ?result (op : atomic) | VBD_unplug (id, force) -> debug "VBD.unplug %s" (VBD_DB.string_of_id id) ; finally - (fun () -> - B.VBD.deactivate t (VBD_DB.vm_of id) (VBD_DB.read_exn id) force ; - B.VBD.detach t (VBD_DB.vm_of id) (VBD_DB.read_exn id) - ) + (fun () -> B.VBD.unplug t (VBD_DB.vm_of id) (VBD_DB.read_exn id) force) (fun () -> VBD_DB.signal id) | VBD_deactivate (id, force) -> debug "VBD.deactivate %s" (VBD_DB.string_of_id id) ; diff --git a/ocaml/xenopsd/lib/xenops_server_plugin.ml b/ocaml/xenopsd/lib/xenops_server_plugin.ml index 4c8c73773f8..19ab155aa92 100644 --- a/ocaml/xenopsd/lib/xenops_server_plugin.ml +++ b/ocaml/xenopsd/lib/xenops_server_plugin.ml @@ -211,6 +211,8 @@ module type S = sig val activate : Xenops_task.task_handle -> Vm.id -> Vbd.t -> unit + val unplug : Xenops_task.task_handle -> Vm.id -> Vbd.t -> bool -> unit + val deactivate : Xenops_task.task_handle -> Vm.id -> Vbd.t -> bool -> unit val detach : Xenops_task.task_handle -> Vm.id -> Vbd.t -> unit diff --git a/ocaml/xenopsd/lib/xenops_server_simulator.ml b/ocaml/xenopsd/lib/xenops_server_simulator.ml index f8c0afab8ab..13ae583c7da 100644 --- a/ocaml/xenopsd/lib/xenops_server_simulator.ml +++ b/ocaml/xenopsd/lib/xenops_server_simulator.ml @@ -677,6 +677,8 @@ module VBD = struct let activate _ (_vm : Vm.id) (_vbd : Vbd.t) = () + let unplug _ vm vbd _ = with_lock m (remove_vbd vm vbd) + let deactivate _ vm vbd _ = with_lock m (remove_vbd vm vbd) let detach _ _vm _vbd = () diff --git a/ocaml/xenopsd/lib/xenops_server_skeleton.ml b/ocaml/xenopsd/lib/xenops_server_skeleton.ml index b938927a2e4..f5ef9ed027c 100644 --- a/ocaml/xenopsd/lib/xenops_server_skeleton.ml +++ b/ocaml/xenopsd/lib/xenops_server_skeleton.ml @@ -148,6 +148,8 @@ module VBD = struct let activate _ _ _ = unimplemented __FUNCTION__ + let unplug _ _ _ _ = unimplemented __FUNCTION__ + let deactivate _ _ _ _ = unimplemented __FUNCTION__ let detach _ _ _ = unimplemented __FUNCTION__ diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index ccf3eac9764..61e5d45fb84 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -3863,6 +3863,127 @@ module VBD = struct ) (fun () -> cleanup_attached_vdis vm (id_of vbd)) + let unplug task vm vbd force = + with_xc_and_xs (fun xc xs -> + try + (* On destroying the datapath + + 1. if the device has already been shutdown and deactivated (as in + suspend) we must call DP.destroy here to avoid leaks + + 2. if the device is successfully shutdown here then we must call + DP.destroy because no-one else will + + 3. if the device shutdown is rejected then we should leave the DP + alone and rely on the event thread calling us again later. *) + let domid = domid_of_uuid ~xs (uuid_of_string vm) in + (* If the device is gone then we don't need to shut it down but we do + need to free any storage resources. *) + let dev = + try + Some (device_by_id xc xs vm (device_kind_of ~xs vbd) (id_of vbd)) + with + | Xenopsd_error (Does_not_exist (_, _)) -> + debug "VM = %s; VBD = %s; Ignoring missing domain" vm (id_of vbd) ; + None + | Xenopsd_error Device_not_connected -> + debug "VM = %s; VBD = %s; Ignoring missing device" vm (id_of vbd) ; + None + in + let backend = + match dev with + | None -> + None + | Some dv -> ( + match + Rpcmarshal.unmarshal typ_of_backend + (Device.Generic.get_private_key ~xs dv _vdi_id + |> Jsonrpc.of_string + ) + with + | Ok x -> + x + | Error (`Msg m) -> + internal_error "Failed to unmarshal VBD backend: %s" m + ) + in + Option.iter + (fun dev -> + if force && not (Device.can_surprise_remove ~xs dev) then + debug + "VM = %s; VBD = %s; Device is not surprise-removable \ + (ignoring and removing anyway)" + vm (id_of vbd) ; + (* this happens on normal shutdown too *) + (* Case (1): success; Case (2): success; Case (3): an exception is + thrown *) + with_tracing ~task ~name:"VBD_device_shutdown" @@ fun () -> + Xenops_task.with_subtask task + (Printf.sprintf "Vbd.clean_shutdown %s" (id_of vbd)) + (fun () -> + (if force then Device.hard_shutdown else Device.clean_shutdown) + task ~xs dev + ) + ) + dev ; + (* We now have a shutdown device but an active DP: we should destroy + the DP if the backend is of type VDI *) + finally + (fun () -> + with_tracing ~task ~name:"VBD_device_release" (fun () -> + Option.iter + (fun dev -> + Xenops_task.with_subtask task + (Printf.sprintf "Vbd.release %s" (id_of vbd)) + (fun () -> Device.Vbd.release task ~xc ~xs dev) + ) + dev + ) ; + (* If we have a qemu frontend, detach this too. *) + with_tracing ~task ~name:"VBD_detach_qemu" @@ fun () -> + let _ = + DB.update vm + (Option.map (fun vm_t -> + let persistent = vm_t.VmExtra.persistent in + if List.mem_assoc vbd.Vbd.id persistent.VmExtra.qemu_vbds + then ( + let _, qemu_vbd = + List.assoc vbd.Vbd.id persistent.VmExtra.qemu_vbds + in + (* destroy_vbd_frontend ignores 'refusing to close' + transients' *) + destroy_vbd_frontend ~xc ~xs task qemu_vbd ; + VmExtra. + { + persistent= + { + persistent with + qemu_vbds= + List.remove_assoc vbd.Vbd.id + persistent.qemu_vbds + } + } + ) else + vm_t + ) + ) + in + () + ) + (fun () -> + with_tracing ~task ~name:"VBD_dp_destroy" @@ fun () -> + match (domid, backend) with + | Some x, None | Some x, Some (VDI _) -> + Storage.dp_destroy task + (Storage.id_of (string_of_int x) vbd.Vbd.id) + | _ -> + () + ) + with Device_common.Device_error (_, s) -> + debug "Caught Device_error: %s" s ; + raise (Xenopsd_error (Device_detach_rejected ("VBD", id_of vbd, s))) + ) + let deactivate task vm vbd force = with_xc_and_xs (fun xc xs -> try From 8017a4ec3fdc7daef5e21d6e32a70e932ed2ac2d Mon Sep 17 00:00:00 2001 From: Christian Lindig Date: Thu, 3 Jul 2025 11:57:13 +0000 Subject: [PATCH 104/111] CP-308455 VM.sysprep make delay configurable This was missing from previous commit to control the time waited for a VM to recognise a CD. In the long run we would like to replace this with a protocol that tells us when the guest is ready. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 16d485025b1..3db126dae08 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -263,7 +263,7 @@ let sysprep ~__context ~vm ~unattend = debug "%s: inserting Sysprep VDI for VM %s" __FUNCTION__ vm_uuid ; call ~__context @@ fun rpc session_id -> Client.VBD.insert ~rpc ~session_id ~vdi ~vbd ; - Thread.delay 5.0 ; + Thread.delay !Xapi_globs.vm_sysprep_wait ; match trigger ~domid ~uuid with | "running" -> debug "%s: sysprep running, ejecting CD" __FUNCTION__ ; From 0fd510f8d48031d41942637d8cd04480105a2b08 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Fri, 4 Jul 2025 15:40:00 +0800 Subject: [PATCH 105/111] CA-393417: Bind mount /proc/ into chroot From strace/gdb, XS9 qemu requires /proc/self/fd/ to work well This is due to systemd/libudev update. Just bind mount /proc/self/ to the chroot to permit qemu access ``` 1047 openat(AT_FDCWD, "/proc/self/fd/46", O_RDONLY|O_NOCTTY|O_CLOEXEC|O_PATH) = -1 ENOENT (No such file or directory) 1048 openat(AT_FDCWD, "/proc/", O_RDONLY|O_NOCTTY|O_CLOEXEC|O_PATH) = -1 ENOENT (No such file or directory) ../sysdeps/unix/sysv/linux/fstatfs64.c:30 out>, dir_fd=) at ../src/basic/stat-util.c:566 magic_value=1650812274) at ../src/basic/stat-util.c:369 fd=) at ../src/basic/stat-util.h:66 verify=) at ../src/libsystemd/sd-device/sd-device.c:221 (ret=0x7ffc67ebba20, syspath=0x7ffc67ebb950 "/sys/bus/usb/devices/usb1", strict=true) at ../src/libsystemd/sd-device/sd-device.c:271 (syspath=0x7ffc67ebb950 "/sys/bus/usb/devices/usb1", ret=0x7ffc67ebba20) at ../src/libsystemd/sd-device/sd-device.c:280 ``` Signed-off-by: Lin Liu --- python3/libexec/usb_reset.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/python3/libexec/usb_reset.py b/python3/libexec/usb_reset.py index 3e5ff849060..010fd134862 100755 --- a/python3/libexec/usb_reset.py +++ b/python3/libexec/usb_reset.py @@ -15,7 +15,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # # attach -# ./usb_reset.py attach device -d dom-id [-r] +# ./usb_reset.py attach device -d dom-id -p pid [-r] # ./usb_reset.py attach 2-2 -d 12 -p 4130 # ./usb_reset.py attach 2-2 -d 12 -p 4130 -r # 1. reset device @@ -23,18 +23,21 @@ # 2. if it's the first USB device to pass-through # a) bind mount /sys in chroot directory (/var/xen/qemu/root-) # b) clone (create the device with same major/minor number and mode) in chroot directory with same path +# c) bind mount /proc/ to chroot directory (/var/xen/qemu/root-/proc/self) # 3. set device file uid/gid to (qemu_base + dom-id) # # detach # ./usb_reset.py detach device -d dom-id # ./usb_reset.py detach 2-2 -d 12 # 1. Remove the cloned device file in chroot directory +# 2. Umount /proc/self from chroot directory if it is mounted # # cleanup # ./usb_reset.py cleanup -d dom-id # ./usb_reset.py cleanup -d 12 # 1.umount /sys from chroot directory if they are mounted. -# 2.remove /dev/bus directory in chroot directory if it exists +# 2.umount /proc/self from chroot directory if they are mounted. +# 3.remove /dev/bus directory in chroot directory if it exists import argparse import ctypes @@ -58,6 +61,8 @@ def parse_arg(): attach.add_argument("device", help="the target usb device") attach.add_argument("-d", dest="domid", type=int, required=True, help="specify the domid of the VM") + attach.add_argument("-p", dest="pid", type=int, required=True, + help="the process id of QEMU") attach.add_argument("-r", dest="reset_only", action="store_true", help="reset device only, for privileged mode") @@ -152,7 +157,7 @@ def clone_device(path, root_dir, domid): exit(1) -def attach(device, domid, reset_only): +def attach(device, domid, pid, reset_only): path = dev_path(device) # reset device @@ -177,16 +182,19 @@ def attach(device, domid, reset_only): clone_device(path, root_dir, domid) sys_dir = root_dir + "/sys" + proc_dir = root_dir + "/proc" # sys_dir could already be mounted because of PCI pass-through - if not os.path.isdir(sys_dir): - try: - os.mkdir(sys_dir, 0o755) - except OSError: - log.error("Failed to create sys dir in chroot") - exit(1) + os.makedirs(sys_dir, exist_ok=True, mode=0o755) if not os.path.isdir(sys_dir + "/devices"): mount("/sys", sys_dir, "sysfs") + self_dir = os.path.join(proc_dir, "self") + os.makedirs(self_dir , exist_ok=True, mode=0o755) + fd_dir = os.path.join(self_dir, "fd") + if not os.path.isdir(fd_dir): + MS_BIND = 4096 # mount flags, from fs.h + mount(f"/proc/{pid}/", self_dir, "", MS_BIND) + def detach(device, domid): path = dev_path(device) @@ -201,11 +209,17 @@ def cleanup(domid): dev_dir = root_dir + "/dev" sys_dir = root_dir + "/sys" bus_dir = dev_dir + "/bus" + proc_dir = root_dir + "/proc" + self_dir = proc_dir + "/self" if os.path.isdir(bus_dir): log.info("Removing bus directory: {} for cleanup".format(bus_dir)) shutil.rmtree(bus_dir) if os.path.isdir(sys_dir + "/devices"): umount(sys_dir) + if os.path.exists(sys_dir) and os.path.ismount(self_dir): + umount(self_dir) + log.info("Removing proc directory: {} for cleanup".format(proc_dir)) + shutil.rmtree(proc_dir) if __name__ == "__main__": @@ -214,7 +228,7 @@ def cleanup(domid): arg = parse_arg() if "attach" == arg.command: - attach(arg.device, arg.domid, arg.reset_only) + attach(arg.device, arg.domid, arg.pid, arg.reset_only) elif "detach" == arg.command: detach(arg.device, arg.domid) elif "cleanup" == arg.command: From c725281e3a2ae0a60c3b181cc469d163315965bb Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 3 Jul 2025 11:50:38 +0100 Subject: [PATCH 106/111] xapi-stdext-threads: calibrate ratio for delay times On very busy systems, the wait may take much longer than expected. Instead of hard-coding the expected value, wait once to estimate the time aded to the delays, and then use it to compare the times. Also change to use Mtime.Spans instead of using integers. Signed-off-by: Pau Ruiz Safont --- .../lib/xapi-stdext-threads/scheduler_test.ml | 94 +++++++++++-------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-threads/scheduler_test.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-threads/scheduler_test.ml index 0a4a847403f..259a24ee260 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-threads/scheduler_test.ml +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-threads/scheduler_test.ml @@ -14,90 +14,106 @@ module Scheduler = Xapi_stdext_threads_scheduler.Scheduler +let calibrated_ratio () = + let expected = Mtime.Span.(100 * ms |> to_float_ns) in + let elapsed = Mtime_clock.counter () in + (* Add a 10% leeway to the ratio calculated *) + Thread.delay 0.11 ; + let actual = Mtime_clock.count elapsed |> Mtime.Span.to_float_ns in + let ratio = actual /. expected in + Alcotest.(check bool) (Printf.sprintf "ratio is %f" ratio) true true ; + ratio + let started = Atomic.make false let start_schedule () = if not (Atomic.exchange started true) then Thread.create Scheduler.loop () |> ignore -let send event data = Event.(send event data |> sync) +let send event data () = Event.(send event data |> sync) let receive event = Event.(receive event |> sync) -let elapsed_ms cnt = - let elapsed_ns = Mtime_clock.count cnt |> Mtime.Span.to_uint64_ns in - Int64.(div elapsed_ns 1000000L |> to_int) +let is_less ratio a b = + let a = + Mtime.Span.to_float_ns a + |> Float.mul ratio + |> Int64.of_float + |> Mtime.Span.of_uint64_ns + in + Mtime.Span.is_shorter ~than:a b -let is_less = Alcotest.(testable (pp int)) Stdlib.( > ) +let mtime_span () = + let cmp = is_less (calibrated_ratio ()) in + Alcotest.(testable Mtime.Span.pp) cmp let test_single () = let finished = Event.new_channel () in - Scheduler.add_to_queue "one" Scheduler.OneShot 0.001 (fun () -> - send finished true - ) ; + Scheduler.add_to_queue "one" Scheduler.OneShot 0.001 (send finished true) ; start_schedule () ; Alcotest.(check bool) "result" true (receive finished) -let test_remove_self () = +let test_remove_self mtime_span () = let which = Event.new_channel () in Scheduler.add_to_queue "self" (Scheduler.Periodic 0.001) 0.001 (fun () -> (* this should remove the periodic scheduling *) Scheduler.remove_from_queue "self" ; (* add an operation to stop the test *) - Scheduler.add_to_queue "stop" Scheduler.OneShot 0.1 (fun () -> - send which "stop" - ) ; - send which "self" + Scheduler.add_to_queue "stop" Scheduler.OneShot 0.1 (send which "stop") ; + send which "self" () ) ; start_schedule () ; - let cnt = Mtime_clock.counter () in + + let from_wait_to_receive = Mtime_clock.counter () in Alcotest.(check string) "same event name" "self" (receive which) ; Alcotest.(check string) "same event name" "stop" (receive which) ; - let elapsed_ms = elapsed_ms cnt in - Alcotest.check is_less "small time" 300 elapsed_ms -let test_empty () = + let elapsed = Mtime_clock.count from_wait_to_receive in + let expected = Mtime.Span.(300 * ms) in + Alcotest.check mtime_span "small time" expected elapsed + +let test_empty mtime_span () = let finished = Event.new_channel () in - Scheduler.add_to_queue "one" Scheduler.OneShot 0.001 (fun () -> - send finished true - ) ; + Scheduler.add_to_queue "one" Scheduler.OneShot 0.001 (send finished true) ; start_schedule () ; Alcotest.(check bool) "finished" true (receive finished) ; (* wait loop to go to wait with no work to do *) Thread.delay 0.1 ; - Scheduler.add_to_queue "two" Scheduler.OneShot 0.001 (fun () -> - send finished true - ) ; - let cnt = Mtime_clock.counter () in + Scheduler.add_to_queue "two" Scheduler.OneShot 0.001 (send finished true) ; + + let from_wait_to_receive = Mtime_clock.counter () in Alcotest.(check bool) "finished" true (receive finished) ; - let elapsed_ms = elapsed_ms cnt in - Alcotest.check is_less "small time" 100 elapsed_ms -let test_wakeup () = + let elapsed = Mtime_clock.count from_wait_to_receive in + let expected = Mtime.Span.(100 * ms) in + Alcotest.check mtime_span "small time" expected elapsed + +let test_wakeup mtime_span () = let which = Event.new_channel () in (* schedule a long event *) - Scheduler.add_to_queue "long" Scheduler.OneShot 2.0 (fun () -> - send which "long" - ) ; + Scheduler.add_to_queue "long" Scheduler.OneShot 2.0 (send which "long") ; start_schedule () ; (* wait loop to go to wait with no work to do *) Thread.delay 0.1 ; - let cnt = Mtime_clock.counter () in + (* schedule a quick event, should wake up the loop *) - Scheduler.add_to_queue "quick" Scheduler.OneShot 0.1 (fun () -> - send which "quick" - ) ; + Scheduler.add_to_queue "quick" Scheduler.OneShot 0.1 (send which "quick") ; + + let from_wait_to_receive_quick = Mtime_clock.counter () in Alcotest.(check string) "same event name" "quick" (receive which) ; + Scheduler.remove_from_queue "long" ; - let elapsed_ms = elapsed_ms cnt in - Alcotest.check is_less "small time" 150 elapsed_ms + let elapsed = Mtime_clock.count from_wait_to_receive_quick in + let expected = Mtime.Span.(100 * ms) in + Alcotest.check mtime_span "small time" expected elapsed let tests = + let mtime_span = mtime_span () in [ ("test_single", `Quick, test_single) - ; ("test_remove_self", `Quick, test_remove_self) - ; ("test_empty", `Quick, test_empty) - ; ("test_wakeup", `Quick, test_wakeup) + ; ("test_remove_self", `Quick, test_remove_self mtime_span) + ; ("test_empty", `Quick, test_empty mtime_span) + ; ("test_wakeup", `Quick, test_wakeup mtime_span) ] let () = Alcotest.run "Scheduler" [("generic", tests)] From 51a97e7cfb4219a3890ca05e8f55d598b318d457 Mon Sep 17 00:00:00 2001 From: Guillaume Date: Mon, 7 Jul 2025 13:47:45 +0200 Subject: [PATCH 107/111] Downgrade unknown SM.feature errors to warnings Previously, encountering unknown features such as ATOMIC_PAUSE or SR_CACHING in SM.feature would trigger an error in xapi. However, these features can be used internally by SM and are not necessarily indicative of a misconfiguration. This change downgrades such cases from error to warning, allowing normal operation while still notifying the user that an unrecognized feature is present. Signed-off-by: Guillaume --- ocaml/xapi/smint.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/smint.ml b/ocaml/xapi/smint.ml index e58340b5239..1b4e4d45e47 100644 --- a/ocaml/xapi/smint.ml +++ b/ocaml/xapi/smint.ml @@ -132,7 +132,7 @@ module Feature = struct Some (feature, 1L) ) | feature :: _ -> - error "SM.feature: unknown feature %s" feature ; + warn "SM.feature: unknown feature %s" feature ; None (** [compat_features features1 features2] finds the compatible features in the input From 03980fc5e423ef93d2db1f81e041400e6e05cf75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Thu, 3 Jul 2025 15:57:59 +0000 Subject: [PATCH 108/111] CP-308455 VM.sysprep use watch to detect sysprep running Replace code that loops waiting for an update in xenstore with a watch. This should eliminate the chance of a race condition. Signed-off-by: Christian Lindig --- ocaml/xapi/vm_sysprep.ml | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/ocaml/xapi/vm_sysprep.ml b/ocaml/xapi/vm_sysprep.ml index 3db126dae08..effdecabd83 100644 --- a/ocaml/xapi/vm_sysprep.ml +++ b/ocaml/xapi/vm_sysprep.ml @@ -209,25 +209,20 @@ let find_vdi ~__context ~label = file. *) let trigger ~domid ~uuid = let open Ezxenstore_core.Xenstore in + let module Watch = Ezxenstore_core.Watch in let control = Printf.sprintf "/local/domain/%Ld/control/sysprep" domid in with_xs (fun xs -> xs.Xs.write (control // "filename") "D://unattend.xml" ; xs.Xs.write (control // "vdi-uuid") uuid ; xs.Xs.write (control // "action") "sysprep" ; debug "%s: notified domain %Ld" __FUNCTION__ domid ; - let rec wait n = - match (n, xs.Xs.read (control // "action")) with - | _, "running" -> - "running" - | n, action when n < 0 -> - action - | _, _ -> - Thread.delay 1.0 ; - wait (n - 1) - in - (* wait up to 5 iterations for runnung to appear or report whatever - is the status at the end *) - wait 5 + try + Watch.( + wait_for ~xs ~timeout:5.0 + (value_to_become (control // "action") "running") + ) ; + "running" + with Watch.Timeout _ -> xs.Xs.read (control // "action") ) (* This function is executed on the host where [vm] is running *) @@ -267,6 +262,7 @@ let sysprep ~__context ~vm ~unattend = match trigger ~domid ~uuid with | "running" -> debug "%s: sysprep running, ejecting CD" __FUNCTION__ ; + Thread.delay 1.0 ; Client.VBD.eject ~rpc ~session_id ~vbd ; Sys.remove iso | status -> From 5416081c9f0ae28d4eaea455c0d1aeda6f8a91f7 Mon Sep 17 00:00:00 2001 From: Lin Liu Date: Mon, 7 Jul 2025 10:53:18 +0800 Subject: [PATCH 109/111] CA-393417: Fix CI failure - Add unitest for usb_reset for coverage - Move mock to unittest.mock as python3 only now - exit -> sys.exit Signed-off-by: Lin Liu --- python3/libexec/usb_reset.py | 20 +++--- python3/tests/import_helper.py | 4 +- python3/tests/test_usb_reset.py | 109 ++++++++++++++++++++++++++++++++ python3/tests/test_usb_scan.py | 10 +-- 4 files changed, 127 insertions(+), 16 deletions(-) create mode 100644 python3/tests/test_usb_reset.py diff --git a/python3/libexec/usb_reset.py b/python3/libexec/usb_reset.py index 010fd134862..941259d6182 100755 --- a/python3/libexec/usb_reset.py +++ b/python3/libexec/usb_reset.py @@ -22,7 +22,8 @@ # if without -r, do step 2~3 # 2. if it's the first USB device to pass-through # a) bind mount /sys in chroot directory (/var/xen/qemu/root-) -# b) clone (create the device with same major/minor number and mode) in chroot directory with same path +# b) clone (create the device with same major/minor number and mode) +# in chroot directory with same path # c) bind mount /proc/ to chroot directory (/var/xen/qemu/root-/proc/self) # 3. set device file uid/gid to (qemu_base + dom-id) # @@ -44,13 +45,14 @@ import ctypes.util import fcntl import grp -import xcp.logger as log # pytype: disable=import-error import logging import os import pwd import re import shutil +import sys +import xcp.logger as log # pytype: disable=import-error def parse_arg(): parser = argparse.ArgumentParser( @@ -94,14 +96,14 @@ def dev_path(device): pat = re.compile(r"\d+-\d+(\.\d+)*$") if pat.match(device) is None: log.error("Unexpected device node: {}".format(device)) - exit(1) + sys.exit(1) try: bus = read_int("/sys/bus/usb/devices/{}/busnum".format(device)) dev = read_int("/sys/bus/usb/devices/{}/devnum".format(device)) return "/dev/bus/usb/{0:03d}/{1:03d}".format(bus, dev) except (IOError, ValueError) as e: log.error("Failed to get device path {}: {}".format(device, str(e))) - exit(1) + sys.exit(1) def mount(source, target, fs, flags=0): @@ -110,7 +112,7 @@ def mount(source, target, fs, flags=0): log.error("Failed to mount {} ({}) to {} with flags {}: {}". format(source, fs, target, flags, os.strerror(ctypes.get_errno()))) - exit(1) + sys.exit(1) def umount(target): @@ -140,7 +142,7 @@ def clone_device(path, root_dir, domid): st = os.stat(path) except OSError as e: log.error("Failed to get stat of {}: {}".format(path, str(e))) - exit(1) + sys.exit(1) mode = st.st_mode major = os.major(st.st_rdev) @@ -154,7 +156,7 @@ def clone_device(path, root_dir, domid): grp.getgrnam("qemu_base").gr_gid + domid) except OSError as e: log.error("Failed to chown device file {}: {}".format(path, str(e))) - exit(1) + sys.exit(1) def attach(device, domid, pid, reset_only): @@ -177,7 +179,7 @@ def attach(device, domid, pid, reset_only): dev_dir = root_dir + "/dev" if not os.path.isdir(root_dir) or not os.path.isdir(dev_dir): log.error("Error: The chroot or dev directory doesn't exist") - exit(1) + sys.exit(1) clone_device(path, root_dir, domid) @@ -235,4 +237,4 @@ def cleanup(domid): cleanup(arg.domid) else: log.error("Unexpected command: {}".format(arg.command)) - exit(1) + sys.exit(1) diff --git a/python3/tests/import_helper.py b/python3/tests/import_helper.py index 2fdbd922b95..6e1c5946558 100644 --- a/python3/tests/import_helper.py +++ b/python3/tests/import_helper.py @@ -5,7 +5,7 @@ from types import ModuleType from typing import Generator -from mock import Mock +from unittest.mock import MagicMock @contextmanager @@ -28,7 +28,7 @@ def mocked_modules(*module_names: str) -> Generator[None, None, None]: ``` """ for module_name in module_names: - sys.modules[module_name] = Mock() + sys.modules[module_name] = MagicMock() yield for module_name in module_names: sys.modules.pop(module_name) diff --git a/python3/tests/test_usb_reset.py b/python3/tests/test_usb_reset.py new file mode 100644 index 00000000000..43dae790cb1 --- /dev/null +++ b/python3/tests/test_usb_reset.py @@ -0,0 +1,109 @@ +import unittest +from unittest import mock +from unittest.mock import MagicMock +import sys + +# some mocked arguemtn is not used in the tests, but as side-effects +# disabled pylint warning for unused arguments +# pylint: disable=unused-argument + +from python3.tests.import_helper import import_file_as_module +# mock modules to avoid dependencies +sys.modules["xcp"] = MagicMock() +sys.modules["xcp.logger"] = MagicMock() +usb_reset = import_file_as_module("python3/libexec/usb_reset.py") + + +class TestUsbReset(unittest.TestCase): + @mock.patch("usb_reset.open", new_callable=mock.mock_open, read_data="5\n") + def test_read_int(self, mock_open): + self.assertEqual(usb_reset.read_int("/fake/path"), 5) + mock_open.assert_called_with("/fake/path") + + @mock.patch("usb_reset.read_int", side_effect=[1, 2]) + @mock.patch("usb_reset.log") + def test_dev_path_valid(self, mock_log, mock_read_int): + device = "1-2.3" + path = usb_reset.dev_path(device) + self.assertEqual(path, "/dev/bus/usb/001/002") + mock_log.error.assert_not_called() + + @mock.patch("usb_reset.log") + def test_dev_path_invalid(self, mock_log): + with self.assertRaises(SystemExit): + usb_reset.dev_path("invalid-device") + mock_log.error.assert_called() + + @mock.patch("usb_reset.ctypes.CDLL") + @mock.patch("usb_reset.ctypes.util.find_library", return_value="libc.so.6") + @mock.patch("usb_reset.log") + def test_mount_success(self, mock_log, mock_find_lib, mock_cdll): + mock_cdll.return_value.mount.return_value = 0 + usb_reset.mount("src", "tgt", "fs") + mock_cdll.return_value.mount.assert_called() + + @mock.patch("usb_reset.ctypes.CDLL") + @mock.patch("usb_reset.ctypes.util.find_library", return_value="libc.so.6") + @mock.patch("usb_reset.log") + def test_mount_fail(self, mock_log, mock_find_lib, mock_cdll): + mock_cdll.return_value.mount.return_value = -1 + with self.assertRaises(SystemExit): + usb_reset.mount("src", "tgt", "fs") + mock_log.error.assert_called() + + @mock.patch("usb_reset.ctypes.CDLL") + @mock.patch("usb_reset.ctypes.util.find_library", return_value="libc.so.6") + @mock.patch("usb_reset.log") + def test_umount(self, mock_log, mock_find_lib, mock_cdll): + mock_cdll.return_value.umount.return_value = -1 + usb_reset.umount("tgt") + mock_log.error.assert_called() + + @mock.patch("usb_reset.os") + @mock.patch("usb_reset.pwd.getpwnam") + @mock.patch("usb_reset.grp.getgrnam") + @mock.patch("usb_reset.log") + def test_clone_device(self, mock_log, mock_grp, mock_pwd, mock_os): + mock_os.path.exists.return_value = False + mock_os.path.sep = "/" + mock_os.stat.return_value.st_mode = 0o600 + mock_os.stat.return_value.st_rdev = 0 + mock_os.major.return_value = 1 + mock_os.minor.return_value = 2 + mock_os.makedev.return_value = 1234 + mock_pwd.return_value.pw_uid = 1000 + mock_grp.return_value.gr_gid = 1000 + usb_reset.clone_device("/dev/bus/usb/001/002", "/root", 1) + mock_os.mknod.assert_called() + mock_os.chown.assert_called() + + @mock.patch("usb_reset.dev_path", return_value="/dev/bus/usb/001/002") + @mock.patch("usb_reset.open", new_callable=mock.mock_open) + @mock.patch("usb_reset.fcntl.ioctl") + @mock.patch("usb_reset.log") + def test_attach_reset_only(self, mock_log, mock_ioctl, mock_open, mock_dev_path): + usb_reset.attach("1-2", 1, 123, True) + mock_open.assert_called() + mock_ioctl.assert_called() + + @mock.patch("usb_reset.dev_path", return_value="/dev/bus/usb/001/002") + @mock.patch("usb_reset.os.remove") + @mock.patch("usb_reset.get_root_dir", return_value="/root") + def test_detach(self, mock_get_root_dir, mock_remove, mock_dev_path): + usb_reset.detach("1-2", 1) + mock_remove.assert_called() + + @mock.patch("usb_reset.shutil.rmtree") + @mock.patch("usb_reset.os.path.isdir", return_value=True) + @mock.patch("usb_reset.os.path.exists", return_value=True) + @mock.patch("usb_reset.os.path.ismount", return_value=True) + @mock.patch("usb_reset.umount") + @mock.patch("usb_reset.log") + #pylint: disable=too-many-arguments + def test_cleanup(self, mock_log, mock_umount, mock_ismount, + mock_exists, mock_isdir, mock_rmtree): + usb_reset.cleanup(1) + mock_rmtree.assert_called() + +if __name__ == "__main__": + unittest.main() diff --git a/python3/tests/test_usb_scan.py b/python3/tests/test_usb_scan.py index 8b886194c74..45bfc78e569 100644 --- a/python3/tests/test_usb_scan.py +++ b/python3/tests/test_usb_scan.py @@ -9,14 +9,14 @@ import unittest from collections.abc import Mapping from typing import cast +from unittest.mock import MagicMock -import mock from python3.tests.import_helper import import_file_as_module # mock modules to avoid dependencies -sys.modules["xcp"] = mock.Mock() -sys.modules["xcp.logger"] = mock.Mock() -sys.modules["pyudev"] = mock.Mock() +sys.modules["xcp"] = MagicMock() +sys.modules["xcp.logger"] = MagicMock() +sys.modules["pyudev"] = MagicMock() usb_scan = import_file_as_module("python3/libexec/usb_scan.py") @@ -90,7 +90,7 @@ def mock_setup(mod, devices, interfaces, path): mod.log.error = verify_log mod.log.debug = verify_log mod.Policy._PATH = path - mod.pyudev.Context = mock.Mock( + mod.pyudev.Context = MagicMock( return_value=MocContext(devices, interfaces)) From 41cb7df40230ddc6d13f5729e49bf9f7b848112f Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Tue, 8 Jul 2025 09:05:53 +0100 Subject: [PATCH 110/111] datamodel_lifecycle: automatic update Signed-off-by: Bengang Yuan --- ocaml/idl/datamodel_lifecycle.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/idl/datamodel_lifecycle.ml b/ocaml/idl/datamodel_lifecycle.ml index 2880742c9cd..cf4d59eae47 100644 --- a/ocaml/idl/datamodel_lifecycle.ml +++ b/ocaml/idl/datamodel_lifecycle.ml @@ -240,7 +240,7 @@ let prototyped_of_message = function | "host", "set_numa_affinity_policy" -> Some "24.0.0" | "VM", "sysprep" -> - Some "25.23.0-next" + Some "25.24.0" | "VM", "get_secureboot_readiness" -> Some "24.17.0" | "VM", "set_uefi_mode" -> From 12826e8d0b4e5fda224c96e62305294eb76757cc Mon Sep 17 00:00:00 2001 From: Bengang Yuan Date: Tue, 8 Jul 2025 11:30:50 +0100 Subject: [PATCH 111/111] CA-413412: Fail to designate new master The user attempted to designate a new master, but the operation failed. The root cause is as follows: After the new proposed master successfully sent the `commit_new_master` API call to the old master, it attempted to send a `logout` request. However, at this point, the old master was already rebooting its xapi service, causing the `logout` to fail. As a result, the process of designating the new master was marked as failed, and the status changed to `broken`. In high-load environments, there can be a delay in sending the logout request, increasing the likelihood that it is sent after the old master has already started rebooting. If `commit_new_master` has already been successful, the success of the subsequent `logout` operation should not be considered critical. Therefore, the solution is to ignore the result of the `logout` request if `commit_new_master` was successful. Signed-off-by: Bengang Yuan --- ocaml/xapi/helpers.ml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml index 318ddfecf8d..aff1b815566 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -591,6 +591,7 @@ let call_api_functions ~__context f = call_api_functions_internal ~__context f let call_emergency_mode_functions hostname f = + let __FUN = __FUNCTION__ in let open Xmlrpc_client in let transport = SSL @@ -609,7 +610,13 @@ let call_emergency_mode_functions hostname f = in finally (fun () -> f rpc session_id) - (fun () -> Client.Client.Session.local_logout ~rpc ~session_id) + (fun () -> + try Client.Client.Session.local_logout ~rpc ~session_id + with _ -> + (* This is an emergency mode function, so we don't care about the error + in logout *) + debug "%s: The logout failed in emergency mode function" __FUN + ) let is_domain_zero_with_record ~__context vm_ref vm_rec = let host_ref = vm_rec.API.vM_resident_on in