diff --git a/ocaml/idl/datamodel.ml b/ocaml/idl/datamodel.ml index 6eb869ca3a6..f3d48952dbf 100644 --- a/ocaml/idl/datamodel.ml +++ b/ocaml/idl/datamodel.ml @@ -963,7 +963,9 @@ module PIF = struct ~params: [ Ref _pif, "self", "Reference to the object" ; Bool, "value", "New value to set" ] ~allowed_roles:_R_POOL_OP - ~errs:[Api_errors.clustering_enabled_on_network] + ~errs:Api_errors.([ other_operation_in_progress + ; clustering_enabled + ]) () let ip_configuration_mode = Enum ("ip_configuration_mode", @@ -983,7 +985,7 @@ module PIF = struct ] ~in_product_since:rel_miami ~allowed_roles:_R_POOL_OP - ~errs:[Api_errors.clustering_enabled_on_network] + ~errs:Api_errors.([ clustering_enabled ]) () let ipv6_configuration_mode = Enum ("ipv6_configuration_mode", @@ -1003,7 +1005,7 @@ module PIF = struct ] ~lifecycle:[Prototyped, rel_tampa, ""] ~allowed_roles:_R_POOL_OP - ~errs:[Api_errors.clustering_enabled_on_network] + ~errs:Api_errors.([ clustering_enabled ]) () let primary_address_type = Enum ("primary_address_type", @@ -1051,8 +1053,9 @@ module PIF = struct ~params:[Ref _pif, "self", "The PIF object to destroy"] ~in_product_since:rel_miami ~allowed_roles:_R_POOL_OP - ~errs:[ Api_errors.pif_tunnel_still_exists - ; Api_errors.clustering_enabled_on_network] + ~errs:Api_errors.([ pif_tunnel_still_exists + ; clustering_enabled + ]) () let pool_introduce_params first_rel = diff --git a/ocaml/idl/datamodel_cluster.ml b/ocaml/idl/datamodel_cluster.ml index f0ae7276fc8..fe81eb3d735 100644 --- a/ocaml/idl/datamodel_cluster.ml +++ b/ocaml/idl/datamodel_cluster.ml @@ -17,9 +17,24 @@ let cluster_operation = let lifecycle = [Prototyped, rel_kolkata, ""] -let timeout_params = [ - {param_type=Float; param_name="token_timeout"; param_doc="Corosync token timeout in seconds"; param_release=kolkata_release; param_default=Some(VFloat Constants.default_token_timeout_s)}; - {param_type=Float; param_name="token_timeout_coefficient"; param_doc="Corosync token timeout coefficient in seconds"; param_release=kolkata_release; param_default=Some(VFloat Constants.default_token_timeout_coefficient_s)}; +let lifecycle_timeout = [ + Prototyped, rel_kolkata, "the unit is milliseconds"; + Changed, rel_lima, "the unit is now seconds" + ] + +let timeout_params = + [ {param_type=Float; + param_name="token_timeout"; + param_doc="Corosync token timeout in seconds"; + param_release=kolkata_release; + param_default=Some(VFloat Constants.default_token_timeout_s)}; + + {param_type=Float; + param_name="token_timeout_coefficient"; + param_doc="Corosync token timeout coefficient in seconds"; + param_release=kolkata_release; + param_default=Some(VFloat Constants.default_token_timeout_coefficient_s)}; + ] @@ -28,10 +43,25 @@ let create = call ~doc:"Creates a Cluster object and one Cluster_host object as its first member" ~result:(Ref _cluster, "the new Cluster") ~versioned_params: - ([{param_type=Ref _network; param_name="network"; param_doc="the single network on which corosync carries out its inter-host communications"; param_release=kolkata_release; param_default=None}; - {param_type=String; param_name="cluster_stack"; param_doc="simply the string 'corosync'. No other cluster stacks are currently supported"; param_release=kolkata_release; param_default=None}; - {param_type=Bool; param_name="pool_auto_join"; param_doc="true if xapi is automatically joining new pool members to the cluster"; param_release=kolkata_release; param_default=None}; - ] @timeout_params) + ([{param_type=(Ref _pif); + param_name="PIF"; + param_doc="The PIF to connect the cluster's first cluster_host to"; + param_release=kolkata_release; + param_default=None}; + + {param_type=String; + param_name="cluster_stack"; + param_doc="simply the string 'corosync'. No other cluster stacks are currently supported"; + param_release=kolkata_release; + param_default=None}; + + {param_type=Bool; + param_name="pool_auto_join"; + param_doc="true if xapi is automatically joining new pool members to the cluster"; + param_release=kolkata_release; + param_default=None}; + + ] @timeout_params) ~lifecycle ~allowed_roles:_R_POOL_ADMIN () @@ -46,14 +76,36 @@ let destroy = call ~allowed_roles:_R_POOL_ADMIN () +let get_network = call + ~name:"get_network" + ~doc:("Returns the network used by the cluster for inter-host communication, " ^ + "i.e. the network shared by all cluster host PIFs") + ~result:(Ref _network, "network of cluster") + ~params: + [ Ref _cluster, "self", "the Cluster with the network" + ] + ~lifecycle + ~allowed_roles:_R_POOL_ADMIN + () + let pool_create = call ~name:"pool_create" ~doc:"Attempt to create a Cluster from the entire pool" ~result:(Ref _cluster, "the new Cluster") ~versioned_params: - ([{param_type=Ref _network; param_name="network"; param_doc="the single network on which corosync carries out its inter-host communications"; param_release=kolkata_release; param_default=None}; - {param_type=String; param_name="cluster_stack"; param_doc="simply the string 'corosync'. No other cluster stacks are currently supported"; param_release=kolkata_release; param_default=None}; - ] @ timeout_params) + ([{param_type=Ref _network; + param_name="network"; + param_doc="the single network on which corosync carries out its inter-host communications"; + param_release=kolkata_release; + param_default=None}; + + {param_type=String; + param_name="cluster_stack"; + param_doc="simply the string 'corosync'. No other cluster stacks are currently supported"; + param_release=kolkata_release; + param_default=None}; + + ] @ timeout_params) ~lifecycle ~allowed_roles:_R_POOL_ADMIN () @@ -79,12 +131,12 @@ let pool_destroy = call () let pool_resync = call - ~name:"pool_resync" - ~doc:"Resynchronise the cluster_host objects across the pool. Creates them where they need creating and then plugs them" - ~params:[ Ref _cluster, "self", "The cluster to resync"] - ~lifecycle - ~allowed_roles:_R_POOL_ADMIN - () + ~name:"pool_resync" + ~doc:"Resynchronise the cluster_host objects across the pool. Creates them where they need creating and then plugs them" + ~params:[ Ref _cluster, "self", "The cluster to resync"] + ~lifecycle + ~allowed_roles:_R_POOL_ADMIN + () let t = create_obj @@ -105,9 +157,10 @@ let t = ~ty:(Set (Ref _cluster_host)) "cluster_hosts" "A list of the cluster_host objects associated with the Cluster" - ; field ~qualifier:StaticRO ~lifecycle - ~ty:(Ref _network) "network" ~default_value:(Some (VRef null_ref)) - "Reference to the single network on which corosync carries out its inter-host communications" + ; field ~qualifier:DynamicRO ~lifecycle:[ Prototyped, rel_lima, "" ] + ~ty:(Set String) "pending_forget" ~default_value:(Some (VSet [])) + "Internal field used by Host.destroy to store the IP of cluster members \ + marked as permanently dead but not yet removed" ; field ~qualifier:StaticRO ~lifecycle ~ty:String "cluster_token" ~default_value:(Some (VString "")) @@ -119,17 +172,17 @@ let t = ] @ (allowed_and_current_operations cluster_operation) @ [ - field ~qualifier:StaticRO ~lifecycle + field ~qualifier:StaticRO ~lifecycle ~ty:Bool "pool_auto_join" ~default_value:(Some (VBool true)) - "True if xapi is automatically joining new pool members to the cluster. This will be `true` in the first release" + "True if automatically joining new pool members to the cluster. This will be `true` in the first release" - ; field ~qualifier:StaticRO ~lifecycle - ~ty:Int "token_timeout" ~default_value:(Some (VInt 20000L)) - "The corosync token timeout in ms" + ; field ~qualifier:StaticRO ~lifecycle:lifecycle_timeout + ~ty:Float "token_timeout" ~default_value:(Some (VFloat Constants.default_token_timeout_s)) + "The corosync token timeout in seconds" - ; field ~qualifier:StaticRO ~lifecycle - ~ty:Int "token_timeout_coefficient" ~default_value:(Some (VInt 1000L)) - "The corosync token timeout coefficient in ms" + ; field ~qualifier:StaticRO ~lifecycle:lifecycle_timeout + ~ty:Float "token_timeout_coefficient" ~default_value:(Some (VFloat Constants.default_token_timeout_coefficient_s)) + "The corosync token timeout coefficient in seconds" ; field ~qualifier:StaticRO ~lifecycle @@ -143,6 +196,7 @@ let t = ~messages: [ create ; destroy + ; get_network ; pool_create ; pool_force_destroy ; pool_destroy diff --git a/ocaml/idl/datamodel_cluster_host.ml b/ocaml/idl/datamodel_cluster_host.ml index 16589fbded4..622c994c854 100644 --- a/ocaml/idl/datamodel_cluster_host.ml +++ b/ocaml/idl/datamodel_cluster_host.ml @@ -20,6 +20,7 @@ let create = call ~params: [ Ref _cluster, "cluster", "Cluster to join" ; Ref _host, "host", "new cluster member" + ; Ref _pif, "pif", "Network interface to use for communication" ] ~lifecycle ~allowed_roles:_R_POOL_ADMIN @@ -65,6 +66,17 @@ let disable = call ~allowed_roles:_R_POOL_ADMIN () +let forget = call + ~name:"forget" + ~doc:"Permanently remove a dead host from the cluster. This host must never rejoin the cluster." + ~params: + [ Ref _cluster_host, "self", "the cluster_host to declare permanently dead and forget" + ] + ~lifecycle:[Prototyped, rel_lima, ""] + ~allowed_roles:_R_LOCAL_ROOT_ONLY + ~hide_from_docs:true + () + let t = create_obj ~name: _cluster_host @@ -92,12 +104,20 @@ let t = ~ty:Bool "enabled" ~default_value:(Some (VBool false)) "Whether the cluster host believes that clustering should be enabled on this host" + ; field ~qualifier:StaticRO ~lifecycle + ~ty:(Ref _pif) "PIF" ~default_value:(Some (VRef null_ref)) + "Reference to the PIF object" + + ; field ~qualifier:StaticRO ~lifecycle + ~ty:Bool "joined" ~default_value:(Some (VBool true)) + "Whether the cluster host has joined the cluster" + (* TODO: add `live` member to represent whether corosync believes that this cluster host actually is enabled *) - + ] @ (allowed_and_current_operations cluster_host_operation) @ [ - field ~qualifier:StaticRO ~lifecycle + field ~qualifier:StaticRO ~lifecycle ~ty:(Map(String, String)) "other_config" ~default_value:(Some (VMap [])) "Additional configuration" ]) @@ -106,6 +126,7 @@ let t = ; destroy ; enable ; force_destroy + ; forget ; disable ] () diff --git a/ocaml/idl/datamodel_common.ml b/ocaml/idl/datamodel_common.ml index 89881669444..57ea2d2ace6 100644 --- a/ocaml/idl/datamodel_common.ml +++ b/ocaml/idl/datamodel_common.ml @@ -4,10 +4,11 @@ open Datamodel_types open Datamodel_roles (* IMPORTANT: Please bump schema vsn if you change/add/remove a _field_. - You do not have to bump vsn if you change/add/remove a message *) - + You do not have to bump vsn if you change/add/remove a message + When introducing a new release, bump the schema minor version to the next hundred + to leave a gap for potential hotfixes needing to increment the schema version.*) let schema_major_vsn = 5 -let schema_minor_vsn = 142 +let schema_minor_vsn = 202 (* Historical schema versions just in case this is useful later *) let rio_schema_major_vsn = 5 @@ -189,6 +190,12 @@ let get_product_releases in_product_since = | x::xs -> go_through_release_order xs in go_through_release_order release_order +let lima_release = + { internal = get_product_releases rel_lima + ; opensource = get_oss_releases None + ; internal_deprecated_since = None + } + let kolkata_release = { internal = get_product_releases rel_kolkata ; opensource = get_oss_releases None diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index f8d906c1982..ffc9fec88a3 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -1134,12 +1134,18 @@ let _ = ~doc:"The host does not have a Cluster_host with a compatible cluster stack." (); error Api_errors.cluster_force_destroy_failed ["cluster"] ~doc:"Force destroy failed on a Cluster_host while force destroying the cluster." (); - error Api_errors.clustering_enabled_on_network ["network"] - ~doc:"The network has cluster objects attached." (); error Api_errors.cluster_stack_in_use ["cluster_stack"] ~doc:"The cluster stack is already in use." (); error Api_errors.invalid_cluster_stack [ "cluster_stack" ] - ~doc:"The cluster stack provided is not supported." () + ~doc:"The cluster stack provided is not supported." (); + error Api_errors.pif_not_attached_to_host [ "pif"; "host" ] + ~doc:"Cluster_host creation failed as the PIF provided is not attached to the host." (); + error Api_errors.cluster_host_not_joined [ "cluster_host" ] + ~doc:"Cluster_host operation failed as the cluster_host has not joined the cluster." (); + error Api_errors.cluster_host_is_last ["cluster_host"] + ~doc:"The last cluster host cannot be destroyed. Destroy the cluster instead" () + + let _ = message (fst Api_messages.ha_pool_overcommitted) ~doc:"Pool has become overcommitted: it can no longer guarantee to restart protected VMs if the configured number of hosts fail." (); diff --git a/ocaml/idl/datamodel_types.ml b/ocaml/idl/datamodel_types.ml index 216332843e1..15500aab62c 100644 --- a/ocaml/idl/datamodel_types.ml +++ b/ocaml/idl/datamodel_types.ml @@ -59,6 +59,7 @@ let rel_honolulu = "honolulu" let rel_inverness = "inverness" let rel_jura = "jura" let rel_kolkata = "kolkata" +let rel_lima = "lima" type api_release = { code_name: string option; @@ -204,6 +205,11 @@ let release_order_full = [{ version_major = 2; version_minor = 10; branding = "XenServer 7.5"; + }; { + code_name = Some rel_lima; + version_major = 2; + version_minor = 11; + branding = "Unreleased"; }; ] (* When you add a new release, use the version number of the latest release, diff --git a/ocaml/quicktest/quicktest.ml b/ocaml/quicktest/quicktest.ml index fe93bf2759d..45873a56e86 100644 --- a/ocaml/quicktest/quicktest.ml +++ b/ocaml/quicktest/quicktest.ml @@ -21,7 +21,6 @@ let () = let suite = [ "Quicktest_example", Quicktest_example.tests s ; "cbt", Quicktest_cbt.tests s - ; "reconfigure-ip-cluster", Quicktest_cluster.tests s ; "event", Quicktest_event.tests s ; "import_raw_vdi", Quicktest_import_raw_vdi.tests s ; "copy", Quicktest_vdi_copy.tests s diff --git a/ocaml/quicktest/quicktest_cluster.ml b/ocaml/quicktest/quicktest_cluster.ml index 8a7b2dd886f..d105c74f194 100644 --- a/ocaml/quicktest/quicktest_cluster.ml +++ b/ocaml/quicktest/quicktest_cluster.ml @@ -1,78 +1,85 @@ -open Quicktest_common -(* provide test record type and make_test, start, - * debug, success, and failed test functions *) - -open Client -(* provide rpc ref *) - -module C = Client +module Q = Quicktest_common +module C = Client.Client let is_empty = function | [] -> true | _ -> false +let rpc = !Q.rpc + +(* [Q.failed test.Q.name string_of_failure] removes [test] from a test Hashtbl + * and is therefore only called once, in the try-with statement. + * This exception is raised within the try-with body to trigger + * [Q.failed test string_of_failure] *) +exception Abort_test of string -(** --- Helpers for reconfiguration --- *) +(** --- Helpers for IP reconfiguration tests --- *) let reconfigure_ipv4 ~session_id ~self ~dNS = - let netmask = C.PIF.get_netmask ~session_id ~rpc:!rpc ~self in - let iP = C.PIF.get_IP ~session_id ~rpc:!rpc ~self in - let gateway = C.PIF.get_gateway ~session_id ~rpc:!rpc ~self in - let mode = C.PIF.get_ip_configuration_mode ~session_id ~rpc:!rpc ~self in - C.PIF.reconfigure_ip ~session_id ~rpc:!rpc ~self ~iP ~dNS ~gateway ~netmask ~mode + let netmask = C.PIF.get_netmask ~session_id ~rpc ~self in + let iP = C.PIF.get_IP ~session_id ~rpc ~self in + let gateway = C.PIF.get_gateway ~session_id ~rpc ~self in + let mode = C.PIF.get_ip_configuration_mode ~session_id ~rpc ~self in + C.PIF.reconfigure_ip ~session_id ~rpc ~self ~iP ~dNS ~gateway ~netmask ~mode let reconfigure_ipv6 ~session_id ~self ~dNS = (* confirm valid IPv6 strings exist *) - let iPv6_lst = (C.PIF.get_IPv6 ~session_id ~rpc:!rpc ~self) |> List.filter ((<>) "") in + let iPv6_lst = (C.PIF.get_IPv6 ~session_id ~rpc ~self) |> List.filter ((<>) "") in if is_empty iPv6_lst - then Alcotest.fail "No valid IPv6 strings exist."; + then raise (Abort_test "No valid IPv6 strings exist."); - let gateway = C.PIF.get_ipv6_gateway ~session_id ~rpc:!rpc ~self in - let mode = C.PIF.get_ipv6_configuration_mode ~session_id ~rpc:!rpc ~self in + let gateway = C.PIF.get_ipv6_gateway ~session_id ~rpc ~self in + let mode = C.PIF.get_ipv6_configuration_mode ~session_id ~rpc ~self in let iPv6 = List.hd iPv6_lst in - C.PIF.reconfigure_ipv6 ~session_id ~rpc:!rpc ~self ~iPv6 ~dNS ~gateway ~mode + C.PIF.reconfigure_ipv6 ~session_id ~rpc ~self ~iPv6 ~dNS ~gateway ~mode (** --- Test skeleton, receives environment params before running --- *) let test_reconfigure_ip ~ipv6 ~session_id ~(self : API.ref_PIF) = let ip_string = if ipv6 then "IPv6" else "IPv4" in - Printf.printf "Testing reconfiguring %s with clustering.\n" ip_string; + let test = + Q.make_test (Printf.sprintf "Testing reconfiguring %s with clustering." ip_string) 4 + in try - let dNS = C.PIF.get_DNS ~session_id ~rpc:!rpc ~self in + Q.start test; + + let dNS = C.PIF.get_DNS ~session_id ~rpc ~self in if ipv6 then reconfigure_ipv6 ~session_id ~self ~dNS else reconfigure_ipv4 ~session_id ~self ~dNS; - Alcotest.fail "PIF.reconfigure_ip should raise clustering_enabled_on_network." + Q.failed test "PIF.reconfigure_ip should raise CLUSTERING_ENABLED" with - | Api_errors.(Server_error(code,_)) when code=Api_errors.clustering_enabled_on_network - -> print_endline (Printf.sprintf "%s raised as expected." Api_errors.clustering_enabled_on_network) + | Api_errors.(Server_error(code,_)) when code=Api_errors.clustering_enabled + -> Q.debug test (Printf.sprintf "%s raised as expected." Api_errors.clustering_enabled); + Q.success test | Api_errors.(Server_error(_,_)) -> () (* Don't fail on other API errors, only test clustering *) + | Abort_test s -> Q.failed test s + | e -> Q.failed test (ExnHelper.string_of_exn e) (** --- Check environment before calling test --- *) -let test session_id () = - print_endline "Testing IP reconfiguration with and without clustering."; - print_newline (); - print_newline (); - let pifs = Client.PIF.get_all ~session_id ~rpc:!rpc in - - List.iter - (fun self -> - let clustering = - let network = C.PIF.get_network ~session_id ~rpc:!rpc ~self in - C.Cluster.get_all ~session_id ~rpc:!rpc - |> List.filter - (fun cluster -> (C.Cluster.get_network ~session_id ~rpc:!rpc ~self:cluster) = network) - |> (fun lst -> not (is_empty lst)) - in - if clustering - then begin - test_reconfigure_ip ~ipv6:false ~session_id ~self - (* IPv6 clusters not yet supported, can run this test once that changes *) - (* test_reconfigure_ip ~ipv6:true ~session_id ~self *) - end - else - print_endline "No cluster objects on this PIF, skipping tests." - ) pifs +let test session_id = + let test_all_pifs = Q.make_test "Testing IP reconfiguration with and without clustering." 2 in + try + print_newline (); + Q.start test_all_pifs; + print_newline (); -let tests session_id = - [ "IP reconfiguration test", `Slow, test session_id - ] + let enabled_cluster_hosts = + List.filter + (fun self -> C.Cluster_host.get_enabled ~session_id ~rpc ~self) + (C.Cluster_host.get_all ~session_id ~rpc) + in + if is_empty enabled_cluster_hosts + then Q.debug test_all_pifs "No PIFS with clustering enabled, skipping tests." + else begin + enabled_cluster_hosts + |> List.map + (fun self -> C.Cluster_host.get_PIF ~session_id ~rpc ~self) + |> List.iter + (fun self -> + test_reconfigure_ip ~ipv6:false ~session_id ~self + (* IPv6 clusters not yet supported, can run this line once they are: + test_reconfigure_ip ~ipv6:true ~session_id ~self *) + ); + Q.success test_all_pifs + end + with e -> Q.failed test_all_pifs (ExnHelper.string_of_exn e) diff --git a/ocaml/tests/suite_alcotest.ml b/ocaml/tests/suite_alcotest.ml index 6bbbffb7f85..23eaedcba0a 100644 --- a/ocaml/tests/suite_alcotest.ml +++ b/ocaml/tests/suite_alcotest.ml @@ -2,7 +2,7 @@ let () = Suite_init.harness_init (); (* Alcotest hides the standard output of successful tests, - so we will probably not exceed the 4MB limit in Traivs *) + so we will probably not exceed the 4MB limit in Travis *) Debug.log_to_stdout (); Alcotest.run "Base suite" @@ -30,6 +30,8 @@ let () = ; "Test_daemon_manager", Test_daemon_manager.test ; "Test_cluster", Test_cluster.test ; "Test_cluster_host", Test_cluster_host.test + ; "Test_clustering", Test_clustering.test + ; "Test_clustering_allowed_operations", Test_clustering_allowed_operations.test ; "Test_client", Test_client.test ; "Test_ca91480", Test_ca91480.test ; "Test_pgpu", Test_pgpu.test @@ -43,8 +45,6 @@ let () = ; "Test_pvs_site", Test_pvs_site.test ; "Test_pvs_proxy", Test_pvs_proxy.test ; "Test_pvs_server", Test_pvs_server.test - ; "Test_clustering", Test_clustering.test - ; "Test_clustering_allowed_operations", Test_clustering_allowed_operations.test ; "Test_event", Test_event.test ; "Test_vm_placement", Test_vm_placement.test ; "Test_vm_memory_constraints", Test_vm_memory_constraints.test diff --git a/ocaml/tests/test_cluster.ml b/ocaml/tests/test_cluster.ml index 188de3d9bef..059f8da9804 100644 --- a/ocaml/tests/test_cluster.ml +++ b/ocaml/tests/test_cluster.ml @@ -24,7 +24,7 @@ let test_clusterd_rpc ~__context call = | ("enable" | "disable" | "destroy" | "leave"), _ -> Rpc.{success = true; contents = Rpc.Null } | name, params -> - failwith (Printf.sprintf "Unexpected RPC: %s(%s)" name (String.concat " " (List.map Rpc.to_string params))) + Alcotest.failf "Unexpected RPC: %s(%s)" name (String.concat " " (List.map Rpc.to_string params)) let test_rpc ~__context call = match call.Rpc.name, call.Rpc.params with @@ -37,18 +37,19 @@ let test_rpc ~__context call = Xapi_cluster.destroy ~__context ~self:(ref_Cluster_of_rpc self); Rpc.{success = true; contents = Rpc.String "" } | name, params -> - failwith (Printf.sprintf "Unexpected RPC: %s(%s)" name (String.concat " " (List.map Rpc.to_string params))) + Alcotest.failf "Unexpected RPC: %s(%s)" name (String.concat " " (List.map Rpc.to_string params)) -let create_cluster ~__context ?(cluster_stack=Constants.default_smapiv3_cluster_stack) ?(test_clusterd_rpc=test_clusterd_rpc) () = +let create_cluster ~__context ?(cluster_stack=Constants.default_smapiv3_cluster_stack) + ?(test_clusterd_rpc=test_clusterd_rpc) ?(token_timeout=1.) ?(token_timeout_coefficient=1.) () = Context.set_test_rpc __context (test_rpc ~__context); Context.set_test_clusterd_rpc __context (test_clusterd_rpc ~__context); let network = Test_common.make_network ~__context () in let localhost = Helpers.get_localhost ~__context in - let pifref = Test_common.make_pif ~__context ~network ~host:localhost () in - Db.PIF.set_IP ~__context ~self:pifref ~value:"192.0.2.1"; - Db.PIF.set_currently_attached ~__context ~self:pifref ~value:true; - Db.PIF.set_disallow_unplug ~__context ~self:pifref ~value:true; - Xapi_cluster.create ~__context ~network ~cluster_stack ~pool_auto_join:true ~token_timeout:1. ~token_timeout_coefficient:1. + let pIF = Test_common.make_pif ~__context ~network ~host:localhost () in + Db.PIF.set_IP ~__context ~self:pIF ~value:"192.0.2.1"; + Db.PIF.set_currently_attached ~__context ~self:pIF ~value:true; + Db.PIF.set_disallow_unplug ~__context ~self:pIF ~value:true; + Xapi_cluster.create ~__context ~pIF ~cluster_stack ~pool_auto_join:true ~token_timeout ~token_timeout_coefficient let test_create_destroy_status () = let __context = Test_common.make_test_database () in @@ -58,34 +59,48 @@ let test_create_destroy_status () = let test_enable () = let __context = Test_common.make_test_database () in let cluster = create_cluster ~__context () in - (* simulate xapi getting restarted *) - Create_storage.maybe_reenable_cluster_host __context; + (* simulate xapi getting restarted *) + begin match Xapi_clustering.find_cluster_host ~__context ~host:Helpers.(get_localhost ~__context) with + | Some self -> Xapi_cluster_host.enable ~__context ~self + | None -> Alcotest.fail "Couldn't find freshly-created cluster_host" + end; pool_destroy ~__context ~self:cluster -let test_invalid_cluster_stack () = +let test_invalid_parameters () = let __context = Test_common.make_test_database () in let cluster_stack = "invalid_cluster_stack" in Alcotest.check_raises "Cluster.create should fail upon receiving an invalid cluster stack" Api_errors.(Server_error (invalid_cluster_stack, [ cluster_stack ])) - (fun () -> create_cluster ~__context ~cluster_stack () |> ignore) + (fun () -> create_cluster ~__context ~cluster_stack () |> ignore); + + Alcotest.check_raises + "token_timeout < minimum threshold" + Api_errors.(Server_error (invalid_value, [ "token_timeout"; "0.5" ])) + (fun () -> create_cluster ~__context ~token_timeout:0.5 () |> ignore); + + Alcotest.check_raises + "token_timeout_coefficient < minimum threshold" + Api_errors.(Server_error (invalid_value, [ "token_timeout_coefficient"; "0.6" ])) + (fun () -> create_cluster ~__context ~token_timeout_coefficient:0.6 () |> ignore) + let test_create_cleanup () = let __context = Test_common.make_test_database () in - let test_failed_clusterd_rpc ~__context call = + let test_clusterd_rpc ~__context call = match call.Rpc.name, call.Rpc.params with | "create", _ -> Rpc.{ success = false ; contents = Rpcmarshal.marshal - (Cluster_interface.error.Rpc.Types.ty) - (Cluster_interface.InternalError "Cluster.create failed") + Cluster_interface.error.Rpc.Types.ty + Cluster_interface.(InternalError "Cluster.create failed") } | _, _ -> Rpc.{success = true; contents = Rpc.Null } in try - create_cluster ~__context ~test_clusterd_rpc:test_failed_clusterd_rpc () |> ignore; + create_cluster ~__context ~test_clusterd_rpc () |> ignore; Alcotest.fail "Cluster.create should have failed" with | e -> @@ -95,11 +110,11 @@ let test_create_cleanup () = [] (Db.Cluster.get_all ~__context); Alcotest.(check (slist (Alcotest_comparators.ref ()) compare)) "Cluster_host refs should be destroyed" - [] (Db.Cluster_host.get_all ~__context) + [] (Db.Cluster_host.get_all ~__context) let test = [ "test_create_destroy_service_status", `Quick, test_create_destroy_status ; "test_enable", `Quick, test_enable - ; "test_invalid_cluster_stack", `Quick, test_invalid_cluster_stack + ; "test_invalid_parameters", `Quick, test_invalid_parameters ; "test_create_cleanup", `Quick, test_create_cleanup ] diff --git a/ocaml/tests/test_cluster_host.ml b/ocaml/tests/test_cluster_host.ml index 2d07fc94494..ddc20b0cddb 100644 --- a/ocaml/tests/test_cluster_host.ml +++ b/ocaml/tests/test_cluster_host.ml @@ -17,13 +17,14 @@ open Xapi_cluster_host let create_cluster ~__context pool_auto_join = let cluster_ref = Ref.make () in let cluster_uuid = Uuidm.to_string (Uuidm.create `V4) in - let network = Test_common.make_network ~__context () in - Db.Cluster.create ~__context ~ref:cluster_ref ~uuid:cluster_uuid ~network ~cluster_token:"token" - ~cluster_stack:Constants.default_smapiv3_cluster_stack ~token_timeout:5000L ~token_timeout_coefficient:1000L ~allowed_operations:[] - ~current_operations:[] ~pool_auto_join ~cluster_config:[] ~other_config:[]; + Db.Cluster.create ~__context ~ref:cluster_ref ~uuid:cluster_uuid ~cluster_token:"token" + ~cluster_stack:Constants.default_smapiv3_cluster_stack + ~token_timeout:Constants.default_token_timeout_s + ~token_timeout_coefficient:Constants.default_token_timeout_coefficient_s ~allowed_operations:[] + ~current_operations:[] ~pool_auto_join ~cluster_config:[] ~other_config:[] ~pending_forget:[]; cluster_ref -let assert_ref_option = +let check_cluster_option = Alcotest.(check (option (Alcotest_comparators.ref ()) )) let test_dbsync_join () = @@ -31,14 +32,14 @@ let test_dbsync_join () = let cluster = create_cluster ~__context true in let localhost = Helpers.get_localhost ~__context in let result = sync_required ~__context ~host:localhost in - assert_ref_option "Cluster option" result (Some (cluster)) + check_cluster_option "test_dbsync_join" (Some (cluster)) result let test_dbsync_nojoin () = let __context = Test_common.make_test_database () in let _cluster = create_cluster ~__context false in let localhost = Helpers.get_localhost ~__context in let result = sync_required ~__context ~host:localhost in - assert_ref_option "Cluster option" result None + check_cluster_option "test_dbsync_nojoin" None result let pif_plug_rpc __context call = match call.Rpc.name, call.Rpc.params with @@ -48,12 +49,13 @@ let pif_plug_rpc __context call = let self = ref_PIF_of_rpc self_rpc in Db.PIF.set_currently_attached ~__context ~self ~value:true; Rpc.{success=true; contents = Rpc.String ""} - | "Cluster_host.create", [session_id_rpc;cluster_rpc;host_rpc] -> + | "Cluster_host.create", [session_id_rpc;cluster_rpc;host_rpc;pif_rpc] -> let open API in let _session_id = ref_session_of_rpc session_id_rpc in let cluster = ref_Cluster_of_rpc cluster_rpc in let host = ref_host_of_rpc host_rpc in - ignore(Test_common.make_cluster_host ~__context ~cluster ~host ()); + let pIF = ref_PIF_of_rpc pif_rpc in + ignore(Test_common.make_cluster_host ~__context ~cluster ~host ~pIF ()); Rpc.{success=true; contents = Rpc.String ""} | _ -> failwith "Unexpected RPC" @@ -61,40 +63,45 @@ let pif_plug_rpc __context call = let test_fix_prereq () = let __context = Test_common.make_test_database () in Context.set_test_rpc __context (pif_plug_rpc __context); - let exn = "we_havent_decided_on_the_exception_yet" in - let cluster = create_cluster ~__context true in - let network = Db.Cluster.get_network ~__context ~self:cluster in - let localhost = Helpers.get_localhost ~__context in - let pifref = Test_common.make_pif ~__context ~network ~host:localhost () in - let pif = Xapi_clustering.pif_of_host ~__context network localhost in - Alcotest.check_raises "Should fail when checking PIF prequisites" - (Failure exn) - (fun () -> - try - Xapi_cluster_host.fix_pif_prerequisites __context pif - with _ -> - failwith exn); + let network = Test_common.make_network ~__context () in + let host = Helpers.get_localhost ~__context in + let pifref = Test_common.make_pif ~__context ~network ~host () in + Alcotest.check_raises + "Should fail when checking PIF prequisites" + Api_errors.(Server_error (pif_has_no_network_configuration, [ Ref.string_of pifref ])) + (fun () -> Xapi_cluster_host.fix_pif_prerequisites __context pifref); Db.PIF.set_IP ~__context ~self:pifref ~value:"1.1.1.1"; - let pif = Xapi_clustering.pif_of_host ~__context network localhost in - Xapi_cluster_host.fix_pif_prerequisites ~__context pif; - let pif = Xapi_clustering.pif_of_host ~__context network localhost in - Alcotest.(check unit) "Assert PIF prerequisites without error" - (Xapi_clustering.assert_pif_prerequisites pif) () + Xapi_cluster_host.fix_pif_prerequisites ~__context pifref; + let pif = Xapi_clustering.pif_of_host ~__context network host in + Alcotest.(check unit) + "PIF prerequisites have now been fixed" + () (Xapi_clustering.assert_pif_prerequisites pif) let test_create_as_necessary () = let __context = Test_common.make_test_database () in Context.set_test_rpc __context (pif_plug_rpc __context); let cluster = create_cluster ~__context true in - let network = Db.Cluster.get_network ~__context ~self:cluster in let localhost = Helpers.get_localhost ~__context in + let network = Test_common.make_network ~__context () in let pifref = Test_common.make_pif ~__context ~network ~host:localhost () in Db.PIF.set_IP ~__context ~self:pifref ~value:"1.1.1.1"; let _pif = Xapi_clustering.pif_of_host ~__context network localhost in let result = sync_required ~__context ~host:localhost in - assert_ref_option "Cluster option" result (Some cluster); + check_cluster_option "sync_required without an existing cluster_host" (Some cluster) result; + Alcotest.check_raises + "create_as_necessary should fail if autojoin is set and the pool master has no cluster_host" + Api_errors.(Server_error (internal_error, + [ Printf.sprintf "No cluster_host exists on master" ])) + (fun () -> Xapi_cluster_host.create_as_necessary ~__context ~host:localhost); + let _ = Test_common.make_cluster_host ~__context ~pIF:(fst _pif) ~host:(Helpers.get_master ~__context) ~cluster () in Xapi_cluster_host.create_as_necessary ~__context ~host:localhost; let result = sync_required ~__context ~host:localhost in - assert_ref_option "Cluster option" result None + check_cluster_option "sync_required with an existing cluster_host" None result; + let host = Test_common.make_host ~__context () in + let result = sync_required ~__context ~host in + check_cluster_option + "sync_required with an existing cluster_host on master but not given host" + (Some cluster) result (* CA-275728 *) let test_destroy_forbidden_when_sr_attached () = @@ -115,7 +122,85 @@ let test_destroy_forbidden_when_sr_attached () = Alcotest.check_raises ("Should raise cluster_stack_in_use: [ " ^ cluster_stack ^ " ] ") Api_errors.(Server_error (cluster_stack_in_use, [ cluster_stack ])) - (fun () -> Xapi_cluster_host.destroy ~__context ~self:cluster_host) + (fun () -> Xapi_cluster_host.force_destroy ~__context ~self:cluster_host) + +type declare_dead_args = { + dead_members: Cluster_interface.address list; + dbg: string +} [@@deriving rpcty] + +let test_clusterd_rpc ~__context call = + match call.Rpc.name, call.Rpc.params with + | "declare-dead", [args] -> + let args = Rpcmarshal.unmarshal declare_dead_args.Rpc.Types.ty args |> Rresult.R.get_ok in + let all = Db.Cluster_host.get_all ~__context in + let ndead = List.length args.dead_members in + let nall = List.length all in + Printf.printf "dead_members: %d, all: %d\n" ndead nall; + if ndead = nall - 1 then + Rpc.{success = true; contents = Rpc.rpc_of_unit () } + else + let err = Cluster_interface.InternalError "Remaining hosts are not all alive" in + (* in the test we must declare N-1 as dead before it succeeds *) + Rpc.failure (Rpcmarshal.marshal Cluster_interface.error.Rpc.Types.ty err) + | name, params -> + failwith (Printf.sprintf "Unexpected RPC: %s(%s)" name (String.concat " " (List.map Rpc.to_string params))) + +let test_rpc ~__context call = + match call.Rpc.name, call.Rpc.params with + | "Cluster_host.forget", [_session; self] -> + let open API in + Xapi_cluster_host.forget ~__context ~self:(ref_Cluster_host_of_rpc self); + Rpc.{success = true; contents = Rpc.String "" } + | "host.apply_guest_agent_config", _ -> + Rpc.{success = true; contents = Rpc.rpc_of_unit () } + | name, params -> + failwith (Printf.sprintf "Unexpected RPC: %s(%s)" name (String.concat " " (List.map Rpc.to_string params))) + +let make ~__context extra_hosts = + Context.set_test_rpc __context (test_rpc ~__context); + Context.set_test_clusterd_rpc __context (test_clusterd_rpc ~__context); + Test_common.make_cluster_and_hosts ~__context extra_hosts + + +let test_forget () = + let __context = Test_common.make_test_database () in + let host2 = Test_common.make_host ~__context () in + let cluster, original_cluster_hosts = make ~__context [host2] in + + Xapi_host.destroy ~__context ~self:host2; + let pending = Db.Cluster.get_pending_forget ~__context ~self:cluster in + Alcotest.(check (list string)) "no pending forgets" + [] pending; + + Db_gc_util.gc_Cluster_hosts ~__context; + let cluster_hosts = Db.Cluster.get_cluster_hosts ~__context ~self:cluster in + Alcotest.(check (list Alcotest_comparators.(ref ())) "surviving cluster host" + [List.hd original_cluster_hosts] cluster_hosts) + + +let test_forget2 () = + let __context = Test_common.make_test_database () in + let host2 = Test_common.make_host ~__context () in + let host3 = Test_common.make_host __context () in + let cluster, original_cluster_hosts = make ~__context [host2; host3] in + + Xapi_host.destroy ~__context ~self:host3; + + let pending = Db.Cluster.get_pending_forget ~__context ~self:cluster in + Alcotest.(check (list string) "1 pending forgets" + ["192.0.2.3"] pending); + + Xapi_host.destroy ~__context ~self:host2; + + Db_gc_util.gc_Cluster_hosts ~__context; + let cluster_hosts = Db.Cluster.get_cluster_hosts ~__context ~self:cluster in + Alcotest.(check (list Alcotest_comparators.(ref ())) "surviving cluster host" + [List.hd original_cluster_hosts] cluster_hosts); + + let pending = Db.Cluster.get_pending_forget ~__context ~self:cluster in + Alcotest.(check (list string) "no pending forgets" + [] pending) let test = @@ -124,4 +209,6 @@ let test = ; "test_fix_prerequisites", `Quick, test_fix_prereq ; "test_create_as_necessary", `Quick, test_create_as_necessary ; "test_destroy_forbidden_when_sr_attached", `Quick, test_destroy_forbidden_when_sr_attached + ; "test_forget", `Quick, test_forget + ; "test_forget2", `Quick, test_forget2 ] diff --git a/ocaml/tests/test_clustering.ml b/ocaml/tests/test_clustering.ml index b1fcf3002c7..28214e119f5 100644 --- a/ocaml/tests/test_clustering.ml +++ b/ocaml/tests/test_clustering.ml @@ -108,7 +108,7 @@ let test_find_cluster_host_finds_multiple_cluster_hosts () = let _ = T.make_cluster_host ~__context ~host () in Alcotest.check_raises "test_find_cluster_host_finds_multiple_cluster_hosts should throw an internal error" - (Api_errors.Server_error(Api_errors.internal_error,["Multiple cluster_hosts found for host"; (Ref.string_of host)])) + Api_errors.(Server_error (internal_error,["Multiple cluster_hosts found for host"; (Ref.string_of host)])) (fun () -> ignore (Xapi_clustering.find_cluster_host ~__context ~host)) let test_find_cluster_host = @@ -123,32 +123,32 @@ let test_find_cluster_host = let test_assert_cluster_host_is_enabled_when_it_is_enabled () = let __context = T.make_test_database () in let self = T.make_cluster_host ~__context ~enabled:true () in - try - (Xapi_clustering.assert_cluster_host_enabled ~__context ~self ~expected:true) - with _ -> - Alcotest.fail "test_assert_cluster_host_is_enabled_when_it_is_enabled should fail" + Alcotest.(check unit) + "test_assert_cluster_host_is_enabled_when_it_is_enabled" + () (Xapi_clustering.assert_cluster_host_enabled ~__context ~self ~expected:true) let test_assert_cluster_host_is_enabled_when_it_is_disabled () = let __context = T.make_test_database () in let self = T.make_cluster_host ~__context ~enabled:false () in Alcotest.check_raises - "test_assert_cluster_host_is_enabled_when_it_is_disabled should raise clustering_disabled" - (Api_errors.Server_error(Api_errors.clustering_disabled, [Ref.string_of self])) + "test_assert_cluster_host_is_enabled_when_it_is_disabled" + Api_errors.(Server_error (clustering_disabled, [Ref.string_of self])) (fun () -> Xapi_clustering.assert_cluster_host_enabled ~__context ~self ~expected:true) let test_assert_cluster_host_is_disabled_when_it_is_enabled () = let __context = T.make_test_database () in let self = T.make_cluster_host ~__context ~enabled:true () in Alcotest.check_raises - "test_assert_cluster_host_is_disabled_when_it_is_enabled should raise clustering_enabled" - Api_errors.(Server_error(clustering_enabled, [Ref.string_of self])) + "test_assert_cluster_host_is_disabled_when_it_is_enabled" + Api_errors.(Server_error (clustering_enabled, [Ref.string_of self])) (fun () -> Xapi_clustering.assert_cluster_host_enabled ~__context ~self ~expected:false) let test_assert_cluster_host_is_disabled_when_it_is_disabled () = let __context = T.make_test_database () in let self = T.make_cluster_host ~__context ~enabled:false () in - try Xapi_clustering.assert_cluster_host_enabled ~__context ~self ~expected:false - with _ -> Alcotest.fail "asserting cluster_host is disabled fails when cluster_host is disabled" + Alcotest.(check unit) + "test_assert_cluster_host_is_disabled_when_it_is_disabled" + () (Xapi_clustering.assert_cluster_host_enabled ~__context ~self ~expected:false) let test_assert_cluster_host_enabled = [ "test_assert_cluster_host_is_enabled_when_it_is_enabled", `Quick, test_assert_cluster_host_is_enabled_when_it_is_enabled @@ -178,40 +178,40 @@ let make_scenario ?(cluster_host=(Some true)) () = let test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_enabled () = let __context, host, cluster, cluster_host = make_scenario () in Alcotest.(check unit) - "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_enabled should pass" - (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"gfs2") () + "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_enabled" + () (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"gfs2") let test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_no_matching_sms_exist () = let __context, host, cluster, cluster_host = make_scenario () in Alcotest.(check unit) - "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_no_matching_sms_exist should pass" - (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"sr_type_with_no_matching_sm") () + "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_no_matching_sms_exist" + () (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"sr_type_with_no_matching_sm") let test_assert_cluster_host_is_enabled_for_matching_sms_fails_if_cluster_host_is_disabled () = let __context, host, cluster, cluster_host = make_scenario ~cluster_host:(Some false) () in Alcotest.check_raises - "test_assert_cluster_host_is_enabled_for_matching_sms_fails_if_cluster_host_is_disabled should raise clustering_disabled" + "test_assert_cluster_host_is_enabled_for_matching_sms_fails_if_cluster_host_is_disabled" Api_errors.(Server_error(clustering_disabled, [Ref.string_of cluster_host])) (fun () -> Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"gfs2") let test_assert_cluster_host_is_enabled_for_matching_sms_fails_if_no_cluster_host_exists () = let __context, host, cluster, cluster_host = make_scenario ~cluster_host:None () in Alcotest.check_raises - "test_assert_cluster_host_is_enabled_for_matching_sms_fails_if_no_cluster_host_exists should raise no_compatible_cluster_host" + "test_assert_cluster_host_is_enabled_for_matching_sms_fails_if_no_cluster_host_exists" Api_errors.(Server_error(no_compatible_cluster_host, [Ref.string_of host])) (fun () -> Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"gfs2") let test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_disabled_and_clustering_is_not_needed () = let __context, host, cluster, cluster_host = make_scenario ~cluster_host:(Some false) () in Alcotest.(check unit) - "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_disabled_and_clustering_is_not_needed should pass" - (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"lvm") () + "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_disabled_and_clustering_is_not_needed" + () (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"lvm") let test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_no_cluster_host_exists_and_clustering_is_not_needed () = let __context, host, cluster, cluster_host = make_scenario ~cluster_host:None () in Alcotest.(check unit) "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_no_cluster_host_exists_and_clustering_is_not_needed should pass" - (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"lvm") () + () (Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:"lvm") let test_assert_cluster_host_is_enabled_for_matching_sms = [ "test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_enabled", `Quick, test_assert_cluster_host_is_enabled_for_matching_sms_succeeds_if_cluster_host_is_enabled @@ -230,9 +230,9 @@ let nest_with_clustering_lock_if_needed ~__context ~timeout ~type1 ~type2 ~on_de ~timeout:timeout ~otherwise: on_deadlock (fun () -> - Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type:type1 + Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type:type1 __LOC__ (fun () -> - Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type:type2 + Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type:type2 __LOC__ (fun () -> on_no_deadlock () ) ) @@ -248,7 +248,7 @@ let test_clustering_lock_only_taken_if_needed_nested_calls () = ~timeout:1.0 ~type1: "type_corosync" ~type2: "type_nocluster" - ~on_deadlock: (fun () -> failwith "Unexpected deadlock when making nested calls to with_clustering_lock_if_needed") + ~on_deadlock: (fun () -> Alcotest.fail "Unexpected deadlock when making nested calls to with_clustering_lock_if_needed") ~on_no_deadlock: (fun () -> ()) let test_clustering_lock_taken_when_needed_nested_calls () = @@ -262,7 +262,7 @@ let test_clustering_lock_taken_when_needed_nested_calls () = ~type1: "type_corosync1" ~type2: "type_corosync2" ~on_deadlock: (fun () -> ()) - ~on_no_deadlock: (fun () -> failwith "Nesting calls to with_clustering_lock_if_needed should deadlock if both require a cluster stack, lock not taken or not working as expected.") + ~on_no_deadlock: (fun () -> Alcotest.fail "Nesting calls to with_clustering_lock_if_needed should deadlock if both require a cluster stack, lock not taken or not working as expected.") let test_clustering_lock_only_taken_if_needed = [ "test_clustering_lock_only_taken_if_needed_nested_calls", `Quick, test_clustering_lock_only_taken_if_needed_nested_calls @@ -273,53 +273,46 @@ let test_assert_pif_prerequisites () = let __context = T.make_test_database () in let network = T.make_network ~__context () in let localhost = Helpers.get_localhost ~__context in - let (_cluster, _cluster_host) = T.make_cluster_and_cluster_host ~__context ~network ~host:localhost () in - let exn = "we_havent_decided_on_the_exception_yet" in let pifref = T.make_pif ~__context ~network ~host:localhost () in + let (_cluster, _cluster_host) = T.make_cluster_and_cluster_host ~__context ~host:localhost ~pIF:pifref () in let pif = Xapi_clustering.pif_of_host ~__context network localhost in + Alcotest.check_raises "test_assert_pif_prerequisites should fail at first" - (Failure exn) - (fun () -> - try - Xapi_clustering.assert_pif_prerequisites pif - with _ -> - failwith exn); - (* Put in IPv4 info *) - Db.PIF.set_IP ~__context ~self:pifref ~value:"1.1.1.1"; + Api_errors.(Server_error (pif_allows_unplug, [Ref.string_of pifref])) + (fun () -> Xapi_clustering.assert_pif_prerequisites pif); + + (* Fix one prerequisite each time *) + Db.PIF.set_disallow_unplug ~__context ~self:pifref ~value:true; let pif = Xapi_clustering.pif_of_host ~__context network localhost in Alcotest.check_raises - "test_assert_pif_prerequisites should fail after setting IPv4 info" - (Failure exn) - (fun () -> - try - Xapi_clustering.assert_pif_prerequisites pif - with _ -> - failwith exn); + "test_assert_pif_prerequisites : disallow_unplug set, IP and currently_attached to go " + Api_errors.(Server_error (required_pif_is_unplugged, [Ref.string_of pifref])) + (fun () -> Xapi_clustering.assert_pif_prerequisites pif); + + (* Plug in PIF *) Db.PIF.set_currently_attached ~__context ~self:pifref ~value:true; let pif = Xapi_clustering.pif_of_host ~__context network localhost in Alcotest.check_raises - "test_assert_pif_prerequisites should fail after setting attached:true" - (Failure exn) - (fun () -> - try - Xapi_clustering.assert_pif_prerequisites pif - with _ -> - failwith exn); - Db.PIF.set_disallow_unplug ~__context ~self:pifref ~value:true; + "test_assert_pif_prerequisites : disallow_unplug and currently_attached set, need IP config now " + Api_errors.(Server_error (pif_has_no_network_configuration, [Ref.string_of pifref])) + (fun () -> Xapi_clustering.assert_pif_prerequisites pif); + + (* Put in IPv4 info *) + Db.PIF.set_IP ~__context ~self:pifref ~value:"1.1.1.1"; let pif = Xapi_clustering.pif_of_host ~__context network localhost in Alcotest.(check unit) - "assert_pif_prerequisites should pass after setting disallow_unplug:true" - (Xapi_clustering.assert_pif_prerequisites pif) () + "assert_pif_prerequisites should pass after fixing all prereqs" + () (Xapi_clustering.assert_pif_prerequisites pif) let test_assert_pif_prerequisites = [ "test_assert_pif_prerequisites", `Quick, test_assert_pif_prerequisites ] (** Test PIF.disallow_unplug is RO when clustering is enabled *) -let check_disallow_unplug expected_value __context pif msg = +let check_disallow_unplug expected_value __context self msg = Alcotest.(check bool) msg - (Db.PIF.get_disallow_unplug ~__context ~self:pif) + (Db.PIF.get_disallow_unplug ~__context ~self) expected_value (* Need host and network to make PIF *) @@ -361,19 +354,91 @@ let test_disallow_unplug_with_clustering () = "check_disallow_unplug called by test_disallow_unplug_with_clustering after setting disallow_unplug:true"; (* PIF.disallow_unplug should become RO upon introduce cluster_host object, should throw exception when changing value *) - let _ = T.make_cluster_and_cluster_host ~__context ~network ~host () in + let _, cluster_host = T.make_cluster_and_cluster_host ~__context ~pIF:pif ~host () in Alcotest.check_raises "check_disallow_unplug called by test_disallow_unplug_with_clustering after attaching cluster and cluster_host to network" - (Api_errors.(Server_error(clustering_enabled_on_network, [Ref.string_of network]))) + (Api_errors.(Server_error(clustering_enabled, [ Ref.string_of cluster_host ]))) (fun () -> Xapi_pif.set_disallow_unplug ~__context ~self:pif ~value:false); Xapi_pif.set_disallow_unplug ~__context ~self:pif ~value:true; check_disallow_unplug true __context pif "PIF.set_disallow_unplug should be idempotent even with clustering" -let test_disallow_unplug_ro_with_clustering_enabled = +let test_assert_no_clustering_on_pif () = + let __context = T.make_test_database () in + let host, _, self = make_host_network_pif ~__context in + let assert_no_clustering_on self msg = + Alcotest.(check unit) msg + () (Xapi_pif.assert_no_clustering_enabled_on ~__context ~self) + in + assert_no_clustering_on self + "assert_no_clustering_on_pif without clustering"; + + (* Add an enabled cluster_host *) + let cluster, cluster_host = T.make_cluster_and_cluster_host ~__context ~host ~pIF:self () in + Alcotest.check_raises + "Live cluster_host on PIF" + Api_errors.(Server_error (clustering_enabled, [ Ref.string_of cluster_host ])) + (fun () -> Xapi_pif.assert_no_clustering_enabled_on ~__context ~self); + + (* Disable clustering on PIF *) + Db.Cluster_host.set_enabled ~__context ~self:cluster_host ~value:false; + assert_no_clustering_on self + "assert_no_clustering_on_pif with clustering disabled" + +let test_disallow_unplug_during_cluster_host_create () = + let __context = T.make_test_database () in + let host, network, pIF = make_host_network_pif ~__context in + let cluster, cluster_host = + T.make_cluster_and_cluster_host ~__context ~pIF ~host () + in + let add_op value = + let key = Context.get_task_id __context |> Ref.string_of in + Db.Cluster.add_to_current_operations ~__context ~self:cluster ~key ~value + in + let check_disallow_unplug_false_fails self msg = + Alcotest.check_raises msg + Api_errors.(Server_error (other_operation_in_progress, + [ "Cluster" ; Ref.string_of cluster ])) + (fun () -> Xapi_pif.set_disallow_unplug ~__context ~self ~value:false) + in + let check_successful_disallow_unplug value self msg = + Alcotest.(check unit) msg + () (Xapi_pif.set_disallow_unplug ~__context ~self ~value) + in + Db.Cluster_host.set_enabled ~__context ~self:cluster_host ~value:false; + + let test_with_current op = + Xapi_pif.set_disallow_unplug ~__context ~self:pIF ~value:true; + add_op op; + + check_disallow_unplug_false_fails pIF + "disallow_unplug cannot be set to false during cluster_host creation or enable on same PIF"; + + let other_pif = T.make_pif ~__context ~network ~host () in + check_successful_disallow_unplug true other_pif + "Should always be able to set disallow_unplug:true regardless of clustering operations"; + check_disallow_unplug_false_fails other_pif + "disallow_unplug cannot be set to false during cluster_host creation or enable on any PIF"; + + let key = Context.get_task_id __context |> Ref.string_of in + Db.Cluster.remove_from_current_operations ~__context ~self:cluster ~key + in + (* Should block setting disallow_unplug false for any PIF during Cluster_host create or enable *) + List.iter test_with_current [`add; `enable]; + + List.iter + (fun self -> + check_successful_disallow_unplug false self + "No current clustering operations or enabled cluster hosts on PIF" + ) (Db.PIF.get_all ~__context) + + +let test_networking_with_clustering = [ "test_disallow_unplug_no_clustering", `Quick, test_disallow_unplug_no_clustering ; "test_disallow_unplug_with_clustering", `Quick, test_disallow_unplug_with_clustering + ; "test_assert_no_clustering_on_pif", `Quick, test_assert_no_clustering_on_pif + ; "test_disallow_unplug_during_cluster_host_create", `Quick, test_disallow_unplug_during_cluster_host_create ] let default = !Xapi_globs.cluster_stack_default @@ -400,7 +465,7 @@ let test_choose_cluster_stack_clusters_no_sms () = let __context = T.make_test_database () in choose_cluster_stack_should_select default ~__context; - (* Add two cluster, test choose_cluster_stack's filtering *) + (* Add two clusters, test choose_cluster_stack's filtering *) for i = 0 to 1 do let _ = T.make_cluster_and_cluster_host ~__context () in choose_cluster_stack_should_select default_smapiv3 ~__context @@ -542,7 +607,7 @@ let test_pool_ha_cluster_stacks_with_ha_with_clustering () = (* Cluster.destroy should set HA cluster stack with HA disabled *) Xapi_cluster_host.enable ~__context ~self:cluster_host; - Xapi_cluster_host.destroy ~__context ~self:cluster_host; + (* can't destroy last cluster_host, must be done through destroying cluster *) Xapi_cluster.destroy ~__context ~self:cluster; (* Cluster.destroy should reset HA cluster stacks *) assert_cluster_stack_is default ~__context; @@ -569,14 +634,13 @@ let test_pool_ha_cluster_stacks = ; "test_pool_ha_cluster_stacks_with_ha_with_clustering", `Quick, test_pool_ha_cluster_stacks_with_ha_with_clustering ] - let test = ( test_get_required_cluster_stacks @ test_find_cluster_host @ test_assert_cluster_host_enabled @ test_assert_cluster_host_is_enabled_for_matching_sms @ test_assert_pif_prerequisites - @ test_disallow_unplug_ro_with_clustering_enabled + @ test_networking_with_clustering @ test_choose_cluster_stack @ test_pool_ha_cluster_stacks (* NOTE: lock test hoards the mutex and should thus always be last, diff --git a/ocaml/tests/test_clustering_allowed_operations.ml b/ocaml/tests/test_clustering_allowed_operations.ml index 6b5e567de43..a3de22d9570 100644 --- a/ocaml/tests/test_clustering_allowed_operations.ml +++ b/ocaml/tests/test_clustering_allowed_operations.ml @@ -19,7 +19,7 @@ let assert_true msg x = Alcotest.(check bool) msg true x (** cluster_create is not allowed if a cluster already exists *) let test_pool_cluster_create_not_allowed_when_cluster_exists () = let __context = make_test_database () in - let self = Db.Pool.get_all ~__context |> List.hd in + let self = Helpers.get_pool ~__context in let _, _ = make_cluster_and_cluster_host ~__context () in Xapi_pool_helpers.update_allowed_operations ~__context ~self; let allowed_ops = Db.Pool.get_allowed_operations ~__context ~self in @@ -29,7 +29,7 @@ let test_pool_cluster_create_not_allowed_when_cluster_exists () = (** cluster_create is not allowed if any pool operations are in progress *) let test_pool_cluster_create_not_allowed_during_pool_ops () = let __context = make_test_database () in - let self = Db.Pool.get_all ~__context |> List.hd in + let self = Helpers.get_pool ~__context in Xapi_pool_helpers.with_pool_operation ~__context ~self ~doc:"" ~op:`ha_enable (fun () -> let allowed_ops = Db.Pool.get_allowed_operations ~__context ~self in @@ -40,7 +40,7 @@ let test_pool_cluster_create_not_allowed_during_pool_ops () = operations in progress *) let test_pool_cluster_create_allowed () = let __context = make_test_database () in - let self = Db.Pool.get_all ~__context |> List.hd in + let self = Helpers.get_pool ~__context in Xapi_pool_helpers.update_allowed_operations ~__context ~self; let allowed_ops = Db.Pool.get_allowed_operations ~__context ~self in assert_true "Pool.allowed_operations should contain 'cluster_create'" @@ -101,6 +101,80 @@ let test_cluster_host_ops_not_allowed_during_cluster_host_op () = let allowed_ops = Db.Cluster_host.get_allowed_operations ~__context ~self in assert_true "Cluster_host.allowed_operations should be empty" (allowed_ops = [])) +let with_cluster_op ~__context self op = + Xapi_cluster_helpers.with_cluster_operation ~__context ~self ~doc:"" ~op + (fun () -> ()) + +let with_cluster_host_op ~__context self op = + Xapi_cluster_host_helpers.with_cluster_host_operation ~__context ~self ~doc:"" ~op + (fun () -> ()) + +let test_clustering_ops_disallowed_during_rolling_upgrade () = + let __context = Test_common.make_test_database () in + + (** Helpers for testing clustering operations forbidden during rolling pool upgrade *) + let test_clustering_ops_should_pass with_cluster_fn self ops = + List.iter + (fun op -> + Alcotest.(check unit) + "Clustering operations should be allowed" + () (with_cluster_fn ~__context self op) + ) ops + in + let cluster, cluster_host = + Test_common.make_cluster_and_cluster_host ~__context () + in + let test_cluster_host_operations_valid () = + test_clustering_ops_should_pass + with_cluster_host_op + cluster_host + Xapi_cluster_host_helpers.all_cluster_host_operations + in + + (* All clustering operations are valid without RPU in progress + * and rolling_upgrade is false by default *) + test_clustering_ops_should_pass + with_cluster_op + cluster + Xapi_cluster_helpers.all_cluster_operations; + test_cluster_host_operations_valid (); + + (* set rolling upgrade *) + let key = Xapi_globs.rolling_upgrade_in_progress in + let self = Helpers.get_pool ~__context in + Db.Pool.remove_from_other_config ~__context ~self ~key; + Db.Pool.add_to_other_config ~__context ~self ~key ~value:"true"; + + (* Only cluster_host lifecycle changes valid during RPU, not cluster membership changes *) + List.iter + (fun op -> + Alcotest.check_raises + "Other than cluster_host enable/disable, no clustering operations should be allowed during RPU" + Api_errors.(Server_error (not_supported_during_upgrade, [])) + (fun () -> with_cluster_op ~__context cluster op) + ) [ `add ; `remove ; `destroy]; + + test_clustering_ops_should_pass + with_cluster_op + cluster + [ `enable ; `disable ]; + + test_cluster_host_operations_valid () + +let test_cluster_host_ops_without_join () = + (* Note that joined:true by default so no need to check *) + let __context = make_test_database () in + let cluster, cluster_host = make_cluster_and_cluster_host ~__context ~host:Helpers.(get_localhost ~__context) () in + Db.Cluster_host.set_joined ~__context ~self:cluster_host ~value:false; + + List.iter + (fun op -> + Alcotest.check_raises + "Non-remove cluster operations invalid when not cluster_host.joined" + Api_errors.(Server_error (cluster_host_not_joined, [ Ref.string_of cluster_host ])) + (fun () -> with_cluster_host_op ~__context cluster_host op) + ) Xapi_cluster_host_helpers.all_cluster_host_operations + let test = [ "test_pool_cluster_create_not_allowed_when_cluster_exists", `Quick, test_pool_cluster_create_not_allowed_when_cluster_exists ; "test_pool_cluster_create_not_allowed_during_pool_ops", `Quick, test_pool_cluster_create_not_allowed_during_pool_ops @@ -110,4 +184,6 @@ let test = ; "test_cluster_host_disable_allowed", `Quick, test_cluster_host_disable_allowed ; "test_cluster_host_enable_allowed", `Quick, test_cluster_host_enable_allowed ; "test_cluster_host_ops_not_allowed_during_cluster_host_op", `Quick, test_cluster_host_ops_not_allowed_during_cluster_host_op + ; "test_clustering_ops_disallowed_during_rolling_upgrade", `Quick, test_clustering_ops_disallowed_during_rolling_upgrade + ; "test_cluster_host_ops_without_join", `Quick, test_cluster_host_ops_without_join ] diff --git a/ocaml/tests/test_common.ml b/ocaml/tests/test_common.ml index 17586e311fe..b41fdf819af 100644 --- a/ocaml/tests/test_common.ml +++ b/ocaml/tests/test_common.ml @@ -211,7 +211,8 @@ let make_pool ~__context ~master ?(name_label="") ?(name_description="") ?(redo_log_vdi=Ref.null) ?(vswitch_controller="") ?(igmp_snooping_enabled=false) ?(restrictions=[]) ?(current_operations=[]) ?(allowed_operations=[]) ?(other_config=[Xapi_globs.memory_ratio_hvm; Xapi_globs.memory_ratio_pv]) - ?(ha_cluster_stack="xhad") ?(guest_agent_config=[]) ?(cpu_info=[]) ?(policy_no_vendor_device=false) ?(live_patching_disabled=false)() = + ?(ha_cluster_stack=(!Xapi_globs.cluster_stack_default)) ?(guest_agent_config=[]) ?(cpu_info=[]) + ?(policy_no_vendor_device=false) ?(live_patching_disabled=false) () = let pool_ref = Ref.make () in Db.Pool.create ~__context ~ref:pool_ref ~uuid:(make_uuid ()) ~name_label ~name_description @@ -488,19 +489,35 @@ let make_vfs_on_pf ~__context ~pf ~num = make_vf num let make_cluster_host ~__context ?(ref=Ref.make ()) ?(uuid=make_uuid ()) - ?(cluster=Ref.null) ?(host=Ref.null) ?(enabled=true) + ?(cluster=Ref.null) ?(host=Ref.null) ?(pIF=Ref.null) ?(enabled=true) ?(joined=true) ?(allowed_operations=[]) ?(current_operations=[]) ?(other_config=[]) () = - Db.Cluster_host.create ~__context ~ref ~uuid ~cluster ~host ~enabled - ~allowed_operations ~current_operations ~other_config; + Db.Cluster_host.create ~__context ~ref ~uuid ~cluster ~host ~pIF ~enabled + ~allowed_operations ~current_operations ~other_config ~joined; ref let make_cluster_and_cluster_host ~__context ?(ref=Ref.make ()) ?(uuid=make_uuid ()) - ?(network=Ref.null) ?(cluster_token="") ?(cluster_stack=Constants.default_smapiv3_cluster_stack) + ?(cluster_token="") ?(pIF=Ref.null) ?(cluster_stack=Constants.default_smapiv3_cluster_stack) ?(allowed_operations=[]) ?(current_operations=[]) ?(pool_auto_join=true) - ?(token_timeout=5000L) ?(token_timeout_coefficient=1000L) ?(cluster_config=[]) + ?(token_timeout=Constants.default_token_timeout_s) + ?(token_timeout_coefficient=Constants.default_token_timeout_coefficient_s) ?(cluster_config=[]) ?(other_config=[]) ?(host=Ref.null) () = - Db.Cluster.create ~__context ~ref ~uuid ~network ~cluster_token + Db.Cluster.create ~__context ~ref ~uuid ~cluster_token ~pending_forget:[] ~cluster_stack ~allowed_operations ~current_operations ~pool_auto_join ~token_timeout ~token_timeout_coefficient ~cluster_config ~other_config; - let cluster_host_ref = make_cluster_host ~__context ~cluster:ref ~host () in + let cluster_host_ref = make_cluster_host ~__context ~cluster:ref ~host ~pIF () in ref, cluster_host_ref + +let make_cluster_and_hosts ~__context extra_hosts = + let cluster_stack = "mock_cluster_stack" in + let network = make_network ~__context () in + + let host = Helpers.get_localhost ~__context in + let pIF = make_pif ~__context ~network ~host ~iP:"192.0.2.1" () in + let cluster, cluster_host = make_cluster_and_cluster_host ~__context ~cluster_stack ~pIF ~host () in + + let build_cluster_host i host = + let pIF = make_pif ~__context ~network ~host ~iP:(Printf.sprintf "192.0.2.%d" (i+2)) () in + make_cluster_host ~__context ~cluster ~host ~pIF () + in + + cluster, cluster_host :: List.mapi build_cluster_host extra_hosts diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 9a3b888ad6b..06fa28d55b4 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -609,8 +609,10 @@ let cluster_already_exists = "CLUSTER_ALREADY_EXISTS" let clustering_enabled = "CLUSTERING_ENABLED" let clustering_disabled = "CLUSTERING_DISABLED" let cluster_does_not_have_one_node = "CLUSTER_DOES_NOT_HAVE_ONE_NODE" +let cluster_host_is_last = "CLUSTER_HOST_IS_LAST" let no_compatible_cluster_host = "NO_COMPATIBLE_CLUSTER_HOST" let cluster_force_destroy_failed = "CLUSTER_FORCE_DESTROY_FAILED" -let clustering_enabled_on_network = "CLUSTERING_ENABLED_ON_NETWORK" let cluster_stack_in_use = "CLUSTER_STACK_IN_USE" let invalid_cluster_stack = "INVALID_CLUSTER_STACK" +let pif_not_attached_to_host = "PIF_NOT_ATTACHED_TO_HOST" +let cluster_host_not_joined = "CLUSTER_HOST_NOT_JOINED" diff --git a/ocaml/xapi-consts/api_messages.ml b/ocaml/xapi-consts/api_messages.ml index 2e7f30c53eb..b4a864dbe53 100644 --- a/ocaml/xapi-consts/api_messages.ml +++ b/ocaml/xapi-consts/api_messages.ml @@ -132,3 +132,7 @@ let host_cpu_features_down = addMessage "HOST_CPU_FEATURES_DOWN" 3L let host_cpu_features_up = addMessage "HOST_CPU_FEATURES_UP" 5L let pool_cpu_features_down = addMessage "POOL_CPU_FEATURES_DOWN" 5L let pool_cpu_features_up = addMessage "POOL_CPU_FEATURES_UP" 5L + +(* Cluster messages *) +let cluster_host_creation_failed = addMessage "CLUSTER_HOST_CREATION_FAILED" 3L +let cluster_host_enable_failed = addMessage "CLUSTER_HOST_ENABLE_FAILED" 3L diff --git a/ocaml/xapi-consts/constants.ml b/ocaml/xapi-consts/constants.ml index 0813ada7366..fd904182474 100644 --- a/ocaml/xapi-consts/constants.ml +++ b/ocaml/xapi-consts/constants.ml @@ -142,3 +142,6 @@ let storage_migrate_vgpu_map_key = "maps_to" (* Corosync timeout default values *) let default_token_timeout_s = 20.0 let default_token_timeout_coefficient_s = 1.0 +(* Minimum threshold for token timeout parameters *) +let minimum_token_timeout_s = 1.0 +let minimum_token_timeout_coefficient_s = 0.65 diff --git a/ocaml/xapi/cli_frontend.ml b/ocaml/xapi/cli_frontend.ml index a0b155b0c32..2898485af1b 100644 --- a/ocaml/xapi/cli_frontend.ml +++ b/ocaml/xapi/cli_frontend.ml @@ -1793,6 +1793,14 @@ let rec cmdtable_data : (string*cmd_spec) list = implementation=No_fd Cli_operations.sr_probe; flags=[]; }; + "sr-probe-ext", + { + reqd=["type"]; + optn=["host-uuid";"device-config:";"sm-config:"]; + help="Perform a storage probe. The device-config parameters can be specified by e.g. device-config:devs=/dev/sdb1. Unlike sr-probe, this command returns results in the same human-readable format for every SR type."; + implementation=No_fd Cli_operations.sr_probe_ext; + flags=[]; + }; "sr-scan", { reqd=["uuid"]; @@ -2933,7 +2941,7 @@ add a mapping of 'path' -> '/tmp', the command line should contain the argument }; "cluster-create", { - reqd=["network-uuid"]; + reqd=["pif-uuid"]; optn=["cluster-stack";"pool-auto-join";"token-timeout";"token-timeout-coefficient"]; help="Create new cluster with master as first member"; implementation=No_fd Cli_operations.Cluster.create; @@ -2949,7 +2957,7 @@ add a mapping of 'path' -> '/tmp', the command line should contain the argument }; "cluster-host-create", { - reqd=["cluster-uuid";"host-uuid"]; + reqd=["cluster-uuid";"host-uuid";"pif-uuid"]; optn=[]; help="Add a host to an existing cluster"; implementation=No_fd Cli_operations.Cluster_host.create; diff --git a/ocaml/xapi/cli_operations.ml b/ocaml/xapi/cli_operations.ml index aeb7d74df89..a2e8e64b56a 100644 --- a/ocaml/xapi/cli_operations.ml +++ b/ocaml/xapi/cli_operations.ml @@ -774,8 +774,6 @@ let gen_cmds rpc session_id = ; Client.VGPU.(mk get_all get_all_records_where get_by_uuid vgpu_record "vgpu" [] ["uuid";"vm-uuid";"device";"gpu-group-uuid"] rpc session_id) ; Client.VGPU_type.(mk get_all get_all_records_where get_by_uuid vgpu_type_record "vgpu-type" [] ["uuid";"vendor-name";"model-name";"max-resolution";"max-heads"] rpc session_id) ; Client.DR_task.(mk get_all get_all_records_where get_by_uuid dr_task_record "drtask" [] [] rpc session_id) - (*; Client.Alert.(mk get_all get_all_records_where get_by_uuid alert_record "alert" [] ["uuid";"message";"level";"timestamp";"system";"task"] rpc session_id) - *) ; Client.PVS_site.(mk get_all get_all_records_where get_by_uuid pvs_site_record "pvs-site" [] ["uuid"; "name-label"; "name-description"; "pvs-uuid"; "pvs-server-uuids"] rpc session_id) ; Client.PVS_server.(mk get_all get_all_records_where get_by_uuid pvs_server_record "pvs-server" [] ["uuid"; "addresses"; "pvs-site-uuid"] rpc session_id) ; Client.PVS_proxy.(mk get_all get_all_records_where get_by_uuid pvs_proxy_record "pvs-proxy" [] ["uuid"; "vif-uuid"; "pvs-site-uuid"; "currently-attached"; "cache-sr-uuid"] rpc session_id) @@ -787,21 +785,10 @@ let gen_cmds rpc session_id = ; Client.USB_group.(mk get_all get_all_records_where get_by_uuid usb_group_record "usb-group" [] ["uuid";"name-label";"name-description"] rpc session_id) ; Client.VUSB.(mk get_all get_all_records_where get_by_uuid vusb_record "vusb" [] ["uuid";"vm-uuid"; "usb-group-uuid"] rpc session_id) ; Client.Network_sriov.(mk get_all get_all_records_where get_by_uuid network_sriov_record "network-sriov" [] ["uuid"; "physical-pif"; "logical-pif"; "requires-reboot"; "configuration-mode"] rpc session_id) - ; Client.Cluster.(mk get_all get_all_records_where get_by_uuid cluster_record "cluster" [] ["uuid";"cluster-hosts";"network";"cluster-token";"cluster-stack";"allowed-operations";"current-operations";"pool-auto-join";"cluster-config";"other-config"] rpc session_id) - ; Client.Cluster_host.(mk get_all get_all_records_where get_by_uuid cluster_host_record "cluster-host" [] ["uuid";"cluster";"host";"enabled";"allowed-operations";"current-operations";"other-config"] rpc session_id) + ; Client.Cluster.(mk get_all get_all_records_where get_by_uuid cluster_record "cluster" [] ["uuid";"cluster-hosts";"cluster-token";"cluster-stack";"allowed-operations";"current-operations";"pool-auto-join";"cluster-config";"other-config"] rpc session_id) + ; Client.Cluster_host.(mk get_all get_all_records_where get_by_uuid cluster_host_record "cluster-host" [] ["uuid";"cluster";"pif";"host";"enabled";"allowed-operations";"current-operations";"other-config"] rpc session_id) ] -(* NB, might want to put these back in at some point - * let zurich_params_gone = - * ["distribution";"distribution_vsn";"os";"boot_params"] - * - * let zurich_param_map = - * [("name","name-label"); - * ("description","name-description"); - * ("vcpus","vcpus-number"); - * ("memory_set","memory-dynamic-max");] -*) - let message_create printer rpc session_id params = let body = List.assoc "body" params in let priority = try Int64.of_string (List.assoc "priority" params) with _ -> failwith "Priority field should be an integer" in @@ -1495,6 +1482,65 @@ let sr_probe printer rpc session_id params = with _ -> printer (Cli_printer.PList [txt]) +let sr_probe_ext printer rpc session_id params = + let host = parse_host_uuid rpc session_id params in + let _type = List.assoc "type" params in + let device_config = parse_device_config params in + let sm_config = read_map_params "sm-config" params in + let results = Client.SR.probe_ext ~rpc ~session_id ~host ~device_config ~_type ~sm_config in + let srs, complete_configs, incomplete_configs = + List.fold_left (fun (srs, complete_configs, incomplete_configs) x -> + match x.API.probe_result_sr with + | Some sr -> ((sr,x)::srs, complete_configs, incomplete_configs) + | None -> + if x.API.probe_result_complete then + (srs, x::complete_configs, incomplete_configs) + else + (srs, complete_configs, x::incomplete_configs) + ) + ([], [], []) + results + in + let print_sr x = + let health_to_string = function `healthy -> "healthy" | `recovering -> "recovering" in + (match x.API.sr_stat_uuid with + | Some uuid -> [ "uuid", uuid ] + | None -> []) @ + [ "name-label", x.sr_stat_name_label + ; "name-description", x.sr_stat_name_description + ; "total-space", Int64.to_string x.sr_stat_total_space + ; "free-space", Int64.to_string x.sr_stat_free_space + ; "clustered", string_of_bool x.sr_stat_clustered + ; "health", x.API.sr_stat_health |> health_to_string + ] in + if srs <> [] then begin + printer (Cli_printer.PMsg "The following SRs were found:"); + List.iteri + (fun i (sr, probe_result) -> + printer (Cli_printer.PMsg (Printf.sprintf "SR %d:" i)); + printer (Cli_printer.PTable [print_sr sr]); + printer (Cli_printer.PMsg (Printf.sprintf "SR %d configuration:" i)); + printer (Cli_printer.PTable [probe_result.API.probe_result_configuration]); + printer (Cli_printer.PMsg (Printf.sprintf "SR %d extra information:" i)); + printer (Cli_printer.PTable [probe_result.API.probe_result_extra_info]); + ) + srs; + end; + let print_config i probe_result = + printer (Cli_printer.PMsg (Printf.sprintf "Configuration %d:" i)); + printer (Cli_printer.PTable [probe_result.API.probe_result_configuration]); + printer (Cli_printer.PMsg (Printf.sprintf "Configuration %d extra information:" i)); + printer (Cli_printer.PTable [probe_result.API.probe_result_extra_info]); + in + if complete_configs <> [] then begin + printer (Cli_printer.PMsg "Found the following complete configurations that can be used to create SRs:"); + List.iteri print_config complete_configs; + end; + if incomplete_configs <> [] then begin + printer (Cli_printer.PMsg "Found the following incomplete configurations that may contain SRs:"); + List.iteri print_config incomplete_configs; + end + let sr_destroy printer rpc session_id params = let uuid = List.assoc "uuid" params in let sr = Client.SR.get_by_uuid rpc session_id uuid in @@ -4954,18 +5000,18 @@ module Cluster = struct Client.Cluster.pool_destroy ~rpc ~session_id ~self:cluster_ref let pool_resync printer rpc session_id params = - let cluster = List.assoc "cluster-uuid" params in - let cluster_ref = Client.Cluster.get_by_uuid rpc session_id cluster in + let cluster_uuid = List.assoc "cluster-uuid" params in + let cluster_ref = Client.Cluster.get_by_uuid rpc session_id cluster_uuid in Client.Cluster.pool_resync rpc session_id cluster_ref let create printer rpc session_id params = - let network_uuid = List.assoc "network-uuid" params in + let pif_uuid = List.assoc "pif-uuid" params in + let pIF = Client.PIF.get_by_uuid rpc session_id pif_uuid in let cluster_stack = get_param params "cluster-stack" ~default:Constants.default_smapiv3_cluster_stack in let pool_auto_join = get_bool_param params "pool-auto-join" ~default:true in let token_timeout = get_float_param params "token-timeout" ~default:Constants.default_token_timeout_s in let token_timeout_coefficient = get_float_param params "token-timeout-coefficient" ~default:Constants.default_token_timeout_coefficient_s in - let network = Client.Network.get_by_uuid rpc session_id network_uuid in - let cluster = Client.Cluster.create ~rpc ~session_id ~network ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient in + let cluster = Client.Cluster.create ~rpc ~session_id ~pIF ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient in let uuid = Client.Cluster.get_uuid ~rpc ~session_id ~self:cluster in printer (Cli_printer.PList [uuid]) @@ -4979,9 +5025,11 @@ module Cluster_host = struct let create printer rpc session_id params = let cluster_uuid = List.assoc "cluster-uuid" params in let host_uuid = List.assoc "host-uuid" params in + let pif_uuid = List.assoc "pif-uuid" params in let cluster_ref = Client.Cluster.get_by_uuid rpc session_id cluster_uuid in let host_ref = Client.Host.get_by_uuid rpc session_id host_uuid in - let cluster_host = Client.Cluster_host.create rpc session_id cluster_ref host_ref in + let pif_ref = Client.PIF.get_by_uuid rpc session_id pif_uuid in + let cluster_host = Client.Cluster_host.create rpc session_id cluster_ref host_ref pif_ref in let uuid = Client.Cluster_host.get_uuid ~rpc ~session_id ~self:cluster_host in printer (Cli_printer.PList [uuid]) diff --git a/ocaml/xapi/create_storage.ml b/ocaml/xapi/create_storage.ml index bfabc8d1bbd..becb42f5976 100644 --- a/ocaml/xapi/create_storage.ml +++ b/ocaml/xapi/create_storage.ml @@ -41,17 +41,10 @@ let plug_all_pbds __context = my_pbds; !result -let maybe_reenable_cluster_host __context = - let host = Helpers.get_localhost __context in - match Xapi_clustering.find_cluster_host ~__context ~host with - | Some self -> - Xapi_cluster_host.enable ~__context ~self - | None -> () let plug_unplugged_pbds __context = (* If the plug is to succeed for SM's requiring a cluster stack * we have to enable the cluster stack too if we have one *) - log_and_ignore_exn(fun () -> maybe_reenable_cluster_host __context); let my_pbds = Helpers.get_my_pbds __context in List.iter (fun (self, pbd_record) -> diff --git a/ocaml/xapi/db_gc_util.ml b/ocaml/xapi/db_gc_util.ml index 465e292d960..a0bd0a8f801 100644 --- a/ocaml/xapi/db_gc_util.ml +++ b/ocaml/xapi/db_gc_util.ml @@ -110,6 +110,12 @@ let gc_VIFs ~__context = let gc_PBDs ~__context = gc_connector ~__context Db.PBD.get_all Db.PBD.get_record (fun x->valid_ref __context x.pBD_host) (fun x->valid_ref __context x.pBD_SR) Db.PBD.destroy +let gc_Cluster_hosts ~__context = + gc_connector ~__context Db.Cluster_host.get_all Db.Cluster_host.get_record + (fun x -> valid_ref __context x.cluster_host_host) + (fun x -> valid_ref __context x.cluster_host_PIF) + Db.Cluster_host.destroy + let gc_VGPUs ~__context = gc_connector ~__context Db.VGPU.get_all Db.VGPU.get_record (fun x->valid_ref __context x.vGPU_VM) (fun x->valid_ref __context x.vGPU_GPU_group) (fun ~__context ~self -> @@ -400,6 +406,7 @@ let timeout_alerts ~__context = let gc_subtask_list = [ "VDIs", gc_VDIs; "PIFs", gc_PIFs; + "Cluster_host", gc_Cluster_hosts; "VBDs", gc_VBDs; "crashdumps", gc_crashdumps; "VIFs", gc_VIFs; diff --git a/ocaml/xapi/message_forwarding.ml b/ocaml/xapi/message_forwarding.ml index 652b46db0ba..61534275e36 100644 --- a/ocaml/xapi/message_forwarding.ml +++ b/ocaml/xapi/message_forwarding.ml @@ -591,12 +591,12 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct Local.Pool.ha_failover_plan_exists ~__context ~n let ha_compute_max_host_failures_to_tolerate ~__context = - Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context (fun () -> + Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context __LOC__ (fun () -> info "Pool.ha_compute_max_host_failures_to_tolerate: pool = '%s'" (current_pool_uuid ~__context); Local.Pool.ha_compute_max_host_failures_to_tolerate ~__context) let ha_compute_hypothetical_max_host_failures_to_tolerate ~__context ~configuration = - Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context (fun () -> + Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context __LOC__ (fun () -> info "Pool.ha_compute_hypothetical_max_host_failures_to_tolerate: pool = '%s'; configuration = [ %s ]" (current_pool_uuid ~__context) (String.concat "; " (List.map (fun (vm, p) -> Ref.string_of vm ^ " " ^ p) configuration)); @@ -610,7 +610,7 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct Local.Pool.ha_compute_vm_failover_plan ~__context ~failed_hosts ~failed_vms let set_ha_host_failures_to_tolerate ~__context ~self ~value = - Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context (fun () -> + Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context __LOC__ (fun () -> info "Pool.set_ha_host_failures_to_tolerate: pool = '%s'; value = %Ld" (pool_uuid ~__context self) value; Local.Pool.set_ha_host_failures_to_tolerate ~__context ~self ~value) @@ -832,11 +832,11 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let clear_reserved_netsriov_vfs_on ~__context ~vm = Db.VM.get_VIFs ~__context ~self:vm |> List.iter (fun vif -> - let vf = Db.VIF.get_reserved_pci ~__context ~self:vif in - Db.VIF.set_reserved_pci ~__context ~self:vif ~value:Ref.null; - if Db.is_valid_ref __context vf - then Db.PCI.set_scheduled_to_be_attached_to ~__context ~self:vf ~value:Ref.null - ) + let vf = Db.VIF.get_reserved_pci ~__context ~self:vif in + Db.VIF.set_reserved_pci ~__context ~self:vif ~value:Ref.null; + if Db.is_valid_ref __context vf + then Db.PCI.set_scheduled_to_be_attached_to ~__context ~self:vf ~value:Ref.null + ) (* Notes on memory checking/reservation logic: When computing the hosts free memory we consider all VMs resident_on (ie running @@ -1660,11 +1660,11 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let source_host = Db.VM.get_resident_on ~__context ~self:vm in let to_equal_or_greater_version = Helpers.host_versions_not_decreasing ~__context - ~host_from:(Helpers.LocalObject source_host) - ~host_to:(Helpers.LocalObject host) in + ~host_from:(Helpers.LocalObject source_host) + ~host_to:(Helpers.LocalObject host) in if (Helpers.rolling_upgrade_in_progress ~__context) && (not to_equal_or_greater_version) then - raise (Api_errors.Server_error (Api_errors.not_supported_during_upgrade, [])); + raise (Api_errors.Server_error (Api_errors.not_supported_during_upgrade, [])); (* Make sure the target has enough memory to receive the VM *) let snapshot = Db.VM.get_record ~__context ~self:vm in @@ -1700,8 +1700,8 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct * forward the call to the source. *) let snapshot = Db.VM.get_record ~__context ~self:vm in (fun ~local_fn ~__context ~vm op -> - allocate_vm_to_host ~__context ~vm ~host ~snapshot ~host_op:`vm_migrate (); - forward_vm_op ~local_fn ~__context ~vm op) + allocate_vm_to_host ~__context ~vm ~host ~snapshot ~host_op:`vm_migrate (); + forward_vm_op ~local_fn ~__context ~vm op) else (* Cross pool: just forward to the source host. Resources on the * destination will be reserved separately. *) @@ -1713,8 +1713,8 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct with_vm_operation ~__context ~self:vm ~doc:"VM.migrate_send" ~op:`migrate_send (fun () -> Server_helpers.exec_with_subtask ~__context "VM.assert_can_migrate" (fun ~__context -> - assert_can_migrate ~__context ~vm ~dest ~live ~vdi_map ~vif_map ~vgpu_map ~options - ); + assert_can_migrate ~__context ~vm ~dest ~live ~vdi_map ~vif_map ~vgpu_map ~options + ); forwarder ~local_fn ~__context ~vm (fun session_id rpc -> Client.VM.migrate_send rpc session_id vm dest live vdi_map vif_map options vgpu_map) ) @@ -1990,7 +1990,7 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let local_fn = Local.VM.s3_resume ~vm in forward_vm_op ~local_fn ~__context ~vm (fun session_id rpc -> Client.VM.s3_resume rpc session_id vm) - let set_bios_strings ~__context ~self ~value = + let set_bios_strings ~__context ~self ~value = info "VM.set_bios_strings: self = '%s'; value = '%s'" (vm_uuid ~__context self) (String.concat "; " (List.map (fun (k,v) -> k ^ "=" ^ v) value)); Local.VM.set_bios_strings ~__context ~self ~value @@ -2676,14 +2676,14 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let local_fn = Local.Host.set_iscsi_iqn ~host ~value in do_op_on ~local_fn ~__context ~host (fun session_id rpc -> - Client.Host.set_iscsi_iqn rpc session_id host value) + Client.Host.set_iscsi_iqn rpc session_id host value) let set_multipathing ~__context ~host ~value = info "Host.set_multipathing: host='%s' value='%s'" (host_uuid ~__context host) (string_of_bool value); let local_fn = Local.Host.set_multipathing ~host ~value in do_op_on ~local_fn ~__context ~host (fun session_id rpc -> - Client.Host.set_multipathing rpc session_id host value) + Client.Host.set_multipathing rpc session_id host value) end module Host_crashdump = struct @@ -4084,7 +4084,7 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let update_vdi = Db.Pool_update.get_vdi ~__context ~self in if Db.is_valid_ref __context update_vdi then VDI.forward_vdi_op ~local_fn ~__context ~self:update_vdi - (fun session_id rpc -> Client.Pool_update.pool_clean rpc session_id self) + (fun session_id rpc -> Client.Pool_update.pool_clean rpc session_id self) else info "Pool_update.pool_clean: pool update '%s' has already been cleaned." (pool_update_uuid ~__context self) @@ -4098,7 +4098,7 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let update_vdi = Db.Pool_update.get_vdi ~__context ~self in if Db.is_valid_ref __context update_vdi then VDI.forward_vdi_op ~local_fn ~__context ~self:update_vdi - (fun session_id rpc -> Client.Pool_update.attach rpc session_id self) + (fun session_id rpc -> Client.Pool_update.attach rpc session_id self) else raise (Api_errors.Server_error(Api_errors.cannot_find_update, [(pool_update_uuid ~__context self)])) @@ -4108,7 +4108,7 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct let update_vdi = Db.Pool_update.get_vdi ~__context ~self in if Db.is_valid_ref __context update_vdi then VDI.forward_vdi_op ~local_fn ~__context ~self:update_vdi - (fun session_id rpc -> Client.Pool_update.detach rpc session_id self) + (fun session_id rpc -> Client.Pool_update.detach rpc session_id self) else raise (Api_errors.Server_error(Api_errors.cannot_find_update, [(pool_update_uuid ~__context self)])) @@ -4218,7 +4218,7 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct end module VUSB = struct - let update_vusb_operations ~__context ~vusb = + let update_vusb_operations ~__context ~vusb = Helpers.with_global_lock (fun () -> Xapi_vusb_helpers.update_allowed_operations ~__context ~self:vusb) @@ -4297,12 +4297,12 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct end module Cluster = struct - let create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient = + let create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient = info "Cluster.create"; - let pool = Db.Pool.get_all ~__context |> List.hd in (* assumes 1 pool in DB *) + let pool = Helpers.get_pool ~__context in (* assumes 1 pool in DB *) Xapi_pool_helpers.with_pool_operation ~__context ~self:pool ~doc:"Cluster.create" ~op:`cluster_create (fun () -> - let cluster = Local.Cluster.create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient in + let cluster = Local.Cluster.create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient in Xapi_cluster_helpers.update_allowed_operations ~__context ~self:cluster; cluster ) @@ -4313,31 +4313,36 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct (fun () -> Local.Cluster.destroy ~__context ~self) + let get_network ~__context ~self = + info "Cluster.get_network"; + Local.Cluster.get_network ~__context ~self + + (* Pool operations don't need a lock, they call other locked functions *) let pool_create ~__context ~network ~cluster_stack ~token_timeout ~token_timeout_coefficient = - info "Cluster.pool_create"; + info "Cluster.pool_create"; (* iterates over Cluster_host.create *) Local.Cluster.pool_create ~__context ~network ~cluster_stack ~token_timeout ~token_timeout_coefficient - let pool_force_destroy ~__context ~self = + let pool_force_destroy ~__context ~self = (* iterates over Cluster_host.destroy *) info "Cluster.pool_force_destroy cluster: %s" (Ref.string_of self); Local.Cluster.pool_force_destroy ~__context ~self - let pool_destroy ~__context ~self = + let pool_destroy ~__context ~self = (* iterates Cluster_host.destroy *) info "Cluster.pool_destroy cluster %s" (Ref.string_of self); Local.Cluster.pool_destroy ~__context ~self - let pool_resync ~__context ~self = + let pool_resync ~__context ~self = (* iterates Cluster_host.enable and Cluster_host where necessary*) info "Cluster.pool_resync cluster: %s" (Ref.string_of self); Local.Cluster.pool_resync ~__context ~self end module Cluster_host = struct - let create ~__context ~cluster ~host = - info "Cluster_host.create with cluster:%s, host:%s" (Ref.string_of cluster) (Ref.string_of host); - let local_fn = Local.Cluster_host.create ~cluster ~host in + let create ~__context ~cluster ~host ~pif = + info "Cluster_host.create with cluster:%s, host:%s, pif:%s" (Ref.string_of cluster) (Ref.string_of host) (Ref.string_of pif); + let local_fn = Local.Cluster_host.create ~cluster ~host ~pif in Xapi_cluster_helpers.with_cluster_operation ~__context ~self:cluster ~doc:"Cluster.add" ~op:`add (fun () -> let cluster_host = do_op_on ~__context ~local_fn ~host - (fun session_id rpc -> Client.Cluster_host.create rpc session_id cluster host) in + (fun session_id rpc -> Client.Cluster_host.create rpc session_id cluster host pif) in Xapi_cluster_host_helpers.update_allowed_operations ~__context ~self:cluster_host; cluster_host ) @@ -4346,15 +4351,23 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct info "Cluster_host.destroy cluster_host: %s" (Ref.string_of self); let local_fn = Local.Cluster_host.destroy ~self in let host = Db.Cluster_host.get_host ~__context ~self in - do_op_on ~__context ~local_fn ~host - (fun session_id rpc -> Client.Cluster_host.destroy rpc session_id self) + let cluster = Db.Cluster_host.get_cluster ~__context ~self in + Xapi_cluster_helpers.with_cluster_operation ~__context ~self:cluster ~doc:"Cluster_host.destroy" ~op:`remove + (fun () -> + do_op_on ~__context ~local_fn ~host + (fun session_id rpc -> Client.Cluster_host.destroy rpc session_id self) + ) let force_destroy ~__context ~self = info "Cluster_host.force_destroy cluster_host: %s" (Ref.string_of self); let local_fn = Local.Cluster_host.force_destroy ~self in let host = Db.Cluster_host.get_host ~__context ~self in - do_op_on ~__context ~local_fn ~host - (fun session_id rpc -> Client.Cluster_host.force_destroy rpc session_id self) + let cluster = Db.Cluster_host.get_cluster ~__context ~self in + Xapi_cluster_helpers.with_cluster_operation ~__context ~self:cluster ~doc:"Cluster_host.force_destroy" ~op:`remove + (fun () -> + do_op_on ~__context ~local_fn ~host + (fun session_id rpc -> Client.Cluster_host.force_destroy rpc session_id self) + ) let enable ~__context ~self = info "Cluster_host.enable cluster_host %s" (Ref.string_of self); @@ -4379,5 +4392,33 @@ module Forward = functor(Local: Custom_actions.CUSTOM_ACTIONS) -> struct (fun () -> do_op_on ~__context ~local_fn ~host (fun session_id rpc -> Client.Cluster_host.disable rpc session_id self))) + + let forget ~__context ~self = + info "Cluster_host.forget cluster_host:%s" (Ref.string_of self); + let cluster = Db.Cluster_host.get_cluster ~__context ~self in + let local_fn = Local.Cluster_host.forget ~self in + (* We need to ask another host that has a cluster host to mark it as dead. + * We might've run force destroy and this host would no longer have a cluster host + * *) + let other_hosts = + Db.Cluster.get_cluster_hosts ~__context ~self:cluster + |> List.filter ((<>) self) in + let rec find_first_live = function + | [] -> info "No other cluster hosts, nothing to do" (* go ahead and finish Host.destroy *) + | other_cluster_host :: rest -> + try + let host = Db.Cluster_host.get_host ~__context ~self:other_cluster_host in + Xapi_cluster_helpers.with_cluster_operation ~__context ~self:cluster ~doc:"Cluster.remove" ~op:`remove + (fun () -> + do_op_on ~__context ~local_fn ~host + (fun session_id rpc -> Client.Cluster_host.forget rpc session_id self)); + with Api_errors.Server_error(code, _) as e when code = Api_errors.host_offline -> + match rest with + | [] -> + debug "Ran out of hosts to try (and no cluster host on ourselves), reporting error"; + raise e + | _ -> find_first_live rest + in + find_first_live other_hosts end end diff --git a/ocaml/xapi/records.ml b/ocaml/xapi/records.ml index dbed22bfb7b..d7265d1f065 100644 --- a/ocaml/xapi/records.ml +++ b/ocaml/xapi/records.ml @@ -2080,9 +2080,6 @@ let cluster_record rpc session_id cluster = ~get:(fun () -> String.concat "; " (List.map (fun r -> get_uuid_from_ref r) (x ()).API.cluster_cluster_hosts)) ~get_set:(fun () -> List.map get_uuid_from_ref (x ()).API.cluster_cluster_hosts) () - ; make_field ~name:"network" - ~get:(fun () -> (x ()).API.cluster_network |> get_uuid_from_ref) - () ; make_field ~name:"cluster-token" ~get:(fun () -> (x ()).API.cluster_cluster_token) () @@ -2090,10 +2087,14 @@ let cluster_record rpc session_id cluster = ~get:(fun () -> (x ()).API.cluster_cluster_stack) () ; make_field ~name:"token-timeout" - ~get:(fun () -> Int64.to_string((x ()).API.cluster_token_timeout)) + ~get:(fun () -> string_of_float ((x ()).API.cluster_token_timeout)) () ; make_field ~name:"token-timeout-coefficient" - ~get:(fun () -> Int64.to_string((x ()).API.cluster_token_timeout_coefficient)) + ~get:(fun () -> string_of_float ((x ()).API.cluster_token_timeout_coefficient)) + () + ; make_field ~name:"pending-forget" ~hidden:true + ~get:(fun () -> String.concat "; " (x ()).API.cluster_pending_forget) + ~get_set:(fun () -> (x ()).API.cluster_pending_forget) () ; make_field ~name:"allowed-operations" ~get:(fun () -> String.concat "; " (List.map Record_util.cluster_operation_to_string (x ()).API.cluster_allowed_operations)) @@ -2134,12 +2135,18 @@ let cluster_host_record rpc session_id cluster_host = ; make_field ~name:"cluster" ~get:(fun () -> (x ()).API.cluster_host_cluster |> get_uuid_from_ref) () + ; make_field ~name:"PIF" + ~get:(fun () -> (x ()).API.cluster_host_PIF |> get_uuid_from_ref) + () ; make_field ~name:"host" ~get:(fun () -> (x ()).API.cluster_host_host |> get_uuid_from_ref) () ; make_field ~name:"enabled" ~get:(fun () -> (x ()).API.cluster_host_enabled |> string_of_bool) () + ; make_field ~name:"joined" + ~get:(fun () -> (x ()).API.cluster_host_joined |> string_of_bool) + () ; make_field ~name:"allowed-operations" ~get:(fun () -> String.concat "; " (List.map Record_util.cluster_host_operation_to_string (x ()).API.cluster_host_allowed_operations)) ~get_set:(fun () -> List.map Record_util.cluster_host_operation_to_string (x ()).API.cluster_host_allowed_operations) diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml index 8c5ebc6692e..c49a5ae4367 100644 --- a/ocaml/xapi/xapi.ml +++ b/ocaml/xapi/xapi.ml @@ -910,7 +910,6 @@ let server_init() = (* CA-22417: bring up all non-bond slaves so that the SM backends can use storage NIC IP addresses (if the routing table happens to be right) *) "Best-effort bring up of physical and sriov NICs", [ Startup.NoExnRaising ], Xapi_pif.start_of_day_best_effort_bring_up; - "Create any necessary cluster_host objects", [ Startup.NoExnRaising ], (fun () -> Xapi_cluster_host.create_as_necessary __context (Helpers.get_localhost ~__context)); "updating the vswitch controller", [], (fun () -> Helpers.update_vswitch_controller ~__context ~host:(Helpers.get_localhost ~__context)); "initialising storage", [ Startup.NoExnRaising ], (fun () -> Helpers.call_api_functions ~__context Create_storage.create_storage_localhost); @@ -942,20 +941,43 @@ let server_init() = let wait_management_interface () = let management_if = Xapi_inventory.lookup Xapi_inventory._management_interface in - if management_if <> "" then ( + if management_if <> "" then begin debug "Waiting forever for the management interface to gain an IP address"; let ip = wait_for_management_ip_address ~__context in - debug "Management interface got IP address: %s; attempting to re-plug any unplugged PBDs" ip; + debug "Management interface got IP address: %s, attempting to re-plug unplugged PBDs" ip; + (* This may fail without the clustering IP, which is why we attempt + another replug in maybe_wait_for_clustering_ip *) Helpers.call_api_functions ~__context (fun rpc session_id -> Create_storage.plug_unplugged_pbds __context) - ) + end + in + + let maybe_wait_for_clustering_ip () = + let host = Helpers.get_localhost ~__context in + match Xapi_clustering.find_cluster_host ~__context ~host with + | Some self -> begin + debug "Waiting forever for cluster_host to gain an IP address"; + let ip = Xapi_mgmt_iface.(wait_for_clustering_ip ~__context ~self) in + debug "Got clustering IP %s, resyncing cluster_host %s" ip (Ref.string_of self); + Xapi_cluster_host.resync_host ~__context ~host; + debug "Attempting to re-plug remaining unplugged PBDs"; + Helpers.call_api_functions ~__context (fun rpc session_id -> + Create_storage.plug_unplugged_pbds __context) + end + | None -> () in Startup.run ~__context [ "fetching database backup", [ Startup.OnlySlave; Startup.NoExnRaising ], (fun () -> Pool_db_backup.fetch_database_backup ~master_address:(Pool_role.get_master_address()) ~pool_secret:!Xapi_globs.pool_secret ~force:None); - "wait management interface to come up", [ Startup.NoExnRaising ], wait_management_interface; + "wait management interface to come up, re-plug unplugged PBDs", [ Startup.NoExnRaising ], wait_management_interface; + + (* CA-290237, CA-290473: Create cluster objects after network objects and management IP initialised *) + "Create any necessary cluster_host objects", [ Startup.NoExnRaising ], + (fun () -> log_and_ignore_exn (fun () -> Xapi_cluster_host.create_as_necessary __context (Helpers.get_localhost ~__context))); + "wait for clustering IP if any, re-plug remaining unplugged PBDs", [ Startup.OnThread ], + (fun () -> log_and_ignore_exn (fun () -> maybe_wait_for_clustering_ip () )); "considering sending a master transition alert", [ Startup.NoExnRaising; Startup.OnlyMaster ], Xapi_pool_transition.consider_sending_alert __context; "Cancelling in-progress storage migrations", [], (fun () -> Storage_migrate.killall ~dbg:"xapi init"); @@ -971,7 +993,7 @@ let server_init() = ]; debug "startup: startup sequence finished"); - wait_to_die() + wait_to_die () with | Sys.Break -> cleanup_handler 0 | (Unix.Unix_error (e,s1,s2)) as exn -> diff --git a/ocaml/xapi/xapi_bond.ml b/ocaml/xapi/xapi_bond.ml index e172d43eed3..c594276def1 100644 --- a/ocaml/xapi/xapi_bond.ml +++ b/ocaml/xapi/xapi_bond.ml @@ -257,6 +257,18 @@ let requirements_of_mode = function ] | _ -> [] +let maybe_move_cluster_pif ~__context ~host ~to_pif ~(should_move : API.ref_PIF -> bool) = + match Xapi_clustering.find_cluster_host ~__context ~host with + | Some cluster_host -> + let cluster_pif = Db.Cluster_host.get_PIF ~__context ~self:cluster_host in + if (should_move cluster_pif) + then begin + debug "Moving cluster_host %s from PIF %s to PIF %s" (Ref.string_of cluster_host) + (Ref.string_of cluster_pif) (Ref.string_of to_pif); + Db.Cluster_host.set_PIF ~__context ~self:cluster_host ~value:to_pif + end + | None -> () + let create ~__context ~network ~members ~mAC ~mode ~properties = Xapi_network.assert_network_is_managed ~__context ~self:network; let host = Db.PIF.get_host ~__context ~self:(List.hd members) in @@ -266,7 +278,7 @@ let create ~__context ~network ~members ~mAC ~mode ~properties = (* Validate MAC parameter; note an empty string is OK here, since that means 'inherit MAC from * primary slave PIF' (see below) *) if mAC <> "" && (not (Helpers.is_valid_MAC mAC)) then - raise (Api_errors.Server_error (Api_errors.mac_invalid, [mAC])); + raise Api_errors.(Server_error (mac_invalid, [mAC])); let requirements = requirements_of_mode mode in (* Check that each of the supplied properties is valid. *) @@ -313,7 +325,7 @@ let create ~__context ~network ~members ~mAC ~mode ~properties = | None, pif_with_ip::_, _ -> pif_with_ip | None, [], pif::_ -> pif | None, [], [] -> - raise (Api_errors.Server_error(Api_errors.pif_bond_needs_more_members, [])) + raise Api_errors.(Server_error (pif_bond_needs_more_members, [])) in let mAC = if mAC <> "" then @@ -321,7 +333,7 @@ let create ~__context ~network ~members ~mAC ~mode ~properties = else Db.PIF.get_MAC ~__context ~self:primary_slave in - let disallow_unplug = + let disallow_unplug = (* this is always true if one of the PIFs is a cluster_host.PIF *) List.fold_left (fun a m -> Db.PIF.get_disallow_unplug ~__context ~self:m || a) false members in @@ -333,19 +345,19 @@ let create ~__context ~network ~members ~mAC ~mode ~properties = (* 5. Members must not be the management interface if HA is enabled *) (* 6. Members must be PIFs that are managed by xapi *) (* 7. Members must have the same PIF properties *) - (* 8. Only the primary PIF should have a non-None IP configuration *) + (* 8. At most one member may have an IP address *) List.iter (fun self -> Xapi_pif_helpers.assert_pif_is_managed ~__context ~self; Xapi_pif_helpers.bond_is_allowed_on_pif ~__context ~self; let pool = Helpers.get_pool ~__context in if Db.Pool.get_ha_enabled ~__context ~self:pool && Db.PIF.get_management ~__context ~self - then raise (Api_errors.Server_error(Api_errors.ha_cannot_change_bond_status_of_mgmt_iface, [])); + then raise Api_errors.(Server_error(ha_cannot_change_bond_status_of_mgmt_iface, [])); if Db.PIF.get_capabilities ~__context ~self |> List.mem "fcoe" then Xapi_pif.assert_fcoe_not_in_use ~__context ~self ) members; let hosts = List.map (fun self -> Db.PIF.get_host ~__context ~self) members in if List.length (List.setify hosts) <> 1 - then raise (Api_errors.Server_error (Api_errors.pif_cannot_bond_cross_host, [])); + then raise Api_errors.(Server_error (pif_cannot_bond_cross_host, [])); let pif_properties = if members = [] then [] @@ -354,7 +366,7 @@ let create ~__context ~network ~members ~mAC ~mode ~properties = let p = List.hd ps in let equal = List.fold_left (fun result p' -> result && (p = p')) true (List.tl ps) in if not equal then - raise (Api_errors.Server_error (Api_errors.incompatible_pif_properties, [])) + raise Api_errors.(Server_error (incompatible_pif_properties, [])) else p in @@ -431,6 +443,12 @@ let create ~__context ~network ~members ~mAC ~mode ~properties = Db.PIF.set_disallow_unplug ~__context ~self:pif ~value:false) members end; + + TaskHelper.set_progress ~__context 0.9; + + (* If the host has a cluster_host AND the cluster_host's PIF is a member, update the PIF *) + maybe_move_cluster_pif ~__context ~host ~to_pif:master ~should_move:(fun pif -> List.mem pif members); + TaskHelper.set_progress ~__context 1.0; ); (* return a ref to the new Bond object *) @@ -456,7 +474,7 @@ let destroy ~__context ~self = (* CA-86573: forbid the deletion of a bond involving the mgmt interface if HA is on *) let pool = Helpers.get_pool ~__context in if Db.Pool.get_ha_enabled ~__context ~self:pool && Db.PIF.get_management ~__context ~self:master - then raise (Api_errors.Server_error(Api_errors.ha_cannot_change_bond_status_of_mgmt_iface, [])); + then raise Api_errors.(Server_error (ha_cannot_change_bond_status_of_mgmt_iface, [])); (* Copy IP configuration from master to primary member *) move_configuration ~__context master primary_slave; @@ -502,6 +520,9 @@ let destroy ~__context ~self = Db.PIF.set_disallow_unplug ~__context ~self:primary_slave ~value:true end; + (* Move Cluster_host to primary slave IF currently on bond, i.e. if cluster PIF = master *) + maybe_move_cluster_pif ~__context ~host ~to_pif:primary_slave ~should_move:((=) master); + (* Destroy the Bond and master PIF *) Db.Bond.destroy ~__context ~self; Db.PIF.destroy ~__context ~self:master; diff --git a/ocaml/xapi/xapi_cluster.ml b/ocaml/xapi/xapi_cluster.ml index 48f6c6e6a1f..ecd6d56e9f1 100644 --- a/ocaml/xapi/xapi_cluster.ml +++ b/ocaml/xapi/xapi_cluster.ml @@ -20,18 +20,19 @@ open D (* TODO: update allowed_operations on boot/toolstack-restart *) let validate_params ~token_timeout ~token_timeout_coefficient = - let invalid_value x y = raise (Api_errors.(Server_error (invalid_value, [ x; y ]))) in - if token_timeout < 1.0 then invalid_value "token_timeout" (string_of_float token_timeout); - if token_timeout_coefficient < 0.65 then invalid_value "token_timeout_coefficient" (string_of_float token_timeout_coefficient) + let invalid_value x y = raise (Api_errors.(Server_error (invalid_value, [ x; string_of_float y ]))) in + if token_timeout < Constants.minimum_token_timeout_s then + invalid_value "token_timeout" token_timeout; + if token_timeout_coefficient < Constants.minimum_token_timeout_coefficient_s then + invalid_value "token_timeout_coefficient" token_timeout_coefficient -let create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient = +let create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout ~token_timeout_coefficient = assert_cluster_stack_valid ~cluster_stack; (* Currently we only support corosync. If we support more cluster stacks, this * should be replaced by a general function that checks the given cluster_stack *) Pool_features.assert_enabled ~__context ~f:Features.Corosync; - (* TODO: take network lock *) - with_clustering_lock (fun () -> + with_clustering_lock __LOC__(fun () -> let dbg = Context.string_of_task __context in validate_params ~token_timeout ~token_timeout_coefficient; let cluster_ref = Ref.make () in @@ -40,12 +41,11 @@ let create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout ~to let cluster_host_uuid = Uuidm.to_string (Uuidm.create `V4) in (* For now we assume we have only one pool TODO: get master ref explicitly passed in as parameter*) - let pool = Db.Pool.get_all ~__context |> List.hd in - let host = Db.Pool.get_master ~__context ~self:pool in + let host = Helpers.get_master ~__context in - let pif = pif_of_host ~__context network host in - assert_pif_prerequisites pif; - let ip = ip_of_pif pif in + let pifrec = Db.PIF.get_record ~__context ~self:pIF in + assert_pif_prerequisites (pIF,pifrec); + let ip = ip_of_pif (pIF,pifrec) in let token_timeout_ms = Int64.of_float(token_timeout*.1000.0) in let token_timeout_coefficient_ms = Int64.of_float(token_timeout_coefficient*.1000.0) in @@ -61,11 +61,11 @@ let create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout ~to match result with | Result.Ok cluster_token -> D.debug "Got OK from LocalClient.create"; - Db.Cluster.create ~__context ~ref:cluster_ref ~uuid:cluster_uuid ~network ~cluster_token ~cluster_stack - ~pool_auto_join ~token_timeout:token_timeout_ms ~token_timeout_coefficient:token_timeout_coefficient_ms ~current_operations:[] ~allowed_operations:[] ~cluster_config:[] + Db.Cluster.create ~__context ~ref:cluster_ref ~uuid:cluster_uuid ~cluster_token ~cluster_stack ~pending_forget:[] + ~pool_auto_join ~token_timeout ~token_timeout_coefficient ~current_operations:[] ~allowed_operations:[] ~cluster_config:[] ~other_config:[]; - Db.Cluster_host.create ~__context ~ref:cluster_host_ref ~uuid:cluster_host_uuid ~cluster:cluster_ref ~host ~enabled:true - ~current_operations:[] ~allowed_operations:[] ~other_config:[]; + Db.Cluster_host.create ~__context ~ref:cluster_host_ref ~uuid:cluster_host_uuid ~cluster:cluster_ref ~host ~enabled:true ~pIF + ~current_operations:[] ~allowed_operations:[] ~other_config:[] ~joined:true; Xapi_cluster_host_helpers.update_allowed_operations ~__context ~self:cluster_host_ref; D.debug "Created Cluster: %s and Cluster_host: %s" (Ref.string_of cluster_ref) (Ref.string_of cluster_host_ref); set_ha_cluster_stack ~__context; @@ -76,7 +76,6 @@ let create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout ~to ) let destroy ~__context ~self = - let dbg = Context.string_of_task __context in let cluster_hosts = Db.Cluster.get_cluster_hosts ~__context ~self in let cluster_host = match cluster_hosts with | [] -> None @@ -86,39 +85,35 @@ let destroy ~__context ~self = raise Api_errors.(Server_error(cluster_does_not_have_one_node, [string_of_int n])) in Xapi_stdext_monadic.Opt.iter (fun ch -> - assert_cluster_host_has_no_attached_sr_which_requires_cluster_stack ~__context ~self:ch + assert_cluster_host_has_no_attached_sr_which_requires_cluster_stack ~__context ~self:ch; + Xapi_cluster_host.force_destroy ~__context ~self:ch ) cluster_host; - let result = Cluster_client.LocalClient.destroy (rpc ~__context) dbg in - match result with - | Result.Ok () -> - Xapi_stdext_monadic.Opt.iter (fun ch -> - Db.Cluster_host.destroy ~__context ~self:ch - ) cluster_host; - Db.Cluster.destroy ~__context ~self; - D.debug "Cluster destroyed successfully"; - set_ha_cluster_stack ~__context; - Xapi_clustering.Daemon.disable ~__context - | Result.Error error -> - D.warn "Error occurred during Cluster.destroy"; - handle_error error + Db.Cluster.destroy ~__context ~self; + D.debug "Cluster destroyed successfully"; + set_ha_cluster_stack ~__context; + Xapi_clustering.Daemon.disable ~__context + +let get_network ~__context ~self = + get_network_internal ~__context ~self (* helper function; concurrency checks are done in implementation of Cluster.create and Cluster_host.create *) let pool_create ~__context ~network ~cluster_stack ~token_timeout ~token_timeout_coefficient = validate_params ~token_timeout ~token_timeout_coefficient; let master = Helpers.get_master ~__context in - let hosts = Db.Host.get_all ~__context in - + let slave_hosts = Xapi_pool_helpers.get_slaves_list ~__context in + let pIF,_ = pif_of_host ~__context network master in let cluster = Helpers.call_api_functions ~__context (fun rpc session_id -> - Client.Client.Cluster.create ~rpc ~session_id ~network ~cluster_stack:Constants.default_smapiv3_cluster_stack ~pool_auto_join:true ~token_timeout ~token_timeout_coefficient) + Client.Client.Cluster.create ~rpc ~session_id ~pIF ~cluster_stack + ~pool_auto_join:true ~token_timeout ~token_timeout_coefficient) in List.iter (fun host -> - if master <> host then - (* We need to run this code on the slave *) - Helpers.call_api_functions ~__context (fun rpc session_id -> - let cluster_host_ref = Client.Client.Cluster_host.create ~rpc ~session_id ~cluster ~host in - D.debug "Created Cluster_host: %s" (Ref.string_of cluster_host_ref); - )) hosts; + (* Cluster.create already created cluster_host on master, so we only need to iterate through slaves *) + Helpers.call_api_functions ~__context (fun rpc session_id -> + let pifref,_ = pif_of_host ~__context network host in + let cluster_host_ref = Client.Client.Cluster_host.create ~rpc ~session_id ~cluster ~host ~pif:pifref in + D.debug "Created Cluster_host: %s" (Ref.string_of cluster_host_ref); + )) slave_hosts; cluster @@ -139,6 +134,7 @@ let pool_force_destroy ~__context ~self = let slave_cluster_hosts = Db.Cluster.get_cluster_hosts ~__context ~self |> filter_on_option master_cluster_host in + debug "Destroying cluster_hosts in pool"; (* First try to destroy each cluster_host - if we can do so safely then do *) List.iter (fun cluster_host -> @@ -172,14 +168,14 @@ let pool_force_destroy ~__context ~self = [] all_remaining_cluster_hosts in - begin - match exns with - | [] -> D.debug "Cluster.force_destroy was successful" - | e :: _ -> raise Api_errors.(Server_error (cluster_force_destroy_failed, [Ref.string_of self])) + begin match exns with + | [] -> D.debug "Successfully destroyed all cluster_hosts in pool, now destroying cluster %s" (Ref.string_of self) + | e :: _ -> raise Api_errors.(Server_error (cluster_force_destroy_failed, [Ref.string_of self])) end; Helpers.call_api_functions ~__context (fun rpc session_id -> - Client.Client.Cluster.destroy ~rpc ~session_id ~self) + Client.Client.Cluster.destroy ~rpc ~session_id ~self); + debug "Cluster_host.force_destroy was successful" (* Helper function; concurrency checks are done in implementation of Cluster.destroy and Cluster_host.destroy *) let pool_destroy ~__context ~self = @@ -209,6 +205,7 @@ let pool_resync ~__context ~(self : API.ref_Cluster) = List.iter (fun host -> log_and_ignore_exn (fun () -> + Xapi_cluster_host.create_as_necessary ~__context ~host; Xapi_cluster_host.resync_host ~__context ~host; if is_clustering_disabled_on_host ~__context host then raise Api_errors.(Server_error (no_compatible_cluster_host, [Ref.string_of host])) diff --git a/ocaml/xapi/xapi_cluster.mli b/ocaml/xapi/xapi_cluster.mli index b8ef04b7e27..de9a2d5efec 100644 --- a/ocaml/xapi/xapi_cluster.mli +++ b/ocaml/xapi/xapi_cluster.mli @@ -17,30 +17,35 @@ (******************************************************************************) (** {2 External API calls} *) -val create : __context:Context.t -> network:API.ref_network -> - cluster_stack:string -> pool_auto_join:bool -> - token_timeout:float -> token_timeout_coefficient:float -> +val create : __context:Context.t -> + pIF:API.ref_PIF -> cluster_stack:string -> pool_auto_join:bool -> + token_timeout:float -> token_timeout_coefficient:float -> API.ref_Cluster -(** [create ~__context ~network ~cluster_stack ~pool_auto_join ~token_timeout - * ~token_timeout_coefficient] is the implementation of the XenAPI method +(** [create ~__context ~cluster_stack ~pool_auto_join ~token_timeout + * ~token_timeout_coefficient] is the implementation of the XenAPI method * 'Cluster.create'. It is the constructor of the Cluster object. *) val destroy : __context:Context.t -> self:API.ref_Cluster -> unit (** [destroy ~__context ~self] is the implementation of the XenAPI method 'Cluster.destroy'. It is the destructor of the Cluster object *) -val pool_create : __context:Context.t -> network:API.ref_network -> - cluster_stack:string -> token_timeout:float -> +val get_network : __context:Context.t -> self:API.ref_Cluster -> API.ref_network +(** [get_network ~__context ~self] returns the network of the master cluster host's PIF, + as well as logging whether all the cluster hosts in the pool have + PIFs on the same network *) + +val pool_create : __context:Context.t -> + network:API.ref_network -> cluster_stack:string -> token_timeout:float -> token_timeout_coefficient:float -> API.ref_Cluster -(** [pool_create ~__context ~network ~cluster_stack ~token_timeout - ~token_timeout_coefficient] is the implementation of the XenAPI +(** [pool_create ~__context ~network ~cluster_stack ~token_timeout + ~token_timeout_coefficient] is the implementation of the XenAPI method 'Cluster.pool_create'. This is a convenience function that creates the Cluster object and then creates Cluster_host objects for all hosts in the pool. *) val pool_force_destroy : __context:Context.t -> self:API.ref_Cluster -> unit (** [pool_force_destroy ~__context ~self] is the implementation of the XenAPI - method 'Cluster.pool_force_destroy'. This is a convenience function that + method 'Cluster.pool_force_destroy'. This is a convenience function that attempts to force destroy the Cluster_host objects for all hosts in the pool and then destroys the Cluster object if it was successful. *) diff --git a/ocaml/xapi/xapi_cluster_helpers.ml b/ocaml/xapi/xapi_cluster_helpers.ml index 3d375ff6b83..dea945ba5bf 100644 --- a/ocaml/xapi/xapi_cluster_helpers.ml +++ b/ocaml/xapi/xapi_cluster_helpers.ml @@ -45,15 +45,26 @@ let get_operation_error ~__context ~self ~op = let ref_str = Ref.string_of self in let current_error = None in - let check c f = match c with | Some e -> Some e | None -> f () in + let assert_allowed_during_rpu __context = function + | `add | `remove | `destroy when Helpers.rolling_upgrade_in_progress ~__context -> + Some (Api_errors.not_supported_during_upgrade, []) + | _ -> None + in + (* if other operations are in progress, check that the new operation is allowed concurrently with them *) let current_error = check current_error (fun () -> let current_ops = cr.Db_actions.cluster_current_operations in - if (current_ops <> []) && not (is_allowed_concurrently ~op ~current_ops) - then report_concurrent_operations_error ~current_ops ~ref_str - else None) in + match current_ops with + | _::_ when not (is_allowed_concurrently ~op ~current_ops) -> + report_concurrent_operations_error ~current_ops ~ref_str + | _ -> + check + (assert_allowed_during_rpu __context op) + (fun () -> None) + ) + in current_error let assert_operation_valid ~__context ~self ~op = @@ -68,16 +79,9 @@ let update_allowed_operations ~__context ~self = | _ -> accu in let allowed = List.fold_left check [] all_cluster_operations in - (* TODO: check if we need RPU-related checks here for restricting allowed_operations - based on if an RPU is in progress... - let allowed = - if Helpers.rolling_upgrade_in_progress ~__context - then Listext.List.intersect allowed Xapi_globs.rpu_allowed_cluster_host_operations - else allowed - in *) Db.Cluster.set_allowed_operations ~__context ~self ~value:allowed -(** Add to the cluster host's current_operations, call a function and then remove from the +(** Add to the cluster's current_operations, call a function and then remove from the current operations. Ensure allowed_operations is kept up to date throughout. *) let with_cluster_operation ~__context ~(self : [`Cluster] API.Ref.t) ~doc ~op ?policy f = let task_id = Ref.string_of (Context.get_task_id __context) in diff --git a/ocaml/xapi/xapi_cluster_host.ml b/ocaml/xapi/xapi_cluster_host.ml index 0afa0de7ec6..e136ba7ff91 100644 --- a/ocaml/xapi/xapi_cluster_host.ml +++ b/ocaml/xapi/xapi_cluster_host.ml @@ -22,131 +22,176 @@ open D (* We can't fix _all_ of the prerequisites, as we can't automatically create an IP address. So what we do here is to at least plug the thing in and ensure it has disallow unplug set. *) -let fix_pif_prerequisites ~__context (pif_ref,pif_rec) = +let fix_pif_prerequisites ~__context (self : API.ref_PIF) = (* The following is to raise an exception if there's no IP. This avoids making any changes to the PIF if there's something we simply can't fix. *) - ignore(ip_of_pif (pif_ref,pif_rec)); - if not pif_rec.API.pIF_currently_attached then + let pif_rec self = Db.PIF.get_record ~__context ~self in + ip_of_pif (self,pif_rec self) |> ignore; + if not (pif_rec self).API.pIF_currently_attached then Helpers.call_api_functions ~__context (fun rpc session_id -> - Client.Client.PIF.plug ~rpc ~session_id ~self:pif_ref); - if not pif_rec.API.pIF_disallow_unplug then begin - debug "Setting disallow_unplug on cluster PIF"; - Db.PIF.set_disallow_unplug ~__context ~self:pif_ref ~value:true + Client.Client.PIF.plug ~rpc ~session_id ~self); + if not (pif_rec self).API.pIF_disallow_unplug then begin + debug "Setting disallow_unplug on cluster PIF %s" (Ref.string_of self); + Db.PIF.set_disallow_unplug ~__context ~self ~value:true end -let sync_required ~__context ~host = - let clusters = Db.Cluster.get_all_records ~__context in - match clusters with - | [] -> None - | [cluster_ref, cluster_rec] -> begin - let expr = Db_filter_types.(And (Eq (Field "host", Literal (Ref.string_of host)), - Eq (Field "cluster", Literal (Ref.string_of cluster_ref)))) in - let my_cluster_hosts = Db.Cluster_host.get_internal_records_where ~__context ~expr in - match my_cluster_hosts with - | [(_ref,_rec)] -> None - | [] -> - if cluster_rec.API.cluster_pool_auto_join - then Some cluster_ref - else None - | _ -> raise Api_errors.(Server_error (internal_error, [ "Host cannot be associated with more than one cluster_host"; Ref.string_of host ])) - end - | _ -> raise Api_errors.(Server_error (internal_error, ["Cannot have more than one Cluster object per pool currently"])) - -let create_as_necessary ~__context ~host = - match sync_required ~__context ~host with - | Some cluster_ref -> - let network = Db.Cluster.get_network ~__context ~self:cluster_ref in - let pif = Xapi_clustering.pif_of_host ~__context network host in - fix_pif_prerequisites ~__context pif; +let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body + ~(api_func : (Rpc.call -> Rpc.response) -> API.ref_session -> unit) = Helpers.call_api_functions ~__context (fun rpc session_id -> - Client.Client.Cluster_host.create rpc session_id cluster_ref host) |> ignore - | None -> () + try + api_func rpc session_id + with err -> + Backtrace.is_important err; + let body = Printf.sprintf "Error: %s\nMessage: %s" ExnHelper.(string_of_exn err) body in + Xapi_alert.add ~msg ~cls ~obj_uuid ~body; + raise err + ) -let resync_host ~__context ~host = - create_as_necessary ~__context ~host; - match (find_cluster_host ~__context ~host) with - | None -> () (* no clusters exist *) - | Some cluster_host -> (* cluster_host and cluster exist *) - (* Cluster_host.enable unconditionally invokes the low-level enable operations and is idempotent. *) - if Db.Cluster_host.get_enabled ~__context ~self:cluster_host - then Helpers.call_api_functions ~__context - (fun rpc session_id -> Client.Client.Cluster_host.enable ~rpc ~session_id ~self:cluster_host) - -let create ~__context ~cluster ~host = - (* TODO: take network lock *) - with_clustering_lock (fun () -> +(* Create xapi db object for cluster_host, resync_host calls clusterd *) +let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host = + with_clustering_lock __LOC__ (fun () -> assert_operation_host_target_is_localhost ~__context ~host; + assert_pif_attached_to ~host ~pIF ~__context; assert_cluster_host_can_be_created ~__context ~host; let ref = Ref.make () in - let dbg = Context.string_of_task __context in let uuid = Uuidm.to_string (Uuidm.create `V4) in - let network = Db.Cluster.get_network ~__context ~self:cluster in + Db.Cluster_host.create ~__context ~ref ~uuid ~cluster ~host ~pIF ~enabled:false + ~current_operations:[] ~allowed_operations:[] ~other_config:[] ~joined:false; + ref + ) + +(* Helper function atomically enables clusterd and joins the cluster_host *) +let join_internal ~__context ~self = + with_clustering_lock __LOC__ (fun () -> + + let pIF = Db.Cluster_host.get_PIF ~__context ~self in + fix_pif_prerequisites ~__context pIF; + + let dbg = Context.string_of_task __context in + let cluster = Db.Cluster_host.get_cluster ~__context ~self in let cluster_token = Db.Cluster.get_cluster_token ~__context ~self:cluster in - let pif = pif_of_host ~__context network host in - assert_pif_prerequisites pif; - let ip = ip_of_pif pif in - let ip_list = List.map (fun cluster_host -> - Db.Cluster_host.get_host ~__context ~self:cluster_host |> - pif_of_host ~__context network |> - ip_of_pif + let ip = ip_of_pif (pIF, Db.PIF.get_record ~__context ~self:pIF) in + let ip_list = List.map (fun self -> + let p_ref = Db.Cluster_host.get_PIF ~__context ~self in + let p_rec = Db.PIF.get_record ~__context ~self:p_ref in + ip_of_pif (p_ref,p_rec) ) (Db.Cluster.get_cluster_hosts ~__context ~self:cluster) in + + debug "Enabling clusterd and joining cluster_host %s" (Ref.string_of self); Xapi_clustering.Daemon.enable ~__context; - let result = Cluster_client.LocalClient.join (rpc ~__context) dbg cluster_token ip ip_list in + let result = + Cluster_client.LocalClient.join (rpc ~__context) dbg cluster_token ip ip_list + in match result with | Result.Ok () -> - Db.Cluster_host.create ~__context ~ref ~uuid ~cluster ~host ~enabled:true - ~current_operations:[] ~allowed_operations:[] ~other_config:[]; - debug "Cluster_host.create was successful; cluster_host: %s" (Ref.string_of ref); - ref + debug "Cluster join create was successful for cluster_host %s" (Ref.string_of self); + Db.Cluster_host.set_joined ~__context ~self ~value:true; + Db.Cluster_host.set_enabled ~__context ~self ~value:true; + debug "Cluster_host %s joined and enabled" (Ref.string_of self) | Result.Error error -> - warn "Error occurred during Cluster_host.create"; + warn "Error occurred when joining cluster_host %s" (Ref.string_of self); handle_error error - ) + ) + +(* Enable cluster_host in client layer via clusterd *) +let resync_host ~__context ~host = + match find_cluster_host ~__context ~host with + | None -> () (* no clusters exist *) + | Some self -> (* cluster_host and cluster exist *) + let body = Printf.sprintf "Unable to create cluster host on %s." + (Db.Host.get_name_label ~__context ~self:host) in + let obj_uuid = Db.Host.get_uuid ~__context ~self:host in + + call_api_function_with_alert ~__context + ~msg:Api_messages.cluster_host_enable_failed + ~cls:`Host ~obj_uuid ~body + ~api_func:(fun rpc session_id -> + (* If we have just joined, enable will prevent concurrent clustering ops *) + if not (Db.Cluster_host.get_joined ~__context ~self) + then join_internal ~__context ~self + else + if Db.Cluster_host.get_enabled ~__context ~self then begin + (* [enable] unconditionally invokes low-level enable operations and is idempotent. + RPU reformats partition, losing service status, never re-enables clusterd *) + debug "Cluster_host %s is enabled, starting up xapi-clusterd" (Ref.string_of self); + Xapi_clustering.Daemon.enable ~__context; + + (* Note that join_internal and enable both use the clustering lock *) + Client.Client.Cluster_host.enable rpc session_id self end + ) + +(* API call split into separate functions to create in db and enable in client layer *) +let create ~__context ~cluster ~host ~pif = + let cluster_host : API.ref_Cluster_host = create_internal ~__context ~cluster ~host ~pIF:pif in + resync_host ~__context ~host; + cluster_host + +let destroy_op ~__context ~self meth = + with_clustering_lock __LOC__ (fun () -> + let dbg = Context.string_of_task __context in + let host = Db.Cluster_host.get_host ~__context ~self in + assert_operation_host_target_is_localhost ~__context ~host; + assert_cluster_host_has_no_attached_sr_which_requires_cluster_stack ~__context ~self; + let result = Cluster_client.LocalClient.destroy (rpc ~__context) dbg in + match result with + | Result.Ok () -> + Db.Cluster_host.destroy ~__context ~self; + debug "Cluster_host.%s was successful" meth; + Xapi_clustering.Daemon.disable ~__context + | Result.Error error -> + warn "Error occurred during Cluster_host.%s" meth; + handle_error error) let force_destroy ~__context ~self = - let dbg = Context.string_of_task __context in - let host = Db.Cluster_host.get_host ~__context ~self in - assert_operation_host_target_is_localhost ~__context ~host; - assert_cluster_host_has_no_attached_sr_which_requires_cluster_stack ~__context ~self; - let result = Cluster_client.LocalClient.destroy (rpc ~__context) dbg in - match result with - | Result.Ok () -> - Db.Cluster_host.destroy ~__context ~self; - debug "Cluster_host.force_destroy was successful"; - Xapi_clustering.Daemon.disable ~__context - | Result.Error error -> - warn "Error occurred during Cluster_host.force_destroy"; - handle_error error + destroy_op ~__context ~self "force_destroy" let destroy ~__context ~self = - let dbg = Context.string_of_task __context in - let host = Db.Cluster_host.get_host ~__context ~self in - assert_operation_host_target_is_localhost ~__context ~host; - assert_cluster_host_has_no_attached_sr_which_requires_cluster_stack ~__context ~self; assert_cluster_host_enabled ~__context ~self ~expected:true; - let result = Cluster_client.LocalClient.leave (rpc ~__context) dbg in - match result with - (* can't include refs in case those were successfully destroyed *) - | Result.Ok () -> - Db.Cluster_host.destroy ~__context ~self; - debug "Cluster_host.destroy was successful"; - Xapi_clustering.Daemon.disable ~__context - | Result.Error error -> - warn "Error occurred during Cluster_host.destroy"; - handle_error error + let cluster = Db.Cluster_host.get_cluster ~__context ~self in + let () = match Db.Cluster.get_cluster_hosts ~__context ~self:cluster with + | [ _ ] -> + raise Api_errors.(Server_error (cluster_host_is_last, [Ref.string_of self])) + | _ -> () + in + destroy_op ~__context ~self "destroy" + +let ip_of_str str = Cluster_interface.IPv4 str + +let forget ~__context ~self = + with_clustering_lock __LOC__ (fun () -> + let dbg = Context.string_of_task __context in + let cluster = Db.Cluster_host.get_cluster ~__context ~self in + let pif = Db.Cluster_host.get_PIF ~__context ~self in + let ip = Db.PIF.get_IP ~__context ~self:pif in + let pending = ip :: Db.Cluster.get_pending_forget ~__context ~self:cluster in + debug "Setting pending forget to %s" (String.concat "," pending); + Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:pending; + + let pending = List.map ip_of_str pending in + let result = Cluster_client.LocalClient.declare_dead (rpc ~__context) dbg pending in + match result with + | Result.Ok () -> + debug "Successfully forgot permanently dead hosts, setting pending forget to empty"; + Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[]; + (* must not disable the daemon here, because we declared another unreachable node dead, + * not the current one *) + debug "Cluster_host.forget was successful" + | Result.Error error -> + warn "Error encountered when declaring dead cluster_host %s (did you declare all dead hosts yet?)" (Ref.string_of self); + handle_error error + ) let enable ~__context ~self = - with_clustering_lock (fun () -> + with_clustering_lock __LOC__ (fun () -> let dbg = Context.string_of_task __context in let host = Db.Cluster_host.get_host ~__context ~self in assert_operation_host_target_is_localhost ~__context ~host; - let cluster = Db.Cluster_host.get_cluster ~__context ~self in - let network = Db.Cluster.get_network ~__context ~self:cluster in - let pif = pif_of_host ~__context network host in - assert_pif_prerequisites pif; + let pifref = Db.Cluster_host.get_PIF ~__context ~self in + let pifrec = Db.PIF.get_record ~__context ~self:pifref in + assert_pif_prerequisites (pifref,pifrec); - let ip = ip_of_pif pif in + let ip = ip_of_pif (pifref,pifrec) in let init_config = { Cluster_interface.local_ip = ip; token_timeout_ms = None; @@ -164,7 +209,7 @@ let enable ~__context ~self = ) let disable ~__context ~self = - with_clustering_lock (fun () -> + with_clustering_lock __LOC__ (fun () -> let dbg = Context.string_of_task __context in let host = Db.Cluster_host.get_host ~__context ~self in assert_operation_host_target_is_localhost ~__context ~host; @@ -188,3 +233,30 @@ let disable_clustering ~__context = info "Disabling cluster_host %s" (Ref.string_of self); disable ~__context ~self +let sync_required ~__context ~host = + let clusters = Db.Cluster.get_all_records ~__context in + match clusters with + | [] -> None + | [cluster_ref, cluster_rec] -> begin + let expr = Db_filter_types.(And (Eq (Field "host", Literal (Ref.string_of host)), + Eq (Field "cluster", Literal (Ref.string_of cluster_ref)))) in + let my_cluster_hosts = Db.Cluster_host.get_internal_records_where ~__context ~expr in + match my_cluster_hosts with + | [(_ref,_rec)] -> None + | [] -> + if cluster_rec.API.cluster_pool_auto_join + then Some cluster_ref + else None + | _ -> raise Api_errors.(Server_error (internal_error, [ "Host cannot be associated with more than one cluster_host"; Ref.string_of host ])) + end + | _ -> raise Api_errors.(Server_error (internal_error, ["Cannot have more than one Cluster object per pool currently"])) + +(* If cluster found without local cluster_host, create one in db *) +let create_as_necessary ~__context ~host = + match sync_required ~__context ~host with + | Some cluster -> (* assume pool autojoin set *) + let network = get_network_internal ~__context ~self:cluster in + let (pIF,pifrec) = Xapi_clustering.pif_of_host ~__context network host in + create_internal ~__context ~cluster ~host ~pIF |> ignore + | None -> () + diff --git a/ocaml/xapi/xapi_cluster_host.mli b/ocaml/xapi/xapi_cluster_host.mli index 994c890050f..ff0b2e2b2e7 100644 --- a/ocaml/xapi/xapi_cluster_host.mli +++ b/ocaml/xapi/xapi_cluster_host.mli @@ -18,15 +18,15 @@ (******************************************************************************) (** {2 Internal helper functions} *) -val fix_pif_prerequisites : __context:Context.t -> (API.ref_PIF * API.pIF_t) -> - unit +val fix_pif_prerequisites : __context:Context.t -> API.ref_PIF -> + unit (* [fix_pif_prerequisites ~__context (pif_ref,pif_rec)] will fix those prerequisites that are fixable automatically. It won't be able to fix a missing IP address, but it will plug the PIF if it's not attached and it will set disallow_unplug once the PIF is plugged *) val sync_required : __context:Context.t -> host:API.ref_host -> - API.ref_Cluster option + API.ref_Cluster option (** [sync_required ~__context ~host] returns an option type indicating whether any action is required to sync the cluster. This will only be the case if the cluster object has [pool_auto_join] set and no corresponding @@ -35,15 +35,17 @@ val sync_required : __context:Context.t -> host:API.ref_host -> val create_as_necessary : __context:Context.t -> host:API.ref_host -> unit (** [create_as_necessary ~__context ~host] calls [sync_required], and if any - Cluster_host objects are required it will create them *) + Cluster_host objects are required it will create them in the database *) (******************************************************************************) (** {2 External API calls} *) -val create : __context:Context.t -> cluster:API.ref_Cluster -> host:API.ref_host +val create : __context:Context.t -> cluster:API.ref_Cluster -> host:API.ref_host -> pif:API.ref_PIF -> API.ref_Cluster_host (** [create ~__context ~cluster ~host] is implementation of the XenAPI call - 'Cluster_host.create'. It is the Cluster_host object constructor *) + 'Cluster_host.create'. It is the Cluster_host object constructor, and creates + a cluster_host in the DB before calling [resync_host ~__context ~host], which + either joins the host to the cluster or enables the cluster host *) val force_destroy : __context:Context.t -> self:API.ref_Cluster_host -> unit (** [force_destroy ~__context ~self] is the implementation of the XenAPI call @@ -52,7 +54,9 @@ val force_destroy : __context:Context.t -> self:API.ref_Cluster_host -> unit val destroy : __context:Context.t -> self:API.ref_Cluster_host -> unit (** [destroy ~__context ~self] is the implementation of the XenAPI call - 'Cluster_host.destroy'. It is the Cluster_host destructor *) + 'Cluster_host.destroy'. It is the Cluster_host destructor + Note that this is the only Cluster_host call that is still valid if the + clustering daemon is disabled, all others require it enabled *) val enable : __context:Context.t -> self:API.ref_Cluster_host -> unit (** [enable ~__context ~self] is the implementation of the XenAPI call @@ -71,7 +75,14 @@ val disable_clustering : __context:Context.t -> unit and logs its actions. *) val resync_host : __context:Context.t -> host:API.ref_host -> unit -(** [resync_host ~__context ~host] checks for any clusters on the host. If one - exists but is not associated with a cluster_host, it creates one. If the - database indicates the cluster_host is enabled, host_resync enables it - in the Client layer too. Otherwise, nothing happens. *) +(** [resync_host ~__context ~host] checks for the existence of a cluster_host. + If one exists but hasn't joined the cluster, xapi asks xapi-clusterd to add + the host to the cluster, otherwise it enables the cluster host. + If no cluster_host is found, nothing happens. + If a failure occurs, Xapi sends an alert to XenCenter *) + +val forget : __context:Context.t -> self:API.ref_Cluster_host -> unit +(** [forget ~__context ~self] marks the cluster host as permanently removed + from the cluster. This will only succeed if the rest of the hosts are online, + so in the case of failure the cluster's pending_forget list will be updated. + If you declare all your dead hosts as dead one by one the last one should succeed *) diff --git a/ocaml/xapi/xapi_cluster_host_helpers.ml b/ocaml/xapi/xapi_cluster_host_helpers.ml index 38a6a8290dd..adb5c000994 100644 --- a/ocaml/xapi/xapi_cluster_host_helpers.ml +++ b/ocaml/xapi/xapi_cluster_host_helpers.ml @@ -40,18 +40,29 @@ let report_concurrent_operations_error ~current_ops ~ref_str = let get_operation_error ~__context ~self ~op = let chr = Db.Cluster_host.get_record_internal ~__context ~self in let ref_str = Ref.string_of self in - (* let cluster = Db.Cluster_host.get_cluster ~__context ~self in *) let current_error = None in let check c f = match c with | Some e -> Some e | None -> f () in + let assert_joined_cluster ~__context ~self = function + | (`enable | `disable) when not (Db.Cluster_host.get_joined ~__context ~self) -> + (* Cannot enable nor disable without joining cluster *) + Some (Api_errors.cluster_host_not_joined, [ ref_str ]) + | _ -> None + in + (* if other operations are in progress, check that the new operation is allowed concurrently with them *) let current_error = check current_error (fun () -> let current_ops = chr.Db_actions.cluster_host_current_operations in - if current_ops <> [] && not (is_allowed_concurrently ~op ~current_ops) - then report_concurrent_operations_error ~current_ops ~ref_str - else None) in + match current_ops with + | _::_ when not (is_allowed_concurrently ~op ~current_ops) -> + report_concurrent_operations_error ~current_ops ~ref_str + | _ -> + check + (assert_joined_cluster ~__context ~self op) + (fun () -> None) (* replace this function if adding new checks *) + ) in current_error @@ -67,13 +78,6 @@ let update_allowed_operations ~__context ~self = | _ -> accu in let allowed = List.fold_left check [] all_cluster_host_operations in - (* TODO: check if we need RPU-related checks here for restricting allowed_operations - based on if an RPU is in progress... - let allowed = - if Helpers.rolling_upgrade_in_progress ~__context - then Listext.List.intersect allowed Xapi_globs.rpu_allowed_cluster_operations - else allowed - in *) Db.Cluster_host.set_allowed_operations ~__context ~self ~value:allowed (** Add to the cluster host's current_operations, call a function and then remove from the diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml index 579ce133478..a6149092c06 100644 --- a/ocaml/xapi/xapi_clustering.ml +++ b/ocaml/xapi/xapi_clustering.ml @@ -24,17 +24,17 @@ let set_ha_cluster_stack ~__context = Db.Pool.set_ha_cluster_stack ~__context ~self ~value (* host-local clustering lock *) -let clustering_lock_m = Mutex.create () - -let with_clustering_lock f = - debug "Trying to grab host-local clustering lock..."; - Stdext.Threadext.Mutex.execute clustering_lock_m - (fun () -> - Stdext.Pervasiveext.finally - (fun () -> - debug "Grabbed host-local clustering lock; executing function..."; - f ()) - (fun () -> debug "Function execution finished; returned host-local clustering lock.")) +let clustering_lock_m = Locking_helpers.Named_mutex.create "clustering" + +let with_clustering_lock where f = + debug "Trying to grab host-local clustering lock... (%s)" where; + Locking_helpers.Named_mutex.execute clustering_lock_m + (fun () -> Stdext.Pervasiveext.finally + (fun () -> + debug "Grabbed host-local clustering lock; executing function... (%s)" where; + f ()) + (fun () -> + debug "Function execution finished; returned host-local clustering lock. (%s)" where)) (* Note we have to add type annotations to network/host here because they're only used in the context of Db.PIF.get_records_where, and they're just strings there *) @@ -76,6 +76,10 @@ let assert_pif_prerequisites pif = ignore (ip_of_pif pif); debug "Got IP %s for PIF %s" record.API.pIF_IP (Ref.string_of pif_ref) +let assert_pif_attached_to ~__context ~host ~pIF = + if not (List.mem pIF (Db.Host.get_PIFs ~__context ~self:host)) then + raise Api_errors.(Server_error (pif_not_attached_to_host, [Ref.string_of pIF; Ref.string_of host])) + let handle_error = function | InternalError message -> raise Api_errors.(Server_error (internal_error, [ message ])) | Unix_error message -> failwith ("Unix Error: " ^ message) @@ -103,15 +107,15 @@ let assert_cluster_stack_valid ~cluster_stack = if not (List.mem cluster_stack Constants.supported_smapiv3_cluster_stacks) then raise Api_errors.(Server_error (invalid_cluster_stack, [ cluster_stack ])) -let with_clustering_lock_if_needed ~__context ~sr_sm_type f = +let with_clustering_lock_if_needed ~__context ~sr_sm_type where f = match get_required_cluster_stacks ~__context ~sr_sm_type with | [] -> f () - | _required_cluster_stacks -> with_clustering_lock f + | _required_cluster_stacks -> with_clustering_lock where f -let with_clustering_lock_if_cluster_exists ~__context f = +let with_clustering_lock_if_cluster_exists ~__context where f = match Db.Cluster.get_all ~__context with | [] -> f () - | _ -> with_clustering_lock f + | _ -> with_clustering_lock where f let find_cluster_host ~__context ~host = match Db.Cluster_host.get_refs_where ~__context @@ -123,6 +127,25 @@ let find_cluster_host ~__context ~host = raise Api_errors.(Server_error(internal_error, [msg; (Ref.string_of host)])) | _ -> None +let get_master_pif ~__context = + match find_cluster_host ~__context ~host:Helpers.(get_master ~__context) with + | Some self -> Db.Cluster_host.get_PIF ~__context ~self + | None -> raise Api_errors.(Server_error (internal_error, [ "No cluster_host exists on master" ])) + +let get_network_internal ~__context ~self = + let cluster_network = Db.PIF.get_network ~__context ~self:(get_master_pif ~__context) in + if List.exists + (fun cluster_host -> + let pif = Db.Cluster_host.get_PIF ~__context ~self:cluster_host in + let cluster_host_network = Db.PIF.get_network ~__context ~self:pif in + cluster_host_network <> cluster_network + ) (Db.Cluster_host.get_all ~__context) + then + debug "Not all cluster hosts of cluster %s on same network %s" + (Ref.string_of self) (Ref.string_of cluster_network); + + cluster_network + let assert_cluster_host_enabled ~__context ~self ~expected = let actual = Db.Cluster_host.get_enabled ~__context ~self in if actual <> expected then @@ -176,6 +199,8 @@ let assert_cluster_host_has_no_attached_sr_which_requires_cluster_stack ~__conte then raise Api_errors.(Server_error (cluster_stack_in_use, [ cluster_stack ])) module Daemon = struct + let enabled = ref false + let maybe_call_script ~__context script params = match Context.get_test_clusterd_rpc __context with | Some _ -> debug "in unit test, not calling %s %s" script (String.concat " " params) @@ -188,11 +213,13 @@ module Daemon = struct maybe_call_script ~__context !Xapi_globs.firewall_port_config_script ["open"; port]; maybe_call_script ~__context "/usr/bin/systemctl" [ "enable"; service ]; maybe_call_script ~__context "/usr/bin/systemctl" [ "start"; service ]; + enabled := true; debug "Cluster daemon: enabled & started" let disable ~__context = let port = (string_of_int !Xapi_globs.xapi_clusterd_port) in debug "Disabling and stopping the clustering daemon"; + enabled := false; maybe_call_script ~__context "/usr/bin/systemctl" [ "disable"; service ]; maybe_call_script ~__context "/usr/bin/systemctl" [ "stop"; service ]; maybe_call_script ~__context !Xapi_globs.firewall_port_config_script ["close"; port]; @@ -205,10 +232,13 @@ end * Instead of returning an empty URL which wouldn't work just raise an * exception. *) let rpc ~__context = + if not !Daemon.enabled then + raise Api_errors.(Server_error(Api_errors.operation_not_allowed, + ["clustering daemon has not been started yet"])); match Context.get_test_clusterd_rpc __context with | Some rpc -> rpc | None -> - Cluster_client.rpc (fun () -> failwith "Can only communicate with xapi-clusterd through message-switch") + Cluster_client.rpc (fun () -> failwith "Can only communicate with xapi-clusterd through message-switch") let is_clustering_disabled_on_host ~__context host = match find_cluster_host ~__context ~host with @@ -218,6 +248,6 @@ let is_clustering_disabled_on_host ~__context host = let compute_corosync_max_host_failures ~__context = let all_hosts = Db.Host.get_all ~__context in let nhosts = List.length (all_hosts) in - let disabled_hosts = List.length (List.filter (fun host -> is_clustering_disabled_on_host ~__context host = true ) all_hosts) in + let disabled_hosts = List.length (List.filter (fun host -> is_clustering_disabled_on_host ~__context host) all_hosts) in let corosync_ha_max_hosts = ((nhosts - disabled_hosts - 1) / 2) + disabled_hosts in corosync_ha_max_hosts diff --git a/ocaml/xapi/xapi_db_upgrade.ml b/ocaml/xapi/xapi_db_upgrade.ml index 6f09ad2a3f2..8027d2f3ed2 100644 --- a/ocaml/xapi/xapi_db_upgrade.ml +++ b/ocaml/xapi/xapi_db_upgrade.ml @@ -564,6 +564,21 @@ let upgrade_domain_type = { (Db.VM.get_all_records ~__context) } +let upgrade_cluster_timeouts = { + description = "Upgrade cluster timeout units from milliseconds to seconds"; + version = (fun x -> x < (5, 202)); (* the version where we switched to seconds *) + fn = fun ~__context -> + Db.Cluster.get_all ~__context + |> List.iter (fun self -> + let update_milliseconds getter setter = + let value = getter ~__context ~self /. 1000. in + setter ~__context ~self ~value + in + update_milliseconds Db.Cluster.get_token_timeout Db.Cluster.set_token_timeout; + update_milliseconds Db.Cluster.get_token_timeout_coefficient Db.Cluster.set_token_timeout_coefficient; + ) +} + let rules = [ upgrade_domain_type; upgrade_alert_priority; @@ -589,6 +604,7 @@ let rules = [ upgrade_recommendations_for_gpu_passthru; upgrade_vswitch_controller; default_vm_platform_device_model; + upgrade_cluster_timeouts; ] (* Maybe upgrade most recent db *) diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml index e6721b54fd5..03940bad6d7 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.ml +++ b/ocaml/xapi/xapi_ha_vm_failover.ml @@ -332,7 +332,7 @@ let compute_max_host_failures_to_tolerate ~__context ?live_set ?protected_vms () (* Make sure the pool is marked as overcommitted and the appropriate alert is generated. Return true if something changed, false otherwise *) let mark_pool_as_overcommitted ~__context ~live_set = - Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context (fun () -> + Xapi_clustering.with_clustering_lock_if_cluster_exists ~__context __LOC__ (fun () -> let pool = Helpers.get_pool ~__context in let overcommitted = Db.Pool.get_ha_overcommitted ~__context ~self:pool in diff --git a/ocaml/xapi/xapi_hooks.ml b/ocaml/xapi/xapi_hooks.ml index 22f1d3fd776..db34d149dbc 100644 --- a/ocaml/xapi/xapi_hooks.ml +++ b/ocaml/xapi/xapi_hooks.ml @@ -78,7 +78,19 @@ let execute_pool_hook ~__context ~reason = execute_hook ~__context ~args:[] ~reason let host_pre_declare_dead ~__context ~host ~reason = - execute_host_hook ~__context ~script_name:scriptname__host_pre_declare_dead ~reason ~host + info "Running host pre declare dead hook for %s" (Ref.string_of host); + (* this could use power fencing *) + execute_host_hook ~__context ~script_name:scriptname__host_pre_declare_dead ~reason ~host; + + if String.equal reason reason__dbdestroy then log_and_ignore_exn (fun () -> + (* declare it as dead to the clustering daemon if any *) + match Xapi_clustering.find_cluster_host ~__context ~host with + | Some self -> + info "Declaring cluster host %s as permanently dead" (Ref.string_of self); + Helpers.call_api_functions ~__context + (fun rpc session_id -> Client.Client.Cluster_host.forget ~rpc ~session_id ~self) + | None -> ()) + (* Called when host died -- !! hook code in here to abort outstanding forwarded ops *) let internal_host_dead_hook __context host = diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index ac4b57243c4..6cc9be5a6fd 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -701,7 +701,6 @@ let precheck_destroy_declare_dead ~__context ~self call = let me = Helpers.get_localhost ~__context in if self=me then raise (Api_errors.Server_error(Api_errors.host_is_live, [ Ref.string_of self ])) - (* Returns a tuple of lists: The first containing the control domains, and the second containing the regular VMs *) let get_resident_vms ~__context ~self = let my_resident_vms = Db.Host.get_resident_VMs ~__context ~self in @@ -720,6 +719,10 @@ let destroy ~__context ~self = if List.length my_regular_vms > 0 then raise (Api_errors.Server_error(Api_errors.host_has_resident_vms, [ Ref.string_of self ])); + (* Call external host failed hook (allows a third-party to use power-fencing if desired). + * This will declare the host as dead to the clustering daemon *) + Xapi_hooks.host_pre_declare_dead ~__context ~host:self ~reason:Xapi_hooks.reason__dbdestroy; + (* Call the hook before we destroy the stuff as it will likely need the database records *) Xapi_hooks.host_post_declare_dead ~__context ~host:self ~reason:Xapi_hooks.reason__dbdestroy; @@ -732,6 +735,10 @@ let destroy ~__context ~self = let declare_dead ~__context ~host = precheck_destroy_declare_dead ~__context ~self:host "declare_dead"; + (* Call external host failed hook (allows a third-party to use power-fencing if desired). + * This needs to happen before we reset the power state of the VMs *) + Xapi_hooks.host_pre_declare_dead ~__context ~host ~reason:Xapi_hooks.reason__user; + let my_control_domains, my_regular_vms = get_resident_vms ~__context ~self:host in Helpers.call_api_functions ~__context (fun rpc session_id -> diff --git a/ocaml/xapi/xapi_mgmt_iface.ml b/ocaml/xapi/xapi_mgmt_iface.ml index 57fc6f38b0c..7f12cdfd7cd 100644 --- a/ocaml/xapi/xapi_mgmt_iface.ml +++ b/ocaml/xapi/xapi_mgmt_iface.ml @@ -157,20 +157,35 @@ let enable_himn ~__context ~addr = let rebind ~__context = run ~__context ~mgmt_enabled:!listening_all -let management_ip_mutex = Mutex.create () -let management_ip_cond = Condition.create () +let ip_mutex = Mutex.create () +let ip_cond = Condition.create () let wait_for_management_ip ~__context = let ip = ref (match Helpers.get_management_ip_addr ~__context with Some x -> x | None -> "") in let is_connected = ref (Helpers.get_management_iface_is_connected ~__context) in - Mutex.execute management_ip_mutex - (fun () -> begin while !ip = "" && !is_connected = false do - Condition.wait management_ip_cond management_ip_mutex; + Mutex.execute ip_mutex + (fun () -> while !ip = "" && !is_connected = false do + Condition.wait ip_cond ip_mutex; ip := (match Helpers.get_management_ip_addr ~__context with Some x -> x | None -> ""); is_connected := (Helpers.get_management_iface_is_connected ~__context) - done; end); + done); !ip +let has_carrier ~__context ~self = + let metrics = Db.PIF.get_metrics ~__context ~self in + Db.PIF_metrics.get_carrier ~__context ~self:metrics + +(* CA-280237: Called in startup sequence after creating cluster_hosts *) +let wait_for_clustering_ip ~__context ~(self : API.ref_Cluster_host) = + let pIF = Db.Cluster_host.get_PIF ~__context ~self in + let iP = ref (Db.PIF.get_IP ~__context ~self:pIF) in + Mutex.execute ip_mutex (* Don't return until PIF is plugged AND has a valid IP *) + (fun () -> while !iP="" || not (has_carrier ~__context ~self:pIF) do + Condition.wait ip_cond ip_mutex; + iP := Db.PIF.get_IP ~__context ~self:pIF + done); + !iP + let on_dom0_networking_change ~__context = debug "Checking to see if hostname or management IP has changed"; (* Need to update: @@ -207,6 +222,6 @@ let on_dom0_networking_change ~__context = end; Helpers.update_domain_zero_name ~__context localhost new_hostname; debug "Signalling anyone waiting for the management IP address to change"; - Mutex.execute management_ip_mutex - (fun () -> Condition.broadcast management_ip_cond) + Mutex.execute ip_mutex + (fun () -> Condition.broadcast ip_cond) diff --git a/ocaml/xapi/xapi_mgmt_iface.mli b/ocaml/xapi/xapi_mgmt_iface.mli index 65ea31c2556..18a34925a85 100644 --- a/ocaml/xapi/xapi_mgmt_iface.mli +++ b/ocaml/xapi/xapi_mgmt_iface.mli @@ -21,6 +21,9 @@ val himn_addr : string option ref (** Block until an IP address appears on the management interface *) val wait_for_management_ip : __context:Context.t -> string +(** Block until an IP address appears on the given cluster host PIF *) +val wait_for_clustering_ip : __context:Context.t -> self:API.ref_Cluster_host -> string + (** Called anywhere we suspect dom0's networking (hostname, IP address) has been changed underneath us (eg by dhclient) *) val on_dom0_networking_change : __context:Context.t -> unit diff --git a/ocaml/xapi/xapi_pbd.ml b/ocaml/xapi/xapi_pbd.ml index e22ea92799d..464f611ff08 100644 --- a/ocaml/xapi/xapi_pbd.ml +++ b/ocaml/xapi/xapi_pbd.ml @@ -112,18 +112,6 @@ let check_sharing_constraint ~__context ~sr = [ Ref.string_of sr; Ref.string_of (Db.PBD.get_host ~__context ~self:(List.hd others)) ])) end -(** If the SR requires some cluster stacks, we resync every compatible Cluster *) -let resync_cluster_stack_for_sr_type ~__context ~sr_sm_type ~host = - let required_cluster_stacks = Xapi_clustering.get_required_cluster_stacks ~__context ~sr_sm_type in - (* This is empty if the SR requires no cluster stack *) - match (Xapi_clustering.find_cluster_host ~__context ~host) with - | None -> () - | Some cluster_host -> - (* check cluster_host associated with both the host and a cluster with a matching cluster_stack *) - let self = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in - if List.mem (Db.Cluster.get_cluster_stack ~__context ~self) required_cluster_stacks - then Xapi_cluster_host.resync_host ~__context ~host - module C = Storage_interface.Client(struct let rpc = Storage_access.rpc end) let plug ~__context ~self = @@ -139,8 +127,7 @@ let plug ~__context ~self = (* This must NOT be done while holding the lock, because the functions that eventually get called also grab the clustering lock. We can call this unconditionally because the operations it calls should be idempotent. *) - log_and_ignore_exn (fun () -> resync_cluster_stack_for_sr_type ~__context ~sr_sm_type ~host); - Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type (fun () -> + Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type __LOC__ (fun () -> Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type; check_sharing_constraint ~__context ~sr; let dbg = Ref.string_of (Context.get_task_id __context) in @@ -162,7 +149,7 @@ let unplug ~__context ~self = if currently_attached then let sr = Db.PBD.get_SR ~__context ~self in let sr_sm_type = Db.SR.get_type ~__context ~self:sr in - Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type (fun () -> + Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type __LOC__ (fun () -> let host = Db.PBD.get_host ~__context ~self in if Db.Host.get_enabled ~__context ~self:host then abort_if_storage_attached_to_protected_vms ~__context ~self; diff --git a/ocaml/xapi/xapi_pif.ml b/ocaml/xapi/xapi_pif.ml index 2e6a611b9c3..c5253592e8e 100644 --- a/ocaml/xapi/xapi_pif.ml +++ b/ocaml/xapi/xapi_pif.ml @@ -450,21 +450,19 @@ let introduce_internal (* return ref of newly created pif record *) pif -(* Assertion passes if network has clusters attached but host has disabled clustering *) -let assert_no_clustering_enabled ~__context ~network ~host = - if not (Xapi_clustering.is_clustering_disabled_on_host ~__context host) - then - (Db.Cluster.get_refs_where ~__context - ~expr:Db_filter_types.(Eq(Field "network", Literal (Ref.string_of network)))) - |> function - | [] -> () - | _::_ -> raise Api_errors.(Server_error (clustering_enabled_on_network, [Ref.string_of network])) +(* Assertion passes if PIF has clusters attached but host has disabled clustering *) +let assert_no_clustering_enabled_on ~__context ~self = + let cluster_host_on_pif = Db_filter_types.(Eq (Field "PIF", Literal (Ref.string_of self))) in + match Db.Cluster_host.get_refs_where ~__context ~expr:cluster_host_on_pif with + | [] -> () + | [ cluster_host ] -> + if Db.Cluster_host.get_enabled ~__context ~self:cluster_host + then raise Api_errors.(Server_error (clustering_enabled, [ Ref.string_of cluster_host ])) + | lst -> failwith "Should never happen: there can only be one cluster host associated with a PIF" (* Internal [forget] is passed a pre-built table [t] *) let forget_internal ~t ~__context ~self = - let network = Db.PIF.get_network ~__context ~self in - let host = Db.PIF.get_host ~__context ~self in - assert_no_clustering_enabled ~__context ~network ~host; + assert_no_clustering_enabled_on ~__context ~self; if Db.PIF.get_managed ~__context ~self = true then Nm.bring_pif_down ~__context self; (* NB we are allowed to forget an interface which still exists *) @@ -644,9 +642,7 @@ let destroy ~__context ~self = let reconfigure_ipv6 ~__context ~self ~mode ~iPv6 ~gateway ~dNS = Xapi_pif_helpers.assert_pif_is_managed ~__context ~self; assert_no_protection_enabled ~__context ~self; - let network = Db.PIF.get_network ~__context ~self in - let host = Db.PIF.get_host ~__context ~self in - assert_no_clustering_enabled ~__context ~network ~host; + assert_no_clustering_enabled_on ~__context ~self; if gateway <> "" then Helpers.assert_is_valid_ip `ipv6 "gateway" gateway; @@ -696,9 +692,7 @@ let reconfigure_ipv6 ~__context ~self ~mode ~iPv6 ~gateway ~dNS = let reconfigure_ip ~__context ~self ~mode ~iP ~netmask ~gateway ~dNS = Xapi_pif_helpers.assert_pif_is_managed ~__context ~self; assert_no_protection_enabled ~__context ~self; - let network = Db.PIF.get_network ~__context ~self in - let host = Db.PIF.get_host ~__context ~self in - assert_no_clustering_enabled ~__context ~network ~host; + assert_no_clustering_enabled_on ~__context ~self; if mode = `Static then begin (* require these parameters if mode is static *) @@ -794,12 +788,26 @@ let set_property ~__context ~self ~name ~value = Nm.bring_pif_up ~__context pif ) (self :: vlan_pifs) +let assert_cluster_host_operation_not_in_progress ~__context = + match (Db.Cluster.get_all ~__context) with + | [] -> () + | cluster :: _ -> + let ops = Db.Cluster.get_current_operations ~__context ~self:cluster |> List.map snd in + if (List.mem `enable ops) || (List.mem `add ops) + then raise Api_errors.(Server_error (other_operation_in_progress, + [ "Cluster"; Ref.string_of cluster ])) + +(* Block allowing unplug if + - a cluster host is enabled on this PIF + - a cluster host is being created in this pool + - a Cluster_host.enable is in progress on any PIF *) let set_disallow_unplug ~__context ~self ~value = if (Db.PIF.get_disallow_unplug ~__context ~self) <> value then begin - let network = Db.PIF.get_network ~__context ~self in - let host = Db.PIF.get_host ~__context ~self in - assert_no_clustering_enabled ~__context ~network ~host; + if not value then begin + assert_no_clustering_enabled_on ~__context ~self; + assert_cluster_host_operation_not_in_progress ~__context + end; Db.PIF.set_disallow_unplug ~__context ~self ~value end @@ -892,7 +900,7 @@ let rec plug ~__context ~self = debug "PIF is SRIOV physical PIF and bond slave, also bringing up bond master PIF"; plug ~__context ~self:bond_master_pif (* It will be checked later to make sure that bond slave will not be brought up *) - end + end else raise Api_errors.(Server_error (cannot_plug_bond_slave, [Ref.string_of self])) end else () | _ -> () diff --git a/ocaml/xapi/xapi_pif.mli b/ocaml/xapi/xapi_pif.mli index 5bcebb36d0a..d1a1a10c21d 100644 --- a/ocaml/xapi/xapi_pif.mli +++ b/ocaml/xapi/xapi_pif.mli @@ -273,6 +273,9 @@ val start_of_day_best_effort_bring_up : unit -> unit (** {2 Assertion Helper Functions} *) +(** Raise an error if PIF attached to ENABLED cluster_host *) +val assert_no_clustering_enabled_on : __context:Context.t -> self:[`PIF] Ref.t -> unit + (** Ensure the PIF is not a bond slave or master. *) val assert_not_in_bond : __context:Context.t -> self:[ `PIF ] Ref.t -> unit diff --git a/ocaml/xapi/xapi_pool_helpers.ml b/ocaml/xapi/xapi_pool_helpers.ml index 3492e1ac4e7..892140d0a53 100644 --- a/ocaml/xapi/xapi_pool_helpers.ml +++ b/ocaml/xapi/xapi_pool_helpers.ml @@ -57,6 +57,10 @@ let valid_operations ~__context record _ref' = else set_errors Api_errors.ha_not_enabled [] [ `ha_disable ]; + (* cluster create cannot run during a rolling pool upgrade *) + if Helpers.rolling_upgrade_in_progress ~__context then + set_errors Api_errors.not_supported_during_upgrade [] [ `cluster_create ]; + (* cluster create cannot run if a cluster already exists on the pool *) begin match Db.Cluster.get_all ~__context with | [_] -> set_errors Api_errors.cluster_already_exists [] [ `cluster_create ] diff --git a/ocaml/xapi/xapi_sr.ml b/ocaml/xapi/xapi_sr.ml index 29074c3bdbc..3ebfd309503 100644 --- a/ocaml/xapi/xapi_sr.ml +++ b/ocaml/xapi/xapi_sr.ml @@ -280,7 +280,8 @@ let probe_ext = (* Create actually makes the SR on disk, and introduces it into db, and creates PBD record for current host *) let create ~__context ~host ~device_config ~(physical_size:int64) ~name_label ~name_description ~_type ~content_type ~shared ~sm_config = - let pbds, sr_ref = Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type:_type (fun () -> + let pbds, sr_ref = + Xapi_clustering.with_clustering_lock_if_needed ~__context ~sr_sm_type:_type __LOC__ (fun () -> Xapi_clustering.assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type:_type; Helpers.assert_rolling_upgrade_not_in_progress ~__context ; debug "SR.create name_label=%s sm_config=[ %s ]" name_label (String.concat "; " (List.map (fun (k, v) -> k ^ " = " ^ v) sm_config));