Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ocaml/idl/datamodel_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ let t =
~ty:(Ref _pif) "PIF" ~default_value:(Some (VRef null_ref))
"Reference to the PIF object"

; field ~qualifier:StaticRO ~lifecycle
~ty:Bool "joined" ~default_value:(Some (VBool true))
"Whether the cluster host has joined the cluster"

(* TODO: add `live` member to represent whether corosync believes that this
cluster host actually is enabled *)

Expand Down
4 changes: 3 additions & 1 deletion ocaml/idl/datamodel_errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1139,7 +1139,9 @@ let _ =
error Api_errors.invalid_cluster_stack [ "cluster_stack" ]
~doc:"The cluster stack provided is not supported." ();
error Api_errors.pif_not_attached_to_host [ "pif"; "host" ]
~doc:"Cluster_host creation failed as the PIF provided is not attached to the host." ()
~doc:"Cluster_host creation failed as the PIF provided is not attached to the host." ();
error Api_errors.cluster_host_not_joined [ "cluster_host" ]
~doc:"Cluster_host operation failed as the cluster_host has not joined the cluster." ()


let _ =
Expand Down
6 changes: 3 additions & 3 deletions ocaml/tests/suite_alcotest.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
let () =
Suite_init.harness_init ();
(* Alcotest hides the standard output of successful tests,
so we will probably not exceed the 4MB limit in Traivs *)
so we will probably not exceed the 4MB limit in Travis *)
Debug.log_to_stdout ();

Alcotest.run "Base suite"
Expand Down Expand Up @@ -30,6 +30,8 @@ let () =
; "Test_daemon_manager", Test_daemon_manager.test
; "Test_cluster", Test_cluster.test
; "Test_cluster_host", Test_cluster_host.test
; "Test_clustering", Test_clustering.test
; "Test_clustering_allowed_operations", Test_clustering_allowed_operations.test
; "Test_client", Test_client.test
; "Test_ca91480", Test_ca91480.test
; "Test_pgpu", Test_pgpu.test
Expand All @@ -43,8 +45,6 @@ let () =
; "Test_pvs_site", Test_pvs_site.test
; "Test_pvs_proxy", Test_pvs_proxy.test
; "Test_pvs_server", Test_pvs_server.test
; "Test_clustering", Test_clustering.test
; "Test_clustering_allowed_operations", Test_clustering_allowed_operations.test
; "Test_event", Test_event.test
; "Test_vm_placement", Test_vm_placement.test
; "Test_vm_memory_constraints", Test_vm_memory_constraints.test
Expand Down
7 changes: 5 additions & 2 deletions ocaml/tests/test_cluster.ml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ let test_create_destroy_status () =
let test_enable () =
let __context = Test_common.make_test_database () in
let cluster = create_cluster ~__context () in
(* simulate xapi getting restarted *)

Create_storage.maybe_reenable_cluster_host __context;
(* simulate xapi getting restarted *)
begin match Xapi_clustering.find_cluster_host ~__context ~host:Helpers.(get_localhost ~__context) with
| Some self -> Xapi_cluster_host.enable ~__context ~self
| None -> Alcotest.fail "Couldn't find freshly-created cluster_host"
end;
pool_destroy ~__context ~self:cluster

let test_invalid_cluster_stack () =
Expand Down
18 changes: 9 additions & 9 deletions ocaml/tests/test_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,15 @@ let test_fix_prereq () =
let __context = Test_common.make_test_database () in
Context.set_test_rpc __context (pif_plug_rpc __context);
let network = Test_common.make_network ~__context () in
let localhost = Helpers.get_localhost ~__context in
let pifref = Test_common.make_pif ~__context ~network ~host:localhost () in
let pif = Xapi_clustering.pif_of_host ~__context network localhost in
let host = Helpers.get_localhost ~__context in
let pifref = Test_common.make_pif ~__context ~network ~host () in
Alcotest.check_raises
"Should fail when checking PIF prequisites"
Api_errors.(Server_error (pif_has_no_network_configuration, [ Ref.string_of pifref ]))
(fun () -> Xapi_cluster_host.fix_pif_prerequisites __context pif);
(fun () -> Xapi_cluster_host.fix_pif_prerequisites __context pifref);
Db.PIF.set_IP ~__context ~self:pifref ~value:"1.1.1.1";
let pif = Xapi_clustering.pif_of_host ~__context network localhost in
Xapi_cluster_host.fix_pif_prerequisites ~__context pif;
let pif = Xapi_clustering.pif_of_host ~__context network localhost in
Xapi_cluster_host.fix_pif_prerequisites ~__context pifref;
let pif = Xapi_clustering.pif_of_host ~__context network host in
Alcotest.(check unit)
"PIF prerequisites have now been fixed"
() (Xapi_clustering.assert_pif_prerequisites pif)
Expand All @@ -93,15 +91,17 @@ let test_create_as_necessary () =
Alcotest.check_raises
"create_as_necessary should fail if autojoin is set and the pool master has no cluster_host"
Api_errors.(Server_error (internal_error,
[ Printf.sprintf "No cluster_host master found for cluster %s" (Ref.string_of cluster) ]))
[ Printf.sprintf "No cluster_host exists on master" ]))
(fun () -> Xapi_cluster_host.create_as_necessary ~__context ~host:localhost);
let _ = Test_common.make_cluster_host ~__context ~pIF:(fst _pif) ~host:(Helpers.get_master ~__context) ~cluster () in
Xapi_cluster_host.create_as_necessary ~__context ~host:localhost;
let result = sync_required ~__context ~host:localhost in
check_cluster_option "sync_required with an existing cluster_host" None result;
let host = Test_common.make_host ~__context () in
let result = sync_required ~__context ~host in
check_cluster_option "sync_required with an existing cluster_host on master but not given host" (Some cluster) result
check_cluster_option
"sync_required with an existing cluster_host on master but not given host"
(Some cluster) result

(* CA-275728 *)
let test_destroy_forbidden_when_sr_attached () =
Expand Down
41 changes: 28 additions & 13 deletions ocaml/tests/test_clustering_allowed_operations.ml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ let assert_true msg x = Alcotest.(check bool) msg true x
(** cluster_create is not allowed if a cluster already exists *)
let test_pool_cluster_create_not_allowed_when_cluster_exists () =
let __context = make_test_database () in
let self = Db.Pool.get_all ~__context |> List.hd in
let self = Helpers.get_pool ~__context in
let _, _ = make_cluster_and_cluster_host ~__context () in
Xapi_pool_helpers.update_allowed_operations ~__context ~self;
let allowed_ops = Db.Pool.get_allowed_operations ~__context ~self in
Expand All @@ -29,7 +29,7 @@ let test_pool_cluster_create_not_allowed_when_cluster_exists () =
(** cluster_create is not allowed if any pool operations are in progress *)
let test_pool_cluster_create_not_allowed_during_pool_ops () =
let __context = make_test_database () in
let self = Db.Pool.get_all ~__context |> List.hd in
let self = Helpers.get_pool ~__context in
Xapi_pool_helpers.with_pool_operation ~__context ~self ~doc:"" ~op:`ha_enable
(fun () ->
let allowed_ops = Db.Pool.get_allowed_operations ~__context ~self in
Expand All @@ -40,7 +40,7 @@ let test_pool_cluster_create_not_allowed_during_pool_ops () =
operations in progress *)
let test_pool_cluster_create_allowed () =
let __context = make_test_database () in
let self = Db.Pool.get_all ~__context |> List.hd in
let self = Helpers.get_pool ~__context in
Xapi_pool_helpers.update_allowed_operations ~__context ~self;
let allowed_ops = Db.Pool.get_allowed_operations ~__context ~self in
assert_true "Pool.allowed_operations should contain 'cluster_create'"
Expand Down Expand Up @@ -101,6 +101,14 @@ let test_cluster_host_ops_not_allowed_during_cluster_host_op () =
let allowed_ops = Db.Cluster_host.get_allowed_operations ~__context ~self in
assert_true "Cluster_host.allowed_operations should be empty" (allowed_ops = []))

let with_cluster_op ~__context self op =
Xapi_cluster_helpers.with_cluster_operation ~__context ~self ~doc:"" ~op
(fun () -> ())

let with_cluster_host_op ~__context self op =
Xapi_cluster_host_helpers.with_cluster_host_operation ~__context ~self ~doc:"" ~op
(fun () -> ())

let test_clustering_ops_disallowed_during_rolling_upgrade () =
let __context = Test_common.make_test_database () in

Expand All @@ -110,17 +118,9 @@ let test_clustering_ops_disallowed_during_rolling_upgrade () =
(fun op ->
Alcotest.(check unit)
"Clustering operations should be allowed"
() (with_cluster_fn self op)
() (with_cluster_fn ~__context self op)
) ops
in
let with_cluster_op self op =
Xapi_cluster_helpers.with_cluster_operation ~__context ~self ~doc:"" ~op
(fun () -> ())
in
let with_cluster_host_op self op =
Xapi_cluster_host_helpers.with_cluster_host_operation ~__context ~self ~doc:"" ~op
(fun () -> ())
in
let cluster, cluster_host =
Test_common.make_cluster_and_cluster_host ~__context ()
in
Expand Down Expand Up @@ -151,7 +151,7 @@ let test_clustering_ops_disallowed_during_rolling_upgrade () =
Alcotest.check_raises
"Other than cluster_host enable/disable, no clustering operations should be allowed during RPU"
Api_errors.(Server_error (not_supported_during_upgrade, []))
(fun () -> with_cluster_op cluster op)
(fun () -> with_cluster_op ~__context cluster op)
) [ `add ; `remove ; `destroy];

test_clustering_ops_should_pass
Expand All @@ -161,6 +161,20 @@ let test_clustering_ops_disallowed_during_rolling_upgrade () =

test_cluster_host_operations_valid ()

let test_cluster_host_ops_without_join () =
(* Note that joined:true by default so no need to check *)
let __context = make_test_database () in
let cluster, cluster_host = make_cluster_and_cluster_host ~__context ~host:Helpers.(get_localhost ~__context) () in
Db.Cluster_host.set_joined ~__context ~self:cluster_host ~value:false;

List.iter
(fun op ->
Alcotest.check_raises
"Non-remove cluster operations invalid when not cluster_host.joined"
Api_errors.(Server_error (cluster_host_not_joined, [ Ref.string_of cluster_host ]))
(fun () -> with_cluster_host_op ~__context cluster_host op)
) Xapi_cluster_host_helpers.all_cluster_host_operations

let test =
[ "test_pool_cluster_create_not_allowed_when_cluster_exists", `Quick, test_pool_cluster_create_not_allowed_when_cluster_exists
; "test_pool_cluster_create_not_allowed_during_pool_ops", `Quick, test_pool_cluster_create_not_allowed_during_pool_ops
Expand All @@ -171,4 +185,5 @@ let test =
; "test_cluster_host_enable_allowed", `Quick, test_cluster_host_enable_allowed
; "test_cluster_host_ops_not_allowed_during_cluster_host_op", `Quick, test_cluster_host_ops_not_allowed_during_cluster_host_op
; "test_clustering_ops_disallowed_during_rolling_upgrade", `Quick, test_clustering_ops_disallowed_during_rolling_upgrade
; "test_cluster_host_ops_without_join", `Quick, test_cluster_host_ops_without_join
]
4 changes: 2 additions & 2 deletions ocaml/tests/test_common.ml
Original file line number Diff line number Diff line change
Expand Up @@ -489,10 +489,10 @@ let make_vfs_on_pf ~__context ~pf ~num =
make_vf num

let make_cluster_host ~__context ?(ref=Ref.make ()) ?(uuid=make_uuid ())
?(cluster=Ref.null) ?(host=Ref.null) ?(pIF=Ref.null) ?(enabled=true)
?(cluster=Ref.null) ?(host=Ref.null) ?(pIF=Ref.null) ?(enabled=true) ?(joined=true)
?(allowed_operations=[]) ?(current_operations=[]) ?(other_config=[]) () =
Db.Cluster_host.create ~__context ~ref ~uuid ~cluster ~host ~pIF ~enabled
~allowed_operations ~current_operations ~other_config;
~allowed_operations ~current_operations ~other_config ~joined;
ref

let make_cluster_and_cluster_host ~__context ?(ref=Ref.make ()) ?(uuid=make_uuid ())
Expand Down
1 change: 1 addition & 0 deletions ocaml/xapi-consts/api_errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -614,3 +614,4 @@ let cluster_force_destroy_failed = "CLUSTER_FORCE_DESTROY_FAILED"
let cluster_stack_in_use = "CLUSTER_STACK_IN_USE"
let invalid_cluster_stack = "INVALID_CLUSTER_STACK"
let pif_not_attached_to_host = "PIF_NOT_ATTACHED_TO_HOST"
let cluster_host_not_joined = "CLUSTER_HOST_NOT_JOINED"
7 changes: 0 additions & 7 deletions ocaml/xapi/create_storage.ml
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,10 @@ let plug_all_pbds __context =
my_pbds;
!result

let maybe_reenable_cluster_host __context =
let host = Helpers.get_localhost __context in
match Xapi_clustering.find_cluster_host ~__context ~host with
| Some self ->
Xapi_cluster_host.enable ~__context ~self
| None -> ()

let plug_unplugged_pbds __context =
(* If the plug is to succeed for SM's requiring a cluster stack
* we have to enable the cluster stack too if we have one *)
log_and_ignore_exn(fun () -> maybe_reenable_cluster_host __context);
let my_pbds = Helpers.get_my_pbds __context in
List.iter
(fun (self, pbd_record) ->
Expand Down
3 changes: 3 additions & 0 deletions ocaml/xapi/records.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2144,6 +2144,9 @@ let cluster_host_record rpc session_id cluster_host =
; make_field ~name:"enabled"
~get:(fun () -> (x ()).API.cluster_host_enabled |> string_of_bool)
()
; make_field ~name:"joined"
~get:(fun () -> (x ()).API.cluster_host_joined |> string_of_bool)
()
; make_field ~name:"allowed-operations"
~get:(fun () -> String.concat "; " (List.map Record_util.cluster_host_operation_to_string (x ()).API.cluster_host_allowed_operations))
~get_set:(fun () -> List.map Record_util.cluster_host_operation_to_string (x ()).API.cluster_host_allowed_operations)
Expand Down
35 changes: 28 additions & 7 deletions ocaml/xapi/xapi.ml
Original file line number Diff line number Diff line change
Expand Up @@ -910,8 +910,6 @@ let server_init() =
(* CA-22417: bring up all non-bond slaves so that the SM backends can use storage NIC IP addresses (if the routing
table happens to be right) *)
"Best-effort bring up of physical and sriov NICs", [ Startup.NoExnRaising ], Xapi_pif.start_of_day_best_effort_bring_up;
"Create any necessary cluster_host objects", [ Startup.NoExnRaising ], (fun () -> Xapi_cluster_host.create_as_necessary __context (Helpers.get_localhost ~__context));
"resync cluster host state", [], (fun () -> Xapi_cluster_host.resync_host ~__context ~host:Helpers.(get_localhost ~__context));
"updating the vswitch controller", [], (fun () -> Helpers.update_vswitch_controller ~__context ~host:(Helpers.get_localhost ~__context));
"initialising storage", [ Startup.NoExnRaising ],
(fun () -> Helpers.call_api_functions ~__context Create_storage.create_storage_localhost);
Expand Down Expand Up @@ -943,20 +941,43 @@ let server_init() =

let wait_management_interface () =
let management_if = Xapi_inventory.lookup Xapi_inventory._management_interface in
if management_if <> "" then (
if management_if <> "" then begin
debug "Waiting forever for the management interface to gain an IP address";
let ip = wait_for_management_ip_address ~__context in
debug "Management interface got IP address: %s; attempting to re-plug any unplugged PBDs" ip;
debug "Management interface got IP address: %s, attempting to re-plug unplugged PBDs" ip;
(* This may fail without the clustering IP, which is why we attempt
another replug in maybe_wait_for_clustering_ip *)
Helpers.call_api_functions ~__context (fun rpc session_id ->
Create_storage.plug_unplugged_pbds __context)
)
end
in

let maybe_wait_for_clustering_ip () =
let host = Helpers.get_localhost ~__context in
match Xapi_clustering.find_cluster_host ~__context ~host with
| Some self -> begin
debug "Waiting forever for cluster_host to gain an IP address";
let ip = Xapi_mgmt_iface.(wait_for_clustering_ip ~__context ~self) in
debug "Got clustering IP %s, resyncing cluster_host %s" ip (Ref.string_of self);
Xapi_cluster_host.resync_host ~__context ~host;
debug "Attempting to re-plug remaining unplugged PBDs";
Helpers.call_api_functions ~__context (fun rpc session_id ->
Create_storage.plug_unplugged_pbds __context)
end
| None -> ()
in

Startup.run ~__context [
"fetching database backup", [ Startup.OnlySlave; Startup.NoExnRaising ],
(fun () -> Pool_db_backup.fetch_database_backup ~master_address:(Pool_role.get_master_address())
~pool_secret:!Xapi_globs.pool_secret ~force:None);
"wait management interface to come up", [ Startup.NoExnRaising ], wait_management_interface;
"wait management interface to come up, re-plug unplugged PBDs", [ Startup.NoExnRaising ], wait_management_interface;

(* CA-290237, CA-290473: Create cluster objects after network objects and management IP initialised *)
"Create any necessary cluster_host objects", [ Startup.NoExnRaising ],
(fun () -> log_and_ignore_exn (fun () -> Xapi_cluster_host.create_as_necessary __context (Helpers.get_localhost ~__context)));
"wait for clustering IP if any, re-plug remaining unplugged PBDs", [ Startup.OnThread ],
(fun () -> log_and_ignore_exn (fun () -> maybe_wait_for_clustering_ip () ));
"considering sending a master transition alert", [ Startup.NoExnRaising; Startup.OnlyMaster ],
Xapi_pool_transition.consider_sending_alert __context;
"Cancelling in-progress storage migrations", [], (fun () -> Storage_migrate.killall ~dbg:"xapi init");
Expand All @@ -972,7 +993,7 @@ let server_init() =
];

debug "startup: startup sequence finished");
wait_to_die()
wait_to_die ()
with
| Sys.Break -> cleanup_handler 0
| (Unix.Unix_error (e,s1,s2)) as exn ->
Expand Down
15 changes: 8 additions & 7 deletions ocaml/xapi/xapi_cluster.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ let create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout ~token_
(* Currently we only support corosync. If we support more cluster stacks, this
* should be replaced by a general function that checks the given cluster_stack *)
Pool_features.assert_enabled ~__context ~f:Features.Corosync;
(* TODO: take network lock *)
with_clustering_lock (fun () ->
let dbg = Context.string_of_task __context in
validate_params ~token_timeout ~token_timeout_coefficient;
Expand Down Expand Up @@ -64,7 +63,7 @@ let create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout ~token_
~pool_auto_join ~token_timeout ~token_timeout_coefficient ~current_operations:[] ~allowed_operations:[] ~cluster_config:[]
~other_config:[];
Db.Cluster_host.create ~__context ~ref:cluster_host_ref ~uuid:cluster_host_uuid ~cluster:cluster_ref ~host ~enabled:true ~pIF
~current_operations:[] ~allowed_operations:[] ~other_config:[];
~current_operations:[] ~allowed_operations:[] ~other_config:[] ~joined:true;
Xapi_cluster_host_helpers.update_allowed_operations ~__context ~self:cluster_host_ref;
D.debug "Created Cluster: %s and Cluster_host: %s" (Ref.string_of cluster_ref) (Ref.string_of cluster_host_ref);
set_ha_cluster_stack ~__context;
Expand Down Expand Up @@ -142,6 +141,7 @@ let pool_force_destroy ~__context ~self =
let slave_cluster_hosts =
Db.Cluster.get_cluster_hosts ~__context ~self |> filter_on_option master_cluster_host
in
debug "Destroying cluster_hosts in pool";
(* First try to destroy each cluster_host - if we can do so safely then do *)
List.iter
(fun cluster_host ->
Expand Down Expand Up @@ -175,14 +175,14 @@ let pool_force_destroy ~__context ~self =
[] all_remaining_cluster_hosts
in

begin
match exns with
| [] -> D.debug "Cluster.force_destroy was successful"
| e :: _ -> raise Api_errors.(Server_error (cluster_force_destroy_failed, [Ref.string_of self]))
begin match exns with
| [] -> D.debug "Successfully destroyed all cluster_hosts in pool, now destroying cluster %s" (Ref.string_of self)
| e :: _ -> raise Api_errors.(Server_error (cluster_force_destroy_failed, [Ref.string_of self]))
end;

Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Cluster.destroy ~rpc ~session_id ~self)
Client.Client.Cluster.destroy ~rpc ~session_id ~self);
debug "Cluster_host.force_destroy was successful"

(* Helper function; concurrency checks are done in implementation of Cluster.destroy and Cluster_host.destroy *)
let pool_destroy ~__context ~self =
Expand Down Expand Up @@ -212,6 +212,7 @@ let pool_resync ~__context ~(self : API.ref_Cluster) =
List.iter
(fun host -> log_and_ignore_exn
(fun () ->
Xapi_cluster_host.create_as_necessary ~__context ~host;
Xapi_cluster_host.resync_host ~__context ~host;
if is_clustering_disabled_on_host ~__context host
then raise Api_errors.(Server_error (no_compatible_cluster_host, [Ref.string_of host]))
Expand Down
Loading