Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 65 additions & 3 deletions http-svr/xmlrpc_client.ml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ module E = Debug.Make(struct let name = "mscgen" end)
module Internal = struct
let set_stunnelpid_callback : (string option -> int -> unit) option ref = ref None
let unset_stunnelpid_callback : (string option -> int -> unit) option ref = ref None
let destination_is_ok : (string -> bool) option ref = ref None
end

let user_agent = "xen-api-libs/1.0"
Expand All @@ -49,14 +50,15 @@ let write_to_log x = StunnelDebug.debug "%s" (Astring.String.trim x)
for an unknown method and checking we get a matching MESSAGE_METHOD_UNKNOWN.
This is used to prevent us accidentally trying to reuse a connection which has been
closed or left in some other inconsistent state. *)
let check_reusable (x: Unix.file_descr) =
let check_reusable_inner (x: Unix.file_descr) =
let msg_name = "system.isAlive" in
let msg_uuid = Uuidm.to_string (Uuidm.create `V4) in
(* This is for backward compatability *)
let msg_func = Printf.sprintf "%s:%s" msg_name msg_uuid in
let msg_param = [ XMLRPC.To.string msg_uuid ] in
let xml = Xml.to_string (XMLRPC.To.methodCall msg_func msg_param) in
let http = xmlrpc ~version:"1.1" ~keep_alive:true ~body:xml "/" in

try
Http_client.rpc x http
(fun response _ ->
Expand Down Expand Up @@ -96,6 +98,64 @@ let get_new_stunnel_id =
let m = Mutex.create () in
fun () -> Mutex.execute m (fun () -> incr counter; !counter)

let run_watchdog timeout (fire_fn : unit -> unit) f (pout,pin) =
let fired = ref None in
let th = Thread.create (fun () ->
match Unix.select [pout] [] [] timeout with
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

select can wake up early for a variety of reasons with EINTR, especially if the process is receiving signals. It would be good to use a function here that sleeps based on an absolute deadline, pthread_ and sem_ have a few but I haven't seem any of them bound in ocaml standard library.
Thought that Thread.wait_timed_read would DTRT, but it just calls select without retrying.

Might have to use gettimeofday, compute how long is left and retry ourselves unless we can find a helper function that does that already

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

| [x],[],[] ->
fired := Some false
| _,_,_ ->
fire_fn ();
fired := Some true) () in
let cancel_watchdog () = let _x : int = Unix.write pin "x" 0 1 in () in
let get_fired () = Thread.join th; !fired in
f cancel_watchdog get_fired

let with_pipe f =
let pout,pin = Unix.pipe () in
finally
(fun () -> f (pout,pin))
(fun () -> Unix.close pout; Unix.close pin)

let watchdog timeout pid f =
with_pipe
(run_watchdog
timeout
(fun () ->
StunnelDebug.warn "Watchdog fired: killing pid: %d" pid;
Unix.kill pid Sys.sigterm)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we know this is the right PID and stunnel hasn't died already and the PID has been reused by something else? Does stunnel have a lockfile it keeps open with the pid that we can use to kill it without race conditions?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only really by construction - we do this pattern elsewhere in the code too. I really want to throw away all this code and use direct openssl bindings instead, but since this is for backporting that's not really an option here.

(fun cancel_watchdog get_fired ->
let result = try Some (f ()) with _ -> None in
cancel_watchdog ();
match get_fired (), result with
| Some true, _ ->
(* Watchdog fired, stunnel is killed. Not reusable *)
false
| Some false, None ->
(* Watchdog didn't fire, but f () raised an exception. Not reusable *)
false
| Some false, Some x ->
(* Watchdog didn't fire and we got a result from f() - return that result *)
x
| None, _ ->
(* fire_fn raised an exception (!) - Not reusable *)
false))

let check_reusable x pid =
watchdog 30.0 pid (fun () -> check_reusable_inner x)

let assert_dest_is_ok host =
(* Double check whether we _should_ be able to talk to this host *)
let dest_is_ok =
match !Internal.destination_is_ok with
| Some f -> f host
| None -> true (* No check function set: assume it's OK *)
in
if not dest_is_ok then begin
StunnelDebug.error "Destination host has been marked as offline. Aborting";
raise Stunnel_connection_failed
end

(** Returns an stunnel, either from the persistent cache or a fresh one which
has been checked out and guaranteed to work. *)
let get_reusable_stunnel ?use_fork_exec_helper ?write_to_log ?verify_cert host port =
Expand All @@ -106,7 +166,7 @@ let get_reusable_stunnel ?use_fork_exec_helper ?write_to_log ?verify_cert host p
try
while !found = None do
let (x: Stunnel.t) = Stunnel_cache.remove host port verify_cert in
if check_reusable x.Stunnel.fd
if check_reusable x.Stunnel.fd (Stunnel.getpid x.Stunnel.pid)
then found := Some x
else begin
StunnelDebug.debug "get_reusable_stunnel: Found non-reusable stunnel in the cache. disconnecting from %s:%d" host port;
Expand All @@ -130,14 +190,16 @@ let get_reusable_stunnel ?use_fork_exec_helper ?write_to_log ?verify_cert host p
try
let unique_id = get_new_stunnel_id () in
let (x: Stunnel.t) = Stunnel.connect ~unique_id ?use_fork_exec_helper ?write_to_log ~verify_cert host port in
if check_reusable x.Stunnel.fd
if check_reusable x.Stunnel.fd (Stunnel.getpid x.Stunnel.pid)
then found := Some x
else begin
assert_dest_is_ok host;
StunnelDebug.error "get_reusable_stunnel: fresh stunnel failed reusable check; delaying %.2f seconds before reconnecting to %s:%d (attempt %d / %d)" delay host port !attempt_number max_attempts;
Thread.delay delay;
Stunnel.disconnect x
end
with e ->
assert_dest_is_ok host;
StunnelDebug.error "get_reusable_stunnel: fresh stunnel connection failed with exception: %s: delaying %.2f seconds before reconnecting to %s:%d (attempt %d / %d)" (Printexc.to_string e) delay host port !attempt_number max_attempts;
Thread.delay delay;
done
Expand Down
5 changes: 5 additions & 0 deletions http-svr/xmlrpc_client.mli
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,9 @@ module Internal : sig
is called to allow us to forget the association between a task and an
stunnel pid *)
val unset_stunnelpid_callback : (string option -> int -> unit) option ref

(** Callback to check whether a destination address is still OK. Only called after
a failed attempt to talk to the destination *)
val destination_is_ok : (string -> bool) option ref

end