Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CP-26717: Port Gpumon to PPX-based RPCs #196

Merged
merged 5 commits into from
Feb 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions gpumon/gpumon_cli.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

(* Gpumon CLI *)

module Cmds = Gpumon_interface.RPC_API(Cmdlinergen.Gen ())

let version_str description =
let maj,min,mic = description.Idl.Interface.version in
Printf.sprintf "%d.%d.%d" maj min mic

let default_cmd =
let doc = String.concat "" [
"A CLI for the GPU monitoring API. This allows scripting of the gpumon daemon ";
"for testing and debugging. This tool is not intended to be used as an ";
"end user tool"] in
Cmdliner.Term.(ret (const (fun _ -> `Help (`Pager, None)) $ const ())),
Cmdliner.Term.info "gpumon_cli" ~version:(version_str Cmds.description) ~doc

let cli () =
let rpc = Gpumon_client.rpc in
Cmdliner.Term.eval_choice default_cmd (List.map (fun t -> t rpc) (Cmds.implementation ()))

let _ = cli ()
25 changes: 10 additions & 15 deletions gpumon/gpumon_client.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,14 @@
* GNU Lesser General Public License for more details.
*)

open Gpumon_interface
open Xcp_client

let xml_url () = "file:" ^ xml_path

module Client = Gpumon_interface.Client(struct
let rpc call =
if !use_switch
then json_switch_rpc queue_name call
else xml_http_rpc
~srcstr:(get_user_agent ())
~dststr:"gpumon"
xml_url
call
end)
let xml_url () = "file:" ^ Gpumon_interface.xml_path

let rpc call =
if !Xcp_client.use_switch
then Xcp_client.json_switch_rpc Gpumon_interface.queue_name call
else Xcp_client.xml_http_rpc
~srcstr:(Xcp_client.get_user_agent ())
~dststr:"gpumon"
xml_url
call
module Client = Gpumon_interface.RPC_API(Idl.GenClientExnRpc(struct let rpc=rpc end))
181 changes: 141 additions & 40 deletions gpumon/gpumon_interface.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,60 +12,161 @@
* GNU Lesser General Public License for more details.
*)

open Rpc
open Idl

let service_name = "gpumon"
let queue_name = Xcp_service.common_prefix ^ service_name
let xml_path = "/var/xapi/" ^ service_name

(** Uninterpreted string associated with the operation *)
type debug_info = string
[@@deriving rpcty]

(* Domain ID of VM *)
type domid = int
[@@deriving rpcty]

(** Reason for incompatibility *)
type incompatibility_reason =
| Host_driver
| Guest_driver
| GPU
| Other
[@@deriving rpcty]

type incompatibility_reason = Host_driver | Guest_driver | GPU | Other
type compatibility = Compatible | Incompatible of incompatibility_reason list
(** Compatibility between virtual and physical GPU *)
type compatibility =
| Compatible
| Incompatible of incompatibility_reason list
[@@deriving rpcty]

(** PCI identifier of physical GPU *)
type pgpu_address = string
[@@deriving rpcty]

(** Metadata of Nvidia physical GPU *)
type nvidia_pgpu_metadata = string
[@@deriving rpcty]

(** Metadata of Nvidia virtual GPU *)
type nvidia_vgpu_metadata = string
[@@deriving rpcty]

(** List of Nvidia virtual GPU metadata records *)
type nvidia_vgpu_metadata_list = nvidia_vgpu_metadata list
[@@deriving rpcty]


(** Error wrapper *)
type gpu_errors =
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing I'm uncertain about is whether we need an error for compatibility issues. I assume these are already handled, but is it worth including one just in case?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compatibility errors are only for the Nvidia module at the moment, we will need new ones for ATI or Intel if ever needed. How to deal with it I think is open for discussion:

It is probably not worth splitting:

type nvml_error = 
   | NvmlInterfaceNotAvailable
   (** Exception raised when gpumon is unable to load the nvml nvidia library *)
   | NvmlFailure of string
   (** Exception raised by the c bindings to the nvml nvidia library*)
[@@deriving rpcty]

(** Error wrapper *)
 type gpu_errors =
   | Nvml of nvml_error
   (** Error raised by the Nvml library bindings *)
   | Gpumon_failure
   (** Default exception raised upon daemon failure *)
 [@@default Gpumon_failure]
[@@deriving rpcty]

| NvmlInterfaceNotAvailable
(** Exception raised when gpumon is unable to load the nvml nvidia library *)
| NvmlFailure of string
(** Exception raised by the c bindings to the nvml nvidia library*)
| Gpumon_failure
(** Default exception raised upon daemon failure *)
[@@default Gpumon_failure]
[@@deriving rpcty]

exception Gpumon_error of gpu_errors

(** Error handler *)
module GpuErrors = Error.Make(struct
type t = gpu_errors
let t = gpu_errors
end)
let gpu_err = GpuErrors.error

(** Functor to autogenerate API calls *)
module RPC_API(R : RPC) = struct
open R

let param = Param.mk

(** Exception raised when gpumon is unable to load the nvml nvidia library *)
exception NvmlInterfaceNotAvailable
(** Exception raised by the c bindings to the nvml nvidia library*)
exception NvmlFailure of string
let description =
Interface.{ name = "Gpumon"
; namespace = None
; description =
[ "This interface is used by Xapi and Gpumon to monitor "
; "physical and virtual GPUs."]
; version=(1,0,0)
}

let implementation = implement description

module Nvidia = struct
(** Compatibility checking interface for Nvidia vGPUs *)
module Nvidia = struct

(** common API call parameters *)

let debug_info_p = param ~description:
["Uninterpreted string used for debugging."]
debug_info

let domid_p = param ~description:
["Domain ID of the VM in which the vGPU(s) is running."]
domid

let pgpu_address_p = param ~description:
["PCI bus ID of the pGPU in which the VM is currently running"
;"in the form `domain:bus:device.function` PCI identifier."]
pgpu_address

let nvidia_pgpu_metadata_p = param ~description:
["Metadata of Nvidia physical GPU."]
nvidia_pgpu_metadata

let nvidia_vgpu_metadata_p = param ~description:
["Metadata of Nvidia virtual GPU."]
nvidia_vgpu_metadata

let nvidia_vgpu_metadata_list_p = param ~description:
["Metadata list of Nvidia virtual GPU."]
nvidia_vgpu_metadata_list

let compatibility_p = param ~description:
[ "Value indicating whether two or more GPUs are compatible with each other." ]
compatibility

let get_pgpu_metadata =
declare "get_pgpu_metadata"
[ "Gets the metadata for a pGPU, given its address (PCI bus ID)." ]
(debug_info_p
@-> pgpu_address_p
@-> returning nvidia_pgpu_metadata_p gpu_err
)

let get_pgpu_vm_compatibility =
declare "get_pgpu_vm_compatibility"
[ "Checks compatibility between a VM's vGPU(s) and another pGPU." ]
(debug_info_p
@-> pgpu_address_p
@-> domid_p
@-> nvidia_pgpu_metadata_p
@-> returning compatibility_p gpu_err
)

let get_vgpu_metadata =
declare "get_vgpu_metadata"
[ "Obtains metadata for all vGPUs running in a domain." ]
( debug_info_p
@-> domid_p
@-> pgpu_address_p
@-> returning nvidia_vgpu_metadata_list_p gpu_err
)

(** Get the metadata for a pGPU, given its address (PCI bus ID). *)
external get_pgpu_metadata: debug_info -> pgpu_address -> nvidia_pgpu_metadata = ""

(** Check compatibility between a VM's vGPU(s) and another pGPU.
* pgpu_address = PCI bus ID of the pGPU in which the VM is currently running
* in the form `domain:bus:device.function` PCI identifier.
* domid = domain ID of the VM in which the vGPU(s) is running.
* pgpu_metadata = metadata of the pGPU to check compatibility for. *)
external get_pgpu_vm_compatibility: debug_info -> pgpu_address -> domid -> nvidia_pgpu_metadata -> compatibility = ""

(** Obtain meta data for all vGPUs running in a domain. The
* [pgpu_address] is a PCI identifier of the form
* domain:bus:device.function
*)
external get_vgpu_metadata
: debug_info
-> domid
-> pgpu_address
-> nvidia_vgpu_metadata list
= ""

(** Check compatibility between a pGPU (on a host) and a list of vGPUs
* (assigned to a VM). The use case is VM.suspend/VM.resume: before
* VM.resume [nvidia_vgpu_metadata] of the suspended VM is checked
* against the [nvidia_pgpu_metadata] on the host where the VM is
* resumed. A VM may use several vGPUs.
*)
external get_pgpu_vgpu_compatibility
: debug_info
-> nvidia_pgpu_metadata
-> nvidia_vgpu_metadata list
-> compatibility
= ""
let get_pgpu_vgpu_compatibility =
declare "get_pgpu_vgpu_compatibility"
[ "Checks compatibility between a pGPU (on a host) and a list of vGPUs "
; "(assigned to a VM). Note: A VM may use several vGPUs."
; "The use case is VM.suspend/VM.resume:"
; "before VM.resume [nvidia_vgpu_metadata] of the suspended VM is "
; "checked against the [nvidia_pgpu_metadata] on the host where the VM "
; "is resumed." ]
( debug_info_p
@-> nvidia_pgpu_metadata_p
@-> nvidia_vgpu_metadata_list_p
@-> returning compatibility_p gpu_err)
end
end
36 changes: 18 additions & 18 deletions gpumon/jbuild
Original file line number Diff line number Diff line change
Expand Up @@ -22,35 +22,35 @@ let coverage_rewriter =
else
""

let rewriters_camlp4 = ["rpclib.idl -syntax camlp4o"]
let rewriters_ppx = ["ppx_deriving_rpc"; "ppx_sexp_conv"]
let rewriters = ["ppx_deriving_rpc"]

let () = Printf.ksprintf Jbuild_plugin.V1.send {|
(jbuild_version 1)

(library
((name xapi_gpumon_interface)
(public_name xcp.gpumon.interface)
(modules (gpumon_interface))
(flags (:standard -w -39 %s))
(libraries
(rpclib
threads
xcp))
(wrapped false)
%s))

(library
((name xapi_gpumon)
(public_name xcp.gpumon)
(modules (:standard \ gpumon_interface))
(flags (:standard -w -39-33 %s))
(modules (:standard \ gpumon_cli ))
(libraries
(rpclib
threads
xcp
xapi_gpumon_interface))
xcp))
(wrapped false)
%s))

|} (flags rewriters_camlp4) coverage_rewriter (flags rewriters_ppx) coverage_rewriter
(executable
((name gpumon_cli)
(modules (gpumon_cli))
(libraries
(cmdliner
rpclib.cmdliner
rpclib.markdown
xcp.gpumon))))

(alias
((name runtest)
(deps (gpumon_cli.exe))
(action (run ${<}))))

|} (flags rewriters) coverage_rewriter