Skip to content

Commit

Permalink
Merge pull request #3418 from minishrink/gpu_ppx
Browse files Browse the repository at this point in the history
CP-26717: Update Xapi to use PPX-based Gpumon IDL
  • Loading branch information
mseri committed Feb 2, 2018
2 parents 10608a2 + 4b3baa8 commit 2a78d9d
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 41 deletions.
61 changes: 30 additions & 31 deletions ocaml/xapi/xapi_gpumon.ml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ module Nvidia = struct
let get_pgpu_compatibility_metadata ~dbg ~pgpu_pci_address =
let metadata =
pgpu_pci_address
|> Gpumon_client.Client.Nvidia.get_pgpu_metadata dbg
|> Gpumon_client.Client.Nvidia.get_pgpu_metadata dbg
|> Stdext.Base64.encode
in [key, metadata]

Expand All @@ -70,30 +70,30 @@ module Nvidia = struct
* one vGPU per VM. The underdyling problem is that there is no
* mapping between a vGPU and vgpu_instance in the NVIDIA library
* currently.
*)
*)
let get_vgpu_compatibility_metadata ~__context ~vgpu =
let this = "get_vgpu_compatibility_metadata" in
try
let dbg = Context.string_of_task __context in
let vm = Db.VGPU.get_VM ~__context ~self:vgpu in
let domid = Db.VM.get_domid ~__context ~self:vm |> Int64.to_int in
Db.VGPU.get_resident_on ~__context ~self:vgpu
|> fun self -> Db.PGPU.get_PCI ~__context ~self
|> fun self -> Db.PCI.get_pci_id ~__context ~self
|> (fun self -> Db.PGPU.get_PCI ~__context ~self)
|> (fun self -> Db.PCI.get_pci_id ~__context ~self)
|> Gpumon_client.Client.Nvidia.get_vgpu_metadata dbg domid
|> function
| [] -> []
| [meta] -> [key, Stdext.Base64.encode meta]
| _::_ -> failwith @@ Printf.sprintf
"%s: VM %s (dom %d) has more than one NVIDIA vGPU (%s)"
this (Ref.string_of vm) domid __LOC__
|> (function
| [] -> []
| [meta] -> [key, Stdext.Base64.encode meta]
| _::_ -> failwith @@ Printf.sprintf
"%s: VM %s (dom %d) has more than one NVIDIA vGPU (%s)"
this (Ref.string_of vm) domid __LOC__)
with
| Gpumon_interface.NvmlInterfaceNotAvailable ->
let host = Helpers.get_localhost ~__context |> Ref.string_of in
raise Api_errors.(Server_error (nvidia_tools_error, [host]))
| err ->
let msg = Printexc.to_string err in
raise Api_errors.(Server_error (internal_error, [msg]))
| Gpumon_interface.(Gpumon_error NvmlInterfaceNotAvailable) ->
let host = Helpers.get_localhost ~__context |> Ref.string_of in
raise Api_errors.(Server_error (nvidia_tools_error, [host]))
| err ->
let msg = Printexc.to_string err in
raise Api_errors.(Server_error (internal_error, [msg]))


(* N.B. the vgpu (and the vm) must be in the local host where this function runs *)
Expand All @@ -102,30 +102,29 @@ module Nvidia = struct
let vm_domid = Int64.to_int (Db.VM.get_domid ~__context ~self:vm) in
let pgpu_metadata = Stdext.Base64.decode encoded_pgpu_metadata in
match vgpu_impl ~__context vgpu with
| `passthrough | `gvt_g | `mxgpu ->
| `passthrough | `gvt_g | `mxgpu ->
debug "Skipping, vGPU %s implementation for VM %s is not Nvidia" (Ref.string_of vgpu) (Ref.string_of vm)
| `nvidia ->
let local_pgpu_address =
let local_pgpu_address =
Db.VGPU.get_resident_on ~__context ~self:vgpu
|> (fun self -> Db.PGPU.get_PCI ~__context ~self)
|> (fun self -> Db.PCI.get_pci_id ~__context ~self)
in
let compatibility =
let compatibility =
try
Gpumon_client.Client.Nvidia.get_pgpu_vm_compatibility dbg
Gpumon_client.Client.Nvidia.get_pgpu_vm_compatibility dbg
local_pgpu_address vm_domid pgpu_metadata
with
| Gpumon_interface.NvmlInterfaceNotAvailable ->
| Gpumon_interface.(Gpumon_error NvmlInterfaceNotAvailable) ->
let host = Db.VM.get_resident_on ~__context ~self:vm in
raise Api_errors.(Server_error (nvidia_tools_error, [Ref.string_of host]))
| err -> raise Api_errors.(Server_error (internal_error, [Printexc.to_string err]))
in
let open Gpumon_interface in
match compatibility with
| Compatible ->
| Gpumon_interface.Compatible ->
info "VM %s Nvidia vGPU is compatible with the destination pGPU on host %s"
(Ref.string_of vm) (Ref.string_of dest_host)
| Incompatible reasons ->
| Gpumon_interface.(Incompatible reasons) ->
raise Api_errors.(Server_error (
vgpu_destination_incompatible,
[ String.concat ", " (List.map reason_to_string reasons)
Expand All @@ -138,7 +137,7 @@ module Nvidia = struct
* compatible according to their abstract compatibility metadata. This
* code can run on any host. If no vGPU or pGPU metadata is
* available, compatibility is assumed.
*)
*)
let vgpu_pgpu_are_compatible ~__context ~vgpu ~pgpu =
let dbg = Context.string_of_task __context in
let localhost = Helpers.get_localhost ~__context in
Expand All @@ -151,7 +150,7 @@ module Nvidia = struct
|> Stdext.Base64.decode in
let vgpu_metadata () =
Db.VGPU.get_compatibility_metadata ~__context ~self:vgpu
|> fun md -> [List.assoc key md]
|> (fun md -> [List.assoc key md])
|> List.map Stdext.Base64.decode in
match
Gpumon_client.Client.Nvidia.get_pgpu_vgpu_compatibility
Expand All @@ -177,7 +176,7 @@ module Nvidia = struct
(reasons |> List.map reason_to_string |> String.concat "; ");
false
(* errors *)
| exception Gpumon_interface.NvmlInterfaceNotAvailable ->
| exception Gpumon_interface.(Gpumon_error NvmlInterfaceNotAvailable) ->
raise Api_errors.(Server_error (nvidia_tools_error, [localhost']))
| exception err ->
raise Api_errors.(Server_error
Expand Down Expand Up @@ -211,9 +210,9 @@ let clear_vgpu_metadata ~__context ~vm =
Db.VM.get_VGPUs ~__context ~self:vm
|> List.filter (fun vgpu -> Db.is_valid_ref __context vgpu)
|> List.iter (fun vgpu ->
Db.VGPU.get_compatibility_metadata ~__context ~self:vgpu
|> List.filter (fun (k,_) -> k <> Nvidia.key)
|> fun value ->
Db.VGPU.set_compatibility_metadata ~__context ~self:vgpu ~value)
Db.VGPU.get_compatibility_metadata ~__context ~self:vgpu
|> List.filter (fun (k,_) -> k <> Nvidia.key)
|> fun value ->
Db.VGPU.set_compatibility_metadata ~__context ~self:vgpu ~value)


20 changes: 10 additions & 10 deletions ocaml/xapi/xapi_pgpu.ml
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ let calculate_max_capacities ~__context ~pCI ~size ~supported_VGPU_types =

let fetch_compatibility_metadata ~__context ~pgpu_pci =
if Db.PCI.get_vendor_id ~__context ~self:pgpu_pci =
Xapi_pci.id_of_int Xapi_vgpu_type.Nvidia.vendor_id
Xapi_pci.id_of_int Xapi_vgpu_type.Nvidia.vendor_id
then (
let dbg = Context.string_of_task __context in
let pgpu_pci_address = Db.PCI.get_pci_id ~__context ~self:pgpu_pci in
Xapi_gpumon.Nvidia.get_pgpu_compatibility_metadata ~dbg ~pgpu_pci_address
let dbg = Context.string_of_task __context in
let pgpu_pci_address = Db.PCI.get_pci_id ~__context ~self:pgpu_pci in
Xapi_gpumon.Nvidia.get_pgpu_compatibility_metadata ~dbg ~pgpu_pci_address
) else []

let maybe_fetch_compatibility_metadata ~__context ~pgpu_pci =
try fetch_compatibility_metadata ~__context ~pgpu_pci
with
| Gpumon_interface.NvmlInterfaceNotAvailable -> []
| Gpumon_interface.(Gpumon_error NvmlInterfaceNotAvailable) -> []
| err ->
debug "fetch_compatibility_metadata for pgpu_pci:%s failed with %s"
(Ref.string_of pgpu_pci) (Printexc.to_string err);
Expand All @@ -53,12 +53,12 @@ let populate_compatibility_metadata ~__context ~pgpu ~pgpu_pci =
let value = fetch_compatibility_metadata ~__context ~pgpu_pci in
Db.PGPU.set_compatibility_metadata ~__context ~self:pgpu ~value
with
| Gpumon_interface.NvmlInterfaceNotAvailable ->
info "%s: can't get compat data for pgpu_pci:%s, keeping existing data"
this (Ref.string_of pgpu_pci)
| Gpumon_interface.(Gpumon_error NvmlInterfaceNotAvailable) ->
info "%s: can't get compat data for pgpu_pci:%s, keeping existing data"
this (Ref.string_of pgpu_pci)
| err ->
debug "%s: obtaining compat data for pgpu_pci:%s failed with %s"
this (Ref.string_of pgpu_pci) (Printexc.to_string err)
debug "%s: obtaining compat data for pgpu_pci:%s failed with %s"
this (Ref.string_of pgpu_pci) (Printexc.to_string err)

let create ~__context ~pCI ~gPU_group ~host ~other_config
~supported_VGPU_types ~size ~dom0_access
Expand Down

0 comments on commit 2a78d9d

Please sign in to comment.