Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support gpu to the grid client #666

Merged
merged 25 commits into from
Jun 25, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b076f81
Add gpu as option to machines and kubernetes
AhmedHanafy725 Jun 20, 2023
32e7048
Add gpu filter to the grid proxy queries
AhmedHanafy725 Jun 20, 2023
b0135cb
Add the missing fields to the node interface comming from gridproxy
AhmedHanafy725 Jun 20, 2023
fe4bce2
Add zos call to get gpu details
AhmedHanafy725 Jun 20, 2023
2218eb1
Add zos gpu call to the scripts
AhmedHanafy725 Jun 20, 2023
4cb9a47
Add checks to get make sure the user can use the gpu on the machine
AhmedHanafy725 Jun 20, 2023
eb3b1ba
Add more checks on gpu to make sure that the gpu id is on the node
AhmedHanafy725 Jun 20, 2023
86944d4
Add gpu challenge in zmachine
AhmedHanafy725 Jun 20, 2023
b1f6fb7
Add gpu as option to machines and kubernetes
AhmedHanafy725 Jun 20, 2023
cbb468e
Add gpu filter to the grid proxy queries
AhmedHanafy725 Jun 20, 2023
b3a9255
Add the missing fields to the node interface comming from gridproxy
AhmedHanafy725 Jun 20, 2023
ab0f39b
Add zos call to get gpu details
AhmedHanafy725 Jun 20, 2023
155f2d5
Add zos gpu call to the scripts
AhmedHanafy725 Jun 20, 2023
e84c783
Add checks to get make sure the user can use the gpu on the machine
AhmedHanafy725 Jun 20, 2023
b64a8c2
Add more checks on gpu to make sure that the gpu id is on the node
AhmedHanafy725 Jun 20, 2023
4266608
Add gpu challenge in zmachine
AhmedHanafy725 Jun 20, 2023
c9f4ce0
Merge branch 'development_gpu' of github.com:threefoldtech/tfgrid-sdk…
AhmedHanafy725 Jun 22, 2023
cad474e
Add rented by query and fix has gpu
AhmedHanafy725 Jun 22, 2023
08503f1
Add gpu in the result of the vms
AhmedHanafy725 Jun 22, 2023
54e0744
Merge branch 'development' into development_gpu
AhmedHanafy725 Jun 22, 2023
fa47b3a
Add check for the gpu card if it's used to reject the deployment
AhmedHanafy725 Jun 22, 2023
e86ddae
Add example script for deploying a vm with gpu
AhmedHanafy725 Jun 22, 2023
4500681
Make getting the gpu id more easier
AhmedHanafy725 Jun 22, 2023
ca3c34c
Add check on the filyer to throw a clear error
AhmedHanafy725 Jun 22, 2023
10830d2
gpu -> gpus
AhmedHanafy725 Jun 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
79 changes: 79 additions & 0 deletions packages/grid_client/scripts/vm_with_gpu.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import { DiskModel, FilterOptions, MachineModel, MachinesModel, NetworkModel } from "../src";
import { config, getClient } from "./client_loader";
import { log } from "./utils";

async function main() {
const grid3 = await getClient();

// create network Object
const n = new NetworkModel();
n.name = "vmgpuNetwork";
n.ip_range = "10.249.0.0/16";

// create disk Object
const disk = new DiskModel();
disk.name = "vmgpuDisk";
disk.size = 100;
disk.mountpoint = "/testdisk";

const vmQueryOptions: FilterOptions = {
cru: 8,
mru: 16, // GB
sru: 1000,
availableFor: grid3.twinId,
hasGPU: true,
rentedBy: grid3.twinId,
};

const nodes = await grid3.capacity.filterNodes(vmQueryOptions);
if (nodes.length === 0) {
throw Error(`Couldn't find a node satisfying these filter options: ${JSON.stringify(vmQueryOptions)}`);
}
const nodeId = +nodes[0].nodeId;

// create vm node Object
const vm = new MachineModel();
vm.name = "vmgpu";
vm.node_id = nodeId;
vm.disks = [disk];
vm.public_ip = false;
vm.planetary = true;
vm.cpu = 8;
vm.memory = 1024 * 16;
vm.rootfs_size = 0;
vm.flist = "https://hub.grid.tf/tf-official-vms/ubuntu-22.04.flist";
vm.entrypoint = "/";
vm.env = {
SSH_KEY: config.ssh_key,
};
let gpuList = await grid3.zos.getNodeGPUInfo({ nodeId: nodeId });
gpuList = gpuList.filter(g => g.contract === 0);
if (gpuList.length <= 0) {
throw Error(`Couldn't find GPU card available on node ${nodeId}`);
}
vm.gpu = [gpuList[0].id]; // gpu card's id, you can check the available gpu from the dashboard

// create VMs Object
const vms = new MachinesModel();
vms.name = "vmgpu";
vms.network = n;
vms.machines = [vm];
vms.metadata = "";
vms.description = "test deploying VM with GPU via ts grid3 client";

// deploy vms
const res = await grid3.machines.deploy(vms);
log(res);

// get the deployment
const l = await grid3.machines.getObj(vms.name);
log(l);

// // delete
// const d = await grid3.machines.delete({ name: vms.name });
// log(d);

await grid3.disconnect();
}

main();
6 changes: 6 additions & 0 deletions packages/grid_client/scripts/zos_rmb_requests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ async function main() {
} catch (e) {
log(`Couldn't reach node ${nodeId} to get storage pools due to ${e}`);
}
try {
log(await grid3.zos.getNodeGPUInfo({ nodeId }));
} catch (e) {
log(`Couldn't reach node ${nodeId} to get gpu info due to ${e}`);
}

await grid3.disconnect();
}

Expand Down
4 changes: 4 additions & 0 deletions packages/grid_client/src/high_level/base.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Addr } from "netaddr";

import { RMB } from "../clients/rmb/client";
import { GridClientConfig } from "../config";
import { events } from "../helpers/events";
import { Operations, TwinDeployment } from "../high_level/models";
Expand All @@ -11,8 +12,11 @@ import { Workload, WorkloadTypes } from "../zos/workload";

class HighLevelBase {
nodes: Nodes;
rmb: RMB;

constructor(public config: GridClientConfig) {
this.nodes = new Nodes(this.config.graphqlURL, this.config.proxyURL, this.config.rmbClient);
this.rmb = new RMB(config.rmbClient);
}

_filterWorkloads(
Expand Down
4 changes: 4 additions & 0 deletions packages/grid_client/src/high_level/kubernetes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class KubernetesHL extends HighLevelBase {
corex = false,
solutionProviderId: number,
zlogsOutput?: string,
gpu: string[] = [],
Mahmoud-Emad marked this conversation as resolved.
Show resolved Hide resolved
) {
events.emit("logs", `Creating a master with name: ${name} on node: ${nodeId}, network: ${network.name}`);
const machine = new VMHL(this.config);
Expand Down Expand Up @@ -73,6 +74,7 @@ class KubernetesHL extends HighLevelBase {
corex,
solutionProviderId,
zlogsOutput,
gpu,
);
}

Expand Down Expand Up @@ -100,6 +102,7 @@ class KubernetesHL extends HighLevelBase {
corex = false,
solutionProviderId: number,
zlogsOutput?: string,
gpu: string[] = [],
Mahmoud-Emad marked this conversation as resolved.
Show resolved Hide resolved
) {
events.emit("logs", `Creating a worker with name: ${name} on node: ${nodeId}, network: ${network.name}`);
const machine = new VMHL(this.config);
Expand Down Expand Up @@ -141,6 +144,7 @@ class KubernetesHL extends HighLevelBase {
corex,
solutionProviderId,
zlogsOutput,
gpu,
);
}

Expand Down
25 changes: 25 additions & 0 deletions packages/grid_client/src/high_level/machine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class VMHL extends HighLevelBase {
corex = false,
solutionProviderId: number,
zlogsOutput?: string,
gpu: string[] = [],
Mahmoud-Emad marked this conversation as resolved.
Show resolved Hide resolved
): Promise<[TwinDeployment[], string]> {
const deployments: TwinDeployment[] = [];
const workloads: Workload[] = [];
Expand Down Expand Up @@ -120,6 +121,7 @@ class VMHL extends HighLevelBase {
}

// ipv4
// TODO: make sure that the farm has a free public ip before continuing the deployment
let ipName = "";
let publicIps = 0;
if (publicIp || publicIp6) {
Expand All @@ -131,6 +133,28 @@ class VMHL extends HighLevelBase {
}
}

if (gpu && gpu.length > 0) {
const nodeTwinId = await this.nodes.getNodeTwinId(nodeId);
const gpuList = await this.rmb.request([nodeTwinId], "zos.gpu.list", "");
if (gpuList.length <= 0) {
throw Error(`The selected node ${nodeId} doesn't have GPU card`);
}
for (const g of gpu) {
Mahmoud-Emad marked this conversation as resolved.
Show resolved Hide resolved
const found = gpuList.filter(item => item.id === g);
if (found.length === 0) {
throw Error(`Couldn't find the GPU with id: "${g}" in node: ${nodeId}`);
}
if (found[0].contract !== 0) {
throw Error(`This GPU: "${g}" is currently in use by another VM with contract id: ${found[0].contract}`);
}
}

const node = await this.nodes.getNode(nodeId);
if (node.rentedByTwinId !== this.config.twinId) {
throw Error(`This node ${nodeId} is not rented by the current user`);
}
}

// validate user ip subnet in case of no networks already
let userIPsubnet;
let accessNodeSubnet;
Expand Down Expand Up @@ -247,6 +271,7 @@ class VMHL extends HighLevelBase {
description,
0,
corex,
gpu,
),
);

Expand Down
1 change: 1 addition & 0 deletions packages/grid_client/src/modules/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ class BaseModule {
description: workload.description,
rootfs_size: data.size,
corex: data.corex,
gpu: data.gpu,
};
}

Expand Down
3 changes: 3 additions & 0 deletions packages/grid_client/src/modules/k8s.ts
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class K8sModule extends BaseModule {
master.corex,
master.solutionProviderId!,
master.zlogsOutput,
master.gpu,
);

deployments = deployments.concat(twinDeployments);
Expand Down Expand Up @@ -158,6 +159,7 @@ class K8sModule extends BaseModule {
worker.corex,
worker.solutionProviderId!,
worker.zlogsOutput,
worker.gpu,
);

deployments = deployments.concat(twinDeployments);
Expand Down Expand Up @@ -295,6 +297,7 @@ class K8sModule extends BaseModule {
options.corex,
options.solutionProviderId!,
options.zlogsOutput,
options.gpu,
);

return await this._add(options.deployment_name, options.node_id, oldDeployments, twinDeployments, network);
Expand Down
2 changes: 2 additions & 0 deletions packages/grid_client/src/modules/machines.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class MachinesModule extends BaseModule {
machine.corex,
machine.solutionProviderId!,
machine.zlogsOutput,
machine.gpu,
);
twinDeployments = twinDeployments.concat(TDeployments);
if (wgConfig) {
Expand Down Expand Up @@ -188,6 +189,7 @@ class MachinesModule extends BaseModule {
options.corex,
options.solutionProviderId!,
options.zlogsOutput,
options.gpu,
);
return await this._add(options.deployment_name, options.node_id, oldDeployments, twinDeployments, network);
}
Expand Down
6 changes: 5 additions & 1 deletion packages/grid_client/src/modules/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class MachineModel {
@Expose() @IsOptional() @IsBoolean() corex?: boolean;
@Expose() @IsInt() @IsOptional() solutionProviderId?: number;
@Expose() @IsString() @IsOptional() zlogsOutput?: string;
@Expose() @IsString({ each: true }) @IsOptional() gpu?: string[];
}

class MachinesModel {
Expand Down Expand Up @@ -138,6 +139,7 @@ class KubernetesNodeModel {
@Expose() @IsOptional() @IsBoolean() corex?: boolean;
@Expose() @IsInt() @IsOptional() solutionProviderId?: number;
@Expose() @IsString() @IsOptional() zlogsOutput?: string;
@Expose() @IsString({ each: true }) @IsOptional() gpu?: string[];
}

class K8SModel {
Expand Down Expand Up @@ -556,9 +558,11 @@ class FilterOptions {
@Expose() @IsOptional() @IsString() country?: string;
@Expose() @IsOptional() @IsString() city?: string;
@Expose() @IsOptional() @IsBoolean() dedicated?: boolean;
@Expose() @IsOptional() @IsInt() availableFor?: number;
@Expose() @IsOptional() @IsInt() @Min(1) availableFor?: number;
@Expose() @IsOptional() @IsInt() page?: number;
@Expose() @IsOptional() @IsInt() size?: number;
@Expose() @IsOptional() @IsBoolean() hasGPU?: boolean;
@Expose() @IsOptional() @IsInt() @Min(1) rentedBy?: number;
}

class CalculatorModel {
Expand Down
7 changes: 7 additions & 0 deletions packages/grid_client/src/modules/zos.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ class Zos {
const nodeTwinId = await this.capacity.getNodeTwinId(options.nodeId);
return await this.rmb.request([nodeTwinId], "zos.storage.pools", "");
}

@expose
@validateInput
async getNodeGPUInfo(options: ZOSNodeModel) {
const nodeTwinId = await this.capacity.getNodeTwinId(options.nodeId);
return await this.rmb.request([nodeTwinId], "zos.gpu.list", "");
}
}

export { Zos as zos };
5 changes: 5 additions & 0 deletions packages/grid_client/src/primitives/nodes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ interface NodeInfo {
publicConfig: PublicConfig;
status: string;
certificationType: string;
hasGPU: boolean;
extraFee: number;
rentedByTwinId: number;
}
interface PublicConfig {
domain: string;
Expand Down Expand Up @@ -332,6 +335,8 @@ class Nodes {
status: "up",
page: options.page,
size: options.size,
has_gpu: options.hasGPU,
rented_by: options.rentedBy,
Mahmoud-Emad marked this conversation as resolved.
Show resolved Hide resolved
};
if (options.gateway) {
params["ipv4"] = true;
Expand Down
2 changes: 2 additions & 0 deletions packages/grid_client/src/primitives/vm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class VMPrimitive {
description = "",
version = 0,
corex = false,
gpu: string[] = [],
): Workload {
const zmachine = new Zmachine();
zmachine.flist = flist;
Expand All @@ -49,6 +50,7 @@ class VMPrimitive {
zmachine.compute_capacity = this._createComputeCapacity(cpu, memory);
zmachine.env = env;
zmachine.corex = corex;
zmachine.gpu = gpu;

const zmachine_workload = new Workload();
zmachine_workload.version = version || 0;
Expand Down
19 changes: 18 additions & 1 deletion packages/grid_client/src/zos/zmachine.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
import { Expose, Transform, Type } from "class-transformer";
import { IsBoolean, IsDefined, IsInt, IsIP, IsNotEmpty, IsString, Max, Min, ValidateNested } from "class-validator";
import {
IsBoolean,
IsDefined,
IsInt,
IsIP,
IsNotEmpty,
IsOptional,
IsString,
Max,
Min,
ValidateNested,
} from "class-validator";

import { ComputeCapacity } from "./computecapacity";
import { WorkloadData, WorkloadDataResult } from "./workload_base";
Expand Down Expand Up @@ -47,6 +58,7 @@ class Zmachine extends WorkloadData {
@Expose() @IsString() @IsNotEmpty() entrypoint: string;
@Expose() env: Record<string, unknown>;
@Expose() @Transform(({ value }) => (value ? true : false)) @IsBoolean() corex: boolean;
@Expose() @IsString({ each: true }) @IsOptional() gpu?: string[];

challenge(): string {
let out = "";
Expand All @@ -62,6 +74,11 @@ class Zmachine extends WorkloadData {
out += key;
out += "=";
out += this.env[key];
if (this.gpu) {
for (const g of this.gpu) {
out += g;
}
}
}
return out;
}
Expand Down