This repository has been archived by the owner on Dec 30, 2020. It is now read-only.
/
workload.proto
257 lines (225 loc) · 7.01 KB
/
workload.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// Copyright (c) 2019 Sylabs, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package api;
import "google/protobuf/timestamp.proto";
import "google/protobuf/duration.proto";
// WorkloadManager defines API for interaction with HPC workload manager, e.g slurm.
service WorkloadManager {
// SubmitJob submits new job to the workload manager and
// returns job id that can be used to track job status.
rpc SubmitJob (SubmitJobRequest) returns (SubmitJobResponse);
// SubmitJobContainer submits new job to the workload manager and
// returns job id that can be used to track job status.
rpc SubmitJobContainer (SubmitJobContainerRequest) returns (SubmitJobContainerResponse);
// CancelJob cancels job by job id.
rpc CancelJob (CancelJobRequest) returns (CancelJobResponse);
// JobInfo returns complete information about a particular job.
// In case of JobArray the first job in slice is a root.
// JobInfoResponse have to contain at least one element
rpc JobInfo (JobInfoRequest) returns (JobInfoResponse);
// JobSteps returns information about each individual job step.
rpc JobSteps (JobStepsRequest) returns (JobStepsResponse);
// OpenFile opens a file and streams its content back. May be
// useful for results collecting.
rpc OpenFile (OpenFileRequest) returns (stream Chunk);
// TailFile opens a file and streams its content back. Unlike
// OpenFile this call will watch file content changes and stream
// new chunks continuously.
rpc TailFile (stream TailFileRequest) returns (stream Chunk);
// Resources returns partition resources
// nodes, cpu, mem, wall-time and available features
rpc Resources (ResourcesRequest) returns (ResourcesResponse);
// Partitions returns a list of available partitions.
rpc Partitions (PartitionsRequest) returns (PartitionsResponse);
// WorkloadInfo provides info about workload (name, version, red-box uid)
rpc WorkloadInfo (WorkloadInfoRequest) returns (WorkloadInfoResponse);
}
message SubmitJobRequest {
// Bash script that will be submitted to a workload manager.
string script = 1;
// Partition where job should be submitted.
string partition = 2;
// ID of a client who submitted this job.
string client_id = 3;
}
message SubmitJobResponse {
// Job ID to track submitted job.
int64 job_id = 1;
}
message CancelJobRequest {
// ID of a job to be cancelled.
int64 job_id = 1;
}
message CancelJobResponse {
}
message JobInfoRequest {
// ID of a job to fetch info of.
int64 job_id = 1;
}
message JobInfoResponse {
// Job information.
repeated JobInfo info = 1;
}
message JobStepsRequest {
// ID of a job to fetch steps of.
int64 job_id = 1;
}
message JobStepsResponse {
// Job steps information.
repeated JobStepInfo job_steps = 1;
}
message OpenFileRequest {
// Path to file to open.
string path = 1;
}
message ResourcesRequest {
// Partition which resources should be returned.
string partition = 1;
}
message ResourcesResponse {
// Number of nodes in the partition.
int64 nodes = 1;
// Number of cpus on each node.
int64 cpuPerNode = 2;
// Amount of memory on each node.
int64 memPerNode = 3;
// Wall time setting for the partition.
int64 wallTime = 4;
// Set of features of the partition.
repeated Feature features = 5;
}
message PartitionsRequest {
}
message PartitionsResponse {
repeated string partition = 1;
}
message WorkloadInfoRequest {
}
message WorkloadInfoResponse {
string name = 1;
string version = 2;
int64 uid = 3;
}
message SubmitJobContainerRequest {
// Job image name
string imageName = 1;
// Number of nodes in the partition.
int64 nodes = 2;
// Number of cpus on each node.
int64 cpuPerNode = 3;
// Amount of memory on each node.
int64 memPerNode = 4;
// Wall time setting for the partition.
int64 wallTime = 5;
// Partition where job should be submitted.
string partition = 6;
// ID of a client who submitted this job.
string client_id = 7;
SingularityOptions options = 8;
}
message SingularityOptions {
string app = 1;
bool allowUnsigned = 2;
repeated string binds = 3;
bool clearEnv = 4;
bool fakeRoot = 5;
string hostName = 6;
bool ipc = 7;
bool pid = 8;
bool noPrivs = 9;
bool writable = 10;
}
message SubmitJobContainerResponse {
// Job ID to track submitted job.
int64 job_id = 1;
}
enum TailAction {
Start = 0;
ReadToEndAndClose = 1;
}
message TailFileRequest {
TailAction action = 1;
// Path to file to tail.
string path = 2;
}
enum JobStatus {
COMPLETED = 0;
CANCELLED = 1;
FAILED = 2;
TIMEOUT = 3;
PENDING = 4;
UNKNOWN = 10;
}
// JobInfo represents compete information about a single job.
message JobInfo {
// ID of a job.
string id = 1;
// ID of a user who submitted the job.
string user_id = 2;
// Job name.
string name = 3;
// Job exit code. For slurm workload manager will be in form "int:int".
string exit_code = 4;
// Job current status.
JobStatus status = 5;
// Job submit time.
google.protobuf.Timestamp submit_time = 6;
// Job start time.
google.protobuf.Timestamp start_time = 7;
// Job running time.
google.protobuf.Duration run_time = 8;
// Job time limit.
google.protobuf.Duration time_limit = 9;
// Job working directory.
string working_dir = 10;
// Path to job's standard output file.
string std_out = 11;
// Path to job's standard error file.
string std_err = 12;
// Cluster partition on which job resides.
string partition = 13;
// List of nodes on which job is executed.
string node_list = 14;
// Host from which job was submitted.
string batch_host = 15;
// Number of nodes requested by job.
string num_nodes = 16;
// Job array id.
string array_id = 17;
}
// JobStepInfo represents information about a single job step.
message JobStepInfo {
// ID od a job step.
string id = 1;
// Job step name.
string name = 2;
// Job step exit code.
int32 exit_code = 3;
// Job step current status.
JobStatus status = 4;
// Job step start time.
google.protobuf.Timestamp start_time = 5;
// Job step end time.
google.protobuf.Timestamp end_time = 6;
}
// Chunk is an arbitrary amount of bytes.
message Chunk {
bytes content = 1;
}
message Feature {
string name = 1;
string version = 2;
int64 quantity = 3;
}