-
Notifications
You must be signed in to change notification settings - Fork 0
/
greeneQoS.lua
149 lines (107 loc) · 4.26 KB
/
greeneQoS.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/bin/env lua
local greeneQoS = { }
local greeneUtils = require "greeneUtils"
local greeneSpecialUsers = require "greeneSpecialUsers"
local greeneCommon = require "greeneCommon"
local slurm_log = greeneUtils.slurm_log
local user_log = greeneUtils.user_log
local twelve_hours = greeneUtils.twelve_hours
local two_days = greeneUtils.two_days
local seven_days = greeneUtils.seven_days
local unlimited_time = greeneUtils.unlimited_time
local interactive_time_limit = greeneUtils.hours_to_mins(4)
local time_limit = 0
local QoSs = { "interact",
"cpuplus",
"cpu48", "cpu168", "cpulow",
"gpuamd", "gpuplus",
"gpu48", "gpu168",
"cds" }
local qos_configurations = {
interact = { interactive = true, time_min = 0, time_max = interactive_time_limit,
max_cpus = 48, max_gpus = 4 },
cpu48 = { gpu = false, time_min = 0, time_max = two_days },
cpu168 = { gpu = false, time_min = two_days, time_max = seven_days },
gpu48 = { gpu = true, time_min = 0, time_max = two_days },
gpu168 = { gpu = true, time_min = two_days, time_max = seven_days },
gpuamd = { gpu = true, gpu_type = "mi50", time_min = 0, time_max = two_days },
-- special QoS with user access control
cpuplus = { gpu = false, time_min = 0, time_max = seven_days,
users = greeneSpecialUsers.cpuplus_users },
gpuplus = { gpu = true,
time_min = 0, time_max = seven_days,
users = greeneSpecialUsers.gpuplus_users },
cds = { gpu = true,
time_min = 0, time_max = seven_days,
account = "cds" },
cpulow = { gpu = false, time_min = 0, time_max = twelve_hours,
require_qos = true },
--[[
cpu365 = { time_min = seven_days, time_max = unlimited_time,
users = princeStakeholders.users_with_unlimited_wall_time
}
--]]
}
local function fit_into_qos(qos_name)
local qos = qos_configurations[qos_name]
if qos == nil then return false end
if qos.interactive ~= nil and qos.interactive ~= greeneCommon.is_interactive_job() then return false end
if qos.require_qos and qos_name ~= greeneCommon.qos() then return false end
if qos.gpu_type ~= nil then
if greeneCommon.gpu_type == nil then return false end
if greeneCommon.gpu_type ~= qos.gpu_type then return false end
end
if qos.max_cpus ~= nil then
local n_cpus, n_gpus = greeneCommon.total_cpus_and_gpus()
if n_cpus > qos.max_cpus then return false end
if n_gpus > qos.max_gpus then return false end
end
if qos.gpu ~= nil and qos.gpu ~= greeneCommon.is_gpu_job() then return false end
if qos.account ~= nil and qos.account ~= greeneCommon.account() then return false end
if (qos.users ~= nil and greeneUtils.in_table(qos.users, greeneCommon.netid())) or qos.users == nil then
if time_limit > qos.time_min and time_limit <= qos.time_max then return true end
end
return false
end
local function valid_qos()
for _, qos_name in pairs(QoSs) do
if fit_into_qos(qos_name) then
return qos_name
end
end
return nil
end
local function qos_gpu168_warnings()
local qos = greeneCommon.job_desc.qos
if qos == "gpu168" then
user_log("******************************************************************************************")
user_log("*** Each user is limited to 4 GPUs in total for all jobs with wall time more than 48 hours")
user_log("******************************************************************************************")
end
end
local function qos_is_valid()
local qos = greeneCommon.job_desc.qos
if qos == nil then
user_log("*** Error no proper QoS fit this job")
return false
end
if not greeneUtils.in_table(QoSs, qos) then
user_log("*** Error '%s' is not a valid QoS on Greene", qos)
return false
end
if not fit_into_qos(qos) then
user_log("*** Error QoS '%s' does not fit this job", qos)
return false
end
qos_gpu168_warnings()
return true
end
local function setup_parameters(args)
time_limit = args.time_limit
end
-- functions
greeneQoS.setup_parameters = setup_parameters
greeneQoS.valid_qos = valid_qos
greeneQoS.qos_is_valid = qos_is_valid
slurm_log("To load greeneQoS.lua")
return greeneQoS