Skip to content
This repository was archived by the owner on May 16, 2022. It is now read-only.

Commit 0285299

Browse files
committed
Retrieve job statistics via sacct
1 parent e019fd4 commit 0285299

File tree

14 files changed

+542
-49
lines changed

14 files changed

+542
-49
lines changed

cmd/gobler/sender.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func (s *sender) SenderWorker(psCh <-chan *spool.FileGob, psfCh chan<- *spool.Fi
6262
l.Printf("SENDER %s#%d: error removing file %s\n", s.connector, s.num, err)
6363
}
6464
lock.Unlock()
65-
l.Printf("SENDER %s#%d: Gob deleted\n", s.connector)
65+
l.Printf("SENDER %s#: Gob deleted\n", s.connector)
6666
}
6767
}
6868
l.Println("======================= Sender end =============================================")

cmd/goslmailer/adaptive_card_template.json

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,13 @@
3737
"text":"Job {{ .Job.SlurmEnvironment.SLURM_JOB_ID }} {{ .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE }}",
3838
"wrap":true,
3939
"size":"Large",
40-
{{ if eq .Job.SlurmEnvironment.SLURM_JOB_STATE "FAILED" }}"color":"Attention"{{ else }}"color":"Good"{{ end }}
40+
{{ if or (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "FAILED") (eq .Job.SlurmEnvironment.SLURM_JOB_STATE "TIMEOUT") ((eq .Job.SlurmEnvironment.SLURM_JOB_STATE "OUT_OF_MEMORY")) }}"color":"Attention"{{ else }}"color":"Good"{{ end }}
4141
},
4242
{
4343
"type":"TextBlock",
4444
"spacing":"none",
45-
"text":"Created $(date)",
45+
"text":"Created {{ .Created }}",
46+
4647
"isSubtle":true,
4748
"wrap":true
4849
}
@@ -70,8 +71,13 @@
7071
},
7172
{
7273
"type":"Fact",
73-
"title":"Job state",
74-
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_STATE }}"
74+
"title":"User",
75+
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}"
76+
},
77+
{
78+
"type":"Fact",
79+
"title":"Partition",
80+
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_PARTITION }}"
7581
},
7682
{
7783
"type":"Fact",
@@ -80,34 +86,89 @@
8086
},
8187
{
8288
"type":"Fact",
83-
"title":"Partition",
84-
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_PARTITION }}"
89+
"title":"Cores",
90+
"value":"{{ .Job.JobStats.Ncpus }}"
8591
},
8692
{
8793
"type":"Fact",
88-
"title":"User",
89-
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_USER }}"
94+
"title":"Job state",
95+
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_STATE }}"
96+
},
97+
{{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}}
98+
{
99+
"type":"Fact",
100+
"title":"Exit Code",
101+
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_EXIT_CODE_MAX }}"
102+
},
103+
{{ end }}
104+
{
105+
"type":"Fact",
106+
"title":"Submit",
107+
"value":"{{ .Job.JobStats.Submittime }}"
90108
},
91109
{
92110
"type":"Fact",
93-
"title":"Runtime",
111+
"title":"Start",
112+
"value":"{{ .Job.JobStats.Starttime }}"
113+
},
114+
{{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING"}}
115+
{
116+
"type":"Fact",
117+
"title":"End",
118+
"value":"{{ .Job.JobStats.Endtime }}"
119+
},
120+
{{ end }}
121+
{
122+
"type":"Fact",
123+
"title":"Reserved Walltime",
124+
"value":"{{ .Job.JobStats.WalltimeStr }}"
125+
},
126+
{{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }}
127+
{
128+
"type":"Fact",
129+
"title":"Used Walltime",
94130
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_RUN_TIME }}"
95131
},
132+
{{ if ne .Job.SlurmEnvironment.SLURM_JOB_STATE "RUNNING" }}
96133
{
97134
"type":"Fact",
98-
"title":"Exit Code",
99-
"value":"{{ .Job.SlurmEnvironment.SLURM_JOB_EXIT_CODE_MAX }}"
135+
"title":"Used CPU time",
136+
"value":"{{ .Job.JobStats.TotalCPUStr }}"
100137
},
138+
{
139+
"type":"Fact",
140+
"title":"% User (Computation)",
141+
"value":'{{ printf "%5.2f%%" .Job.JobStats.CalcUserComputePercentage }}'
142+
},
143+
{
144+
"type":"Fact",
145+
"title":"% System (I/O)",
146+
"value":'{{ printf "%5.2f%%" .Job.JobStats.CalcSystemComputePercentage }}'
147+
},
148+
{{ end }}
149+
{{ end }}
101150
{
102151
"type":"Fact",
103152
"title":"Memory Requested",
104-
"value":"{{ .Job.JobStats.MemReq }}"
153+
"value":"{{ .Job.JobStats.ReqMem | humanBytes }}"
154+
},
155+
{{ if ne .Job.SlurmEnvironment.SLURM_JOB_MAIL_TYPE "Began" }}
156+
{
157+
"type":"Fact",
158+
"title":"Max Memory Used",
159+
"value":"{{ .Job.JobStats.MaxRSS | humanBytes }}"
160+
},
161+
{
162+
"type":"Fact",
163+
"title":"Max Disk Write",
164+
"value":"{{ .Job.JobStats.MaxDiskWrite | humanBytes }}"
105165
},
106166
{
107167
"type":"Fact",
108-
"title":"Memory Used",
109-
"value":"{{ .Job.JobStats.MemUsed }}"
168+
"title":"Max Disk Read",
169+
"value":"{{ .Job.JobStats.MaxDiskRead | humanBytes }}"
110170
}
171+
{{ end }}
111172
]
112173
},
113174
{{ range .Job.Hints }}

cmd/goslmailer/goslmailer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@ func main() {
5858

5959
// get job statistics based on the SLURM_JOB_ID from slurmEnv struct
6060
// only if job is END or FAIL(?)
61-
job.GetJobStats()
61+
job.GetJobStats(log)
6262

6363
// generate hints based on SlurmEnv and JobStats (e.g. "too much memory requested" or "walltime << requested queue")
6464
// only if job is END or fail(?)
65-
job.GenerateHints()
65+
job.GenerateHints(cfg.QosMap)
6666

6767
// populate map with configured referenced connectors
6868
conns.PopulateConnectors(cfg, log)

connectors/msteams/send.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package msteams
33
import (
44
"bytes"
55
"errors"
6+
"fmt"
67
"io"
78
"log"
89
"net/http"
@@ -11,6 +12,7 @@ import (
1112
"text/template"
1213
"time"
1314

15+
"github.com/dustin/go-humanize"
1416
"github.com/pja237/goslmailer/internal/lookup"
1517
"github.com/pja237/goslmailer/internal/message"
1618
"github.com/pja237/goslmailer/internal/slurmjob"
@@ -40,18 +42,24 @@ func NewConnector(conf map[string]string) (*Connector, error) {
4042
func (c *Connector) msteamsRenderCardTemplate(j *slurmjob.JobContext, userid string, buf *bytes.Buffer) error {
4143

4244
var x = struct {
43-
Job slurmjob.JobContext
44-
UserID string
45+
Job slurmjob.JobContext
46+
UserID string
47+
Created string
4548
}{
4649
*j,
4750
userid,
51+
fmt.Sprint(time.Now().Format("Mon, 2 Jan 2006 15:04:05 MST")),
52+
}
53+
54+
var funcMap = template.FuncMap{
55+
"humanBytes": humanize.Bytes,
4856
}
4957

5058
f, err := os.ReadFile(c.adaptiveCardTemplate)
5159
if err != nil {
5260
return err
5361
}
54-
t := template.Must(template.New("AdaptiveCard").Parse(string(f)))
62+
t := template.Must(template.New("AdaptiveCard").Funcs(funcMap).Parse(string(f)))
5563
err = t.Execute(buf, x)
5664
if err != nil {
5765
return err

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
module github.com/pja237/goslmailer
22

33
go 1.17
4+
5+
require github.com/dustin/go-humanize v1.0.0 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
2+
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=

internal/config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ type ConfigContainer struct {
1010
Logfile string `json:"logfile"`
1111
DefaultConnector string `json:"defaultconnector"`
1212
Connectors map[string]map[string]string `json:"connectors"`
13+
QosMap map[uint64]string `json:"qosmap"`
1314
}
1415

1516
func NewConfigContainer() *ConfigContainer {

internal/slurmjob/getjobcontext.go

Lines changed: 90 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,68 @@
11
package slurmjob
22

3-
import "os"
3+
import (
4+
"fmt"
5+
"log"
6+
"os"
7+
"sort"
8+
"time"
9+
)
10+
11+
// TODO make it configureable or read it from sacctmgr
12+
13+
func calculateOptimalQOS(qosMap map[uint64]string, runtime uint64) string {
14+
keys := make([]uint64, len(qosMap))
15+
for k := range qosMap {
16+
keys = append(keys, k)
17+
}
18+
sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
19+
for _, k := range keys {
20+
if runtime <= k {
21+
return qosMap[k]
22+
}
23+
}
24+
return "LONG"
25+
}
426

527
// Populate JobContext.Hints with hints about the job (mis)usage and suggestions how to optimize it.
628
// todo: add more logic once more stats are in
7-
func (j *JobContext) GenerateHints() {
8-
if j.JobStats.MemReq/2 > j.JobStats.MemUsed {
9-
j.Hints = append(j.Hints, "TIP: Please consider lowering the ammount of requested memory in the future, your job has consumed less then half of requested.")
29+
func (j *JobContext) GenerateHints(qosMap map[uint64]string) {
30+
if j.IsJobFinished() {
31+
// Check OUT_OF_MEMORY
32+
if j.SlurmEnvironment.SLURM_JOB_STATE == "OUT_OF_MEMORY" {
33+
j.Hints = append(j.Hints, "TIP: The job ran out of memory. Please re-submnit with increase memory requirements")
34+
return
35+
}
36+
37+
if j.SlurmEnvironment.SLURM_JOB_STATE == "TIMEOUT" {
38+
j.Hints = append(j.Hints, "TIP: The job ran into a timeout. Please re-submnit with increase walltime requirements and potentially to a different QOS)")
39+
return
40+
}
41+
42+
// Check memory consumption
43+
if j.JobStats.ReqMem/2 > j.JobStats.MaxRSS {
44+
j.Hints = append(j.Hints, "TIP: Please consider lowering the ammount of requested memory in the future, your job has consumed less then half of the requested memory.")
45+
}
46+
// check CPU time (16 cores requested only 1 used)
47+
if j.JobStats.CPUTime/2 > j.JobStats.TotalCPU {
48+
j.Hints = append(j.Hints, "TIP: Please consider lowering the amount of requested CPU cores in the future, your job has consumed less than half of requested CPU cores")
49+
}
50+
51+
// Check if runtime is half of the requested runtime
52+
if j.JobStats.Walltime/2 > j.JobStats.Runtime {
53+
54+
// Check if it was submitted without specifying a walltime (just against default maxwalltime of QOS)
55+
optimalQos := calculateOptimalQOS(qosMap, j.JobStats.Runtime)
56+
if qos, ok := qosMap[j.JobStats.Walltime]; ok {
57+
j.Hints = append(j.Hints, fmt.Sprintf("TIP: Your job was submitted to %s QOS and finished within half of the requested walltime. Consider submitting it to the %s QOS instead", qos, optimalQos))
58+
j.Hints = append(j.Hints, fmt.Sprintf("TIP: No --time specified: Using default %s QOS limit. Specify --time <walltime> to increase the chances that the scheduler will use this job for backfilling purposes! See https://docs.vbc.ac.at/link/21#bkmrk-scheduling-policy for more information.", qos))
59+
} else {
60+
61+
j.Hints = append(j.Hints, fmt.Sprintf("TIP: Your job was submitted with a walltime of %s and finished in less half of the time, consider reducing the walltime and submitted to %s QOS", j.JobStats.WalltimeStr, optimalQos))
62+
}
63+
64+
}
1065
}
11-
j.Hints = append(j.Hints, "TIP: Your job was submitted to LONG queue and finished in less then an hour, consider submitting to RAPID or SHORT queues.")
1266
}
1367

1468
// Get SLURM_* environment variables from the environment
@@ -46,23 +100,36 @@ func (j *JobContext) GetSlurmEnvVars() {
46100

47101
}
48102

103+
func IsJobFinished(jobState string) bool {
104+
switch jobState {
105+
case
106+
"FAILED",
107+
"COMPLETED",
108+
"OUT_OF_MEMORY",
109+
"TIMEOUT":
110+
return true
111+
}
112+
return false
113+
}
114+
115+
func (j *JobContext) IsJobFinished() bool {
116+
return IsJobFinished(j.SlurmEnvironment.SLURM_JOB_STATE)
117+
}
118+
49119
// Get additional job statistics from external source (e.g. jobinfo or sacct)
50-
func (j *JobContext) GetJobStats() {
51-
j.JobStats.MemReq = 4096
52-
j.JobStats.MemUsed = 1024
53-
// FUTURE: employ jobinfo to do the job or call sacct and get the data we need ourself
54-
//
55-
//out, err := exec.Command("./jobinfo", string(s.SLURM_JOBID)).Output()
56-
//if err != nil {
57-
// fmt.Println("ERROR Executing jobinfo. Abort!")
58-
// os.Exit(1)
59-
//}
60-
////fmt.Println(string(out))
61-
////fmt.Printf("%#v\n", strings.Split(string(out), "\n"))
62-
//for _, l := range strings.Split(string(out), "\n") {
63-
// //fmt.Printf("PRE SPLIT %#v\n", l)
64-
// v := strings.Split(l, ":")
65-
// fmt.Printf("POST SPLIT %#v\n", v)
66-
// //fmt.Printf("ACCESS %#v : %#v\n", strings.Trim(v[0], " "), strings.Trim(v[1], " "))
67-
//}
120+
func (j *JobContext) GetJobStats(log *log.Logger) {
121+
jobId := j.SlurmEnvironment.SLURM_JOBID
122+
if j.SlurmEnvironment.SLURM_ARRAY_JOB_ID != "" {
123+
jobId = j.SlurmEnvironment.SLURM_ARRAY_JOB_ID
124+
}
125+
j.JobStats = *GetSacctMetrics(jobId, log)
126+
counter := 0
127+
for !IsJobFinished(j.JobStats.State) && j.JobStats.State != j.SlurmEnvironment.SLURM_JOB_STATE && counter < 5 {
128+
time.Sleep(2 * time.Second)
129+
j.JobStats = *GetSacctMetrics(jobId, log)
130+
counter += 1
131+
}
132+
if j.JobStats.State == "RUNNING" {
133+
updateJobStatsWithLiveData(&j.JobStats, jobId, log)
134+
}
68135
}

0 commit comments

Comments
 (0)