-
Notifications
You must be signed in to change notification settings - Fork 42
/
managed_process.go
368 lines (329 loc) · 9.54 KB
/
managed_process.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
package pexec
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"sync"
"syscall"
"time"
"github.com/edaniels/golog"
"github.com/pkg/errors"
"go.viam.com/utils"
)
var errAlreadyStopped = errors.New("already stopped")
// A ManagedProcess controls the lifecycle of a single system process. Based on
// its configuration, it will ensure the process is revived if it every unexpectedly
// perishes.
type ManagedProcess interface {
// ID returns the unique ID of the process.
ID() string
// Start starts the process. The given context is only used for one shot processes.
Start(ctx context.Context) error
// Stop signals and waits for the process to stop. An error is returned if
// there's any system level issue stopping the process.
Stop() error
}
// NewManagedProcess returns a new, unstarted, from the given configuration.
func NewManagedProcess(config ProcessConfig, logger golog.Logger) ManagedProcess {
logger = logger.Named(fmt.Sprintf("process.%s_%s", config.ID, config.Name))
if config.StopSignal == 0 {
config.StopSignal = syscall.SIGTERM
}
if config.StopTimeout == 0 {
config.StopTimeout = defaultStopTimeout
}
return &managedProcess{
id: config.ID,
name: config.Name,
args: config.Args,
cwd: config.CWD,
oneShot: config.OneShot,
shouldLog: config.Log,
onUnexpectedExit: config.OnUnexpectedExit,
managingCh: make(chan struct{}),
killCh: make(chan struct{}),
stopSig: config.StopSignal,
stopWaitInterval: config.StopTimeout / time.Duration(3),
logger: logger,
logWriter: config.LogWriter,
}
}
type managedProcess struct {
mu sync.Mutex
id string
name string
args []string
cwd string
oneShot bool
shouldLog bool
cmd *exec.Cmd
stopped bool
onUnexpectedExit func(int) bool
managingCh chan struct{}
killCh chan struct{}
stopSig syscall.Signal
stopWaitInterval time.Duration
lastWaitErr error
logger golog.Logger
logWriter io.Writer
}
func (p *managedProcess) ID() string {
return p.id
}
func (p *managedProcess) Start(ctx context.Context) error {
p.mu.Lock()
defer p.mu.Unlock()
// In the event this Start happened from a restart but a
// stop happened while we were acquiring the lock, we may
// need to return early.
select {
case <-p.killCh:
// This will signal to a potential restarter that
// there's no restart to do.
return errAlreadyStopped
default:
}
if _, err := exec.LookPath(p.name); err != nil {
return err
}
if p.oneShot {
// Here we use the context since we block on waiting for the command
// to finish running.
//nolint:gosec
cmd := exec.CommandContext(ctx, p.name, p.args...)
cmd.SysProcAttr = sysProcAttr()
cmd.Dir = p.cwd
var runErr error
if p.shouldLog || p.logWriter != nil {
out, err := cmd.CombinedOutput()
if len(out) > 0 {
if p.shouldLog {
p.logger.Debugw("process output", "name", p.name, "output", string(out))
}
if p.logWriter != nil {
if _, err := p.logWriter.Write(out); err != nil && !errors.Is(err, io.ErrClosedPipe) {
p.logger.Errorw("error writing process output to log writer", "name", p.name, "error", err)
}
}
}
if err != nil {
runErr = err
}
} else {
runErr = cmd.Run()
}
if runErr == nil {
return nil
}
return errors.Wrapf(runErr, "error running process %q", p.name)
}
// This is fully managed so we will control when to kill the process and not
// use the CommandContext variant.
//nolint:gosec
cmd := exec.Command(p.name, p.args...)
cmd.SysProcAttr = sysProcAttr()
cmd.Dir = p.cwd
var stdOut, stdErr io.ReadCloser
if p.shouldLog || p.logWriter != nil {
var err error
stdOut, err = cmd.StdoutPipe()
if err != nil {
return err
}
stdErr, err = cmd.StderrPipe()
if err != nil {
return err
}
}
if err := cmd.Start(); err != nil {
return err
}
// We have the lock here so it's okay to:
// 1. Unset the old command, if there was one and let it be GC'd.
// 2. Assign a new command to be referenced in other places.
p.cmd = cmd
// It's okay to not wait for management to start.
utils.ManagedGo(func() {
p.manage(stdOut, stdErr)
}, nil)
return nil
}
// manage is the watchdog of the process. If the process has ended
// unexpectedly, onUnexpectedExit will be called. If onUnexpectedExit is unset
// or returns true, manage will restart the process. Note that onUnexpectedExit
// may be called multiple times if it returns true. It's possible and okay for
// a restart to be in progress while a Stop is happening. As a means of
// simplifying implementation, a restart spawns new goroutines by calling Start
// again and lets the original goroutine die off.
func (p *managedProcess) manage(stdOut, stdErr io.ReadCloser) {
// If no restart is going to happen after this function exits,
// then we want to notify anyone listening that this process
// is done being managed. We assume that if we aren't managing,
// the process is no longer running (it could have double forked though).
var restarted bool
defer func() {
if !restarted {
close(p.managingCh)
}
}()
// This block here logs as much as possible if it's requested until the
// pipes are closed.
stopLogging := make(chan struct{})
var activeLoggers sync.WaitGroup
if p.shouldLog || p.logWriter != nil {
logPipe := func(name string, pipe io.ReadCloser, isErr bool) {
logger := p.logger.Named(name)
defer activeLoggers.Done()
pipeR := bufio.NewReader(pipe)
logWriterError := false
for {
select {
case <-stopLogging:
return
default:
}
line, _, err := pipeR.ReadLine()
if err != nil {
if !errors.Is(err, io.EOF) && !errors.Is(err, os.ErrClosed) {
p.logger.Errorw("error reading output", "name", name, "error", err)
}
return
}
if p.shouldLog {
if isErr {
logger.Error("\n\\_ " + string(line))
} else {
logger.Info("\n\\_ " + string(line))
}
}
if p.logWriter != nil && !logWriterError {
_, err := p.logWriter.Write(line)
if err == nil {
_, err = p.logWriter.Write([]byte("\n"))
}
if err != nil {
if !errors.Is(err, io.ErrClosedPipe) {
p.logger.Debugw("error writing process output to log writer", "name", name, "error", err)
}
if !p.shouldLog {
return
}
logWriterError = true
}
}
}
}
activeLoggers.Add(2)
utils.PanicCapturingGo(func() {
logPipe("StdOut", stdOut, false)
})
utils.PanicCapturingGo(func() {
logPipe("StdErr", stdErr, true)
})
}
err := p.cmd.Wait()
// This is safe to write to because it is only read in Stop which
// is waiting for us to stop managing.
if err == nil {
p.lastWaitErr = nil
} else {
p.lastWaitErr = err
}
close(stopLogging)
activeLoggers.Wait()
// It's possible that Stop was called and is the reason why Wait returned.
select {
case <-p.killCh:
return
default:
}
// Run onUnexpectedExit if it exists. Do not attempt restart if
// onUnexpectedExit returns false.
if p.onUnexpectedExit != nil &&
!p.onUnexpectedExit(p.cmd.ProcessState.ExitCode()) {
return
}
// Otherwise, let's try restarting the process.
if err != nil {
// Right now we are assuming that any wait error implies the process is no longer
// alive. TODO(GOUT-8): Verify that
// this is actually true. If it's false, we could be multiply spawning processes
// where all are orphaned but one.
p.logger.Errorw("error waiting for process during manage", "error", err)
}
if p.cmd.ProcessState.Exited() {
p.logger.Infow("process exited before expected", "code", p.cmd.ProcessState.ExitCode())
} else {
p.logger.Infow("process exited before expected", "state", p.cmd.ProcessState)
}
p.logger.Info("restarting process")
// Temper ourselves so we aren't constantly restarting if we immediately fail.
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
select {
case <-ticker.C:
case <-p.killCh:
return
}
err = p.Start(context.Background())
if err != nil {
if !errors.Is(err, errAlreadyStopped) {
// MAYBE(erd): add retry
p.logger.Errorw("error restarting process", "error", err)
}
return
}
restarted = true
}
func (p *managedProcess) Stop() error {
// Minimally hold a lock here so that we can signal the
// management goroutine to stop. If we were to hold the
// lock for the duration of the function, we would possibly
// deadlock with manage trying to restart.
p.mu.Lock()
if p.stopped {
p.mu.Unlock()
return nil
}
close(p.killCh)
p.stopped = true
if p.cmd == nil {
p.mu.Unlock()
return nil
}
p.mu.Unlock()
// Since p.cmd is mutex guarded and we just signaled the manage
// goroutine to stop, no new Start can happen and therefore
// p.cmd can no longer be modified rendering it safe to read
// without a lock held.
forceKilled, err := p.kill()
if err != nil {
return err
}
<-p.managingCh
if p.lastWaitErr == nil && p.cmd.ProcessState.Success() {
return nil
}
if p.lastWaitErr != nil {
var unknownStatus bool
var errno syscall.Errno
if errors.As(p.lastWaitErr, &errno) {
// We lost the race to wait before the signal was caught. We're
// not going to be able to report any information here about the
// process stopping, unfortunately.
if errno == syscall.ECHILD {
unknownStatus = true
}
}
unknownStatus = unknownStatus || isWaitErrUnknown(p.lastWaitErr.Error(), forceKilled)
if unknownStatus {
p.logger.Debug("unable to check exit status")
return nil
}
return p.lastWaitErr
}
return errors.Errorf("non-successful exit code: %d", p.cmd.ProcessState.ExitCode())
}