/
watcher.go
639 lines (553 loc) · 15.6 KB
/
watcher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
// +build linux darwin netbsd freebsd openbsd dragonfly
// Package gaio is an Async-IO library for Golang.
//
// gaio acts in proactor mode, https://en.wikipedia.org/wiki/Proactor_pattern.
// User submit async IO operations and waits for IO-completion signal.
package gaio
import (
"container/heap"
"container/list"
"io"
"net"
"reflect"
"runtime"
"sync"
"sync/atomic"
"syscall"
"time"
)
var (
aiocbPool sync.Pool
)
func init() {
aiocbPool.New = func() interface{} {
return new(aiocb)
}
}
// fdDesc contains all data structures associated to fd
type fdDesc struct {
readers list.List // all read/write requests
writers list.List
ptr uintptr // pointer to net.Conn
r_armed bool
w_armed bool
}
// watcher will monitor events and process async-io request(s),
type watcher struct {
// poll fd
pfd *poller
// netpoll events
chEventNotify chan pollerEvents
// events from user
chPendingNotify chan struct{}
pendingCreate []*aiocb
pendingProcessing []*aiocb // swaped pending
pendingMutex sync.Mutex
recycles []*aiocb
// IO-completion events to user
chResults chan *aiocb
// internal buffer for reading
swapSize int // swap buffer capacity, triple buffer
swapBufferFront []byte
swapBufferMiddle []byte
swapBufferBack []byte
bufferOffset int // bufferOffset for current using one
shouldSwap int32 // atomic mark for swap
// loop cpu affinity
chCPUID chan int32
// loop related data structure
descs map[int]*fdDesc // all descriptors
connIdents map[uintptr]int // we must not hold net.Conn as key, for GC purpose
// for timeout operations which
// aiocb has non-zero deadline, either exists
// in timeouts & queue at any time
// or in neither of them.
timeouts timedHeap
timer *time.Timer
// for garbage collector
gc []net.Conn
gcMutex sync.Mutex
gcNotify chan struct{}
die chan struct{}
dieOnce sync.Once
}
// NewWatcher creates a management object for monitoring file descriptors
// with default internal buffer size - 64KB
func NewWatcher() (*Watcher, error) {
return NewWatcherSize(defaultInternalBufferSize)
}
// NewWatcherSize creates a management object for monitoring file descriptors.
// 'bufsize' sets the internal swap buffer size for Read() with nil, 2 slices with'bufsize'
// will be allocated for performance.
func NewWatcherSize(bufsize int) (*Watcher, error) {
w := new(watcher)
pfd, err := openPoll()
if err != nil {
return nil, err
}
w.pfd = pfd
// loop related chan
w.chCPUID = make(chan int32)
w.chEventNotify = make(chan pollerEvents)
w.chPendingNotify = make(chan struct{}, 1)
w.chResults = make(chan *aiocb, maxEvents*4)
w.die = make(chan struct{})
// swapBuffer for shared reading
w.swapSize = bufsize
w.swapBufferFront = make([]byte, bufsize)
w.swapBufferMiddle = make([]byte, bufsize)
w.swapBufferBack = make([]byte, bufsize)
// init loop related data structures
w.descs = make(map[int]*fdDesc)
w.connIdents = make(map[uintptr]int)
w.gcNotify = make(chan struct{}, 1)
w.timer = time.NewTimer(0)
go w.pfd.Wait(w.chEventNotify)
go w.loop()
// watcher finalizer for system resources
wrapper := &Watcher{watcher: w}
runtime.SetFinalizer(wrapper, func(wrapper *Watcher) {
wrapper.Close()
})
return wrapper, nil
}
// Set Poller Affinity for Epoll/Kqueue
func (w *watcher) SetPollerAffinity(cpuid int) (err error) {
if cpuid >= runtime.NumCPU() {
return ErrCPUID
}
// store and wakeup
atomic.StoreInt32(&w.pfd.cpuid, int32(cpuid))
w.pfd.wakeup()
return nil
}
// Set Loop Affinity for syscall.Read/syscall.Write
func (w *watcher) SetLoopAffinity(cpuid int) (err error) {
if cpuid >= runtime.NumCPU() {
return ErrCPUID
}
// sendchan
select {
case w.chCPUID <- int32(cpuid):
case <-w.die:
return ErrConnClosed
}
return nil
}
// Close stops monitoring on events for all connections
func (w *watcher) Close() (err error) {
w.dieOnce.Do(func() {
close(w.die)
err = w.pfd.Close()
})
return err
}
// notify new operations pending
func (w *watcher) notifyPending() {
select {
case w.chPendingNotify <- struct{}{}:
default:
}
}
// WaitIO blocks until any read/write completion, or error.
// An internal 'buf' returned or 'r []OpResult' are safe to use BEFORE next call to WaitIO().
func (w *watcher) WaitIO() (r []OpResult, err error) {
// recycle previous aiocb
for k := range w.recycles {
aiocbPool.Put(w.recycles[k])
}
w.recycles = w.recycles[:0]
for {
select {
case pcb := <-w.chResults:
r = append(r, OpResult{Operation: pcb.op, Conn: pcb.conn, IsSwapBuffer: pcb.useSwap, Buffer: pcb.buffer, Size: pcb.size, Error: pcb.err, Context: pcb.ctx})
w.recycles = append(w.recycles, pcb)
for len(w.chResults) > 0 {
pcb := <-w.chResults
r = append(r, OpResult{Operation: pcb.op, Conn: pcb.conn, IsSwapBuffer: pcb.useSwap, Buffer: pcb.buffer, Size: pcb.size, Error: pcb.err, Context: pcb.ctx})
w.recycles = append(w.recycles, pcb)
}
atomic.CompareAndSwapInt32(&w.shouldSwap, 0, 1)
return r, nil
case <-w.die:
return nil, ErrWatcherClosed
}
}
}
// Read submits an async read request on 'fd' with context 'ctx', using buffer 'buf'.
// 'buf' can be set to nil to use internal buffer.
// 'ctx' is the user-defined value passed through the gaio watcher unchanged.
func (w *watcher) Read(ctx interface{}, conn net.Conn, buf []byte) error {
return w.aioCreate(ctx, OpRead, conn, buf, zeroTime, false)
}
// ReadTimeout submits an async read request on 'fd' with context 'ctx', using buffer 'buf', and
// expects to read some bytes into the buffer before 'deadline'.
// 'ctx' is the user-defined value passed through the gaio watcher unchanged.
func (w *watcher) ReadTimeout(ctx interface{}, conn net.Conn, buf []byte, deadline time.Time) error {
return w.aioCreate(ctx, OpRead, conn, buf, deadline, false)
}
// ReadFull submits an async read request on 'fd' with context 'ctx', using buffer 'buf', and
// expects to fill the buffer before 'deadline'.
// 'ctx' is the user-defined value passed through the gaio watcher unchanged.
// 'buf' can't be nil in ReadFull.
func (w *watcher) ReadFull(ctx interface{}, conn net.Conn, buf []byte, deadline time.Time) error {
if len(buf) == 0 {
return ErrEmptyBuffer
}
return w.aioCreate(ctx, OpRead, conn, buf, deadline, true)
}
// Write submits an async write request on 'fd' with context 'ctx', using buffer 'buf'.
// 'ctx' is the user-defined value passed through the gaio watcher unchanged.
func (w *watcher) Write(ctx interface{}, conn net.Conn, buf []byte) error {
if len(buf) == 0 {
return ErrEmptyBuffer
}
return w.aioCreate(ctx, OpWrite, conn, buf, zeroTime, false)
}
// WriteTimeout submits an async write request on 'fd' with context 'ctx', using buffer 'buf', and
// expects to complete writing the buffer before 'deadline', 'buf' can be set to nil to use internal buffer.
// 'ctx' is the user-defined value passed through the gaio watcher unchanged.
func (w *watcher) WriteTimeout(ctx interface{}, conn net.Conn, buf []byte, deadline time.Time) error {
if len(buf) == 0 {
return ErrEmptyBuffer
}
return w.aioCreate(ctx, OpWrite, conn, buf, deadline, false)
}
// Free let the watcher to release resources related to this conn immediately,
// like socket file descriptors.
func (w *watcher) Free(conn net.Conn) error {
return w.aioCreate(nil, opDelete, conn, nil, zeroTime, false)
}
// core async-io creation
func (w *watcher) aioCreate(ctx interface{}, op OpType, conn net.Conn, buf []byte, deadline time.Time, readfull bool) error {
select {
case <-w.die:
return ErrWatcherClosed
default:
var ptr uintptr
if conn != nil && reflect.TypeOf(conn).Kind() == reflect.Ptr {
ptr = reflect.ValueOf(conn).Pointer()
} else {
return ErrUnsupported
}
cb := aiocbPool.Get().(*aiocb)
*cb = aiocb{op: op, ptr: ptr, size: 0, ctx: ctx, conn: conn, buffer: buf, deadline: deadline, readFull: readfull, idx: -1}
w.pendingMutex.Lock()
w.pendingCreate = append(w.pendingCreate, cb)
w.pendingMutex.Unlock()
w.notifyPending()
return nil
}
}
// tryRead will try to read data on aiocb and notify
func (w *watcher) tryRead(fd int, pcb *aiocb) bool {
buf := pcb.buffer
useSwap := false
backBuffer := false
if buf == nil { // internal or backBuffer
if atomic.CompareAndSwapInt32(&w.shouldSwap, 1, 0) {
w.swapBufferFront, w.swapBufferMiddle, w.swapBufferBack = w.swapBufferMiddle, w.swapBufferBack, w.swapBufferFront
w.bufferOffset = 0
}
buf = w.swapBufferFront[w.bufferOffset:]
if len(buf) > 0 {
useSwap = true
} else {
backBuffer = true
buf = pcb.backBuffer[:]
}
}
for {
nr, er := rawRead(fd, buf[pcb.size:])
if er == syscall.EAGAIN {
return false
}
// On MacOS we can see EINTR here if the user
// pressed ^Z.
if er == syscall.EINTR {
continue
}
// if er is nil, accumulate bytes read
if er == nil {
pcb.size += nr
}
pcb.err = er
// proper setting of EOF
if nr == 0 && er == nil {
pcb.err = io.EOF
}
break
}
if pcb.readFull { // read full operation
if pcb.err != nil {
return true
}
if pcb.size == len(pcb.buffer) {
return true
}
return false
}
if useSwap { // IO completed with internal buffer
pcb.useSwap = true
pcb.buffer = buf[:pcb.size] // set len to pcb.size
w.bufferOffset += pcb.size
} else if backBuffer { // internal buffer exhausted
pcb.buffer = buf
}
return true
}
func (w *watcher) tryWrite(fd int, pcb *aiocb) bool {
var nw int
var ew error
if pcb.buffer != nil {
for {
nw, ew = rawWrite(fd, pcb.buffer[pcb.size:])
pcb.err = ew
if ew == syscall.EAGAIN {
return false
}
if ew == syscall.EINTR {
continue
}
// if ew is nil, accumulate bytes written
if ew == nil {
pcb.size += nw
}
break
}
}
// all bytes written or has error
// nil buffer still returns
if pcb.size == len(pcb.buffer) || ew != nil {
return true
}
return false
}
// release connection related resources
func (w *watcher) releaseConn(ident int) {
if desc, ok := w.descs[ident]; ok {
// delete from heap
for e := desc.readers.Front(); e != nil; e = e.Next() {
tcb := e.Value.(*aiocb)
// notify caller
tcb.err = io.ErrClosedPipe
w.deliver(tcb)
}
for e := desc.writers.Front(); e != nil; e = e.Next() {
tcb := e.Value.(*aiocb)
tcb.err = io.ErrClosedPipe
w.deliver(tcb)
}
delete(w.descs, ident)
delete(w.connIdents, desc.ptr)
// close socket file descriptor duplicated from net.Conn
syscall.Close(ident)
}
}
// deliver function will try best to aggregate results for batch delivery
func (w *watcher) deliver(pcb *aiocb) {
if pcb.idx != -1 {
heap.Remove(&w.timeouts, pcb.idx)
}
select {
case w.chResults <- pcb:
case <-w.die:
}
}
// the core event loop of this watcher
func (w *watcher) loop() {
// defer function to release all resources
defer func() {
for ident := range w.descs {
w.releaseConn(ident)
}
}()
for {
select {
case <-w.chPendingNotify:
// swap w.pending with w.pending2
w.pendingMutex.Lock()
w.pendingCreate, w.pendingProcessing = w.pendingProcessing, w.pendingCreate
w.pendingCreate = w.pendingCreate[:0]
w.pendingMutex.Unlock()
w.handlePending(w.pendingProcessing)
case pe := <-w.chEventNotify: // poller events
w.handleEvents(pe)
case <-w.timer.C: // timeout heap
for w.timeouts.Len() > 0 {
now := time.Now()
pcb := w.timeouts[0]
if now.After(pcb.deadline) {
// ErrDeadline
pcb.err = ErrDeadline
// remove from list
pcb.l.Remove(pcb.elem)
w.deliver(pcb)
} else {
w.timer.Reset(pcb.deadline.Sub(now))
break
}
}
case <-w.gcNotify: // gc recycled net.Conn
w.gcMutex.Lock()
for i, c := range w.gc {
ptr := reflect.ValueOf(c).Pointer()
if ident, ok := w.connIdents[ptr]; ok {
// since it's gc-ed, queue is impossible to hold net.Conn
// we don't have to send to chIOCompletion,just release here
w.releaseConn(ident)
}
w.gc[i] = nil
}
w.gc = w.gc[:0]
w.gcMutex.Unlock()
case cpuid := <-w.chCPUID:
setAffinity(cpuid)
case <-w.die:
return
}
}
}
// for loop handling pending requests
func (w *watcher) handlePending(pending []*aiocb) {
for _, pcb := range pending {
ident, ok := w.connIdents[pcb.ptr]
// resource releasing operation
if pcb.op == opDelete && ok {
w.releaseConn(ident)
continue
}
// handling new connection
var desc *fdDesc
if ok {
desc = w.descs[ident]
} else {
if dupfd, err := dupconn(pcb.conn); err != nil {
// unexpected situation, should notify caller if we cannot dup(2)
pcb.err = err
w.deliver(pcb)
continue
} else {
// as we duplicated successfully, we're safe to
// close the original connection
pcb.conn.Close()
// assign idents
ident = dupfd
werr := w.pfd.Watch(ident)
if werr != nil {
pcb.err = werr
w.deliver(pcb)
continue
}
// file description bindings
desc = &fdDesc{ptr: pcb.ptr}
w.descs[ident] = desc
w.connIdents[pcb.ptr] = ident
// the conn is still useful for GC finalizer.
// note finalizer function cannot hold reference to net.Conn,
// if not it will never be GC-ed.
runtime.SetFinalizer(pcb.conn, func(c net.Conn) {
w.gcMutex.Lock()
w.gc = append(w.gc, c)
w.gcMutex.Unlock()
// notify gc processor
select {
case w.gcNotify <- struct{}{}:
default:
}
})
}
}
// operations splitted into different buckets
switch pcb.op {
case OpRead:
// try immediately queue is empty
if desc.readers.Len() == 0 {
if w.tryRead(ident, pcb) {
w.deliver(pcb)
continue
}
}
// enqueue for poller events
pcb.l = &desc.readers
pcb.elem = pcb.l.PushBack(pcb)
if !desc.r_armed {
desc.r_armed = true
}
case OpWrite:
if desc.writers.Len() == 0 {
if w.tryWrite(ident, pcb) {
w.deliver(pcb)
continue
}
}
pcb.l = &desc.writers
pcb.elem = pcb.l.PushBack(pcb)
if !desc.w_armed {
desc.w_armed = true
}
}
// try rearm descriptor
w.pfd.Rearm(ident, desc.r_armed, desc.w_armed)
// push to heap for timeout operation
if !pcb.deadline.IsZero() {
heap.Push(&w.timeouts, pcb)
if w.timeouts.Len() == 1 {
w.timer.Reset(time.Until(pcb.deadline))
}
}
}
}
// handle poller events
func (w *watcher) handleEvents(pe pollerEvents) {
// suppose fd(s) being polled is closed by conn.Close() from outside after chanrecv,
// and a new conn has re-opened with the same handler number(fd). The read and write
// on this fd is fatal.
//
// Note poller will remove closed fd automatically epoll(7), kqueue(2) and silently.
// To solve this problem watcher will dup() a new fd from net.Conn, which uniquely
// identified by 'e.ident', all library operation will be based on 'e.ident',
// then IO operation is impossible to misread or miswrite on re-created fd.
//log.Println(e)
for _, e := range pe {
if desc, ok := w.descs[e.ident]; ok {
if e.ev&EV_READ != 0 {
desc.r_armed = false
var next *list.Element
for elem := desc.readers.Front(); elem != nil; elem = next {
next = elem.Next()
pcb := elem.Value.(*aiocb)
if w.tryRead(e.ident, pcb) {
w.deliver(pcb)
desc.readers.Remove(elem)
} else {
break
}
}
if desc.readers.Len() > 0 {
desc.r_armed = true
}
}
if e.ev&EV_WRITE != 0 {
desc.w_armed = false
var next *list.Element
for elem := desc.writers.Front(); elem != nil; elem = next {
next = elem.Next()
pcb := elem.Value.(*aiocb)
if w.tryWrite(e.ident, pcb) {
w.deliver(pcb)
desc.writers.Remove(elem)
} else {
break
}
}
if desc.writers.Len() > 0 {
desc.w_armed = true
}
}
if desc.r_armed || desc.w_armed {
w.pfd.Rearm(e.ident, desc.r_armed, desc.w_armed)
}
}
}
}