-
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
/
Futex.zig
1064 lines (912 loc) · 42.2 KB
/
Futex.zig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! Futex is a mechanism used to block (`wait`) and unblock (`wake`) threads using a 32bit memory address as hints.
//! Blocking a thread is acknowledged only if the 32bit memory address is equal to a given value.
//! This check helps avoid block/unblock deadlocks which occur if a `wake()` happens before a `wait()`.
//! Using Futex, other Thread synchronization primitives can be built which efficiently wait for cross-thread events or signals.
const std = @import("../std.zig");
const builtin = @import("builtin");
const Futex = @This();
const os = std.os;
const assert = std.debug.assert;
const testing = std.testing;
const Atomic = std.atomic.Atomic;
/// Checks if `ptr` still contains the value `expect` and, if so, blocks the caller until either:
/// - The value at `ptr` is no longer equal to `expect`.
/// - The caller is unblocked by a matching `wake()`.
/// - The caller is unblocked spuriously ("at random").
///
/// The checking of `ptr` and `expect`, along with blocking the caller, is done atomically
/// and totally ordered (sequentially consistent) with respect to other wait()/wake() calls on the same `ptr`.
pub fn wait(ptr: *const Atomic(u32), expect: u32) void {
@setCold(true);
Impl.wait(ptr, expect, null) catch |err| switch (err) {
error.Timeout => unreachable, // null timeout meant to wait forever
};
}
/// Checks if `ptr` still contains the value `expect` and, if so, blocks the caller until either:
/// - The value at `ptr` is no longer equal to `expect`.
/// - The caller is unblocked by a matching `wake()`.
/// - The caller is unblocked spuriously ("at random").
/// - The caller blocks for longer than the given timeout. In which case, `error.Timeout` is returned.
///
/// The checking of `ptr` and `expect`, along with blocking the caller, is done atomically
/// and totally ordered (sequentially consistent) with respect to other wait()/wake() calls on the same `ptr`.
pub fn timedWait(ptr: *const Atomic(u32), expect: u32, timeout_ns: u64) error{Timeout}!void {
@setCold(true);
// Avoid calling into the OS for no-op timeouts.
if (timeout_ns == 0) {
if (ptr.load(.SeqCst) != expect) return;
return error.Timeout;
}
return Impl.wait(ptr, expect, timeout_ns);
}
/// Unblocks at most `max_waiters` callers blocked in a `wait()` call on `ptr`.
pub fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
@setCold(true);
// Avoid calling into the OS if there's nothing to wake up.
if (max_waiters == 0) {
return;
}
Impl.wake(ptr, max_waiters);
}
const Impl = if (builtin.single_threaded)
SingleThreadedImpl
else if (builtin.os.tag == .windows)
WindowsImpl
else if (builtin.os.tag.isDarwin())
DarwinImpl
else if (builtin.os.tag == .linux)
LinuxImpl
else if (builtin.os.tag == .freebsd)
FreebsdImpl
else if (builtin.os.tag == .openbsd)
OpenbsdImpl
else if (builtin.os.tag == .dragonfly)
DragonflyImpl
else if (builtin.target.isWasm())
WasmImpl
else if (std.Thread.use_pthreads)
PosixImpl
else
UnsupportedImpl;
/// We can't do @compileError() in the `Impl` switch statement above as its eagerly evaluated.
/// So instead, we @compileError() on the methods themselves for platforms which don't support futex.
const UnsupportedImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
return unsupported(.{ ptr, expect, timeout });
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
return unsupported(.{ ptr, max_waiters });
}
fn unsupported(unused: anytype) noreturn {
_ = unused;
@compileError("Unsupported operating system " ++ @tagName(builtin.target.os.tag));
}
};
const SingleThreadedImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
if (ptr.loadUnchecked() != expect) {
return;
}
// There are no threads to wake us up.
// So if we wait without a timeout we would never wake up.
const delay = timeout orelse {
unreachable; // deadlock detected
};
std.time.sleep(delay);
return error.Timeout;
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
// There are no other threads to possibly wake up
_ = ptr;
_ = max_waiters;
}
};
// We use WaitOnAddress through NtDll instead of API-MS-Win-Core-Synch-l1-2-0.dll
// as it's generally already a linked target and is autoloaded into all processes anyway.
const WindowsImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
var timeout_value: os.windows.LARGE_INTEGER = undefined;
var timeout_ptr: ?*const os.windows.LARGE_INTEGER = null;
// NTDLL functions work with time in units of 100 nanoseconds.
// Positive values are absolute deadlines while negative values are relative durations.
if (timeout) |delay| {
timeout_value = @as(os.windows.LARGE_INTEGER, @intCast(delay / 100));
timeout_value = -timeout_value;
timeout_ptr = &timeout_value;
}
const rc = os.windows.ntdll.RtlWaitOnAddress(
@as(?*const anyopaque, @ptrCast(ptr)),
@as(?*const anyopaque, @ptrCast(&expect)),
@sizeOf(@TypeOf(expect)),
timeout_ptr,
);
switch (rc) {
.SUCCESS => {},
.TIMEOUT => {
assert(timeout != null);
return error.Timeout;
},
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
const address = @as(?*const anyopaque, @ptrCast(ptr));
assert(max_waiters != 0);
switch (max_waiters) {
1 => os.windows.ntdll.RtlWakeAddressSingle(address),
else => os.windows.ntdll.RtlWakeAddressAll(address),
}
}
};
const DarwinImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
// Darwin XNU 7195.50.7.100.1 introduced __ulock_wait2 and migrated code paths (notably pthread_cond_t) towards it:
// https://github.com/apple/darwin-xnu/commit/d4061fb0260b3ed486147341b72468f836ed6c8f#diff-08f993cc40af475663274687b7c326cc6c3031e0db3ac8de7b24624610616be6
//
// This XNU version appears to correspond to 11.0.1:
// https://kernelshaman.blogspot.com/2021/01/building-xnu-for-macos-big-sur-1101.html
//
// ulock_wait() uses 32-bit micro-second timeouts where 0 = INFINITE or no-timeout
// ulock_wait2() uses 64-bit nano-second timeouts (with the same convention)
const supports_ulock_wait2 = builtin.target.os.version_range.semver.min.major >= 11;
var timeout_ns: u64 = 0;
if (timeout) |delay| {
assert(delay != 0); // handled by timedWait()
timeout_ns = delay;
}
// If we're using `__ulock_wait` and `timeout` is too big to fit inside a `u32` count of
// micro-seconds (around 70min), we'll request a shorter timeout. This is fine (users
// should handle spurious wakeups), but we need to remember that we did so, so that
// we don't return `Timeout` incorrectly. If that happens, we set this variable to
// true so that we we know to ignore the ETIMEDOUT result.
var timeout_overflowed = false;
const addr = @as(*const anyopaque, @ptrCast(ptr));
const flags = os.darwin.UL_COMPARE_AND_WAIT | os.darwin.ULF_NO_ERRNO;
const status = blk: {
if (supports_ulock_wait2) {
break :blk os.darwin.__ulock_wait2(flags, addr, expect, timeout_ns, 0);
}
const timeout_us = std.math.cast(u32, timeout_ns / std.time.ns_per_us) orelse overflow: {
timeout_overflowed = true;
break :overflow std.math.maxInt(u32);
};
break :blk os.darwin.__ulock_wait(flags, addr, expect, timeout_us);
};
if (status >= 0) return;
switch (@as(std.os.E, @enumFromInt(-status))) {
// Wait was interrupted by the OS or other spurious signalling.
.INTR => {},
// Address of the futex was paged out. This is unlikely, but possible in theory, and
// pthread/libdispatch on darwin bother to handle it. In this case we'll return
// without waiting, but the caller should retry anyway.
.FAULT => {},
// Only report Timeout if we didn't have to cap the timeout
.TIMEDOUT => {
assert(timeout != null);
if (!timeout_overflowed) return error.Timeout;
},
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
var flags: u32 = os.darwin.UL_COMPARE_AND_WAIT | os.darwin.ULF_NO_ERRNO;
if (max_waiters > 1) {
flags |= os.darwin.ULF_WAKE_ALL;
}
while (true) {
const addr = @as(*const anyopaque, @ptrCast(ptr));
const status = os.darwin.__ulock_wake(flags, addr, 0);
if (status >= 0) return;
switch (@as(std.os.E, @enumFromInt(-status))) {
.INTR => continue, // spurious wake()
.FAULT => unreachable, // __ulock_wake doesn't generate EFAULT according to darwin pthread_cond_t
.NOENT => return, // nothing was woken up
.ALREADY => unreachable, // only for ULF_WAKE_THREAD
else => unreachable,
}
}
}
};
// https://man7.org/linux/man-pages/man2/futex.2.html
const LinuxImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
var ts: os.timespec = undefined;
if (timeout) |timeout_ns| {
ts.tv_sec = @as(@TypeOf(ts.tv_sec), @intCast(timeout_ns / std.time.ns_per_s));
ts.tv_nsec = @as(@TypeOf(ts.tv_nsec), @intCast(timeout_ns % std.time.ns_per_s));
}
const rc = os.linux.futex_wait(
@as(*const i32, @ptrCast(&ptr.value)),
os.linux.FUTEX.PRIVATE_FLAG | os.linux.FUTEX.WAIT,
@as(i32, @bitCast(expect)),
if (timeout != null) &ts else null,
);
switch (os.linux.getErrno(rc)) {
.SUCCESS => {}, // notified by `wake()`
.INTR => {}, // spurious wakeup
.AGAIN => {}, // ptr.* != expect
.TIMEDOUT => {
assert(timeout != null);
return error.Timeout;
},
.INVAL => {}, // possibly timeout overflow
.FAULT => unreachable, // ptr was invalid
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
const rc = os.linux.futex_wake(
@as(*const i32, @ptrCast(&ptr.value)),
os.linux.FUTEX.PRIVATE_FLAG | os.linux.FUTEX.WAKE,
std.math.cast(i32, max_waiters) orelse std.math.maxInt(i32),
);
switch (os.linux.getErrno(rc)) {
.SUCCESS => {}, // successful wake up
.INVAL => {}, // invalid futex_wait() on ptr done elsewhere
.FAULT => {}, // pointer became invalid while doing the wake
else => unreachable,
}
}
};
// https://www.freebsd.org/cgi/man.cgi?query=_umtx_op&sektion=2&n=1
const FreebsdImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
var tm_size: usize = 0;
var tm: os.freebsd._umtx_time = undefined;
var tm_ptr: ?*const os.freebsd._umtx_time = null;
if (timeout) |timeout_ns| {
tm_ptr = &tm;
tm_size = @sizeOf(@TypeOf(tm));
tm._flags = 0; // use relative time not UMTX_ABSTIME
tm._clockid = os.CLOCK.MONOTONIC;
tm._timeout.tv_sec = @as(@TypeOf(tm._timeout.tv_sec), @intCast(timeout_ns / std.time.ns_per_s));
tm._timeout.tv_nsec = @as(@TypeOf(tm._timeout.tv_nsec), @intCast(timeout_ns % std.time.ns_per_s));
}
const rc = os.freebsd._umtx_op(
@intFromPtr(&ptr.value),
@intFromEnum(os.freebsd.UMTX_OP.WAIT_UINT_PRIVATE),
@as(c_ulong, expect),
tm_size,
@intFromPtr(tm_ptr),
);
switch (os.errno(rc)) {
.SUCCESS => {},
.FAULT => unreachable, // one of the args points to invalid memory
.INVAL => unreachable, // arguments should be correct
.TIMEDOUT => {
assert(timeout != null);
return error.Timeout;
},
.INTR => {}, // spurious wake
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
const rc = os.freebsd._umtx_op(
@intFromPtr(&ptr.value),
@intFromEnum(os.freebsd.UMTX_OP.WAKE_PRIVATE),
@as(c_ulong, max_waiters),
0, // there is no timeout struct
0, // there is no timeout struct pointer
);
switch (os.errno(rc)) {
.SUCCESS => {},
.FAULT => {}, // it's ok if the ptr doesn't point to valid memory
.INVAL => unreachable, // arguments should be correct
else => unreachable,
}
}
};
// https://man.openbsd.org/futex.2
const OpenbsdImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
var ts: os.timespec = undefined;
if (timeout) |timeout_ns| {
ts.tv_sec = @as(@TypeOf(ts.tv_sec), @intCast(timeout_ns / std.time.ns_per_s));
ts.tv_nsec = @as(@TypeOf(ts.tv_nsec), @intCast(timeout_ns % std.time.ns_per_s));
}
const rc = os.openbsd.futex(
@as(*const volatile u32, @ptrCast(&ptr.value)),
os.openbsd.FUTEX_WAIT | os.openbsd.FUTEX_PRIVATE_FLAG,
@as(c_int, @bitCast(expect)),
if (timeout != null) &ts else null,
null, // FUTEX_WAIT takes no requeue address
);
switch (os.errno(rc)) {
.SUCCESS => {}, // woken up by wake
.NOSYS => unreachable, // the futex operation shouldn't be invalid
.FAULT => unreachable, // ptr was invalid
.AGAIN => {}, // ptr != expect
.INVAL => unreachable, // invalid timeout
.TIMEDOUT => {
assert(timeout != null);
return error.Timeout;
},
.INTR => {}, // spurious wake from signal
.CANCELED => {}, // spurious wake from signal with SA_RESTART
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
const rc = os.openbsd.futex(
@as(*const volatile u32, @ptrCast(&ptr.value)),
os.openbsd.FUTEX_WAKE | os.openbsd.FUTEX_PRIVATE_FLAG,
std.math.cast(c_int, max_waiters) orelse std.math.maxInt(c_int),
null, // FUTEX_WAKE takes no timeout ptr
null, // FUTEX_WAKE takes no requeue address
);
// returns number of threads woken up.
assert(rc >= 0);
}
};
// https://man.dragonflybsd.org/?command=umtx§ion=2
const DragonflyImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
// Dragonfly uses a scheme where 0 timeout means wait until signaled or spurious wake.
// It's reporting of timeout's is also unrealiable so we use an external timing source (Timer) instead.
var timeout_us: c_int = 0;
var timeout_overflowed = false;
var sleep_timer: std.time.Timer = undefined;
if (timeout) |delay| {
assert(delay != 0); // handled by timedWait().
timeout_us = std.math.cast(c_int, delay / std.time.ns_per_us) orelse blk: {
timeout_overflowed = true;
break :blk std.math.maxInt(c_int);
};
// Only need to record the start time if we can provide somewhat accurate error.Timeout's
if (!timeout_overflowed) {
sleep_timer = std.time.Timer.start() catch unreachable;
}
}
const value = @as(c_int, @bitCast(expect));
const addr = @as(*const volatile c_int, @ptrCast(&ptr.value));
const rc = os.dragonfly.umtx_sleep(addr, value, timeout_us);
switch (os.errno(rc)) {
.SUCCESS => {},
.BUSY => {}, // ptr != expect
.AGAIN => { // maybe timed out, or paged out, or hit 2s kernel refresh
if (timeout) |timeout_ns| {
// Report error.Timeout only if we know the timeout duration has passed.
// If not, there's not much choice other than treating it as a spurious wake.
if (!timeout_overflowed and sleep_timer.read() >= timeout_ns) {
return error.Timeout;
}
}
},
.INTR => {}, // spurious wake
.INVAL => unreachable, // invalid timeout
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
// A count of zero means wake all waiters.
assert(max_waiters != 0);
const to_wake = std.math.cast(c_int, max_waiters) orelse 0;
// https://man.dragonflybsd.org/?command=umtx§ion=2
// > umtx_wakeup() will generally return 0 unless the address is bad.
// We are fine with the address being bad (e.g. for Semaphore.post() where Semaphore.wait() frees the Semaphore)
const addr = @as(*const volatile c_int, @ptrCast(&ptr.value));
_ = os.dragonfly.umtx_wakeup(addr, to_wake);
}
};
const WasmImpl = struct {
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
if (!comptime std.Target.wasm.featureSetHas(builtin.target.cpu.features, .atomics)) {
@compileError("WASI target missing cpu feature 'atomics'");
}
const to: i64 = if (timeout) |to| @intCast(to) else -1;
const result = asm (
\\local.get %[ptr]
\\local.get %[expected]
\\local.get %[timeout]
\\memory.atomic.wait32 0
\\local.set %[ret]
: [ret] "=r" (-> u32),
: [ptr] "r" (&ptr.value),
[expected] "r" (@as(i32, @bitCast(expect))),
[timeout] "r" (to),
);
switch (result) {
0 => {}, // ok
1 => {}, // expected =! loaded
2 => return error.Timeout,
else => unreachable,
}
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
if (!comptime std.Target.wasm.featureSetHas(builtin.target.cpu.features, .atomics)) {
@compileError("WASI target missing cpu feature 'atomics'");
}
assert(max_waiters != 0);
const woken_count = asm (
\\local.get %[ptr]
\\local.get %[waiters]
\\memory.atomic.notify 0
\\local.set %[ret]
: [ret] "=r" (-> u32),
: [ptr] "r" (&ptr.value),
[waiters] "r" (max_waiters),
);
_ = woken_count; // can be 0 when linker flag 'shared-memory' is not enabled
}
};
/// Modified version of linux's futex and Go's sema to implement userspace wait queues with pthread:
/// https://code.woboq.org/linux/linux/kernel/futex.c.html
/// https://go.dev/src/runtime/sema.go
const PosixImpl = struct {
const Event = struct {
cond: std.c.pthread_cond_t,
mutex: std.c.pthread_mutex_t,
state: enum { empty, waiting, notified },
fn init(self: *Event) void {
// Use static init instead of pthread_cond/mutex_init() since this is generally faster.
self.cond = .{};
self.mutex = .{};
self.state = .empty;
}
fn deinit(self: *Event) void {
// Some platforms reportedly give EINVAL for statically initialized pthread types.
const rc = std.c.pthread_cond_destroy(&self.cond);
assert(rc == .SUCCESS or rc == .INVAL);
const rm = std.c.pthread_mutex_destroy(&self.mutex);
assert(rm == .SUCCESS or rm == .INVAL);
self.* = undefined;
}
fn wait(self: *Event, timeout: ?u64) error{Timeout}!void {
assert(std.c.pthread_mutex_lock(&self.mutex) == .SUCCESS);
defer assert(std.c.pthread_mutex_unlock(&self.mutex) == .SUCCESS);
// Early return if the event was already set.
if (self.state == .notified) {
return;
}
// Compute the absolute timeout if one was specified.
// POSIX requires that REALTIME is used by default for the pthread timedwait functions.
// This can be changed with pthread_condattr_setclock, but it's an extension and may not be available everywhere.
var ts: os.timespec = undefined;
if (timeout) |timeout_ns| {
os.clock_gettime(os.CLOCK.REALTIME, &ts) catch unreachable;
ts.tv_sec +|= @as(@TypeOf(ts.tv_sec), @intCast(timeout_ns / std.time.ns_per_s));
ts.tv_nsec += @as(@TypeOf(ts.tv_nsec), @intCast(timeout_ns % std.time.ns_per_s));
if (ts.tv_nsec >= std.time.ns_per_s) {
ts.tv_sec +|= 1;
ts.tv_nsec -= std.time.ns_per_s;
}
}
// Start waiting on the event - there can be only one thread waiting.
assert(self.state == .empty);
self.state = .waiting;
while (true) {
// Block using either pthread_cond_wait or pthread_cond_timewait if there's an absolute timeout.
const rc = blk: {
if (timeout == null) break :blk std.c.pthread_cond_wait(&self.cond, &self.mutex);
break :blk std.c.pthread_cond_timedwait(&self.cond, &self.mutex, &ts);
};
// After waking up, check if the event was set.
if (self.state == .notified) {
return;
}
assert(self.state == .waiting);
switch (rc) {
.SUCCESS => {},
.TIMEDOUT => {
// If timed out, reset the event to avoid the set() thread doing an unnecessary signal().
self.state = .empty;
return error.Timeout;
},
.INVAL => unreachable, // cond, mutex, and potentially ts should all be valid
.PERM => unreachable, // mutex is locked when cond_*wait() functions are called
else => unreachable,
}
}
}
fn set(self: *Event) void {
assert(std.c.pthread_mutex_lock(&self.mutex) == .SUCCESS);
defer assert(std.c.pthread_mutex_unlock(&self.mutex) == .SUCCESS);
// Make sure that multiple calls to set() were not done on the same Event.
const old_state = self.state;
assert(old_state != .notified);
// Mark the event as set and wake up the waiting thread if there was one.
// This must be done while the mutex as the wait() thread could deallocate
// the condition variable once it observes the new state, potentially causing a UAF if done unlocked.
self.state = .notified;
if (old_state == .waiting) {
assert(std.c.pthread_cond_signal(&self.cond) == .SUCCESS);
}
}
};
const Treap = std.Treap(usize, std.math.order);
const Waiter = struct {
node: Treap.Node,
prev: ?*Waiter,
next: ?*Waiter,
tail: ?*Waiter,
is_queued: bool,
event: Event,
};
// An unordered set of Waiters
const WaitList = struct {
top: ?*Waiter = null,
len: usize = 0,
fn push(self: *WaitList, waiter: *Waiter) void {
waiter.next = self.top;
self.top = waiter;
self.len += 1;
}
fn pop(self: *WaitList) ?*Waiter {
const waiter = self.top orelse return null;
self.top = waiter.next;
self.len -= 1;
return waiter;
}
};
const WaitQueue = struct {
fn insert(treap: *Treap, address: usize, waiter: *Waiter) void {
// prepare the waiter to be inserted.
waiter.next = null;
waiter.is_queued = true;
// Find the wait queue entry associated with the address.
// If there isn't a wait queue on the address, this waiter creates the queue.
var entry = treap.getEntryFor(address);
const entry_node = entry.node orelse {
waiter.prev = null;
waiter.tail = waiter;
entry.set(&waiter.node);
return;
};
// There's a wait queue on the address; get the queue head and tail.
const head = @fieldParentPtr(Waiter, "node", entry_node);
const tail = head.tail orelse unreachable;
// Push the waiter to the tail by replacing it and linking to the previous tail.
head.tail = waiter;
tail.next = waiter;
waiter.prev = tail;
}
fn remove(treap: *Treap, address: usize, max_waiters: usize) WaitList {
// Find the wait queue associated with this address and get the head/tail if any.
var entry = treap.getEntryFor(address);
var queue_head = if (entry.node) |node| @fieldParentPtr(Waiter, "node", node) else null;
const queue_tail = if (queue_head) |head| head.tail else null;
// Once we're done updating the head, fix it's tail pointer and update the treap's queue head as well.
defer entry.set(blk: {
const new_head = queue_head orelse break :blk null;
new_head.tail = queue_tail;
break :blk &new_head.node;
});
var removed = WaitList{};
while (removed.len < max_waiters) {
// dequeue and collect waiters from their wait queue.
const waiter = queue_head orelse break;
queue_head = waiter.next;
removed.push(waiter);
// When dequeueing, we must mark is_queued as false.
// This ensures that a waiter which calls tryRemove() returns false.
assert(waiter.is_queued);
waiter.is_queued = false;
}
return removed;
}
fn tryRemove(treap: *Treap, address: usize, waiter: *Waiter) bool {
if (!waiter.is_queued) {
return false;
}
queue_remove: {
// Find the wait queue associated with the address.
var entry = blk: {
// A waiter without a previous link means it's the queue head that's in the treap so we can avoid lookup.
if (waiter.prev == null) {
assert(waiter.node.key == address);
break :blk treap.getEntryForExisting(&waiter.node);
}
break :blk treap.getEntryFor(address);
};
// The queue head and tail must exist if we're removing a queued waiter.
const head = @fieldParentPtr(Waiter, "node", entry.node orelse unreachable);
const tail = head.tail orelse unreachable;
// A waiter with a previous link is never the head of the queue.
if (waiter.prev) |prev| {
assert(waiter != head);
prev.next = waiter.next;
// A waiter with both a previous and next link is in the middle.
// We only need to update the surrounding waiter's links to remove it.
if (waiter.next) |next| {
assert(waiter != tail);
next.prev = waiter.prev;
break :queue_remove;
}
// A waiter with a previous but no next link means it's the tail of the queue.
// In that case, we need to update the head's tail reference.
assert(waiter == tail);
head.tail = waiter.prev;
break :queue_remove;
}
// A waiter with no previous link means it's the queue head of queue.
// We must replace (or remove) the head waiter reference in the treap.
assert(waiter == head);
entry.set(blk: {
const new_head = waiter.next orelse break :blk null;
new_head.tail = head.tail;
break :blk &new_head.node;
});
}
// Mark the waiter as successfully removed.
waiter.is_queued = false;
return true;
}
};
const Bucket = struct {
mutex: std.c.pthread_mutex_t align(std.atomic.cache_line) = .{},
pending: Atomic(usize) = Atomic(usize).init(0),
treap: Treap = .{},
// Global array of buckets that addresses map to.
// Bucket array size is pretty much arbitrary here, but it must be a power of two for fibonacci hashing.
var buckets = [_]Bucket{.{}} ** @bitSizeOf(usize);
// https://github.com/Amanieu/parking_lot/blob/1cf12744d097233316afa6c8b7d37389e4211756/core/src/parking_lot.rs#L343-L353
fn from(address: usize) *Bucket {
// The upper `@bitSizeOf(usize)` bits of the fibonacci golden ratio.
// Hashing this via (h * k) >> (64 - b) where k=golden-ration and b=bitsize-of-array
// evenly lays out h=hash values over the bit range even when the hash has poor entropy (identity-hash for pointers).
const max_multiplier_bits = @bitSizeOf(usize);
const fibonacci_multiplier = 0x9E3779B97F4A7C15 >> (64 - max_multiplier_bits);
const max_bucket_bits = @ctz(buckets.len);
comptime assert(std.math.isPowerOfTwo(buckets.len));
const index = (address *% fibonacci_multiplier) >> (max_multiplier_bits - max_bucket_bits);
return &buckets[index];
}
};
const Address = struct {
fn from(ptr: *const Atomic(u32)) usize {
// Get the alignment of the pointer.
const alignment = @alignOf(Atomic(u32));
comptime assert(std.math.isPowerOfTwo(alignment));
// Make sure the pointer is aligned,
// then cut off the zero bits from the alignment to get the unique address.
const addr = @intFromPtr(ptr);
assert(addr & (alignment - 1) == 0);
return addr >> @ctz(@as(usize, alignment));
}
};
fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
const address = Address.from(ptr);
const bucket = Bucket.from(address);
// Announce that there's a waiter in the bucket before checking the ptr/expect condition.
// If the announcement is reordered after the ptr check, the waiter could deadlock:
//
// - T1: checks ptr == expect which is true
// - T2: updates ptr to != expect
// - T2: does Futex.wake(), sees no pending waiters, exits
// - T1: bumps pending waiters (was reordered after the ptr == expect check)
// - T1: goes to sleep and misses both the ptr change and T2's wake up
//
// SeqCst as Acquire barrier to ensure the announcement happens before the ptr check below.
// SeqCst as shared modification order to form a happens-before edge with the fence(.SeqCst)+load() in wake().
var pending = bucket.pending.fetchAdd(1, .SeqCst);
assert(pending < std.math.maxInt(usize));
// If the wait gets cancelled, remove the pending count we previously added.
// This is done outside the mutex lock to keep the critical section short in case of contention.
var cancelled = false;
defer if (cancelled) {
pending = bucket.pending.fetchSub(1, .Monotonic);
assert(pending > 0);
};
var waiter: Waiter = undefined;
{
assert(std.c.pthread_mutex_lock(&bucket.mutex) == .SUCCESS);
defer assert(std.c.pthread_mutex_unlock(&bucket.mutex) == .SUCCESS);
cancelled = ptr.load(.Monotonic) != expect;
if (cancelled) {
return;
}
waiter.event.init();
WaitQueue.insert(&bucket.treap, address, &waiter);
}
defer {
assert(!waiter.is_queued);
waiter.event.deinit();
}
waiter.event.wait(timeout) catch {
// If we fail to cancel after a timeout, it means a wake() thread dequeued us and will wake us up.
// We must wait until the event is set as that's a signal that the wake() thread won't access the waiter memory anymore.
// If we return early without waiting, the waiter on the stack would be invalidated and the wake() thread risks a UAF.
defer if (!cancelled) waiter.event.wait(null) catch unreachable;
assert(std.c.pthread_mutex_lock(&bucket.mutex) == .SUCCESS);
defer assert(std.c.pthread_mutex_unlock(&bucket.mutex) == .SUCCESS);
cancelled = WaitQueue.tryRemove(&bucket.treap, address, &waiter);
if (cancelled) {
return error.Timeout;
}
};
}
fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
const address = Address.from(ptr);
const bucket = Bucket.from(address);
// Quick check if there's even anything to wake up.
// The change to the ptr's value must happen before we check for pending waiters.
// If not, the wake() thread could miss a sleeping waiter and have it deadlock:
//
// - T2: p = has pending waiters (reordered before the ptr update)
// - T1: bump pending waiters
// - T1: if ptr == expected: sleep()
// - T2: update ptr != expected
// - T2: p is false from earlier so doesn't wake (T1 missed ptr update and T2 missed T1 sleeping)
//
// What we really want here is a Release load, but that doesn't exist under the C11 memory model.
// We could instead do `bucket.pending.fetchAdd(0, Release) == 0` which achieves effectively the same thing,
// but the RMW operation unconditionally marks the cache-line as modified for others causing unnecessary fetching/contention.
//
// Instead we opt to do a full-fence + load instead which avoids taking ownership of the cache-line.
// fence(SeqCst) effectively converts the ptr update to SeqCst and the pending load to SeqCst: creating a Store-Load barrier.
//
// The pending count increment in wait() must also now use SeqCst for the update + this pending load
// to be in the same modification order as our load isn't using Release/Acquire to guarantee it.
bucket.pending.fence(.SeqCst);
if (bucket.pending.load(.Monotonic) == 0) {
return;
}
// Keep a list of all the waiters notified and wake then up outside the mutex critical section.
var notified = WaitList{};
defer if (notified.len > 0) {
const pending = bucket.pending.fetchSub(notified.len, .Monotonic);
assert(pending >= notified.len);
while (notified.pop()) |waiter| {
assert(!waiter.is_queued);
waiter.event.set();
}
};
assert(std.c.pthread_mutex_lock(&bucket.mutex) == .SUCCESS);
defer assert(std.c.pthread_mutex_unlock(&bucket.mutex) == .SUCCESS);
// Another pending check again to avoid the WaitQueue lookup if not necessary.
if (bucket.pending.load(.Monotonic) > 0) {
notified = WaitQueue.remove(&bucket.treap, address, max_waiters);
}
}
};
test "Futex - smoke test" {
var value = Atomic(u32).init(0);
// Try waits with invalid values.
Futex.wait(&value, 0xdeadbeef);
Futex.timedWait(&value, 0xdeadbeef, 0) catch {};
// Try timeout waits.
try testing.expectError(error.Timeout, Futex.timedWait(&value, 0, 0));
try testing.expectError(error.Timeout, Futex.timedWait(&value, 0, std.time.ns_per_ms));
// Try wakes
Futex.wake(&value, 0);
Futex.wake(&value, 1);
Futex.wake(&value, std.math.maxInt(u32));
}
test "Futex - signaling" {
// This test requires spawning threads
if (builtin.single_threaded) {
return error.SkipZigTest;
}
const num_threads = 4;
const num_iterations = 4;
const Paddle = struct {
value: Atomic(u32) = Atomic(u32).init(0),
current: u32 = 0,
fn hit(self: *@This()) void {
_ = self.value.fetchAdd(1, .Release);
Futex.wake(&self.value, 1);
}
fn run(self: *@This(), hit_to: *@This()) !void {
while (self.current < num_iterations) {
// Wait for the value to change from hit()
var new_value: u32 = undefined;
while (true) {
new_value = self.value.load(.Acquire);
if (new_value != self.current) break;
Futex.wait(&self.value, self.current);
}
// change the internal "current" value
try testing.expectEqual(new_value, self.current + 1);
self.current = new_value;
// hit the next paddle
hit_to.hit();
}
}
};
var paddles = [_]Paddle{.{}} ** num_threads;
var threads = [_]std.Thread{undefined} ** num_threads;
// Create a circle of paddles which hit each other
for (&threads, 0..) |*t, i| {
const paddle = &paddles[i];
const hit_to = &paddles[(i + 1) % paddles.len];
t.* = try std.Thread.spawn(.{}, Paddle.run, .{ paddle, hit_to });
}
// Hit the first paddle and wait for them all to complete by hitting each other for num_iterations.
paddles[0].hit();
for (threads) |t| t.join();
for (paddles) |p| try testing.expectEqual(p.current, num_iterations);
}
test "Futex - broadcasting" {
// This test requires spawning threads
if (builtin.single_threaded) {
return error.SkipZigTest;
}
const num_threads = 4;
const num_iterations = 4;
const Barrier = struct {
count: Atomic(u32) = Atomic(u32).init(num_threads),
futex: Atomic(u32) = Atomic(u32).init(0),
fn wait(self: *@This()) !void {
// Decrement the counter.
// Release ensures stuff before this barrier.wait() happens before the last one.
const count = self.count.fetchSub(1, .Release);
try testing.expect(count <= num_threads);
try testing.expect(count > 0);
// First counter to reach zero wakes all other threads.
// Acquire for the last counter ensures stuff before previous barrier.wait()s happened before it.
// Release on futex update ensures stuff before all barrier.wait()'s happens before they all return.
if (count - 1 == 0) {
_ = self.count.load(.Acquire); // TODO: could be fence(Acquire) if not for TSAN
self.futex.store(1, .Release);
Futex.wake(&self.futex, num_threads - 1);
return;
}
// Other threads wait until last counter wakes them up.
// Acquire on futex synchronizes with last barrier count to ensure stuff before all barrier.wait()'s happen before us.
while (self.futex.load(.Acquire) == 0) {
Futex.wait(&self.futex, 0);
}
}
};
const Broadcast = struct {
barriers: [num_iterations]Barrier = [_]Barrier{.{}} ** num_iterations,
threads: [num_threads]std.Thread = undefined,
fn run(self: *@This()) !void {
for (&self.barriers) |*barrier| {
try barrier.wait();
}