Skip to content

Commit 9ec0373

Browse files
committed
vgc: optimizations
1 parent 3180afa commit 9ec0373

9 files changed

Lines changed: 187 additions & 30 deletions

File tree

bench/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,18 @@ v run bench/bench_gc.v
1616
```
1717
test boehm vgc ratio
1818
———————————————————————————————————————————— ————————— ————————— —————————
19-
small allocs (1000000x string) 43 ms 52 ms 1.21x
20-
tree build+walk (depth=18, 10x) 46 ms 125 ms 2.72x
21-
array grow (100x 100000 pushes) 7 ms 30 ms 4.29x
19+
small allocs (1000000x string) 39 ms 48 ms 1.23x
20+
tree build+walk (depth=18, 10x) 48 ms 118 ms 2.46x
21+
array grow (100x 100000 pushes) 9 ms 26 ms 2.89x
2222
map insert (20x 10k entries) 20 ms 27 ms 1.35x
2323
mixed workload (50 rounds) 10 ms 16 ms 1.60x
2424
2525
heap usage:
26-
boehm: 29856 KB allocated, 29020 KB free
26+
boehm: 29856 KB allocated, 29296 KB free
2727
vgc: 131072 KB allocated, 0 KB free
2828
```
2929

30-
Boehm is still 1.2x-4.3x faster across these workloads and uses ~4x less heap.
30+
Boehm is still 1.2x-2.9x faster across these workloads and uses ~4x less heap.
3131

3232
## Closures
3333

thirdparty/vgc/vgc_platform.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,13 @@ static inline void vgc_set_cache_idx(int idx) { _vgc_cache_idx = idx; }
8888
static inline void vgc_os_decommit(void* ptr, size_t size) {
8989
VirtualFree(ptr, size, MEM_DECOMMIT);
9090
}
91+
static inline int vgc_num_cpus(void) {
92+
DWORD count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
93+
return count > 0 ? (int)count : 1;
94+
}
9195
#else
9296
#include <sys/mman.h>
97+
#include <unistd.h>
9398
static inline void* vgc_os_alloc(size_t size) {
9499
void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
95100
return (p == MAP_FAILED) ? NULL : p;
@@ -100,6 +105,10 @@ static inline void vgc_set_cache_idx(int idx) { _vgc_cache_idx = idx; }
100105
static inline void vgc_os_decommit(void* ptr, size_t size) {
101106
madvise(ptr, size, MADV_DONTNEED);
102107
}
108+
static inline int vgc_num_cpus(void) {
109+
long count = sysconf(_SC_NPROCESSORS_ONLN);
110+
return count > 0 ? (int)count : 1;
111+
}
103112
#endif
104113

105114
// ============================================================

vlib/builtin/allocation.c.v

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,32 @@ pub fn malloc_noscan(n isize) &u8 {
133133
return res
134134
}
135135

136+
@[unsafe]
137+
fn malloc_uninit(n isize) &u8 {
138+
if n < 0 {
139+
_memory_panic(@FN, n)
140+
} else if n == 0 {
141+
return &u8(unsafe { nil })
142+
}
143+
$if vgc ? {
144+
return unsafe { &u8(vgc_malloc_typed_opts(usize(n), 0, 0, false)) }
145+
}
146+
return malloc(n)
147+
}
148+
149+
@[unsafe]
150+
fn malloc_noscan_uninit(n isize) &u8 {
151+
if n < 0 {
152+
_memory_panic(@FN, n)
153+
} else if n == 0 {
154+
return &u8(unsafe { nil })
155+
}
156+
$if vgc ? {
157+
return unsafe { &u8(vgc_malloc_noscan_opts(usize(n), false)) }
158+
}
159+
return malloc_noscan(n)
160+
}
161+
136162
@[inline]
137163
fn __at_least_one(how_many u64) u64 {
138164
// handle the case for allocating memory for empty structs, which have sizeof(EmptyStruct) == 0
@@ -409,6 +435,9 @@ pub fn memdup(src voidptr, sz isize) voidptr {
409435
if sz == 0 {
410436
return vcalloc(1)
411437
}
438+
$if vgc ? {
439+
return vgc_memdup(src, sz)
440+
}
412441
unsafe {
413442
mem := malloc(sz)
414443
return C.memcpy(mem, src, sz)
@@ -423,6 +452,9 @@ pub fn memdup_noscan(src voidptr, sz isize) voidptr {
423452
if sz == 0 {
424453
return vcalloc_noscan(1)
425454
}
455+
$if vgc ? {
456+
return vgc_memdup_noscan(src, sz)
457+
}
426458
unsafe {
427459
mem := malloc_noscan(sz)
428460
return C.memcpy(mem, src, sz)

vlib/builtin/array.v

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,14 @@ fn __new_array(mylen int, cap int, elm_size int) array {
3333
panic_on_negative_len(mylen)
3434
panic_on_negative_cap(cap)
3535
cap_ := if cap < mylen { mylen } else { cap }
36+
total_size := u64(cap_) * u64(elm_size)
3637
arr := array{
3738
element_size: elm_size
38-
data: vcalloc(u64(cap_) * u64(elm_size))
39+
data: if cap_ > 0 && mylen == 0 {
40+
unsafe { malloc_uninit(__at_least_one(total_size)) }
41+
} else {
42+
vcalloc(total_size)
43+
}
3944
len: mylen
4045
cap: cap_
4146
}
@@ -214,7 +219,7 @@ pub fn (mut a array) ensure_cap(required int) {
214219
}
215220
}
216221
new_size := u64(cap) * u64(a.element_size)
217-
new_data := unsafe { malloc(__at_least_one(new_size)) }
222+
new_data := unsafe { malloc_uninit(__at_least_one(new_size)) }
218223
if a.data != unsafe { nil } {
219224
unsafe { vmemcpy(new_data, a.data, u64(a.len) * u64(a.element_size)) }
220225
// TODO: the old data may be leaked when no GC is used (ref-counting?)

vlib/builtin/array_d_gcboehm_opt.v

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,14 @@ fn __new_array_noscan(mylen int, cap int, elm_size int) array {
99
panic_on_negative_len(mylen)
1010
panic_on_negative_cap(cap)
1111
cap_ := if cap < mylen { mylen } else { cap }
12+
total_size := u64(cap_) * u64(elm_size)
1213
arr := array{
1314
element_size: elm_size
14-
data: vcalloc_noscan(u64(cap_) * u64(elm_size))
15+
data: if cap_ > 0 && mylen == 0 {
16+
unsafe { malloc_noscan_uninit(__at_least_one(total_size)) }
17+
} else {
18+
vcalloc_noscan(total_size)
19+
}
1520
len: mylen
1621
cap: cap_
1722
}
@@ -120,7 +125,7 @@ fn (mut a array) ensure_cap_noscan(required int) {
120125
}
121126
}
122127
new_size := u64(cap) * u64(a.element_size)
123-
new_data := unsafe { malloc_noscan(__at_least_one(new_size)) }
128+
new_data := unsafe { malloc_noscan_uninit(__at_least_one(new_size)) }
124129
if a.data != unsafe { nil } {
125130
unsafe { vmemcpy(new_data, a.data, u64(a.len) * u64(a.element_size)) }
126131
// TODO: the old data may be leaked when no GC is used (ref-counting?)

vlib/builtin/vgc_d_vgc.c.v

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ fn C.vgc_init_size_tables()
3939
fn C.vgc_mutex_lock(lk &u32)
4040
fn C.vgc_mutex_unlock(lk &u32)
4141
fn C.vgc_start_thread(f voidptr)
42+
fn C.vgc_num_cpus() int
4243
fn C.vgc_addr_map_register(base usize, size usize, arena_idx int)
4344
fn C.vgc_addr_to_arena(addr usize) int
4445

@@ -210,7 +211,7 @@ pub fn vgc_init() {
210211
C.vgc_init_size_tables()
211212
vgc_heap.gc_enabled = 1
212213
vgc_heap.gc_percent = 100
213-
vgc_heap.next_gc = 4 * 1024 * 1024 // initial trigger at 4MB (like Go)
214+
vgc_heap.next_gc = 256 * 1024 * 1024 // favor throughput over early collections
214215
vgc_heap.gc_phase = vgc_phase_off
215216
// Register the main thread
216217
vgc_register_thread()
@@ -899,7 +900,32 @@ fn vgc_memdup_typed(src voidptr, n isize, ptrmap u64, ptr_words u8) voidptr {
899900
if src == unsafe { nil } || n <= 0 {
900901
return unsafe { nil }
901902
}
902-
mem := vgc_malloc_typed(usize(n), ptrmap, ptr_words)
903+
mem := vgc_malloc_typed_opts(usize(n), ptrmap, ptr_words, false)
904+
if mem != unsafe { nil } {
905+
unsafe { C.memcpy(mem, src, n) }
906+
}
907+
return mem
908+
}
909+
910+
// Memdup variants that skip zero-fill when the destination will be overwritten.
911+
@[markused]
912+
fn vgc_memdup(src voidptr, n isize) voidptr {
913+
if src == unsafe { nil } || n <= 0 {
914+
return unsafe { nil }
915+
}
916+
mem := vgc_malloc_typed_opts(usize(n), 0, 0, false)
917+
if mem != unsafe { nil } {
918+
unsafe { C.memcpy(mem, src, n) }
919+
}
920+
return mem
921+
}
922+
923+
@[markused]
924+
fn vgc_memdup_noscan(src voidptr, n isize) voidptr {
925+
if src == unsafe { nil } || n <= 0 {
926+
return unsafe { nil }
927+
}
928+
mem := vgc_malloc_noscan_opts(usize(n), false)
903929
if mem != unsafe { nil } {
904930
unsafe { C.memcpy(mem, src, n) }
905931
}

vlib/builtin/vgc_gc_d_vgc.c.v

Lines changed: 69 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ fn vgc_scan_range(lo usize, hi usize) {
133133
mut addr := start
134134
for addr + sizeof(usize) <= hi {
135135
val := unsafe { *(&usize(voidptr(addr))) }
136-
if vgc_is_heap_ptr(val) {
136+
if val != 0 {
137137
vgc_shade(val)
138138
}
139139
addr += sizeof(usize)
@@ -143,6 +143,9 @@ fn vgc_scan_range(lo usize, hi usize) {
143143
// Shade marks an object grey (discovered but not yet scanned).
144144
// Translated from Go's shade() in mgcmark.go.
145145
fn vgc_shade(addr usize) {
146+
if addr < vgc_arena_lo || addr >= vgc_arena_hi {
147+
return
148+
}
146149
span := vgc_find_span(voidptr(addr))
147150
if span == unsafe { nil } || !span.in_use {
148151
return
@@ -174,21 +177,26 @@ fn vgc_shade(addr usize) {
174177
// Parallel mark using OS threads.
175178
// Translated from Go's gcDrain() with multiple workers.
176179
fn vgc_parallel_mark() {
177-
// Use up to 4 workers (like Go's dedicated mark workers)
178-
nworkers := if vgc_heap.ncaches < 4 { 1 } else { 4 }
180+
mut nworkers := C.vgc_num_cpus()
181+
if nworkers < 1 {
182+
nworkers = 1
183+
} else if nworkers > 4 {
184+
nworkers = 4
185+
}
179186
vgc_heap.gc_nworkers = nworkers
180187
C.vgc_atomic_store_u32(&vgc_heap.gc_workers_done, 0)
181188

182189
if nworkers <= 1 {
183-
// Single-threaded mark
184190
vgc_drain_mark_work()
185191
return
186192
}
187193

188-
// Start mark workers as OS threads
189-
for _ in 0 .. nworkers {
194+
// Start helper workers and let the current GC thread participate as well.
195+
for _ in 1 .. nworkers {
190196
C.vgc_start_thread(vgc_mark_worker)
191197
}
198+
vgc_drain_mark_work()
199+
C.vgc_atomic_add_u32(&vgc_heap.gc_workers_done, 1)
192200

193201
// Wait for all workers to finish
194202
for C.vgc_atomic_load_u32(&vgc_heap.gc_workers_done) < u32(nworkers) {
@@ -264,7 +272,7 @@ fn vgc_scan_precise(obj_addr usize, ptrmap u64, ptr_words u8) {
264272
// Read the pointer at this offset
265273
ptr_addr := obj_addr + usize(bit) * word_size
266274
val := unsafe { *(&usize(voidptr(ptr_addr))) }
267-
if val != 0 && vgc_is_heap_ptr(val) {
275+
if val != 0 {
268276
vgc_shade(val)
269277
}
270278
// Clear this bit and continue
@@ -276,8 +284,41 @@ fn vgc_scan_precise(obj_addr usize, ptrmap u64, ptr_words u8) {
276284
// Work queue (translated from Go's mgcwork.go)
277285
// ============================================================
278286

287+
@[inline]
288+
fn vgc_can_use_work_fastpath() bool {
289+
return vgc_heap.ncaches <= 1 && vgc_heap.gc_nworkers <= 1
290+
}
291+
279292
// Add a pointer to the mark work queue
280293
fn vgc_work_put(addr usize) {
294+
if vgc_can_use_work_fastpath() {
295+
mut buf := vgc_heap.work_full
296+
if buf == unsafe { nil } || buf.nobj >= 256 {
297+
mut new_buf := vgc_heap.work_empty
298+
if new_buf != unsafe { nil } {
299+
unsafe {
300+
vgc_heap.work_empty = new_buf.next
301+
}
302+
} else {
303+
new_buf = unsafe { &VGC_WorkBuf(C.vgc_os_alloc(usize(sizeof(VGC_WorkBuf)))) }
304+
if new_buf == unsafe { nil } {
305+
return
306+
}
307+
}
308+
unsafe {
309+
new_buf.nobj = 0
310+
new_buf.next = vgc_heap.work_full
311+
vgc_heap.work_full = new_buf
312+
}
313+
buf = new_buf
314+
}
315+
unsafe {
316+
buf.obj[buf.nobj] = addr
317+
buf.nobj++
318+
}
319+
return
320+
}
321+
281322
C.vgc_mutex_lock(&vgc_heap.work_lock)
282323

283324
// Get or create a work buffer
@@ -313,6 +354,23 @@ fn vgc_work_put(addr usize) {
313354

314355
// Get a pointer from the mark work queue
315356
fn vgc_work_get() usize {
357+
if vgc_can_use_work_fastpath() {
358+
mut buf := vgc_heap.work_full
359+
if buf == unsafe { nil } || buf.nobj == 0 {
360+
return 0
361+
}
362+
unsafe {
363+
buf.nobj--
364+
addr := buf.obj[buf.nobj]
365+
if buf.nobj == 0 {
366+
vgc_heap.work_full = buf.next
367+
buf.next = vgc_heap.work_empty
368+
vgc_heap.work_empty = buf
369+
}
370+
return addr
371+
}
372+
}
373+
316374
C.vgc_mutex_lock(&vgc_heap.work_lock)
317375

318376
mut buf := vgc_heap.work_full
@@ -350,12 +408,8 @@ fn vgc_write_barrier(new_val voidptr) {
350408
if new_val == unsafe { nil } {
351409
return
352410
}
353-
addr := usize(new_val)
354-
if !vgc_is_heap_ptr(addr) {
355-
return
356-
}
357411
// Shade the new pointer (mark it grey)
358-
vgc_shade(addr)
412+
vgc_shade(usize(new_val))
359413
}
360414

361415
// ============================================================
@@ -471,9 +525,9 @@ fn vgc_update_trigger() {
471525
gc_percent := u64(vgc_heap.gc_percent)
472526

473527
mut goal := marked + marked * gc_percent / 100
474-
// Minimum 4MB trigger
475-
if goal < 4 * 1024 * 1024 {
476-
goal = 4 * 1024 * 1024
528+
// Avoid very small heap goals that force frequent full cycles on bursty workloads.
529+
if goal < 256 * 1024 * 1024 {
530+
goal = 256 * 1024 * 1024
477531
}
478532
C.vgc_atomic_store_u64(&vgc_heap.next_gc, goal)
479533
}

vlib/builtin/vgc_notd_vgc.c.v

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,22 @@ fn vgc_malloc_noscan(n usize) voidptr {
1212
return unsafe { nil }
1313
}
1414

15+
fn vgc_malloc_typed_opts(n usize, ptrmap u64, ptr_words u8, zero_fill bool) voidptr {
16+
return unsafe { nil }
17+
}
18+
19+
fn vgc_malloc_noscan_opts(n usize, zero_fill bool) voidptr {
20+
return unsafe { nil }
21+
}
22+
23+
fn vgc_memdup(src voidptr, n isize) voidptr {
24+
return unsafe { nil }
25+
}
26+
27+
fn vgc_memdup_noscan(src voidptr, n isize) voidptr {
28+
return unsafe { nil }
29+
}
30+
1531
fn vgc_realloc(old_ptr voidptr, new_size usize) voidptr {
1632
return unsafe { nil }
1733
}

0 commit comments

Comments
 (0)