Skip to content

Commit de365a1

Browse files
authored
v2: speed up the transform stage ~15x (42s -> ~2.85s on self-compile) (#27333)
1 parent b02a629 commit de365a1

10 files changed

Lines changed: 575 additions & 60 deletions

File tree

vlib/v2/builder/builder.v

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,17 @@ fn print_rss(stage string) {
160160
if os.getenv('V2_MEM') == '' {
161161
return
162162
}
163-
bytes := runtime.used_memory() or { 0 }
164-
eprintln(' [mem] ${stage}: ${bytes / (1024 * 1024)} MB')
163+
rss := runtime.used_memory() or { 0 }
164+
$if macos {
165+
// Under -gc none nothing is freed, so `live` is monotonic and its
166+
// per-phase delta is the exact bytes that phase allocated. `peak` is
167+
// the high-water mark. Both are stable run-to-run, unlike `rss`.
168+
live, peak := darwin_live_malloc_bytes()
169+
mb := u64(1024 * 1024)
170+
eprintln(' [mem] ${stage}: live ${live / mb} MB peak ${peak / mb} MB (rss ${rss / mb} MB)')
171+
return
172+
}
173+
eprintln(' [mem] ${stage}: ${rss / (1024 * 1024)} MB')
165174
}
166175

167176
// print_heap reports retained heap size after a forced GC, in MB. Unlike

vlib/v2/builder/mem_darwin.c.v

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
2+
// Use of this source code is governed by an MIT license
3+
// that can be found in the LICENSE file.
4+
module builder
5+
6+
#include <malloc/malloc.h>
7+
8+
struct C.malloc_statistics_t {
9+
blocks_in_use u32
10+
size_in_use usize
11+
max_size_in_use usize
12+
size_allocated usize
13+
}
14+
15+
fn C.malloc_default_zone() voidptr
16+
fn C.malloc_zone_statistics(zone voidptr, stats &C.malloc_statistics_t)
17+
18+
// darwin_live_malloc_bytes returns (current live malloc bytes, peak live bytes)
19+
// from the default malloc zone. Under `-gc none` (no frees) the current value
20+
// is monotonic across phases, so per-phase deltas are the exact number of bytes
21+
// each phase allocated and never released. This is the reliable counterpart to
22+
// runtime.used_memory(), whose resident_size reading is distorted by OS paging.
23+
fn darwin_live_malloc_bytes() (u64, u64) {
24+
mut st := C.malloc_statistics_t{}
25+
C.malloc_zone_statistics(C.malloc_default_zone(), &st)
26+
return u64(st.size_in_use), u64(st.max_size_in_use)
27+
}

vlib/v2/builder/transform_parallel.v

Lines changed: 137 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ module builder
66
import v2.ast
77
import v2.transformer
88
import runtime
9+
import os
10+
import time
911

1012
$if !windows {
1113
struct TransformChunkArgs {
@@ -28,6 +30,8 @@ $if !windows {
2830
fn transform_chunk_thread(arg voidptr) voidptr {
2931
a := unsafe { &TransformChunkArgs(arg) }
3032
t := unsafe { &transformer.Transformer(a.t) }
33+
wprof := os.getenv('V2_TTIME') != ''
34+
mut wsw := time.new_stopwatch()
3135
mut w := t.new_worker_clone(a.worker_idx)
3236
if unsafe { a.flat != nil } {
3337
// Streaming rehydration: rehydrate one file at a time, transform it,
@@ -52,6 +56,9 @@ $if !windows {
5256
for i := 0; i < a.files.len; i++ {
5357
result << w.transform_file_pub(a.files[i])
5458
}
59+
if wprof {
60+
eprintln(' [ttime] worker ${a.worker_idx}: ${a.files.len} files in ${wsw.elapsed().milliseconds()}ms')
61+
}
5562
unsafe {
5663
*(&[]ast.File(a.result_ptr)) = result
5764
*(&voidptr(a.worker_ptr)) = voidptr(w)
@@ -61,8 +68,17 @@ $if !windows {
6168
}
6269

6370
fn (mut b Builder) transform_files_parallel(mut trans transformer.Transformer) []ast.File {
71+
timing := os.getenv('V2_TTIME') != ''
72+
mut sw := time.new_stopwatch()
6473
mut result := b.transform_files_parallel_no_post_pass(mut trans)
74+
if timing {
75+
eprintln(' [ttime] (parallel) prepare+fanout: ${sw.elapsed().milliseconds()}ms')
76+
sw = time.new_stopwatch()
77+
}
6578
trans.post_pass(mut result)
79+
if timing {
80+
eprintln(' [ttime] (parallel) post_pass: ${sw.elapsed().milliseconds()}ms')
81+
}
6682
return result
6783
}
6884

@@ -93,6 +109,8 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
93109
} else {
94110
trans.pre_pass(b.files)
95111
}
112+
timing_impl := os.getenv('V2_TTIME') != ''
113+
mut sw_impl := time.new_stopwatch()
96114
mut stream_files_from_flat := stream_from_flat
97115
mut files_to_transform := []ast.File{}
98116
if trans.needs_full_files_for_transform() {
@@ -102,6 +120,15 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
102120
} else if !stream_from_flat {
103121
files_to_transform = b.files.clone()
104122
}
123+
if timing_impl {
124+
eprintln(' [ttime] prepare_files_for_transform total: ${sw_impl.elapsed().milliseconds()}ms')
125+
sw_impl = time.new_stopwatch()
126+
}
127+
defer {
128+
if timing_impl {
129+
eprintln(' [ttime] per-file fanout: ${sw_impl.elapsed().milliseconds()}ms')
130+
}
131+
}
105132

106133
// In flat mode, workers stream the rehydration per file (one legacy
107134
// ast.File in flight per worker at a time). Otherwise b.files is the
@@ -143,8 +170,32 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
143170
return result
144171
}
145172

146-
// Split files into chunks and spawn workers via pthreads
147-
chunk_size := (n_files + n_jobs - 1) / n_jobs // ceiling division
173+
// Assign files to workers. Contiguous chunks badly unbalance the load:
174+
// the few huge files (transformer.v, monomorphize.v, the cleanc gen
175+
// files, ...) cluster into adjacent chunks, so 2-3 workers run ~10s
176+
// while the rest finish in <0.5s and idle. For the non-flat path we
177+
// instead use longest-processing-time-first (LPT) bucketing keyed on a
178+
// cheap size proxy, then scatter each worker's results back to their
179+
// original file index after the join (no concurrent writes — workers
180+
// each fill their own chunk_results slot, the merge happens serially).
181+
mut bucket_indices := [][]int{len: n_jobs}
182+
if stream_files_from_flat {
183+
// Flat streaming still uses contiguous [start,end) ranges.
184+
chunk_size := (n_files + n_jobs - 1) / n_jobs
185+
mut i := 0
186+
mut w := 0
187+
for i < n_files {
188+
end := if i + chunk_size < n_files { i + chunk_size } else { n_files }
189+
for j in i .. end {
190+
bucket_indices[w] << j
191+
}
192+
i = end
193+
w++
194+
}
195+
} else {
196+
bucket_indices = lpt_buckets(files_to_transform, n_jobs)
197+
}
198+
148199
mut chunk_results := [][]ast.File{len: n_jobs}
149200
mut worker_ptrs := []voidptr{len: n_jobs, init: unsafe { nil }}
150201
mut thread_ids := []C.pthread_t{len: n_jobs}
@@ -159,21 +210,26 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
159210
C.pthread_attr_setstacksize(attr, 64 * 1024 * 1024)
160211

161212
mut chunk_idx := 0
162-
mut i := 0
163-
for i < n_files {
164-
end := if i + chunk_size < n_files { i + chunk_size } else { n_files }
213+
for w in 0 .. n_jobs {
214+
idxs := bucket_indices[w]
215+
if idxs.len == 0 {
216+
continue
217+
}
165218
if stream_files_from_flat {
166219
args << TransformChunkArgs{
167220
t: unsafe { voidptr(trans) }
168221
flat: unsafe { &b.flat }
169-
flat_start: i
170-
flat_end: end
222+
flat_start: idxs[0]
223+
flat_end: idxs[idxs.len - 1] + 1
171224
result_ptr: unsafe { voidptr(&chunk_results[chunk_idx]) }
172225
worker_ptr: unsafe { voidptr(&worker_ptrs[chunk_idx]) }
173226
worker_idx: chunk_idx
174227
}
175228
} else {
176-
chunk := files_to_transform[i..end].clone()
229+
mut chunk := []ast.File{cap: idxs.len}
230+
for fi in idxs {
231+
chunk << files_to_transform[fi]
232+
}
177233
args << TransformChunkArgs{
178234
t: unsafe { voidptr(trans) }
179235
files: chunk
@@ -184,7 +240,6 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
184240
}
185241
C.pthread_create(unsafe { &thread_ids[chunk_idx] }, attr, transform_chunk_thread,
186242
unsafe { voidptr(&args[chunk_idx]) })
187-
i = end
188243
chunk_idx++
189244
}
190245
C.pthread_attr_destroy(attr)
@@ -194,22 +249,88 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
194249
C.pthread_join(thread_ids[ci], unsafe { nil })
195250
}
196251

197-
// Collect results in chunk order and merge worker accumulated state
198-
mut result := []ast.File{cap: n_files}
199-
for ci := 0; ci < chunk_idx; ci++ {
252+
// Scatter each worker's results back to original file order and merge
253+
// accumulated state. bucket_indices[w] lists the original indices the
254+
// w-th spawned worker processed, in the same order it produced results.
255+
mut result := []ast.File{len: n_files}
256+
mut ci := 0
257+
for w in 0 .. n_jobs {
258+
idxs := bucket_indices[w]
259+
if idxs.len == 0 {
260+
continue
261+
}
200262
chunk_files := chunk_results[ci]
201-
for k := 0; k < chunk_files.len; k++ {
202-
result << chunk_files[k]
263+
for k, fi in idxs {
264+
if k < chunk_files.len {
265+
result[fi] = chunk_files[k]
266+
}
203267
}
204-
w := unsafe { &transformer.Transformer(worker_ptrs[ci]) }
205-
trans.merge_worker(w)
268+
worker := unsafe { &transformer.Transformer(worker_ptrs[ci]) }
269+
trans.merge_worker(worker)
270+
ci++
206271
}
207272
// Set synth_pos_counter past all worker ranges to avoid ID collisions in post_pass.
208273
trans.set_synth_pos_counter(-(chunk_idx * 100_000) - 1)
209274
return result
210275
}
211276
}
212277

278+
// lpt_buckets distributes file indices across n_jobs workers using the
279+
// longest-processing-time-first heuristic: process files largest-first and
280+
// always append to the currently least-loaded worker. This keeps the heaviest
281+
// files on separate workers so the fan-out wall time approaches
282+
// total_work / n_jobs instead of being pinned to one overloaded contiguous
283+
// chunk. The cost proxy is top-level statement count (cheap, and the giant
284+
// files have proportionally many declarations). Deterministic: files are
285+
// ordered by (cost desc, index asc) and ties pick the lowest worker index.
286+
fn lpt_buckets(files []ast.File, n_jobs int) [][]int {
287+
n := files.len
288+
mut cost := []int{len: n}
289+
for i in 0 .. n {
290+
// Cost proxy: count function bodies, not just top-level declarations, so
291+
// a file of a few huge functions (transformer.v, the cleanc gen files)
292+
// outranks one with many tiny ones. Deterministic; one level deep is
293+
// enough to separate the heavyweight files that drove the imbalance.
294+
mut c := 1
295+
for stmt in files[i].stmts {
296+
c++
297+
if stmt is ast.FnDecl {
298+
c += stmt.stmts.len
299+
}
300+
}
301+
cost[i] = c
302+
}
303+
// order = file indices by cost descending. Implemented as a plain insertion
304+
// sort (n is small, a few hundred) rather than sort_with_compare: this file
305+
// must self-host through every backend, and capturing closures / pointer
306+
// comparators are not reliably codegen'd by the v2 cleanc and arm64 paths.
307+
// Stable on index (only shifts on strictly-greater), so deterministic.
308+
mut order := []int{len: n, init: index}
309+
for i in 1 .. n {
310+
key := order[i]
311+
kc := cost[key]
312+
mut j := i - 1
313+
for j >= 0 && cost[order[j]] < kc {
314+
order[j + 1] = order[j]
315+
j--
316+
}
317+
order[j + 1] = key
318+
}
319+
mut buckets := [][]int{len: n_jobs}
320+
mut load := []i64{len: n_jobs}
321+
for fi in order {
322+
mut mw := 0
323+
for w in 1 .. n_jobs {
324+
if load[w] < load[mw] {
325+
mw = w
326+
}
327+
}
328+
buckets[mw] << fi
329+
load[mw] += i64(cost[fi])
330+
}
331+
return buckets
332+
}
333+
213334
// transform_files_parallel_to_flat is the parallel counterpart of
214335
// Transformer.transform_files_to_flat. Today it composes the existing
215336
// parallel transform with a boundary flatten_files() — same total work

vlib/v2/transformer/fn.v

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,26 +1172,31 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st
11721172
|| !transformer_string_has_valid_data(type_name) {
11731173
return false
11741174
}
1175-
normalized_key := method_key.replace('.', '__')
1176-
normalized_type := type_name.replace('.', '__')
1175+
// Avoid .replace/.contains here: replace always allocates and contains builds
1176+
// a KMP failure table per call. This runs inside O(method_keys) fallback loops
1177+
// per call site, so those per-call allocations were a large transform cost.
1178+
// Only normalize when a '.' is actually present (index_u8 does not allocate),
1179+
// and locate `__` with a hand-rolled scan.
1180+
normalized_key := if method_key.index_u8(`.`) >= 0 {
1181+
method_key.replace('.', '__')
1182+
} else {
1183+
method_key
1184+
}
1185+
normalized_type := if type_name.index_u8(`.`) >= 0 {
1186+
type_name.replace('.', '__')
1187+
} else {
1188+
type_name
1189+
}
11771190
if normalized_key == normalized_type {
11781191
return true
11791192
}
1180-
key_is_qualified := normalized_key.contains('__')
1181-
type_is_qualified := normalized_type.contains('__')
1182-
if key_is_qualified && type_is_qualified {
1193+
key_dunder := last_double_underscore(normalized_key)
1194+
type_dunder := last_double_underscore(normalized_type)
1195+
if key_dunder >= 0 && type_dunder >= 0 {
11831196
return false
11841197
}
1185-
short_type := if normalized_type.contains('__') {
1186-
normalized_type.all_after_last('__')
1187-
} else {
1188-
normalized_type
1189-
}
1190-
short_key := if normalized_key.contains('__') {
1191-
normalized_key.all_after_last('__')
1192-
} else {
1193-
normalized_key
1194-
}
1198+
short_type := if type_dunder >= 0 { normalized_type[type_dunder + 2..] } else { normalized_type }
1199+
short_key := if key_dunder >= 0 { normalized_key[key_dunder + 2..] } else { normalized_key }
11951200
if short_key == short_type {
11961201
return true
11971202
}
@@ -1210,6 +1215,28 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st
12101215
return false
12111216
}
12121217

1218+
// candidate_method_keys returns the cached method keys that could fuzzy-match any
1219+
// of `names` — i.e. those sharing a receiver short name. A method_key_matches_type_name
1220+
// match always implies equal short names, so the fuzzy fallback loops can scan
1221+
// these candidates instead of every method key (O(all_keys) per call site).
1222+
fn (t &Transformer) candidate_method_keys(names []string) []string {
1223+
mut cand := []string{}
1224+
mut shorts_done := []string{}
1225+
for name in names {
1226+
if name == '' {
1227+
continue
1228+
}
1229+
sh := method_short_name(name)
1230+
if sh in shorts_done {
1231+
continue
1232+
}
1233+
shorts_done << sh
1234+
keys := t.cached_method_keys_by_short[sh] or { continue }
1235+
cand << keys
1236+
}
1237+
return cand
1238+
}
1239+
12131240
fn (t &Transformer) lookup_method_return_type(type_names []string, method_name string) ?types.Type {
12141241
if method_name == '' {
12151242
return none
@@ -1229,7 +1256,7 @@ fn (t &Transformer) lookup_method_return_type(type_names []string, method_name s
12291256
}
12301257
}
12311258
}
1232-
for key in t.cached_method_keys {
1259+
for key in t.candidate_method_keys(seen) {
12331260
mut matches_receiver := false
12341261
for type_name in seen {
12351262
if t.method_key_matches_type_name(key, type_name) {
@@ -1330,7 +1357,7 @@ fn (t &Transformer) lookup_method_exists(type_names []string, method_name string
13301357
return true
13311358
}
13321359
}
1333-
for key in t.cached_method_keys {
1360+
for key in t.candidate_method_keys(seen) {
13341361
mut matches_receiver := false
13351362
for type_name in seen {
13361363
if t.method_key_matches_type_name(key, type_name) {
@@ -4454,7 +4481,7 @@ fn (t &Transformer) resolve_method_call_name(receiver ast.Expr, method_name stri
44544481
}
44554482
}
44564483
// Fuzzy fallback: iterate method keys to find matching receiver types
4457-
for key in t.cached_method_keys {
4484+
for key in t.candidate_method_keys(lookup_names) {
44584485
mut matches_receiver := false
44594486
for name in lookup_names {
44604487
if t.method_key_matches_type_name(key, name) {

0 commit comments

Comments
 (0)