vlang
diff --git a/‎vlib/v2/builder/builder.v‎
Lines changed: 11 additions & 2 deletions b/‎vlib/v2/builder/builder.v‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎vlib/v2/builder/mem_darwin.c.v‎
Lines changed: 27 additions & 0 deletions b/‎vlib/v2/builder/mem_darwin.c.v‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎vlib/v2/builder/transform_parallel.v‎
Lines changed: 137 additions & 16 deletions b/‎vlib/v2/builder/transform_parallel.v‎
Lines changed: 137 additions & 16 deletions
diff --git a/‎vlib/v2/transformer/fn.v‎
Lines changed: 45 additions & 18 deletions b/‎vlib/v2/transformer/fn.v‎
Lines changed: 45 additions & 18 deletions
@@ -160,8 +160,17 @@ fn print_rss(stage string) {
 	if os.getenv('V2_MEM') == '' {
 		return
 	}
-	bytes := runtime.used_memory() or { 0 }
-	eprintln('  [mem] ${stage}: ${bytes / (1024 * 1024)} MB')
+	rss := runtime.used_memory() or { 0 }
+	$if macos {
+		// Under -gc none nothing is freed, so `live` is monotonic and its
+		// per-phase delta is the exact bytes that phase allocated. `peak` is
+		// the high-water mark. Both are stable run-to-run, unlike `rss`.
+		live, peak := darwin_live_malloc_bytes()
+		mb := u64(1024 * 1024)
+		eprintln('  [mem] ${stage}: live ${live / mb} MB  peak ${peak / mb} MB  (rss ${rss / mb} MB)')
+		return
+	}
+	eprintln('  [mem] ${stage}: ${rss / (1024 * 1024)} MB')
 }
 
 // print_heap reports retained heap size after a forced GC, in MB. Unlike
 
@@ -0,0 +1,27 @@
+// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module builder
+
+#include <malloc/malloc.h>
+
+struct C.malloc_statistics_t {
+	blocks_in_use   u32
+	size_in_use     usize
+	max_size_in_use usize
+	size_allocated  usize
+}
+
+fn C.malloc_default_zone() voidptr
+fn C.malloc_zone_statistics(zone voidptr, stats &C.malloc_statistics_t)
+
+// darwin_live_malloc_bytes returns (current live malloc bytes, peak live bytes)
+// from the default malloc zone. Under `-gc none` (no frees) the current value
+// is monotonic across phases, so per-phase deltas are the exact number of bytes
+// each phase allocated and never released. This is the reliable counterpart to
+// runtime.used_memory(), whose resident_size reading is distorted by OS paging.
+fn darwin_live_malloc_bytes() (u64, u64) {
+	mut st := C.malloc_statistics_t{}
+	C.malloc_zone_statistics(C.malloc_default_zone(), &st)
+	return u64(st.size_in_use), u64(st.max_size_in_use)
+}
@@ -6,6 +6,8 @@ module builder
 import v2.ast
 import v2.transformer
 import runtime
+import os
+import time
 
 $if !windows {
 	struct TransformChunkArgs {
@@ -28,6 +30,8 @@ $if !windows {
 	fn transform_chunk_thread(arg voidptr) voidptr {
 		a := unsafe { &TransformChunkArgs(arg) }
 		t := unsafe { &transformer.Transformer(a.t) }
+		wprof := os.getenv('V2_TTIME') != ''
+		mut wsw := time.new_stopwatch()
 		mut w := t.new_worker_clone(a.worker_idx)
 		if unsafe { a.flat != nil } {
 			// Streaming rehydration: rehydrate one file at a time, transform it,
@@ -52,6 +56,9 @@ $if !windows {
 		for i := 0; i < a.files.len; i++ {
 			result << w.transform_file_pub(a.files[i])
 		}
+		if wprof {
+			eprintln('  [ttime] worker ${a.worker_idx}: ${a.files.len} files in ${wsw.elapsed().milliseconds()}ms')
+		}
 		unsafe {
 			*(&[]ast.File(a.result_ptr)) = result
 			*(&voidptr(a.worker_ptr)) = voidptr(w)
@@ -61,8 +68,17 @@ $if !windows {
 }
 
 fn (mut b Builder) transform_files_parallel(mut trans transformer.Transformer) []ast.File {
+	timing := os.getenv('V2_TTIME') != ''
+	mut sw := time.new_stopwatch()
 	mut result := b.transform_files_parallel_no_post_pass(mut trans)
+	if timing {
+		eprintln('  [ttime] (parallel) prepare+fanout: ${sw.elapsed().milliseconds()}ms')
+		sw = time.new_stopwatch()
+	}
 	trans.post_pass(mut result)
+	if timing {
+		eprintln('  [ttime] (parallel) post_pass: ${sw.elapsed().milliseconds()}ms')
+	}
 	return result
 }
 
@@ -93,6 +109,8 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 	} else {
 		trans.pre_pass(b.files)
 	}
+	timing_impl := os.getenv('V2_TTIME') != ''
+	mut sw_impl := time.new_stopwatch()
 	mut stream_files_from_flat := stream_from_flat
 	mut files_to_transform := []ast.File{}
 	if trans.needs_full_files_for_transform() {
@@ -102,6 +120,15 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 	} else if !stream_from_flat {
 		files_to_transform = b.files.clone()
 	}
+	if timing_impl {
+		eprintln('  [ttime] prepare_files_for_transform total: ${sw_impl.elapsed().milliseconds()}ms')
+		sw_impl = time.new_stopwatch()
+	}
+	defer {
+		if timing_impl {
+			eprintln('  [ttime] per-file fanout: ${sw_impl.elapsed().milliseconds()}ms')
+		}
+	}
 
 	// In flat mode, workers stream the rehydration per file (one legacy
 	// ast.File in flight per worker at a time). Otherwise b.files is the
@@ -143,8 +170,32 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 			return result
 		}
 
-		// Split files into chunks and spawn workers via pthreads
-		chunk_size := (n_files + n_jobs - 1) / n_jobs // ceiling division
+		// Assign files to workers. Contiguous chunks badly unbalance the load:
+		// the few huge files (transformer.v, monomorphize.v, the cleanc gen
+		// files, ...) cluster into adjacent chunks, so 2-3 workers run ~10s
+		// while the rest finish in <0.5s and idle. For the non-flat path we
+		// instead use longest-processing-time-first (LPT) bucketing keyed on a
+		// cheap size proxy, then scatter each worker's results back to their
+		// original file index after the join (no concurrent writes — workers
+		// each fill their own chunk_results slot, the merge happens serially).
+		mut bucket_indices := [][]int{len: n_jobs}
+		if stream_files_from_flat {
+			// Flat streaming still uses contiguous [start,end) ranges.
+			chunk_size := (n_files + n_jobs - 1) / n_jobs
+			mut i := 0
+			mut w := 0
+			for i < n_files {
+				end := if i + chunk_size < n_files { i + chunk_size } else { n_files }
+				for j in i .. end {
+					bucket_indices[w] << j
+				}
+				i = end
+				w++
+			}
+		} else {
+			bucket_indices = lpt_buckets(files_to_transform, n_jobs)
+		}
+
 		mut chunk_results := [][]ast.File{len: n_jobs}
 		mut worker_ptrs := []voidptr{len: n_jobs, init: unsafe { nil }}
 		mut thread_ids := []C.pthread_t{len: n_jobs}
@@ -159,21 +210,26 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 		C.pthread_attr_setstacksize(attr, 64 * 1024 * 1024)
 
 		mut chunk_idx := 0
-		mut i := 0
-		for i < n_files {
-			end := if i + chunk_size < n_files { i + chunk_size } else { n_files }
+		for w in 0 .. n_jobs {
+			idxs := bucket_indices[w]
+			if idxs.len == 0 {
+				continue
+			}
 			if stream_files_from_flat {
 				args << TransformChunkArgs{
 					t:          unsafe { voidptr(trans) }
 					flat:       unsafe { &b.flat }
-					flat_start: i
-					flat_end:   end
+					flat_start: idxs[0]
+					flat_end:   idxs[idxs.len - 1] + 1
 					result_ptr: unsafe { voidptr(&chunk_results[chunk_idx]) }
 					worker_ptr: unsafe { voidptr(&worker_ptrs[chunk_idx]) }
 					worker_idx: chunk_idx
 				}
 			} else {
-				chunk := files_to_transform[i..end].clone()
+				mut chunk := []ast.File{cap: idxs.len}
+				for fi in idxs {
+					chunk << files_to_transform[fi]
+				}
 				args << TransformChunkArgs{
 					t:          unsafe { voidptr(trans) }
 					files:      chunk
@@ -184,7 +240,6 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 			}
 			C.pthread_create(unsafe { &thread_ids[chunk_idx] }, attr, transform_chunk_thread,
 				unsafe { voidptr(&args[chunk_idx]) })
-			i = end
 			chunk_idx++
 		}
 		C.pthread_attr_destroy(attr)
@@ -194,22 +249,88 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 			C.pthread_join(thread_ids[ci], unsafe { nil })
 		}
 
-		// Collect results in chunk order and merge worker accumulated state
-		mut result := []ast.File{cap: n_files}
-		for ci := 0; ci < chunk_idx; ci++ {
+		// Scatter each worker's results back to original file order and merge
+		// accumulated state. bucket_indices[w] lists the original indices the
+		// w-th spawned worker processed, in the same order it produced results.
+		mut result := []ast.File{len: n_files}
+		mut ci := 0
+		for w in 0 .. n_jobs {
+			idxs := bucket_indices[w]
+			if idxs.len == 0 {
+				continue
+			}
 			chunk_files := chunk_results[ci]
-			for k := 0; k < chunk_files.len; k++ {
-				result << chunk_files[k]
+			for k, fi in idxs {
+				if k < chunk_files.len {
+					result[fi] = chunk_files[k]
+				}
 			}
-			w := unsafe { &transformer.Transformer(worker_ptrs[ci]) }
-			trans.merge_worker(w)
+			worker := unsafe { &transformer.Transformer(worker_ptrs[ci]) }
+			trans.merge_worker(worker)
+			ci++
 		}
 		// Set synth_pos_counter past all worker ranges to avoid ID collisions in post_pass.
 		trans.set_synth_pos_counter(-(chunk_idx * 100_000) - 1)
 		return result
 	}
 }
 
+// lpt_buckets distributes file indices across n_jobs workers using the
+// longest-processing-time-first heuristic: process files largest-first and
+// always append to the currently least-loaded worker. This keeps the heaviest
+// files on separate workers so the fan-out wall time approaches
+// total_work / n_jobs instead of being pinned to one overloaded contiguous
+// chunk. The cost proxy is top-level statement count (cheap, and the giant
+// files have proportionally many declarations). Deterministic: files are
+// ordered by (cost desc, index asc) and ties pick the lowest worker index.
+fn lpt_buckets(files []ast.File, n_jobs int) [][]int {
+	n := files.len
+	mut cost := []int{len: n}
+	for i in 0 .. n {
+		// Cost proxy: count function bodies, not just top-level declarations, so
+		// a file of a few huge functions (transformer.v, the cleanc gen files)
+		// outranks one with many tiny ones. Deterministic; one level deep is
+		// enough to separate the heavyweight files that drove the imbalance.
+		mut c := 1
+		for stmt in files[i].stmts {
+			c++
+			if stmt is ast.FnDecl {
+				c += stmt.stmts.len
+			}
+		}
+		cost[i] = c
+	}
+	// order = file indices by cost descending. Implemented as a plain insertion
+	// sort (n is small, a few hundred) rather than sort_with_compare: this file
+	// must self-host through every backend, and capturing closures / pointer
+	// comparators are not reliably codegen'd by the v2 cleanc and arm64 paths.
+	// Stable on index (only shifts on strictly-greater), so deterministic.
+	mut order := []int{len: n, init: index}
+	for i in 1 .. n {
+		key := order[i]
+		kc := cost[key]
+		mut j := i - 1
+		for j >= 0 && cost[order[j]] < kc {
+			order[j + 1] = order[j]
+			j--
+		}
+		order[j + 1] = key
+	}
+	mut buckets := [][]int{len: n_jobs}
+	mut load := []i64{len: n_jobs}
+	for fi in order {
+		mut mw := 0
+		for w in 1 .. n_jobs {
+			if load[w] < load[mw] {
+				mw = w
+			}
+		}
+		buckets[mw] << fi
+		load[mw] += i64(cost[fi])
+	}
+	return buckets
+}
+
 // transform_files_parallel_to_flat is the parallel counterpart of
 // Transformer.transform_files_to_flat. Today it composes the existing
 // parallel transform with a boundary flatten_files() — same total work
 
@@ -1172,26 +1172,31 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st
 		|| !transformer_string_has_valid_data(type_name) {
 		return false
 	}
-	normalized_key := method_key.replace('.', '__')
-	normalized_type := type_name.replace('.', '__')
+	// Avoid .replace/.contains here: replace always allocates and contains builds
+	// a KMP failure table per call. This runs inside O(method_keys) fallback loops
+	// per call site, so those per-call allocations were a large transform cost.
+	// Only normalize when a '.' is actually present (index_u8 does not allocate),
+	// and locate `__` with a hand-rolled scan.
+	normalized_key := if method_key.index_u8(`.`) >= 0 {
+		method_key.replace('.', '__')
+	} else {
+		method_key
+	}
+	normalized_type := if type_name.index_u8(`.`) >= 0 {
+		type_name.replace('.', '__')
+	} else {
+		type_name
+	}
 	if normalized_key == normalized_type {
 		return true
 	}
-	key_is_qualified := normalized_key.contains('__')
-	type_is_qualified := normalized_type.contains('__')
-	if key_is_qualified && type_is_qualified {
+	key_dunder := last_double_underscore(normalized_key)
+	type_dunder := last_double_underscore(normalized_type)
+	if key_dunder >= 0 && type_dunder >= 0 {
 		return false
 	}
-	short_type := if normalized_type.contains('__') {
-		normalized_type.all_after_last('__')
-	} else {
-		normalized_type
-	}
-	short_key := if normalized_key.contains('__') {
-		normalized_key.all_after_last('__')
-	} else {
-		normalized_key
-	}
+	short_type := if type_dunder >= 0 { normalized_type[type_dunder + 2..] } else { normalized_type }
+	short_key := if key_dunder >= 0 { normalized_key[key_dunder + 2..] } else { normalized_key }
 	if short_key == short_type {
 		return true
 	}
@@ -1210,6 +1215,28 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st
 	return false
 }
 
+// candidate_method_keys returns the cached method keys that could fuzzy-match any
+// of `names` — i.e. those sharing a receiver short name. A method_key_matches_type_name
+// match always implies equal short names, so the fuzzy fallback loops can scan
+// these candidates instead of every method key (O(all_keys) per call site).
+fn (t &Transformer) candidate_method_keys(names []string) []string {
+	mut cand := []string{}
+	mut shorts_done := []string{}
+	for name in names {
+		if name == '' {
+			continue
+		}
+		sh := method_short_name(name)
+		if sh in shorts_done {
+			continue
+		}
+		shorts_done << sh
+		keys := t.cached_method_keys_by_short[sh] or { continue }
+		cand << keys
+	}
+	return cand
+}
+
 fn (t &Transformer) lookup_method_return_type(type_names []string, method_name string) ?types.Type {
 	if method_name == '' {
 		return none
@@ -1229,7 +1256,7 @@ fn (t &Transformer) lookup_method_return_type(type_names []string, method_name s
 			}
 		}
 	}
-	for key in t.cached_method_keys {
+	for key in t.candidate_method_keys(seen) {
 		mut matches_receiver := false
 		for type_name in seen {
 			if t.method_key_matches_type_name(key, type_name) {
@@ -1330,7 +1357,7 @@ fn (t &Transformer) lookup_method_exists(type_names []string, method_name string
 			return true
 		}
 	}
-	for key in t.cached_method_keys {
+	for key in t.candidate_method_keys(seen) {
 		mut matches_receiver := false
 		for type_name in seen {
 			if t.method_key_matches_type_name(key, type_name) {
@@ -4454,7 +4481,7 @@ fn (t &Transformer) resolve_method_call_name(receiver ast.Expr, method_name stri
 		}
 	}
 	// Fuzzy fallback: iterate method keys to find matching receiver types
-	for key in t.cached_method_keys {
+	for key in t.candidate_method_keys(lookup_names) {
 		mut matches_receiver := false
 		for name in lookup_names {
 			if t.method_key_matches_type_name(key, name) {