emscripten-core · juj · Apr 12, 2022 · Apr 12, 2022 · Apr 13, 2022 · Apr 13, 2022
diff --git a/src/library_wasm_worker.js b/src/library_wasm_worker.js
@@ -367,7 +367,7 @@ mergeInto(LibraryManager.library, {
                                           val - num /* Acquire 'num' of them */);
         if (ret == val) return dispatch(ret/*index of resource acquired*/, 0/*'ok'*/);
         val = ret;
-        let wait = Atomics['waitAsync'](HEAPU32, sem >> 2, ret, maxWaitMilliseconds);
+        var wait = Atomics['waitAsync'](HEAPU32, sem >> 2, ret, maxWaitMilliseconds);
       } while(wait.value === 'not-equal');
 #if ASSERTIONS
       assert(wait.async || wait.value === 'timed-out');

diff --git a/system/lib/dlmalloc.c b/system/lib/dlmalloc.c
@@ -18,12 +18,6 @@
 /* XXX Emscripten Tracing API. This defines away the code if tracing is disabled. */
 #include <emscripten/trace.h>
 
-/* Make malloc() and free() threadsafe by securing the memory allocations with pthread mutexes. */
-#if __EMSCRIPTEN_PTHREADS__
-#define USE_LOCKS 1
-#define USE_SPIN_LOCKS 0 // Ensure we use pthread_mutex_t.
-#endif
-
 #ifndef MALLOC_ALIGNMENT
 #include <stddef.h>
 /* `malloc`ed pointers must be aligned at least as strictly as max_align_t. */

diff --git a/system/lib/emmalloc.c b/system/lib/emmalloc.c
@@ -97,7 +97,7 @@ typedef struct Region
   uint32_t _at_the_end_of_this_struct_size; // do not dereference, this is present for convenient struct sizeof() computation only
 } Region;
 
-#if defined(__EMSCRIPTEN_PTHREADS__)
+#if defined(__EMSCRIPTEN_PTHREADS__) || defined(__EMSCRIPTEN_WASM_WORKERS__)
 // In multithreaded builds, use a simple global spinlock strategy to acquire/release access to the memory allocator.
 static volatile uint8_t multithreadingLock = 0;
 #define MALLOC_ACQUIRE() while(__sync_lock_test_and_set(&multithreadingLock, 1)) { while(multithreadingLock) { /*nop*/ } }

diff --git a/system/lib/wasm_worker/library_wasm_worker.c b/system/lib/wasm_worker/library_wasm_worker.c
@@ -22,7 +22,10 @@ __attribute__((constructor(48)))
 static void emscripten_wasm_worker_main_thread_initialize() {
 	uintptr_t* sbrk_ptr = emscripten_get_sbrk_ptr();
 	__wasm_init_tls((void*)*sbrk_ptr);
-	*sbrk_ptr += __builtin_wasm_tls_size();
+	assert(*sbrk_ptr % 4 == 0); // sbrk value must always be a multiple of four bytes.
+	// TODO: This TLS size rounding up to next multiple of 4 bytes could well be done at compile time, which
+	// would save the need to emit code size to round here. (and could turn this into a assert(__builtin_wasm_tls_size() % 4 == 0) instead)
+	*sbrk_ptr += (__builtin_wasm_tls_size() + 3) & -4; // round TLS up to next 4 bytes to retain correct sbrk rounding.
 }
 
 emscripten_wasm_worker_t emscripten_create_wasm_worker(void *stackLowestAddress, uint32_t stackSize)
@@ -41,6 +44,8 @@ emscripten_wasm_worker_t emscripten_create_wasm_worker(void *stackLowestAddress,
     // We expect TLS area to need to be at most 16 bytes aligned
 	assert(__builtin_wasm_tls_align() == 0 || 16 % __builtin_wasm_tls_align() == 0);
 
+	// TODO: This TLS size rounding up to next multiple of 16 bytes could well be done at compile time, which
+	// would save the need to emit code size to round here.
 	uint32_t tlsSize = (__builtin_wasm_tls_size() + 15) & -16;
 	assert(stackSize > tlsSize);
 

diff --git a/tests/code_size/hello_wasm_worker_wasm.json b/tests/code_size/hello_wasm_worker_wasm.json
@@ -3,8 +3,8 @@
   "a.html.gz": 433,
   "a.js": 733,
   "a.js.gz": 463,
-  "a.wasm": 1800,
-  "a.wasm.gz": 994,
-  "total": 3270,
-  "total_gz": 1890
+  "a.wasm": 1842,
+  "a.wasm.gz": 1020,
+  "total": 3312,
+  "total_gz": 1916
 }
diff --git a/tests/test_browser.py b/tests/test_browser.py
@@ -5288,6 +5288,15 @@ def test_wasm_worker_semaphore_waitinf_acquire(self):
   def test_wasm_worker_semaphore_try_acquire(self):
     self.btest(test_file('wasm_worker/semaphore_try_acquire.c'), expected='0', args=['-sWASM_WORKERS'])
 
+  # Tests that operating malloc from Wasm Workers is thread-safe.
+  @also_with_minimal_runtime
+  def test_wasm_worker_thread_safe_emmalloc(self):
+    self.btest(test_file('wasm_worker/thread_safe_malloc.cpp'), expected='0', args=['-sWASM_WORKERS', '-DEMMALLOC', '-sMALLOC=emmalloc-debug'])
+
+  @also_with_minimal_runtime
+  def test_wasm_worker_thread_safe_dlmalloc(self):
+    self.btest(test_file('wasm_worker/thread_safe_malloc.cpp'), expected='0', args=['-sWASM_WORKERS', '-sMALLOC=dlmalloc'])
+
   @no_firefox('no 4GB support yet')
   @require_v8
   def test_zzz_zzz_4gb(self):

diff --git a/tests/wasm_worker/thread_safe_malloc.cpp b/tests/wasm_worker/thread_safe_malloc.cpp
@@ -0,0 +1,51 @@
+#include <emscripten.h>
+#include <emscripten/wasm_worker.h>
+#include <emscripten/threading.h>
+#include <emscripten/emmalloc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+// Tests that operating malloc from Wasm Workers is thread-safe.
+
+emscripten_semaphore_t workDone = EMSCRIPTEN_SEMAPHORE_T_STATIC_INITIALIZER(0);
+
+// Make sure we have tiny byte of TLS data that causes LLVM TLS size to be not a multiple of four from the get-go.
+__thread uint8_t dummyTls;
+
+void work()
+{
+  assert(dummyTls == 0);
+  for(int i = 0; i < 100000; ++i)
+  {
+    dummyTls += 1;
+    void *ptr = malloc(emscripten_random() * 10000);
+    assert(ptr);
+    free(ptr);
+  }
+  printf("dummyTls: %d\n", (int)dummyTls);
+  emscripten_semaphore_release(&workDone, 1);
+}
+
+void allThreadsDone(volatile void *address, uint32_t idx, ATOMICS_WAIT_RESULT_T result, void *userData)
+{
+#ifdef EMMALLOC
+  assert(emmalloc_validate_memory_regions() == 0);
+#endif
+  printf("Test passed!\n");
+#ifdef REPORT_RESULT
+  REPORT_RESULT(0);
+#endif
+}
+
+int main()
+{
+#define NUM_THREADS 10
+  for(int i = 0; i < NUM_THREADS; ++i)
+  {
+    emscripten_wasm_worker_t worker = emscripten_malloc_wasm_worker(1024);
+    emscripten_wasm_worker_post_function_v(worker, work);
+  }
+
+  emscripten_semaphore_async_acquire(&workDone, NUM_THREADS, allThreadsDone, 0, EMSCRIPTEN_WAIT_ASYNC_INFINITY);
+}
diff --git a/tools/system_libs.py b/tools/system_libs.py
@@ -1271,6 +1271,8 @@ def __init__(self, **kwargs):
     self.memvalidate = kwargs.pop('memvalidate')
     self.verbose = kwargs.pop('verbose')
     self.is_debug = kwargs.pop('is_debug') or self.memvalidate or self.verbose
+    self.thread_safe = kwargs.pop('thread_safe')
+    self.use_spinlocks = kwargs.pop('use_spinlocks')
 
     super().__init__(**kwargs)
 
@@ -1296,15 +1298,25 @@ def get_cflags(self):
       cflags += ['-DMALLOC_FAILURE_ACTION=', '-DEMSCRIPTEN_NO_ERRNO']
     if self.is_tracing:
       cflags += ['--tracing']
+    # Target dlmalloc multithreading aware implementation if using pthreads or Wasm Workers
+    cflags += [f'-DUSE_LOCKS={self.thread_safe}']
+    # When using Wasm Workers, target spinlocks. Otherwise target pthread mutexes when using pthreads.
+    cflags += [f'-DUSE_SPIN_LOCKS={self.use_spinlocks}']
  Thread-safety: NOT thread-safe unless USE_LOCKS defined non-zero 
  When USE_LOCKS is defined, each public call to malloc, free, 
  etc is surrounded with a lock. By default, this uses a plain 
  pthread mutex, win32 critical section, or a spin-lock if if 
  available for the platform and not disabled by setting 
  USE_SPIN_LOCKS=0.  However, if USE_RECURSIVE_LOCKS is defined, 
  Thread-safety: NOT thread-safe unless USE_LOCKS defined non-zero 
  When USE_LOCKS is defined, each public call to malloc, free, 
  etc is surrounded with a lock. By default, this uses a plain 
  pthread mutex, win32 critical section, or a spin-lock if if 
  available for the platform and not disabled by setting 
  USE_SPIN_LOCKS=0.  However, if USE_RECURSIVE_LOCKS is defined, 
+
     return cflags
 
   def get_base_name_prefix(self):
     return 'lib%s' % self.malloc
 
   def get_base_name(self):
-    name = super().get_base_name()
+    name = self.get_base_name_prefix()
     if self.is_debug and not self.memvalidate and not self.verbose:
       name += '-debug'
+
+    # Add the prefixes for malloc ('-mt' / '-ww') after the '-debug' part, since the -debug string
+    # is part of the user-submitted -sMALLOC prefix (e.g. -sMALLOC=dlmalloc-debug)
+    name += super().get_base_name().replace(self.get_base_name_prefix(), '')
+
     if not self.use_errno:
       # emmalloc doesn't actually use errno, but it's easier to build it again
       name += '-noerrno'
@@ -1333,6 +1345,8 @@ def get_default_variation(cls, **kwargs):
       is_tracing=settings.EMSCRIPTEN_TRACING,
       memvalidate='memvalidate' in settings.MALLOC,
       verbose='verbose' in settings.MALLOC,
+      use_spinlocks=settings.WASM_WORKERS,
+      thread_safe=settings.SHARED_MEMORY,
       **kwargs
     )