Skip to content

GH-133136: Revise QSBR to reduce excess memory held #135473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/internal/pycore_pymem.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ extern wchar_t *_PyMem_DefaultRawWcsdup(const wchar_t *str);
extern int _PyMem_DebugEnabled(void);

// Enqueue a pointer to be freed possibly after some delay.
extern void _PyMem_FreeDelayed(void *ptr);
extern void _PyMem_FreeDelayed(void *ptr, size_t size);

// Enqueue an object to be freed possibly after some delay
#ifdef Py_GIL_DISABLED
Expand Down
31 changes: 25 additions & 6 deletions Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,21 @@ struct _qsbr_thread_state {
// Thread state (or NULL)
PyThreadState *tstate;

// Used to defer advancing write sequence a fixed number of times
int deferrals;
// Number of held items added by this thread since the last write sequence
// advance
int deferred_count;

// Estimate for the amount of memory that is held by this thread since
// the last write sequence advance
size_t deferred_memory;

// Amount of memory in mimalloc pages deferred from collection. When
// deferred, they are prevented from being used for a different size class
// and in a different thread.
size_t deferred_page_memory;

// True if the deferred memory frees should be processed.
bool should_process;

// Is this thread state allocated?
bool allocated;
Expand Down Expand Up @@ -109,11 +122,17 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal)
extern uint64_t
_Py_qsbr_advance(struct _qsbr_shared *shared);

// Batches requests to advance the write sequence. This advances the write
// sequence every N calls, which reduces overhead but increases time to
// reclamation. Returns the new goal.
// Return the next value for the write sequence (current plus the increment).
extern uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
_Py_qsbr_shared_next(struct _qsbr_shared *shared);

// Return true if deferred memory frees held by QSBR should be processed to
// determine if they can be safely freed.
static inline bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
{
return qsbr->should_process;
}

// Have the read sequences advanced to the given goal? If this returns true,
// it safe to reclaim any memory tagged with the goal (or earlier goal).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Limit excess memory usage in the :term:`free threading` build when a
large dictionary or list is resized and accessed by multiple threads.
2 changes: 1 addition & 1 deletion Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3370,7 +3370,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
}
memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
_Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
_PyMem_FreeDelayed(tlbc);
_PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *));
tlbc = new_tlbc;
}
char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
Expand Down
4 changes: 2 additions & 2 deletions Objects/dictobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr)
{
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(keys);
_PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys));
return;
}
#endif
Expand Down Expand Up @@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr)
assert(values->embedded == 0);
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(values);
_PyMem_FreeDelayed(values, values_size_from_count(values->capacity));
return;
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion Objects/listobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr)
#ifdef Py_GIL_DISABLED
_PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item);
if (use_qsbr) {
_PyMem_FreeDelayed(array);
size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *);
_PyMem_FreeDelayed(array, size);
}
else {
PyMem_Free(array);
Expand Down
86 changes: 80 additions & 6 deletions Objects/obmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,29 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page)

}

#ifdef Py_GIL_DISABLED

// If we are deferring collection of more than this amount of memory for
// mimalloc pages, advance the write sequence. Advancing allows these
// pages to be re-used in a different thread or for a different size class.
#define QSBR_PAGE_MEM_LIMIT 4096*20

// Return true if the global write sequence should be advanced for a mimalloc
// page that is deferred from collection.
static bool
should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page)
{
size_t bsize = mi_page_block_size(page);
size_t page_size = page->capacity*bsize;
qsbr->deferred_page_memory += page_size;
if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) {
qsbr->deferred_page_memory = 0;
return true;
}
return false;
}
#endif

static bool
_PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
{
Expand All @@ -139,7 +162,14 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)

_PyMem_mi_page_clear_qsbr(page);
page->retire_expire = 0;
page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr);

if (should_advance_qsbr_for_page(tstate->qsbr, page)) {
page->qsbr_goal = _Py_qsbr_advance(tstate->qsbr->shared);
}
else {
page->qsbr_goal = _Py_qsbr_shared_next(tstate->qsbr->shared);
}

llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
return false;
}
Expand Down Expand Up @@ -1141,8 +1171,38 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
}
}


#ifdef Py_GIL_DISABLED

// For deferred advance on free: the number of deferred items before advancing
// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
// want to process a chunk before it overflows.
#define QSBR_DEFERRED_LIMIT 127

// If the deferred memory exceeds 1 MiB, advance the write sequence. This
// helps limit memory usage due to QSBR delaying frees too long.
#define QSBR_FREE_MEM_LIMIT 1024*1024

// Return true if the global write sequence should be advanced for a deferred
// memory free.
static bool
should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size)
{
qsbr->deferred_count++;
qsbr->deferred_memory += size;
if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT ||
qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) {
qsbr->deferred_count = 0;
qsbr->deferred_memory = 0;
qsbr->should_process = true;
return true;
}
return false;
}
#endif

static void
free_delayed(uintptr_t ptr)
free_delayed(uintptr_t ptr, size_t size)
{
#ifndef Py_GIL_DISABLED
free_work_item(ptr, NULL, NULL);
Expand Down Expand Up @@ -1200,23 +1260,32 @@ free_delayed(uintptr_t ptr)
}

assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
uint64_t seq;
if (should_advance_qsbr_for_free(tstate->qsbr, size)) {
seq = _Py_qsbr_advance(tstate->qsbr->shared);
}
else {
seq = _Py_qsbr_shared_next(tstate->qsbr->shared);
}
buf->array[buf->wr_idx].ptr = ptr;
buf->array[buf->wr_idx].qsbr_goal = seq;
buf->wr_idx++;

if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
// Normally the processing of delayed items is done from the eval
// breaker. Processing here is a safety measure to ensure too much
// work does not accumulate.
_PyMem_ProcessDelayed((PyThreadState *)tstate);
}
#endif
}

void
_PyMem_FreeDelayed(void *ptr)
_PyMem_FreeDelayed(void *ptr, size_t size)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed((uintptr_t)ptr);
free_delayed((uintptr_t)ptr, size);
}
}

Expand All @@ -1226,7 +1295,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed(((uintptr_t)ptr)|0x01);
// We use 0 as the size since we don't have an easy way to know the
// actual size. If we are freeing many objects, the write sequence
// will be advanced due to QSBR_DEFERRED_LIMIT.
free_delayed(((uintptr_t)ptr)|0x01, 0);
}
}
#endif
Expand Down Expand Up @@ -1302,6 +1374,8 @@ _PyMem_ProcessDelayed(PyThreadState *tstate)
PyInterpreterState *interp = tstate->interp;
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;

tstate_impl->qsbr->should_process = false;

// Process thread-local work
process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL);

Expand Down
4 changes: 4 additions & 0 deletions Python/ceval_gil.c
Original file line number Diff line number Diff line change
Expand Up @@ -1387,6 +1387,10 @@ _Py_HandlePending(PyThreadState *tstate)
_Py_unset_eval_breaker_bit(tstate, _PY_EVAL_EXPLICIT_MERGE_BIT);
_Py_brc_merge_refcounts(tstate);
}
/* Process deferred memory frees held by QSBR */
if (_Py_qsbr_should_process(((_PyThreadStateImpl *)tstate)->qsbr)) {
_PyMem_ProcessDelayed(tstate);
}
#endif

/* GC scheduled to run */
Expand Down
12 changes: 2 additions & 10 deletions Python/qsbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
// Starting size of the array of qsbr thread states
#define MIN_ARRAY_SIZE 8

// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
// the write sequence.
#define QSBR_DEFERRED_LIMIT 10

// Allocate a QSBR thread state from the freelist
static struct _qsbr_thread_state *
qsbr_allocate(struct _qsbr_shared *shared)
Expand Down Expand Up @@ -117,13 +113,9 @@ _Py_qsbr_advance(struct _qsbr_shared *shared)
}

uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
_Py_qsbr_shared_next(struct _qsbr_shared *shared)
{
if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}
qsbr->deferrals = 0;
return _Py_qsbr_advance(qsbr->shared);
return _Py_qsbr_shared_current(shared) + QSBR_INCR;
}

static uint64_t
Expand Down
Loading