Skip to content

Commit

Permalink
oom_kill.c: futex: delay the OOM reaper to allow time for proper fute…
Browse files Browse the repository at this point in the history
…x cleanup

commit e4a3840 upstream.

The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which
can be targeted by the oom reaper.  This mapping is used to store the
futex robust list head; the kernel does not keep a copy of the robust
list and instead references a userspace address to maintain the
robustness during a process death.

A race can occur between exit_mm and the oom reaper that allows the oom
reaper to free the memory of the futex robust list before the exit path
has handled the futex death:

    CPU1                               CPU2
    --------------------------------------------------------------------
    page_fault
    do_exit "signal"
    wake_oom_reaper
                                        oom_reaper
                                        oom_reap_task_mm (invalidates mm)
    exit_mm
    exit_mm_release
    futex_exit_release
    futex_cleanup
    exit_robust_list
    get_user (EFAULT- can't access memory)

If the get_user EFAULT's, the kernel will be unable to recover the
waiters on the robust_list, leaving userspace mutexes hung indefinitely.

Delay the OOM reaper, allowing more time for the exit path to perform
the futex cleanup.

Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer

Based on a patch by Michal Hocko.

Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1]
Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com
Fixes: 2129258 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
Co-developed-by: Joel Savitz <jsavitz@redhat.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Herton R. Krzesinski <herton@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Savitz <jsavitz@redhat.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  • Loading branch information
Nico Pache authored and gregkh committed Apr 27, 2022
1 parent 6b93292 commit ed5d4ef
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 14 deletions.
1 change: 1 addition & 0 deletions include/linux/sched.h
Expand Up @@ -1325,6 +1325,7 @@ struct task_struct {
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
struct timer_list oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
Expand Down
54 changes: 40 additions & 14 deletions mm/oom_kill.c
Expand Up @@ -633,7 +633,7 @@ static void oom_reap_task(struct task_struct *tsk)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);

/* Drop a reference taken by wake_oom_reaper */
/* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}

Expand All @@ -643,12 +643,12 @@ static int oom_reaper(void *unused)
struct task_struct *tsk = NULL;

wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
spin_lock(&oom_reaper_lock);
spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
spin_unlock(&oom_reaper_lock);
spin_unlock_irq(&oom_reaper_lock);

if (tsk)
oom_reap_task(tsk);
Expand All @@ -657,30 +657,56 @@ static int oom_reaper(void *unused)
return 0;
}

static void wake_oom_reaper(struct task_struct *tsk)
static void wake_oom_reaper(struct timer_list *timer)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
struct task_struct *tsk = container_of(timer, struct task_struct,
oom_reaper_timer);
struct mm_struct *mm = tsk->signal->oom_mm;
unsigned long flags;

get_task_struct(tsk);
/* The victim managed to terminate on its own - see exit_mmap */
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
put_task_struct(tsk);
return;
}

spin_lock(&oom_reaper_lock);
spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}

/*
* Give the OOM victim time to exit naturally before invoking the oom_reaping.
* The timers timeout is arbitrary... the longer it is, the longer the worst
* case scenario for the OOM can take. If it is too small, the oom_reaper can
* get in the way and release resources needed by the process exit path.
* e.g. The futex robust list can sit in Anon|Private memory that gets reaped
* before the exit path is able to wake the futex waiters.
*/
#define OOM_REAPER_DELAY (2*HZ)
static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;

get_task_struct(tsk);
timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
add_timer(&tsk->oom_reaper_timer);
}

static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
return 0;
}
subsys_initcall(oom_init)
#else
static inline void wake_oom_reaper(struct task_struct *tsk)
static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
Expand Down Expand Up @@ -931,7 +957,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock();

if (can_oom_reap)
wake_oom_reaper(victim);
queue_oom_reaper(victim);

mmdrop(mm);
put_task_struct(victim);
Expand Down Expand Up @@ -967,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
wake_oom_reaper(victim);
queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
Expand Down Expand Up @@ -1065,7 +1091,7 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
wake_oom_reaper(current);
queue_oom_reaper(current);
return true;
}

Expand Down

0 comments on commit ed5d4ef

Please sign in to comment.