From 43f2e6077f441d681f0337ab91f7c4c2d4c62761 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 17 Jan 2021 15:17:02 +0200 Subject: [PATCH 01/33] gpio: mvebu: fix pwm .get_state period calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e73b0101ae5124bf7cd3fb5d250302ad2f16a416 upstream. The period is the sum of on and off values. That is, calculate period as ($on + $off) / clkrate instead of $off / clkrate - $on / clkrate that makes no sense. Reported-by: Russell King Reviewed-by: Uwe Kleine-König Fixes: 757642f9a584e ("gpio: mvebu: Add limited PWM support") Signed-off-by: Baruch Siach Signed-off-by: Bartosz Golaszewski [baruch: backport to kernels <= v5.10] Reviewed-by: Uwe Kleine-König Signed-off-by: Baruch Siach Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-mvebu.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 2f245594a90a6..ed7c5fc47f524 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -660,9 +660,8 @@ static void mvebu_pwm_get_state(struct pwm_chip *chip, spin_lock_irqsave(&mvpwm->lock, flags); - val = (unsigned long long) - readl_relaxed(mvebu_pwmreg_blink_on_duration(mvpwm)); - val *= NSEC_PER_SEC; + u = readl_relaxed(mvebu_pwmreg_blink_on_duration(mvpwm)); + val = (unsigned long long) u * NSEC_PER_SEC; do_div(val, mvpwm->clk_rate); if (val > UINT_MAX) state->duty_cycle = UINT_MAX; @@ -671,21 +670,17 @@ static void mvebu_pwm_get_state(struct pwm_chip *chip, else state->duty_cycle = 1; - val = (unsigned long long) - readl_relaxed(mvebu_pwmreg_blink_off_duration(mvpwm)); + val = (unsigned long long) u; /* on duration */ + /* period = on + off duration */ + val += readl_relaxed(mvebu_pwmreg_blink_off_duration(mvpwm)); val *= NSEC_PER_SEC; do_div(val, mvpwm->clk_rate); - if (val < state->duty_cycle) { + if (val > UINT_MAX) + state->period = UINT_MAX; + else if (val) + state->period = val; + else state->period = 1; - } else { - val -= state->duty_cycle; - if (val > UINT_MAX) - state->period = UINT_MAX; - else if (val) - state->period = val; - else - state->period = 1; - } regmap_read(mvchip->regs, GPIO_BLINK_EN_OFF + mvchip->offset, &u); if (u) From bf5eb7d21ab01c12c35df05dddd15f9f2ad5ba71 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 28 Jan 2021 19:32:50 +0800 Subject: [PATCH 02/33] Revert "mm/slub: fix a memory leak in sysfs_slab_add()" commit 757fed1d0898b893d7daa84183947c70f27632f3 upstream. This reverts commit dde3c6b72a16c2db826f54b2d49bdea26c3534a2. syzbot report a double-free bug. The following case can cause this bug. - mm/slab_common.c: create_cache(): if the __kmem_cache_create() fails, it does: out_free_cache: kmem_cache_free(kmem_cache, s); - but __kmem_cache_create() - at least for slub() - will have done sysfs_slab_add(s) -> sysfs_create_group() .. fails .. -> kobject_del(&s->kobj); .. which frees s ... We can't remove the kmem_cache_free() in create_cache(), because other error cases of __kmem_cache_create() do not free this. So, revert the commit dde3c6b72a16 ("mm/slub: fix a memory leak in sysfs_slab_add()") to fix this. Reported-by: syzbot+d0bd96b4696c1ef67991@syzkaller.appspotmail.com Fixes: dde3c6b72a16 ("mm/slub: fix a memory leak in sysfs_slab_add()") Acked-by: Vlastimil Babka Signed-off-by: Wang Hai Cc: Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3f4303f4b657d..071e41067ea67 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5620,10 +5620,8 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) { - kobject_put(&s->kobj); + if (err) goto out; - } err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) From ab5e9a320e444fda64e5912f0e0f4f02021569ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2021 16:00:24 +0100 Subject: [PATCH 03/33] futex: Ensure the correct return value from futex_lock_pi() commit 12bb3f7f1b03d5913b3f9d4236a488aa7774dfe9 upstream In case that futex_lock_pi() was aborted by a signal or a timeout and the task returned without acquiring the rtmutex, but is the designated owner of the futex due to a concurrent futex_unlock_pi() fixup_owner() is invoked to establish consistent state. In that case it invokes fixup_pi_state_owner() which in turn tries to acquire the rtmutex again. If that succeeds then it does not propagate this success to fixup_owner() and futex_lock_pi() returns -EINTR or -ETIMEOUT despite having the futex locked. Return success from fixup_pi_state_owner() in all cases where the current task owns the rtmutex and therefore the futex and propagate it correctly through fixup_owner(). Fixup the other callsite which does not expect a positive return value. Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 00259c7e288ee..0c6d572e675a7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2375,8 +2375,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, } if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { - /* We got the lock after all, nothing to fix. */ - ret = 0; + /* We got the lock. pi_state is correct. Tell caller. */ + ret = 1; goto out_unlock; } @@ -2404,7 +2404,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; + ret = 1; goto out_unlock; } newowner = argowner; @@ -2450,7 +2450,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, raw_spin_unlock(&newowner->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return 0; + return argowner == current; /* * In order to reschedule or handle a page fault, we need to drop the @@ -2492,7 +2492,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * Check if someone else fixed it for us: */ if (pi_state->owner != oldowner) { - ret = 0; + ret = argowner == current; goto out_unlock; } @@ -2525,8 +2525,6 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - int ret = 0; - if (locked) { /* * Got the lock. We might not be the anticipated owner if we @@ -2537,8 +2535,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); - return ret ? ret : locked; + return fixup_pi_state_owner(uaddr, q, current); + return 1; } /* @@ -2549,10 +2547,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Another speculative read; pi_state->owner == current is unstable * but needs our attention. */ - if (q->pi_state->owner == current) { - ret = fixup_pi_state_owner(uaddr, q, NULL); - return ret; - } + if (q->pi_state->owner == current) + return fixup_pi_state_owner(uaddr, q, NULL); /* * Paranoia check. If we did not take the lock, then we should not be @@ -2565,7 +2561,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } - return ret; + return 0; } /** @@ -3263,7 +3259,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + if (ret < 0 && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { pi_state = q.pi_state; get_pi_state(pi_state); } @@ -3273,6 +3269,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. + */ + ret = ret < 0 ? ret : 0; } } else { struct rt_mutex *pi_mutex; From 5ede8ee2cb16f4dd066a37b38ad46576dbf20d45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Jan 2021 16:06:10 +0100 Subject: [PATCH 04/33] futex: Replace pointless printk in fixup_owner() commit 04b79c55201f02ffd675e1231d731365e335c307 upstream If that unexpected case of inconsistent arguments ever happens then the futex state is left completely inconsistent and the printk is not really helpful. Replace it with a warning and make the state consistent. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0c6d572e675a7..d28e9c609c69b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2552,14 +2552,10 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. + * the owner of the rt_mutex. Warn and establish consistent state. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - } + if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) + return fixup_pi_state_owner(uaddr, q, current); return 0; } From 5b2c5a9561c24f3bcdcb85bb897bdc3aa3375c49 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Jan 2021 15:21:35 +0100 Subject: [PATCH 05/33] futex: Provide and use pi_state_update_owner() commit c5cade200ab9a2a3be9e7f32a752c8d86b502ec7 upstream Updating pi_state::owner is done at several places with the same code. Provide a function for it and use that at the obvious places. This is also a preparation for a bug fix to avoid yet another copy of the same code or alternatively introducing a completely unpenetratable mess of gotos. Originally-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 66 +++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index d28e9c609c69b..169f26b0a7c11 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -765,6 +765,29 @@ static struct futex_pi_state *alloc_pi_state(void) return pi_state; } +static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner; + + lockdep_assert_held(&pi_state->pi_mutex.wait_lock); + + if (old_owner) { + raw_spin_lock(&old_owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + } + + if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +} + static void get_pi_state(struct futex_pi_state *pi_state) { WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); @@ -1523,26 +1546,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ ret = -EINVAL; } - if (ret) - goto out_unlock; - - /* - * This is a point of no return; once we modify the uval there is no - * going back and subsequent operations must not fail. - */ - - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - - raw_spin_lock(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); - - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); @@ -2435,19 +2447,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; - - raw_spin_lock(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock(&newowner->pi_lock); + pi_state_update_owner(pi_state, newowner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return argowner == current; From 6d28ac502f9a0cf30d7ca2eeeefbf8a98c01fe82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2021 11:32:07 +0100 Subject: [PATCH 06/33] rtmutex: Remove unused argument from rt_mutex_proxy_unlock() commit 2156ac1934166d6deb6cd0f6ffc4c1076ec63697 upstream Nothing uses the argument. Remove it as preparation to use pi_state_update_owner(). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- kernel/locking/rtmutex.c | 3 +-- kernel/locking/rtmutex_common.h | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 169f26b0a7c11..b69c557c61bd5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -820,7 +820,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) list_del_init(&pi_state->list); raw_spin_unlock(&owner->pi_lock); } - rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + rt_mutex_proxy_unlock(&pi_state->pi_mutex); raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cfdd5b93264d7..2f8cd616d3b29 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1716,8 +1716,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, * possible because it belongs to the pi_state which is about to be freed * and it is not longer visible to other tasks. */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) +void rt_mutex_proxy_unlock(struct rt_mutex *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index d1d62f942be22..ca6fb489007b6 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -133,8 +133,7 @@ enum rtmutex_chainwalk { extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, From a597f12e971c3859fdcc503a25008b37a891f043 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2021 11:35:19 +0100 Subject: [PATCH 07/33] futex: Use pi_state_update_owner() in put_pi_state() commit 6ccc84f917d33312eb2846bd7b567639f585ad6d upstream No point in open coding it. This way it gains the extra sanity checks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b69c557c61bd5..fe83273535ede 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -810,16 +810,10 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - struct task_struct *owner; unsigned long flags; raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); - owner = pi_state->owner; - if (owner) { - raw_spin_lock(&owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock(&owner->pi_lock); - } + pi_state_update_owner(pi_state, NULL); rt_mutex_proxy_unlock(&pi_state->pi_mutex); raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } From abc4dd792f8db54470a888ca825166fbba59ee78 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Jan 2021 16:26:38 +0100 Subject: [PATCH 08/33] futex: Simplify fixup_pi_state_owner() commit f2dac39d93987f7de1e20b3988c8685523247ae2 upstream Too many gotos already and an upcoming fix would make it even more unreadable. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 53 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index fe83273535ede..0f61be768cdfa 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2331,18 +2331,13 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) +static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, curval, newval; struct task_struct *oldowner, *newowner; - u32 newtid; - int ret, err = 0; - - lockdep_assert_held(q->lock_ptr); - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + u32 uval, curval, newval, newtid; + int err = 0; oldowner = pi_state->owner; @@ -2376,14 +2371,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; - goto out_unlock; + return 0; } if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { /* We got the lock. pi_state is correct. Tell caller. */ - ret = 1; - goto out_unlock; + return 1; } /* @@ -2410,8 +2403,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 1; - goto out_unlock; + return 1; } newowner = argowner; } @@ -2442,7 +2434,6 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * itself. */ pi_state_update_owner(pi_state, newowner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return argowner == current; @@ -2465,17 +2456,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, switch (err) { case -EFAULT: - ret = fault_in_user_writeable(uaddr); + err = fault_in_user_writeable(uaddr); break; case -EAGAIN: cond_resched(); - ret = 0; + err = 0; break; default: WARN_ON_ONCE(1); - ret = err; break; } @@ -2485,17 +2475,26 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) { - ret = argowner == current; - goto out_unlock; - } + if (pi_state->owner != oldowner) + return argowner == current; - if (ret) - goto out_unlock; + /* Retry if err was -EAGAIN or the fault in succeeded */ + if (!err) + goto retry; - goto retry; + return err; +} -out_unlock: +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + int ret; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + ret = __fixup_pi_state_owner(uaddr, q, argowner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } From e843e4f782582a10e5077b917948d1df943070f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Jan 2021 19:01:21 +0100 Subject: [PATCH 09/33] futex: Handle faults correctly for PI futexes commit 34b1a1ce1458f50ef27c54e28eb9b1947012907a upstream fixup_pi_state_owner() tries to ensure that the state of the rtmutex, pi_state and the user space value related to the PI futex are consistent before returning to user space. In case that the user space value update faults and the fault cannot be resolved by faulting the page in via fault_in_user_writeable() the function returns with -EFAULT and leaves the rtmutex and pi_state owner state inconsistent. A subsequent futex_unlock_pi() operates on the inconsistent pi_state and releases the rtmutex despite not owning it which can corrupt the RB tree of the rtmutex and cause a subsequent kernel stack use after free. It was suggested to loop forever in fixup_pi_state_owner() if the fault cannot be resolved, but that results in runaway tasks which is especially undesired when the problem happens due to a programming error and not due to malice. As the user space value cannot be fixed up, the proper solution is to make the rtmutex and the pi_state consistent so both have the same owner. This leaves the user space value out of sync. Any subsequent operation on the futex will fail because the 10th rule of PI futexes (pi_state owner and user space value are consistent) has been violated. As a consequence this removes the inept attempts of 'fixing' the situation in case that the current task owns the rtmutex when returning with an unresolvable fault by unlocking the rtmutex which left pi_state::owner and rtmutex::owner out of sync in a different and only slightly less dangerous way. Fixes: 1b7558e457ed ("futexes: fix fault handling in futex_lock_pi") Reported-by: gzobqq@gmail.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 57 ++++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0f61be768cdfa..0693b3ea0f9a4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -960,7 +960,8 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * FUTEX_OWNER_DIED bit. See [4] * * [10] There is no transient state which leaves owner and user space - * TID out of sync. + * TID out of sync. Except one error case where the kernel is denied + * write access to the user address, see fixup_pi_state_owner(). * * * Serialization and lifetime rules: @@ -2482,6 +2483,24 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, if (!err) goto retry; + /* + * fault_in_user_writeable() failed so user state is immutable. At + * best we can make the kernel state consistent but user state will + * be most likely hosed and any subsequent unlock operation will be + * rejected due to PI futex rule [10]. + * + * Ensure that the rtmutex owner is also the pi_state owner despite + * the user space value claiming something different. There is no + * point in unlocking the rtmutex if current is the owner as it + * would need to wait until the next waiter has taken the rtmutex + * to guarantee consistent state. Keep it simple. Userspace asked + * for this wreckaged state. + * + * The rtmutex has an owner - either current or some other + * task. See the EAGAIN loop above. + */ + pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); + return err; } @@ -2758,7 +2777,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2894,23 +2912,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. - */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock */ unqueue_me_pi(&q); - - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - goto out; out_unlock_put_key: @@ -3170,7 +3173,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -3248,10 +3250,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret < 0 && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -3293,25 +3291,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_pi_state_owner() faulted and was unable to handle - * the fault, unlock the rt_mutex and return the fault to - * userspace. - */ - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling From a7f6d4ab44344d71a64028f1dbdfbb1e8781572b Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Thu, 21 Jan 2021 10:46:49 -0800 Subject: [PATCH 10/33] HID: wacom: Correct NULL dereference on AES pen proximity commit 179e8e47c02a1950f1c556f2b854bdb2259078fb upstream. The recent commit to fix a memory leak introduced an inadvertant NULL pointer dereference. The `wacom_wac->pen_fifo` variable was never intialized, resuling in a crash whenever functions tried to use it. Since the FIFO is only used by AES pens (to buffer events from pen proximity until the hardware reports the pen serial number) this would have been easily overlooked without testing an AES device. This patch converts `wacom_wac->pen_fifo` over to a pointer (since the call to `devres_alloc` allocates memory for us) and ensures that we assign it to point to the allocated and initalized `pen_fifo` before the function returns. Link: https://github.com/linuxwacom/input-wacom/issues/230 Fixes: 37309f47e2f5 ("HID: wacom: Fix memory leakage caused by kfifo_alloc") CC: stable@vger.kernel.org # v4.19+ Signed-off-by: Jason Gerecke Tested-by: Ping Cheng Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_sys.c | 7 ++++--- drivers/hid/wacom_wac.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 9e852b4bbf92b..73dafa60080f1 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -147,9 +147,9 @@ static int wacom_wac_pen_serial_enforce(struct hid_device *hdev, } if (flush) - wacom_wac_queue_flush(hdev, &wacom_wac->pen_fifo); + wacom_wac_queue_flush(hdev, wacom_wac->pen_fifo); else if (insert) - wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo, + wacom_wac_queue_insert(hdev, wacom_wac->pen_fifo, raw_data, report_size); return insert && !flush; @@ -1280,7 +1280,7 @@ static void wacom_devm_kfifo_release(struct device *dev, void *res) static int wacom_devm_kfifo_alloc(struct wacom *wacom) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; - struct kfifo_rec_ptr_2 *pen_fifo = &wacom_wac->pen_fifo; + struct kfifo_rec_ptr_2 *pen_fifo; int error; pen_fifo = devres_alloc(wacom_devm_kfifo_release, @@ -1297,6 +1297,7 @@ static int wacom_devm_kfifo_alloc(struct wacom *wacom) } devres_add(&wacom->hdev->dev, pen_fifo); + wacom_wac->pen_fifo = pen_fifo; return 0; } diff --git a/drivers/hid/wacom_wac.h b/drivers/hid/wacom_wac.h index da612b6e9c779..195910dd2154e 100644 --- a/drivers/hid/wacom_wac.h +++ b/drivers/hid/wacom_wac.h @@ -342,7 +342,7 @@ struct wacom_wac { struct input_dev *pen_input; struct input_dev *touch_input; struct input_dev *pad_input; - struct kfifo_rec_ptr_2 pen_fifo; + struct kfifo_rec_ptr_2 *pen_fifo; int pid; int num_contacts_left; u8 bt_features; From 0fa0a05b4089d136b9d2c1a32fc21320af05629b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 18 Jan 2021 21:45:23 +0800 Subject: [PATCH 11/33] HID: multitouch: Apply MT_QUIRK_CONFIDENCE quirk for multi-input devices commit 794c613383433ffc4fceec8eaa081b9f1962e287 upstream. Palm ejection stops working on some Elan and Synaptics touchpad after commit 40d5bb87377a ("HID: multitouch: enable multi-input as a quirk for some devices"). The commit changes the mt_class from MT_CLS_WIN_8 to MT_CLS_WIN_8_FORCE_MULTI_INPUT, so MT_QUIRK_CONFIDENCE isn't applied anymore. So also apply the quirk since MT_CLS_WIN_8_FORCE_MULTI_INPUT is essentially MT_CLS_WIN_8. Fixes: 40d5bb87377a ("HID: multitouch: enable multi-input as a quirk for some devices") Cc: stable@vger.kernel.org Signed-off-by: Kai-Heng Feng Signed-off-by: Benjamin Tissoires Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-multitouch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 0743ef51d3b24..8429ebe7097e4 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -758,7 +758,8 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, MT_STORE_FIELD(inrange_state); return 1; case HID_DG_CONFIDENCE: - if (cls->name == MT_CLS_WIN_8 && + if ((cls->name == MT_CLS_WIN_8 || + cls->name == MT_CLS_WIN_8_FORCE_MULTI_INPUT) && (field->application == HID_DG_TOUCHPAD || field->application == HID_DG_TOUCHSCREEN)) app->quirks |= MT_QUIRK_CONFIDENCE; From 94fb5ff34544897982f7773868fa10530f13bb2d Mon Sep 17 00:00:00 2001 From: Naushir Patuck Date: Wed, 6 Jan 2021 16:16:57 +0100 Subject: [PATCH 12/33] media: Revert "media: videobuf2: Fix length check for single plane dmabuf queueing" commit 95e9295daa849095d8be05fb6e26b2ba9be1594f upstream. The updated length check for dmabuf types broke existing usage in v4l2 userland clients. Fixes: 961d3b27 ("media: videobuf2: Fix length check for single plane dmabuf queueing") Cc: stable@vger.kernel.org Signed-off-by: Naushir Patuck Tested-by: Kieran Bingham Reviewed-by: Kieran Bingham Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/common/videobuf2/videobuf2-v4l2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 96d3b2b2aa318..3f61f5863bf77 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -118,8 +118,7 @@ static int __verify_length(struct vb2_buffer *vb, const struct v4l2_buffer *b) return -EINVAL; } } else { - length = (b->memory == VB2_MEMORY_USERPTR || - b->memory == VB2_MEMORY_DMABUF) + length = (b->memory == VB2_MEMORY_USERPTR) ? b->length : vb->planes[0].length; if (b->bytesused > length) From 77727dfda786ce1d333ebc9c8777d821fe86466a Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 18 Jan 2021 16:37:00 +0100 Subject: [PATCH 13/33] media: v4l2-subdev.h: BIT() is not available in userspace commit a53e3c189cc6460b60e152af3fc24edf8e0ea9d2 upstream. The BIT macro is not available in userspace, so replace BIT(0) by 0x00000001. Signed-off-by: Hans Verkuil Fixes: 6446ec6cbf46 ("media: v4l2-subdev: add VIDIOC_SUBDEV_QUERYCAP ioctl") Cc: Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/v4l2-subdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h index 00850b98078a2..a38454d9e0f54 100644 --- a/include/uapi/linux/v4l2-subdev.h +++ b/include/uapi/linux/v4l2-subdev.h @@ -176,7 +176,7 @@ struct v4l2_subdev_capability { }; /* The v4l2 sub-device video device node is registered in read-only mode. */ -#define V4L2_SUBDEV_CAP_RO_SUBDEV BIT(0) +#define V4L2_SUBDEV_CAP_RO_SUBDEV 0x00000001 /* Backwards compatibility define --- to be removed */ #define v4l2_subdev_edid v4l2_edid From 1fa2fa7932f9f2e695453ed7160d557eced07bb4 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Mon, 18 Jan 2021 19:16:29 -0800 Subject: [PATCH 14/33] RDMA/vmw_pvrdma: Fix network_hdr_type reported in WC commit 9f206f7398f6f6ec7dd0198c045c2459b4f720b6 upstream. The PVRDMA device HW interface defines network_hdr_type according to an old definition of the internal kernel rdma_network_type enum that has since changed, resulting in the wrong rdma_network_type being reported. Fix this by explicitly defining the enum used by the PVRDMA device and adding a function to convert the pvrdma_network_type to rdma_network_type enum. Cc: stable@vger.kernel.org # 5.10+ Fixes: 1c15b4f2a42f ("RDMA/core: Modify enum ib_gid_type and enum rdma_network_type") Link: https://lore.kernel.org/r/1611026189-17943-1-git-send-email-bryantan@vmware.com Reviewed-by: Adit Ranadive Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 14 ++++++++++++++ drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 2 +- include/uapi/rdma/vmw_pvrdma-abi.h | 7 +++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index c142f5e7f25f8..de57f2fed7437 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -509,6 +509,20 @@ static inline int ib_send_flags_to_pvrdma(int flags) return flags & PVRDMA_MASK(PVRDMA_SEND_FLAGS_MAX); } +static inline int pvrdma_network_type_to_ib(enum pvrdma_network_type type) +{ + switch (type) { + case PVRDMA_NETWORK_ROCE_V1: + return RDMA_NETWORK_ROCE_V1; + case PVRDMA_NETWORK_IPV4: + return RDMA_NETWORK_IPV4; + case PVRDMA_NETWORK_IPV6: + return RDMA_NETWORK_IPV6; + default: + return RDMA_NETWORK_IPV6; + } +} + void pvrdma_qp_cap_to_ib(struct ib_qp_cap *dst, const struct pvrdma_qp_cap *src); void ib_qp_cap_to_pvrdma(struct pvrdma_qp_cap *dst, diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 319546a39a0d5..62164db593a4f 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -364,7 +364,7 @@ static int pvrdma_poll_one(struct pvrdma_cq *cq, struct pvrdma_qp **cur_qp, wc->dlid_path_bits = cqe->dlid_path_bits; wc->port_num = cqe->port_num; wc->vendor_err = cqe->vendor_err; - wc->network_hdr_type = cqe->network_hdr_type; + wc->network_hdr_type = pvrdma_network_type_to_ib(cqe->network_hdr_type); /* Update shared ring state */ pvrdma_idx_ring_inc(&cq->ring_state->rx.cons_head, cq->ibcq.cqe); diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index f8b638c73371d..901a4fd72c09f 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -133,6 +133,13 @@ enum pvrdma_wc_flags { PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_NETWORK_HDR_TYPE, }; +enum pvrdma_network_type { + PVRDMA_NETWORK_IB, + PVRDMA_NETWORK_ROCE_V1 = PVRDMA_NETWORK_IB, + PVRDMA_NETWORK_IPV4, + PVRDMA_NETWORK_IPV6 +}; + struct pvrdma_alloc_ucontext_resp { __u32 qp_tab_size; __u32 reserved; From 08a922a6fdf8c3eaea7f6a3beb13c6eed75994dc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 12 Jan 2021 14:24:48 +0100 Subject: [PATCH 15/33] iwlwifi: dbg: Don't touch the tlv data commit a6616bc9a0af7c65c0b0856a7508870a4a40c4ac upstream. The commit ba8f6f4ae254 ("iwlwifi: dbg: add dumping special device memory") added a termination of name string just to be sure, and this seems causing a regression, a GPF triggered at firmware loading. Basically we shouldn't modify the firmware data that may be provided as read-only. This patch drops the code that caused the regression and keep the tlv data as is. Fixes: ba8f6f4ae254 ("iwlwifi: dbg: add dumping special device memory") BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1180344 BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=210733 Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai Acked-by: Luca Coelho Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210112132449.22243-2-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c index 8fa1c22fd96db..fcad5cdcabfa4 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c @@ -237,13 +237,6 @@ static int iwl_dbg_tlv_alloc_region(struct iwl_trans *trans, if (le32_to_cpu(tlv->length) < sizeof(*reg)) return -EINVAL; - /* For safe using a string from FW make sure we have a - * null terminator - */ - reg->name[IWL_FW_INI_MAX_NAME - 1] = 0; - - IWL_DEBUG_FW(trans, "WRT: parsing region: %s\n", reg->name); - if (id >= IWL_FW_INI_MAX_REGION_ID) { IWL_ERR(trans, "WRT: Invalid region id %u\n", id); return -EINVAL; From 7bf3fb6243a3b153ab1854b331ec19d67a4878bb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:00 +0000 Subject: [PATCH 16/33] kernel/io_uring: cancel io_uring before task works [ Upstream commit b1b6b5a30dce872f500dc43f067cba8e7f86fc7d ] For cancelling io_uring requests it needs either to be able to run currently enqueued task_works or having it shut down by that moment. Otherwise io_uring_cancel_files() may be waiting for requests that won't ever complete. Go with the first way and do cancellations before setting PF_EXITING and so before putting the task_work infrastructure into a transition state where task_work_run() would better not be called. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/file.c | 2 -- kernel/exit.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/file.c b/fs/file.c index 4559b5fec3bd5..21c0893f2f1df 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,7 +21,6 @@ #include #include #include -#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -453,7 +452,6 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { - io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index 1f236ed375f83..d13d67fc5f4e2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -762,6 +763,7 @@ void __noreturn do_exit(long code) schedule(); } + io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ From 18f31594ee52ed1f364e376767fb839935fd899c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:01 +0000 Subject: [PATCH 17/33] io_uring: inline io_uring_attempt_task_drop() [ Upstream commit 4f793dc40bc605b97624fd36baf085b3c35e8bfd ] A simple preparation change inlining io_uring_attempt_task_drop() into io_uring_flush(). Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cb0db187d90f..c4e0c352115fe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8835,23 +8835,6 @@ static void io_uring_del_task_file(struct file *file) fput(file); } -/* - * Drop task note for this file if we're the only ones that hold it after - * pending fput() - */ -static void io_uring_attempt_task_drop(struct file *file) -{ - if (!current->io_uring) - return; - /* - * fput() is pending, will be 2 if the only other ref is our potential - * task file note. If the task is exiting, drop regardless of count. - */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || - atomic_long_read(&file->f_count) == 2) - io_uring_del_task_file(file); -} - static void io_uring_remove_task_files(struct io_uring_task *tctx) { struct file *file; @@ -8943,7 +8926,17 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { - io_uring_attempt_task_drop(file); + if (!current->io_uring) + return 0; + + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || + atomic_long_read(&file->f_count) == 2) + io_uring_del_task_file(file); + return 0; } From da67631a33c342528245817cc61e36dd945665b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:02 +0000 Subject: [PATCH 18/33] io_uring: add warn_once for io_uring_flush() [ Upstream commit 6b5733eb638b7068ab7cb34e663b55a1d1892d85] files_cancel() should cancel all relevant requests and drop file notes, so we should never have file notes after that, including on-exit fput and flush. Add a WARN_ONCE to be sure. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c4e0c352115fe..70439e97457d4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8926,17 +8926,23 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { - if (!current->io_uring) + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) return 0; + /* we should have cancelled and erased it before PF_EXITING */ + WARN_ON_ONCE((current->flags & PF_EXITING) && + xa_load(&tctx->xa, (unsigned long)file)); + /* * fput() is pending, will be 2 if the only other ref is our potential * task file note. If the task is exiting, drop regardless of count. */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || - atomic_long_read(&file->f_count) == 2) - io_uring_del_task_file(file); + if (atomic_long_read(&file->f_count) != 2) + return 0; + io_uring_del_task_file(file); return 0; } From a63d9157571b52f7339d6db4c2ab7bc3bfe527c0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:03 +0000 Subject: [PATCH 19/33] io_uring: stop SQPOLL submit on creator's death [ Upstream commit d9d05217cb6990b9a56e13b56e7a1b71e2551f6c ] When the creator of SQPOLL io_uring dies (i.e. sqo_task), we don't want its internals like ->files and ->mm to be poked by the SQPOLL task, it have never been nice and recently got racy. That can happen when the owner undergoes destruction and SQPOLL tasks tries to submit new requests in parallel, and so calls io_sq_thread_acquire*(). That patch halts SQPOLL submissions when sqo_task dies by introducing sqo_dead flag. Once set, the SQPOLL task must not do any submission, which is synchronised by uring_lock as well as the new flag. The tricky part is to make sure that disabling always happens, that means either the ring is discovered by creator's do_exit() -> cancel, or if the final close() happens before it's done by the creator. The last is guaranteed by the fact that for SQPOLL the creator task and only it holds exactly one file note, so either it pins up to do_exit() or removed by the creator on the final put in flush. (see comments in uring_flush() around file->f_count == 2). One more place that can trigger io_sq_thread_acquire_*() is __io_req_task_submit(). Shoot off requests on sqo_dead there, even though actually we don't need to. That's because cancellation of sqo_task should wait for the request before going any further. note 1: io_disable_sqo_submit() does io_ring_set_wakeup_flag() so the caller would enter the ring to get an error, but it still doesn't guarantee that the flag won't be cleared. note 2: if final __userspace__ close happens not from the creator task, the file note will pin the ring until the task dies. Cc: stable@vger.kernel.org # 5.5+ Fixed: b1b6b5a30dce8 ("kernel/io_uring: cancel io_uring before task works") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 70439e97457d4..bcc26fc04d005 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -260,6 +260,7 @@ struct io_ring_ctx { unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; + unsigned int sqo_dead: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -2083,11 +2084,9 @@ static void io_req_task_cancel(struct callback_head *cb) static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool fail; - fail = __io_sq_thread_acquire_mm(ctx); mutex_lock(&ctx->uring_lock); - if (!fail) + if (!ctx->sqo_dead && !__io_sq_thread_acquire_mm(ctx)) __io_queue_sqe(req, NULL); else __io_req_task_cancel(req, -EFAULT); @@ -6796,7 +6795,7 @@ static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, to_submit = 8; mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) + if (likely(!percpu_ref_is_dying(&ctx->refs) && !ctx->sqo_dead)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); @@ -8487,6 +8486,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); /* if force is set, the ring is going away. always drop after that */ + + if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead)) + ctx->sqo_dead = 1; + ctx->cq_overflow_flushed = 1; if (ctx->rings) __io_cqring_overflow_flush(ctx, true, NULL, NULL); @@ -8745,6 +8748,18 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, return ret; } +static void io_disable_sqo_submit(struct io_ring_ctx *ctx) +{ + WARN_ON_ONCE(ctx->sqo_task != current); + + mutex_lock(&ctx->uring_lock); + ctx->sqo_dead = 1; + mutex_unlock(&ctx->uring_lock); + + /* make sure callers enter the ring to get error */ + io_ring_set_wakeup_flag(ctx); +} + /* * We need to iteratively cancel requests, in case a request has dependent * hard links. These persist even for failure of cancelations, hence keep @@ -8756,6 +8771,8 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + /* for SQPOLL only sqo_task has task notes */ + io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle); io_sq_thread_park(ctx->sq_data); @@ -8927,6 +8944,7 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx = file->private_data; if (!tctx) return 0; @@ -8942,7 +8960,16 @@ static int io_uring_flush(struct file *file, void *data) if (atomic_long_read(&file->f_count) != 2) return 0; - io_uring_del_task_file(file); + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* there is only one file note, which is owned by sqo_task */ + WARN_ON_ONCE((ctx->sqo_task == current) == + !xa_load(&tctx->xa, (unsigned long)file)); + + io_disable_sqo_submit(ctx); + } + + if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current) + io_uring_del_task_file(file); return 0; } @@ -9016,8 +9043,9 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ -static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { + int ret = 0; DEFINE_WAIT(wait); do { @@ -9026,6 +9054,11 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + if (unlikely(ctx->sqo_dead)) { + ret = -EOWNERDEAD; + goto out; + } + if (!io_sqring_full(ctx)) break; @@ -9033,6 +9066,8 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); +out: + return ret; } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, @@ -9076,10 +9111,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); + ret = -EOWNERDEAD; + if (unlikely(ctx->sqo_dead)) + goto out; if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) - io_sqpoll_wait_sq(ctx); + if (flags & IORING_ENTER_SQ_WAIT) { + ret = io_sqpoll_wait_sq(ctx); + if (ret) + goto out; + } submitted = to_submit; } else if (to_submit) { ret = io_uring_add_task_file(ctx, f.file); @@ -9498,6 +9539,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: + io_disable_sqo_submit(ctx); io_ring_ctx_wait_and_kill(ctx); return ret; } From 0e3562e3b2aeb4a6aa4615185a8f59c51cade61b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:04 +0000 Subject: [PATCH 20/33] io_uring: fix null-deref in io_disable_sqo_submit [ Upstream commit b4411616c26f26c4017b8fa4d3538b1a02028733 ] general protection fault, probably for non-canonical address 0xdffffc0000000022: 0000 [#1] KASAN: null-ptr-deref in range [0x0000000000000110-0x0000000000000117] RIP: 0010:io_ring_set_wakeup_flag fs/io_uring.c:6929 [inline] RIP: 0010:io_disable_sqo_submit+0xdb/0x130 fs/io_uring.c:8891 Call Trace: io_uring_create fs/io_uring.c:9711 [inline] io_uring_setup+0x12b1/0x38e0 fs/io_uring.c:9739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 io_disable_sqo_submit() might be called before user rings were allocated, don't do io_ring_set_wakeup_flag() in those cases. Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+ab412638aeb652ded540@syzkaller.appspotmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bcc26fc04d005..239cbcf439a2d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8757,7 +8757,8 @@ static void io_disable_sqo_submit(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); /* make sure callers enter the ring to get error */ - io_ring_set_wakeup_flag(ctx); + if (ctx->rings) + io_ring_set_wakeup_flag(ctx); } /* From 8cb6f4da831bc51145aee3a923f03114121dea6b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:05 +0000 Subject: [PATCH 21/33] io_uring: do sqo disable on install_fd error [ Upstream commit 06585c497b55045ec21aa8128e340f6a6587351c ] WARNING: CPU: 0 PID: 8494 at fs/io_uring.c:8717 io_ring_ctx_wait_and_kill+0x4f2/0x600 fs/io_uring.c:8717 Call Trace: io_uring_release+0x3e/0x50 fs/io_uring.c:8759 __fput+0x283/0x920 fs/file_table.c:280 task_work_run+0xdd/0x190 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:174 [inline] exit_to_user_mode_prepare+0x249/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9 failed io_uring_install_fd() is a special case, we don't do io_ring_ctx_wait_and_kill() directly but defer it to fput, though still need to io_disable_sqo_submit() before. note: it doesn't fix any real problem, just a warning. That's because sqring won't be available to the userspace in this case and so SQPOLL won't submit anything. Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+9c9c35374c0ecac06516@syzkaller.appspotmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 239cbcf439a2d..80dcd4250c4c5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9532,6 +9532,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, */ ret = io_uring_install_fd(ctx, file); if (ret < 0) { + io_disable_sqo_submit(ctx); /* fput will clean it up */ fput(file); return ret; From 0682759126bc761c325325ca809ae99c93fda2a0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:06 +0000 Subject: [PATCH 22/33] io_uring: fix false positive sqo warning on flush [ Upstream commit 6b393a1ff1746a1c91bd95cbb2d79b104d8f15ac ] WARNING: CPU: 1 PID: 9094 at fs/io_uring.c:8884 io_disable_sqo_submit+0x106/0x130 fs/io_uring.c:8884 Call Trace: io_uring_flush+0x28b/0x3a0 fs/io_uring.c:9099 filp_close+0xb4/0x170 fs/open.c:1280 close_fd+0x5c/0x80 fs/file.c:626 __do_sys_close fs/open.c:1299 [inline] __se_sys_close fs/open.c:1297 [inline] __x64_sys_close+0x2f/0xa0 fs/open.c:1297 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 io_uring's final close() may be triggered by any task not only the creator. It's well handled by io_uring_flush() including SQPOLL case, though a warning in io_disable_sqo_submit() will fallaciously fire by moving this warning out to the only call site that matters. Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+2f5d1785dc624932da78@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 80dcd4250c4c5..377123ff425fc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8750,8 +8750,6 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static void io_disable_sqo_submit(struct io_ring_ctx *ctx) { - WARN_ON_ONCE(ctx->sqo_task != current); - mutex_lock(&ctx->uring_lock); ctx->sqo_dead = 1; mutex_unlock(&ctx->uring_lock); @@ -8773,6 +8771,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { /* for SQPOLL only sqo_task has task notes */ + WARN_ON_ONCE(ctx->sqo_task != current); io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle); From 54b4c4f9aba9e5d1ef6877f42a57895b189107c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:07 +0000 Subject: [PATCH 23/33] io_uring: fix uring_flush in exit_files() warning [ Upstream commit 4325cb498cb743dacaa3edbec398c5255f476ef6 ] WARNING: CPU: 1 PID: 11100 at fs/io_uring.c:9096 io_uring_flush+0x326/0x3a0 fs/io_uring.c:9096 RIP: 0010:io_uring_flush+0x326/0x3a0 fs/io_uring.c:9096 Call Trace: filp_close+0xb4/0x170 fs/open.c:1280 close_files fs/file.c:401 [inline] put_files_struct fs/file.c:416 [inline] put_files_struct+0x1cc/0x350 fs/file.c:413 exit_files+0x7e/0xa0 fs/file.c:433 do_exit+0xc22/0x2ae0 kernel/exit.c:820 do_group_exit+0x125/0x310 kernel/exit.c:922 get_signal+0x3e9/0x20a0 kernel/signal.c:2770 arch_do_signal_or_restart+0x2a8/0x1eb0 arch/x86/kernel/signal.c:811 handle_signal_work kernel/entry/common.c:147 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x148/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9 An SQPOLL ring creator task may have gotten rid of its file note during exit and called io_disable_sqo_submit(), but the io_uring is still left referenced through fdtable, which will be put during close_files() and cause a false positive warning. First split the warning into two for more clarity when is hit, and the add sqo_dead check to handle the described case. Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+a32b546d58dde07875a1@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 377123ff425fc..02788fd54357c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8962,7 +8962,10 @@ static int io_uring_flush(struct file *file, void *data) if (ctx->flags & IORING_SETUP_SQPOLL) { /* there is only one file note, which is owned by sqo_task */ - WARN_ON_ONCE((ctx->sqo_task == current) == + WARN_ON_ONCE(ctx->sqo_task != current && + xa_load(&tctx->xa, (unsigned long)file)); + /* sqo_dead check is for when this happens after cancellation */ + WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead && !xa_load(&tctx->xa, (unsigned long)file)); io_disable_sqo_submit(ctx); From 186725a80c4e931b6fe31b94d66c989d5f2354c1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:08 +0000 Subject: [PATCH 24/33] io_uring: fix skipping disabling sqo on exec [ Upstream commit 0b5cd6c32b14413bf87e10ee62be3162588dcbe6 ] If there are no requests at the time __io_uring_task_cancel() is called, tctx_inflight() returns zero and and it terminates not getting a chance to go through __io_uring_files_cancel() and do io_disable_sqo_submit(). And we absolutely want them disabled by the time cancellation ends. Cc: stable@vger.kernel.org # 5.5+ Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 02788fd54357c..5018d4928d3e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8917,6 +8917,10 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); + /* trigger io_disable_sqo_submit() */ + if (tctx->sqpoll) + __io_uring_files_cancel(NULL); + do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); From 7bccd1c19128140b9fefaa43808924c6932bef5b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:09 +0000 Subject: [PATCH 25/33] io_uring: dont kill fasync under completion_lock [ Upstream commit 4aa84f2ffa81f71e15e5cffc2cc6090dbee78f8e ] CPU0 CPU1 ---- ---- lock(&new->fa_lock); local_irq_disable(); lock(&ctx->completion_lock); lock(&new->fa_lock); lock(&ctx->completion_lock); *** DEADLOCK *** Move kill_fasync() out of io_commit_cqring() to io_cqring_ev_posted(), so it doesn't hold completion_lock while doing it. That saves from the reported deadlock, and it's just nice to shorten the locking time and untangle nested locks (compl_lock -> wq_head::lock). Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+91ca3f25bd7f795f019c@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5018d4928d3e7..d66afd3fe7ee9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1214,11 +1214,6 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } } static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) @@ -1604,6 +1599,10 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) From d92d00861e98db178bd721876d0a06d1e8d5ff1a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Jan 2021 11:17:10 +0000 Subject: [PATCH 26/33] io_uring: fix sleeping under spin in __io_clean_op [ Upstream commit 9d5c8190683a462dbc787658467a0da17011ea5f ] [ 27.629441] BUG: sleeping function called from invalid context at fs/file.c:402 [ 27.631317] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1012, name: io_wqe_worker-0 [ 27.633220] 1 lock held by io_wqe_worker-0/1012: [ 27.634286] #0: ffff888105e26c98 (&ctx->completion_lock) {....}-{2:2}, at: __io_req_complete.part.102+0x30/0x70 [ 27.649249] Call Trace: [ 27.649874] dump_stack+0xac/0xe3 [ 27.650666] ___might_sleep+0x284/0x2c0 [ 27.651566] put_files_struct+0xb8/0x120 [ 27.652481] __io_clean_op+0x10c/0x2a0 [ 27.653362] __io_cqring_fill_event+0x2c1/0x350 [ 27.654399] __io_req_complete.part.102+0x41/0x70 [ 27.655464] io_openat2+0x151/0x300 [ 27.656297] io_issue_sqe+0x6c/0x14e0 [ 27.660991] io_wq_submit_work+0x7f/0x240 [ 27.662890] io_worker_handle_work+0x501/0x8a0 [ 27.664836] io_wqe_worker+0x158/0x520 [ 27.667726] kthread+0x134/0x180 [ 27.669641] ret_from_fork+0x1f/0x30 Instead of cleaning files on overflow, return back overflow cancellation into io_uring_cancel_files(). Previously it was racy to clean REQ_F_OVERFLOW flag, but we got rid of it, and can do it through repetitive attempts targeting all matching requests. Cc: stable@vger.kernel.org # 5.9+ Reported-by: Abaci Reported-by: Joseph Qi Cc: Xiaoguang Wang Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d66afd3fe7ee9..fd12d9327ee5b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -971,6 +971,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force); +static void io_req_drop_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -991,8 +992,7 @@ EXPORT_SYMBOL(io_uring_get_socket); static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | - REQ_F_INFLIGHT)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); } @@ -1256,6 +1256,8 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.flags &= ~IO_WQ_WORK_FS; } + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req); io_put_identity(req->task->io_uring, req); } @@ -5960,9 +5962,6 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } - - if (req->flags & REQ_F_INFLIGHT) - io_req_drop_files(req); } static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, @@ -8700,6 +8699,8 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, break; /* cancel this request, or head link requests */ io_attempt_cancel(ctx, cancel_req); + io_cqring_overflow_flush(ctx, true, task, files); + io_put_req(cancel_req); /* cancellations _may_ trigger task work */ io_run_task_work(); From c6fd968f58439398b765300aecd7758d501ee49c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 14 Jan 2021 16:14:01 -0600 Subject: [PATCH 27/33] objtool: Don't fail on missing symbol table commit 1d489151e9f9d1647110277ff77282fe4d96d09b upstream. Thanks to a recent binutils change which doesn't generate unused symbols, it's now possible for thunk_64.o be completely empty without CONFIG_PREEMPTION: no text, no data, no symbols. We could edit the Makefile to only build that file when CONFIG_PREEMPTION is enabled, but that will likely create confusion if/when the thunks end up getting used by some other code again. Just ignore it and move on. Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Reviewed-by: Miroslav Benes Tested-by: Nathan Chancellor Link: https://github.com/ClangBuiltLinux/linux/issues/1254 Signed-off-by: Josh Poimboeuf Signed-off-by: Greg Kroah-Hartman --- tools/objtool/elf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 4e1d7460574b4..9452cfb01ef19 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -354,8 +354,11 @@ static int read_symbols(struct elf *elf) symtab = find_section_by_name(elf, ".symtab"); if (!symtab) { - WARN("missing symbol table"); - return -1; + /* + * A missing symbol table is actually possible if it's an empty + * .o file. This can happen for thunk_64.o. + */ + return 0; } symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); From c11f7749f1fc9bad6b1f0e073de08fa996f21cc3 Mon Sep 17 00:00:00 2001 From: Hailong liu Date: Tue, 12 Jan 2021 15:49:08 -0800 Subject: [PATCH 28/33] mm/page_alloc: add a missing mm_page_alloc_zone_locked() tracepoint commit ce8f86ee94fabcc98537ddccd7e82cfd360a4dc5 upstream. The trace point *trace_mm_page_alloc_zone_locked()* in __rmqueue() does not currently cover all branches. Add the missing tracepoint and check the page before do that. [akpm@linux-foundation.org: use IS_ENABLED() to suppress warning] Link: https://lkml.kernel.org/r/20201228132901.41523-1-carver4lio@163.com Signed-off-by: Hailong liu Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Ivan Babrou Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14b9e83ff9da2..88639706ae177 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2846,20 +2846,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; -#ifdef CONFIG_CMA - /* - * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. - */ - if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { - page = __rmqueue_cma_fallback(zone, order); - if (page) - return page; + if (IS_ENABLED(CONFIG_CMA)) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (alloc_flags & ALLOC_CMA && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + goto out; + } } -#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -2870,8 +2870,9 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, alloc_flags)) goto retry; } - - trace_mm_page_alloc_zone_locked(page, order, migratetype); +out: + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } From f472a59aa182d5aac2927633f390514cf7b614b4 Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Tue, 15 Dec 2020 20:42:23 -0800 Subject: [PATCH 29/33] mm: fix a race on nr_swap_pages commit b50da6e9f42ade19141f6cf8870bb2312b055aa3 upstream. The scenario on which "Free swap = -4kB" happens in my system, which is caused by several get_swap_pages racing with each other and show_swap_cache_info happens simutaniously. No need to add a lock on get_swap_page_of_type as we remove "Presub/PosAdd" here. ProcessA ProcessB ProcessC ngoals = 1 ngoals = 1 avail = nr_swap_pages(1) avail = nr_swap_pages(1) nr_swap_pages(1) -= ngoals nr_swap_pages(0) -= ngoals nr_swap_pages = -1 Link: https://lkml.kernel.org/r/1607050340-4535-1-git-send-email-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- mm/swapfile.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d58361109066d..16db9d1ebcbf3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1045,16 +1045,18 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); + spin_lock(&swap_avail_lock); + avail_pgs = atomic_long_read(&nr_swap_pages) / size; - if (avail_pgs <= 0) + if (avail_pgs <= 0) { + spin_unlock(&swap_avail_lock); goto noswap; + } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); - spin_lock(&swap_avail_lock); - start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { @@ -1128,14 +1130,13 @@ swp_entry_t get_swap_page_of_type(int type) spin_lock(&si->lock); if (si->flags & SWP_WRITEOK) { - atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); if (offset) { + atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); return swp_entry(type, offset); } - atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); fail: From cb14bbbb7bbfdb9da25d24cf14f52ef54eee1109 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:05 +0100 Subject: [PATCH 30/33] tools: Factor HOSTCC, HOSTLD, HOSTAR definitions commit c8a950d0d3b926a02c7b2e713850d38217cec3d1 upstream. Several Makefiles in tools/ need to define the host toolchain variables. Move their definition to tools/scripts/Makefile.include Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/bpf/20201110164310.2600671-2-jean-philippe@linaro.org Cc: Alistair Delva Signed-off-by: Greg Kroah-Hartman --- tools/bpf/resolve_btfids/Makefile | 9 --------- tools/build/Makefile | 4 ---- tools/objtool/Makefile | 9 --------- tools/perf/Makefile.perf | 4 ---- tools/power/acpi/Makefile.config | 1 - tools/scripts/Makefile.include | 10 ++++++++++ 6 files changed, 10 insertions(+), 27 deletions(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 66cb92136de4a..bf656432ad736 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -18,15 +18,6 @@ else endif # always use the host compiler -ifneq ($(LLVM),) -HOSTAR ?= llvm-ar -HOSTCC ?= clang -HOSTLD ?= ld.lld -else -HOSTAR ?= ar -HOSTCC ?= gcc -HOSTLD ?= ld -endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) diff --git a/tools/build/Makefile b/tools/build/Makefile index 722f1700d96a8..bae48e6fa9952 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -15,10 +15,6 @@ endef $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,LD,$(CROSS_COMPILE)ld) -HOSTCC ?= gcc -HOSTLD ?= ld -HOSTAR ?= ar - export HOSTCC HOSTLD HOSTAR ifeq ($(V),1) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 4ea9a833dde7a..5cdb19036d7f7 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -3,15 +3,6 @@ include ../scripts/Makefile.include include ../scripts/Makefile.arch # always use the host compiler -ifneq ($(LLVM),) -HOSTAR ?= llvm-ar -HOSTCC ?= clang -HOSTLD ?= ld.lld -else -HOSTAR ?= ar -HOSTCC ?= gcc -HOSTLD ?= ld -endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 7ce3f2e8b9c74..62f3deb1d3a8b 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -175,10 +175,6 @@ endef LD += $(EXTRA_LDFLAGS) -HOSTCC ?= gcc -HOSTLD ?= ld -HOSTAR ?= ar - PKG_CONFIG = $(CROSS_COMPILE)pkg-config LLVM_CONFIG ?= llvm-config diff --git a/tools/power/acpi/Makefile.config b/tools/power/acpi/Makefile.config index 54a2857c2510a..331f6d30f4726 100644 --- a/tools/power/acpi/Makefile.config +++ b/tools/power/acpi/Makefile.config @@ -54,7 +54,6 @@ INSTALL_SCRIPT = ${INSTALL_PROGRAM} CROSS = #/usr/i386-linux-uclibc/usr/bin/i386-uclibc- CROSS_COMPILE ?= $(CROSS) LD = $(CC) -HOSTCC = gcc # check if compiler option is supported cc-supports = ${shell if $(CC) ${1} -S -o /dev/null -x c /dev/null > /dev/null 2>&1; then echo "$(1)"; fi;} diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a7974638561ca..1358e89cdf7d6 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -59,6 +59,16 @@ $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +ifneq ($(LLVM),) +HOSTAR ?= llvm-ar +HOSTCC ?= clang +HOSTLD ?= ld.lld +else +HOSTAR ?= ar +HOSTCC ?= gcc +HOSTLD ?= ld +endif + ifeq ($(CC_NO_CLANG), 1) EXTRA_WARNINGS += -Wstrict-aliasing=3 endif From 861c2e349a36868f9c19a82844b2eb0abf20939b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 14 Jan 2021 18:10:12 +0106 Subject: [PATCH 31/33] printk: fix buffer overflow potential for print_text() commit f0e386ee0c0b71ea6f7238506a4d0965a2dbef11 upstream. Before the commit 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer"), msg_print_text() would only write up to size-1 bytes into the provided buffer. Some callers expect this behavior and append a terminator to returned string. In particular: arch/powerpc/xmon/xmon.c:dump_log_buf() arch/um/kernel/kmsg_dump.c:kmsg_dumper_stdout() msg_print_text() has been replaced by record_print_text(), which currently fills the full size of the buffer. This causes a buffer overflow for the above callers. Change record_print_text() so that it will only use size-1 bytes for text data. Also, for paranoia sakes, add a terminator after the text data. And finally, document this behavior so that it is clear that only size-1 bytes are used and a terminator is added. Fixes: 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer") Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210114170412.4819-1-john.ogness@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 801f8bc52b34f..807810216492a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1338,11 +1338,16 @@ static size_t info_print_prefix(const struct printk_info *info, bool syslog, * done: * * - Add prefix for each line. + * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). - * - Drop truncated lines that do not longer fit into the buffer. + * - Add a string terminator. + * + * Since the produced string is always terminated, the maximum possible + * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added - * prefixes and the newline. The dropped line(s) are not counted. + * prefixes and the newline. The terminator is not counted. The dropped + * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) @@ -1385,26 +1390,31 @@ static size_t record_print_text(struct printk_record *r, bool syslog, /* * Truncate the text if there is not enough space to add the - * prefix and a trailing newline. + * prefix and a trailing newline and a terminator. */ - if (len + prefix_len + text_len + 1 > buf_size) { + if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ - if (len + prefix_len + line_len + 1 > buf_size) + if (len + prefix_len + line_len + 1 + 1 > buf_size) break; - text_len = buf_size - len - prefix_len - 1; + text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); + /* + * Increment the prepared length to include the text and + * prefix that were just moved+copied. Also increment for the + * newline at the end of this line. If this is the last line, + * there is no newline, but it will be added immediately below. + */ len += prefix_len + line_len + 1; - if (text_len == line_len) { /* - * Add the trailing newline removed in - * vprintk_store(). + * This is the last line. Add the trailing newline + * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; @@ -1429,6 +1439,14 @@ static size_t record_print_text(struct printk_record *r, bool syslog, text_len -= line_len + 1; } + /* + * If a buffer was provided, it will be terminated. Space for the + * string terminator is guaranteed to be available. The terminator is + * not counted in the return value. + */ + if (buf_size > 0) + text[len] = 0; + return len; } From d5ac8304e18025a522b5d1d87629e926064ce134 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sun, 24 Jan 2021 21:33:28 +0106 Subject: [PATCH 32/33] printk: fix string termination for record_print_text() commit 08d60e5999540110576e7c1346d486220751b7f9 upstream. Commit f0e386ee0c0b ("printk: fix buffer overflow potential for print_text()") added string termination in record_print_text(). However it used the wrong base pointer for adding the terminator. This led to a 0-byte being written somewhere beyond the buffer. Use the correct base pointer when adding the terminator. Fixes: f0e386ee0c0b ("printk: fix buffer overflow potential for print_text()") Reported-by: Sven Schnelle Signed-off-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210124202728.4718-1-john.ogness@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 807810216492a..aafec8cb8637d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1445,7 +1445,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog, * not counted in the return value. */ if (buf_size > 0) - text[len] = 0; + r->text_buf[len] = 0; return len; } From 05f6d2aa7e2f2cdd137ee600785704139e6dd3b7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 30 Jan 2021 13:55:20 +0100 Subject: [PATCH 33/33] Linux 5.10.12 Tested-by: Pavel Machek (CIP) Link: https://lore.kernel.org/r/20210129105912.628174874@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7a5d906f6ee36..a6b2e64bcf6c7 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 10 -SUBLEVEL = 11 +SUBLEVEL = 12 EXTRAVERSION = NAME = Kleptomaniac Octopus