From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Jun 2023 15:17:36 -0700 Subject: [PATCH 01/13] mm: introduce new 'lock_mm_and_find_vma()' page fault helper .. and make x86 use it. This basically extracts the existing x86 "find and expand faulting vma" code, but extends it to also take the mmap lock for writing in case we actually do need to expand the vma. We've historically short-circuited that case, and have some rather ugly special logic to serialize the stack segment expansion (since we only hold the mmap lock for reading) that doesn't match the normal VM locking. That slight violation of locking worked well, right up until it didn't: the maple tree code really does want proper locking even for simple extension of an existing vma. So extract the code for "look up the vma of the fault" from x86, fix it up to do the necessary write locking, and make it available as a helper function for other architectures that can use the common helper. Note: I say "common helper", but it really only handles the normal stack-grows-down case. Which is all architectures except for PA-RISC and IA64. So some rare architectures can't use the helper, but if they care they'll just need to open-code this logic. It's also worth pointing out that this code really would like to have an optimistic "mmap_upgrade_trylock()" to make it quicker to go from a read-lock (for the common case) to taking the write lock (for having to extend the vma) in the normal single-threaded situation where there is no other locking activity. But that _is_ all the very uncommon special case, so while it would be nice to have such an operation, it probably doesn't matter in reality. I did put in the skeleton code for such a possible future expansion, even if it only acts as pseudo-documentation for what we're doing. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/fault.c | 52 +------------------ include/linux/mm.h | 2 + mm/Kconfig | 4 ++ mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 50 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee40..cb1031018afa5f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -276,6 +276,7 @@ config X86 select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4399983c50c05..e8711b2cafaf70 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } -static noinline void -bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) -{ - __bad_area(regs, error_code, address, 0, SEGV_MAPERR); -} - static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { @@ -1366,51 +1360,10 @@ void do_user_addr_fault(struct pt_regs *regs, lock_mmap: #endif /* CONFIG_PER_VMA_LOCK */ - /* - * Kernel-mode access to the user address space should only occur - * on well-defined single instructions listed in the exception - * tables. But, an erroneous kernel fault occurring outside one of - * those areas which also holds mmap_lock might deadlock attempting - * to validate the fault against the address space. - * - * Only do the expensive exception table search when we might be at - * risk of a deadlock. This happens if we - * 1. Failed to acquire mmap_lock, and - * 2. The access did not originate in userspace. - */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!user_mode(regs) && !search_exception_tables(regs->ip)) { - /* - * Fault from code in kernel from - * which we do not expect faults. - */ - bad_area_nosemaphore(regs, error_code, address); - return; - } retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, error_code, address); - return; - } - if (likely(vma->vm_start <= address)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); - return; - } - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1418,7 +1371,6 @@ void do_user_addr_fault(struct pt_regs *regs, * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, vma); return; diff --git a/include/linux/mm.h b/include/linux/mm.h index 27ce77080c79c7..570cf906fbcc1e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2325,6 +2325,8 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long address, struct pt_regs *regs); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, diff --git a/mm/Kconfig b/mm/Kconfig index 7672a22647b4a2..e3454087fd31ae 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1206,6 +1206,10 @@ config PER_VMA_LOCK This feature allows locking each virtual memory area separately when handling page faults instead of taking mmap_lock. +config LOCK_MM_AND_FIND_VMA + bool + depends on !STACK_GROWSUP + source "mm/damon/Kconfig" endmenu diff --git a/mm/memory.c b/mm/memory.c index f69fbc2511984e..1a427097b71f00 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5262,6 +5262,127 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(handle_mm_fault); +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include + +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + /* Even if this succeeds, make it clear we *might* have slept */ + if (likely(mmap_read_trylock(mm))) { + might_sleep(); + return true; + } + + if (regs && !user_mode(regs)) { + unsigned long ip = instruction_pointer(regs); + if (!search_exception_tables(ip)) + return false; + } + + mmap_read_lock(mm); + return true; +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = instruction_pointer(regs); + if (!search_exception_tables(ip)) + return false; + } + mmap_write_lock(mm); + return true; +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} +#endif + #ifdef CONFIG_PER_VMA_LOCK /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Jun 2023 16:17:48 -0700 Subject: [PATCH 02/13] mm: make the page fault mmap locking killable This is done as a separate patch from introducing the new lock_mm_and_find_vma() helper, because while it's an obvious change, it's not what x86 used to do in this area. We already abort the page fault on fatal signals anyway, so why should we wait for the mmap lock only to then abort later? With the new helper function that returns without the lock held on failure anyway, this is particularly easy and straightforward. Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1a427097b71f00..1dff248805bf7a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5279,8 +5279,7 @@ static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs return false; } - mmap_read_lock(mm); - return true; + return !mmap_read_lock_killable(mm); } static inline bool mmap_upgrade_trylock(struct mm_struct *mm) @@ -5304,8 +5303,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r if (!search_exception_tables(ip)) return false; } - mmap_write_lock(mm); - return true; + return !mmap_write_lock_killable(mm); } /* From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Jun 2023 17:11:44 -0700 Subject: [PATCH 03/13] arm64/mm: Convert to using lock_mm_and_find_vma() This converts arm64 to use the new page fault helper. It was very straightforward, but still needed a fix for the "obvious" conversion I initially did. Thanks to Suren for the fix and testing. Fixed-and-tested-by: Suren Baghdasaryan Unnecessary-code-removal-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/arm64/mm/fault.c | 47 ++++++++----------------------------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 343e1e1cae10a3..92f3fff2522b03 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -225,6 +225,7 @@ config ARM64 select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6045a5117ac15b..8a169bdb4d534f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -483,27 +483,14 @@ static void do_bad_area(unsigned long far, unsigned long esr, #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) -static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, +static vm_fault_t __do_page_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, struct pt_regs *regs) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (unlikely(!vma)) - return VM_FAULT_BADMAP; - /* * Ok, we have a good vm_area for this memory access, so we can handle * it. - */ - if (unlikely(vma->vm_start > addr)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - return VM_FAULT_BADMAP; - if (expand_stack(vma, addr)) - return VM_FAULT_BADMAP; - } - - /* * Check that the permissions on the VMA allow for the fault which * occurred. */ @@ -617,31 +604,15 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, } lock_mmap: #endif /* CONFIG_PER_VMA_LOCK */ - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->pc)) - goto no_context; + retry: - mmap_read_lock(mm); - } else { - /* - * The above mmap_read_trylock() might have succeeded in which - * case, we'll have missed the might_sleep() from down_read(). - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && !search_exception_tables(regs->pc)) { - mmap_read_unlock(mm); - goto no_context; - } -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = VM_FAULT_BADMAP; + goto done; } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); + fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -660,9 +631,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif /* * Handle the "normal" (no error) case first. */ From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 16 Jun 2023 15:51:29 +1000 Subject: [PATCH 04/13] powerpc/mm: Convert to using lock_mm_and_find_vma() Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/fault.c | 39 +++------------------------------------ 2 files changed, 4 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bff5820b7cda14..a243fcdf346dea 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -278,6 +278,7 @@ config PPC select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN && MODULES + select LOCK_MM_AND_FIND_VMA select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 531177a4ee0881..5bfdf6ecfa9650 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) return __bad_area_nosemaphore(regs, address, si_code); } -static noinline int bad_area(struct pt_regs *regs, unsigned long address) -{ - return __bad_area(regs, address, SEGV_MAPERR); -} - static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, struct vm_area_struct *vma) { @@ -515,40 +510,12 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * exceptions table. lock_mm_and_find_vma() handles that logic. */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!is_user && !search_exception_tables(regs->nip)) - return bad_area_nosemaphore(regs, address); - retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) - return bad_area(regs, address); - - if (unlikely(vma->vm_start > address)) { - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - return bad_area(regs, address); - - if (unlikely(expand_stack(vma, address))) - return bad_area(regs, address); - } + return bad_area_nosemaphore(regs, address); if (unlikely(access_pkey_error(is_write, is_exec, (error_code & DSISR_KEYFAULT), vma))) From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 22 Jun 2023 18:47:40 +0200 Subject: [PATCH 05/13] mips/mm: Convert to using lock_mm_and_find_vma() Signed-off-by: Ben Hutchings Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 1 + arch/mips/mm/fault.c | 12 ++---------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 675a8660cb85a0..6796d839bcfdf9 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -91,6 +91,7 @@ config MIPS select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP select IRQ_FORCED_THREADING select ISA if EISA + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_REL if MODULES select MODULES_USE_ELF_RELA if MODULES && 64BIT select PERF_USE_VMALLOC diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index a27045f5a556df..d7878208bd3fa9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_regs *regs, unsigned long write, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: si_code = SEGV_ACCERR; if (write) { From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 22 Jun 2023 20:18:18 +0200 Subject: [PATCH 06/13] riscv/mm: Convert to using lock_mm_and_find_vma() Signed-off-by: Ben Hutchings Signed-off-by: Linus Torvalds --- arch/riscv/Kconfig | 1 + arch/riscv/mm/fault.c | 31 +++++++++++++------------------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 5966ad97c30c3d..a11b1c038c6d1b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -126,6 +126,7 @@ config RISCV select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA if MODULES select MODULE_SECTIONS if MODULES select OF diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 8685f85a7474ef..35a84ec69a9fd4 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -84,13 +84,13 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f BUG(); } -static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) +static inline void +bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) { /* * Something tried to access memory that isn't in our memory map. * Fix it, but check if it's kernel or user first. */ - mmap_read_unlock(mm); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { do_trap(regs, SIGSEGV, code, addr); @@ -100,6 +100,15 @@ static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code no_context(regs, addr); } +static inline void +bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, + unsigned long addr) +{ + mmap_read_unlock(mm); + + bad_area_nosemaphore(regs, code, addr); +} + static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) { pgd_t *pgd, *pgd_k; @@ -287,23 +296,10 @@ void handle_page_fault(struct pt_regs *regs) else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; retry: - mmap_read_lock(mm); - vma = find_vma(mm, addr); + vma = lock_mm_and_find_vma(mm, addr, regs); if (unlikely(!vma)) { tsk->thread.bad_cause = cause; - bad_area(regs, mm, code, addr); - return; - } - if (likely(vma->vm_start <= addr)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - tsk->thread.bad_cause = cause; - bad_area(regs, mm, code, addr); - return; - } - if (unlikely(expand_stack(vma, addr))) { - tsk->thread.bad_cause = cause; - bad_area(regs, mm, code, addr); + bad_area_nosemaphore(regs, code, addr); return; } @@ -311,7 +307,6 @@ void handle_page_fault(struct pt_regs *regs) * Ok, we have a good vm_area for this memory access, so * we can handle it. */ -good_area: code = SEGV_ACCERR; if (unlikely(access_error(cause, vma))) { From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 22 Jun 2023 21:24:30 +0200 Subject: [PATCH 07/13] arm/mm: Convert to using lock_mm_and_find_vma() arm has an additional check for address < FIRST_USER_ADDRESS before expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere (generally as 0), move that check to the generic expand_downwards(). Signed-off-by: Ben Hutchings Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm/mm/fault.c | 63 ++++++++++----------------------------------- mm/mmap.c | 2 +- 3 files changed, 16 insertions(+), 50 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0fb4b218f6658b..9ed7f03ba15a3a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -125,6 +125,7 @@ config ARM select HAVE_UID16 select HAVE_VIRT_CPU_ACCOUNTING_GEN select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE select OF_EARLY_FLATTREE if OF diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2418f1efabd871..0860eeba8bd343 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -232,37 +232,11 @@ static inline bool is_permission_fault(unsigned int fsr) return false; } -static vm_fault_t __kprobes -__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags, - unsigned long vma_flags, struct pt_regs *regs) -{ - struct vm_area_struct *vma = find_vma(mm, addr); - if (unlikely(!vma)) - return VM_FAULT_BADMAP; - - if (unlikely(vma->vm_start > addr)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - return VM_FAULT_BADMAP; - if (addr < FIRST_USER_ADDRESS) - return VM_FAULT_BADMAP; - if (expand_stack(vma, addr)) - return VM_FAULT_BADMAP; - } - - /* - * ok, we have a good vm_area for this memory access, check the - * permissions on the VMA allow for the fault which occurred. - */ - if (!(vma->vm_flags & vma_flags)) - return VM_FAULT_BADACCESS; - - return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); -} - static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; int sig, code; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; @@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) - goto no_context; retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->ARM_pc)) - goto no_context; -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = VM_FAULT_BADMAP; + goto bad_area; } - fault = __do_page_fault(mm, addr, flags, vm_flags, regs); + /* + * ok, we have a good vm_area for this memory access, check the + * permissions on the VMA allow for the fault which occurred. + */ + if (!(vma->vm_flags & vm_flags)) + fault = VM_FAULT_BADACCESS; + else + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_lock because @@ -356,6 +320,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; +bad_area: /* * If we are in kernel mode at this point, we * have no context to handle this fault with. diff --git a/mm/mmap.c b/mm/mmap.c index d600404580b282..6d120bf1d0bc7c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2036,7 +2036,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) int error = 0; address &= PAGE_MASK; - if (address < mmap_min_addr) + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jun 2023 10:55:38 -0700 Subject: [PATCH 08/13] mm/fault: convert remaining simple cases to lock_mm_and_find_vma() This does the simple pattern conversion of alpha, arc, csky, hexagon, loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma() helper. They all have the regular fault handling pattern without odd special cases. The remaining architectures all have something that keeps us from a straightforward conversion: ia64 and parisc have stacks that can grow both up as well as down (and ia64 has special address region checks). And m68k, microblaze, openrisc, sparc64, and um end up having extra rules about only expanding the stack down a limited amount below the user space stack pointer. That is something that x86 used to do too (long long ago), and it probably could just be skipped, but it still makes the conversion less than trivial. Note that this conversion was done manually and with the exception of alpha without any build testing, because I have a fairly limited cross- building environment. The cases are all simple, and I went through the changes several times, but... Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 1 + arch/alpha/mm/fault.c | 13 +++---------- arch/arc/Kconfig | 1 + arch/arc/mm/fault.c | 11 +++-------- arch/csky/Kconfig | 1 + arch/csky/mm/fault.c | 22 +++++----------------- arch/hexagon/Kconfig | 1 + arch/hexagon/mm/vm_fault.c | 18 ++++-------------- arch/loongarch/Kconfig | 1 + arch/loongarch/mm/fault.c | 16 ++++++---------- arch/nios2/Kconfig | 1 + arch/nios2/mm/fault.c | 17 ++--------------- arch/sh/Kconfig | 1 + arch/sh/mm/fault.c | 17 ++--------------- arch/sparc/Kconfig | 1 + arch/sparc/mm/fault_32.c | 32 ++++++++------------------------ arch/xtensa/Kconfig | 1 + arch/xtensa/mm/fault.c | 14 +++----------- 18 files changed, 45 insertions(+), 124 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index a5c2b1aa46b02e..d6968d090d49a7 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -30,6 +30,7 @@ config ALPHA select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL select HAVE_MOD_ARCH_SPECIFIC + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select ODD_RT_SIGACTION select OLD_SIGSUSPEND diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 7b01ae4f3bc6c7..8c9850437e6744 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -119,20 +119,12 @@ do_page_fault(unsigned long address, unsigned long mmcsr, flags |= FAULT_FLAG_USER; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* Ok, we have a good vm_area for this memory access, so we can handle it. */ - good_area: si_code = SEGV_ACCERR; if (cause < 0) { if (!(vma->vm_flags & VM_EXEC)) @@ -192,6 +184,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, bad_area: mmap_read_unlock(mm); + bad_area_nosemaphore: if (user_mode(regs)) goto do_sigsegv; diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ab6d701365bb00..96cf8720bb9391 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -41,6 +41,7 @@ config ARC select HAVE_PERF_EVENTS select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 5ca59a482632a8..f59e722d147f91 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -113,15 +113,9 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (unlikely(address < vma->vm_start)) { - if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address)) - goto bad_area; - } + goto bad_area_nosemaphore; /* * vm_area is good, now check permissions for this memory access @@ -161,6 +155,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: /* * Major/minor page fault accounting * (in case of retry we only land here once) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 4df1f8c9d170b3..03e9f66661570a 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -96,6 +96,7 @@ config CSKY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS + select LOCK_MM_AND_FIND_VMA select MAY_HAVE_SPARSE_IRQ select MODULES_USE_ELF_RELA if MODULES select OF diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index e15f736cca4b4a..ae9781b7d92ea5 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -97,13 +97,12 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f BUG(); } -static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) +static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) { /* * Something tried to access memory that isn't in our memory map. * Fix it, but check if it's kernel or user first. */ - mmap_read_unlock(mm); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { do_trap(regs, SIGSEGV, code, addr); @@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs) if (is_write(regs)) flags |= FAULT_FLAG_WRITE; retry: - mmap_read_lock(mm); - vma = find_vma(mm, addr); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, mm, code, addr); - return; - } - if (likely(vma->vm_start <= addr)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, mm, code, addr); - return; - } - if (unlikely(expand_stack(vma, addr))) { - bad_area(regs, mm, code, addr); + bad_area_nosemaphore(regs, mm, code, addr); return; } @@ -259,11 +247,11 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * Ok, we have a good vm_area for this memory access, so * we can handle it. */ -good_area: code = SEGV_ACCERR; if (unlikely(access_error(regs, vma))) { - bad_area(regs, mm, code, addr); + mmap_read_unlock(mm); + bad_area_nosemaphore(regs, mm, code, addr); return; } diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 54eadf26517868..6726f4941015f3 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -28,6 +28,7 @@ config HEXAGON select GENERIC_SMP_IDLE_THREAD select STACKTRACE_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES select ARCH_WANT_LD_ORPHAN_WARN diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 4b578d02fd01a9..7295ea3f8cc8d3 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -57,21 +57,10 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto bad_area; + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) + goto bad_area_nosemaphore; - if (vma->vm_start <= address) - goto good_area; - - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - - if (expand_stack(vma, address)) - goto bad_area; - -good_area: /* Address space is OK. Now check access rights. */ si_code = SEGV_ACCERR; @@ -143,6 +132,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index d38b066fc931bc..73519e13bbb394 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -130,6 +130,7 @@ config LOONGARCH select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP select IRQ_FORCED_THREADING select IRQ_LOONGARCH_CPU + select LOCK_MM_AND_FIND_VMA select MMU_GATHER_MERGE_VMAS if MMU select MODULES_USE_ELF_RELA if MODULES select NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index 449087bd589d33..da5b6d518cdb1d 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (!expand_stack(vma, address)) - goto good_area; + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) + goto bad_area_nosemaphore; + goto good_area; + /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: do_sigsegv(regs, write, address, si_code); return; diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index e5936417d3cd3c..d54464021a618e 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -16,6 +16,7 @@ config NIOS2 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_KGDB select IRQ_DOMAIN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index ca64eccea5511d..e3fa9c15181df2 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->ea)) - goto bad_area_nosemaphore; retry: - mmap_read_lock(mm); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: code = SEGV_ACCERR; switch (cause) { diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 9652d367fc3777..393023d092450f 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -59,6 +59,7 @@ config SUPERH select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select NEED_SG_DMA_LENGTH select NO_DMA if !MMU && !DMA_COHERENT diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index acd2f5e50bfcd0..06e6b49529245a 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, } retry: - mmap_read_lock(mm); - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, error_code, address); - return; - } - if (likely(vma->vm_start <= address)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); - return; - } - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -461,7 +449,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 8535e19062f65f..8c196990558b24 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -57,6 +57,7 @@ config SPARC32 select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 select HAVE_UID16 + select LOCK_MM_AND_FIND_VMA select OLD_SIGACTION select ZONE_DMA diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 179295b14664a5..a3ccc0267bc205 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, if (pagefault_disabled() || !mm) goto no_context; + if (!from_user && address >= PAGE_OFFSET) + goto no_context; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - - if (!from_user && address >= PAGE_OFFSET) - goto bad_area; - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) @@ -321,17 +312,9 @@ static void force_user_fault(unsigned long address, int write) code = SEGV_MAPERR; - mmap_read_lock(mm); - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; -good_area: + goto bad_area_nosemaphore; code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) @@ -350,6 +333,7 @@ static void force_user_fault(unsigned long address, int write) return; bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address); return; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 3c6e5471f025b9..2d0d6440b9796e 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -49,6 +49,7 @@ config XTENSA select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING_GEN select IRQ_DOMAIN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC select TRACE_IRQFLAGS_SUPPORT diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index faf7cf35a0ee3d..d1eb8d6c5b8267 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: code = SEGV_ACCERR; if (is_write) { @@ -205,6 +196,7 @@ void do_page_fault(struct pt_regs *regs) */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { force_sig_fault(SIGSEGV, code, (void *) address); return; From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jun 2023 11:17:05 -0700 Subject: [PATCH 09/13] powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma() This is one of the simple cases, except there's no pt_regs pointer. Which is fine, as lock_mm_and_find_vma() is set up to work fine with a NULL pt_regs. Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting, so we can just use the helper without any extra work. Signed-off-by: Linus Torvalds --- arch/powerpc/mm/copro_fault.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 7c507fb48182bd..f49fd873df8da2 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, if (mm->pgd == NULL) return -EFAULT; - mmap_read_lock(mm); - ret = -EFAULT; - vma = find_vma(mm, ea); + vma = lock_mm_and_find_vma(mm, ea, NULL); if (!vma) - goto out_unlock; - - if (ea < vma->vm_start) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_unlock; - if (expand_stack(vma, ea)) - goto out_unlock; - } + return -EFAULT; + ret = -EFAULT; is_write = dsisr & DSISR_ISSTORE; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 16 Jun 2023 15:58:54 -0700 Subject: [PATCH 10/13] mm: make find_extend_vma() fail if write lock not held Make calls to extend_vma() and find_extend_vma() fail if the write lock is required. To avoid making this a flag-day event, this still allows the old read-locking case for the trivial situations, and passes in a flag to say "is it write-locked". That way write-lockers can say "yes, I'm being careful", and legacy users will continue to work in all the common cases until they have been fully converted to the new world order. Co-Developed-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 6 +++--- fs/exec.c | 5 +++-- include/linux/mm.h | 10 +++++++--- mm/memory.c | 2 +- mm/mmap.c | 50 ++++++++++++++++++++++++++++++---------------- mm/nommu.c | 3 ++- 6 files changed, 49 insertions(+), 27 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1033fbdfdbec73..869c3aa0e45587 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - if (mmap_read_lock_killable(mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_extend_vma(mm, bprm->p); - mmap_read_unlock(mm); + vma = find_extend_vma_locked(mm, bprm->p, true); + mmap_write_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/exec.c b/fs/exec.c index a466e797c8e2e6..a61eb256e5e4ce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -205,7 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, #ifdef CONFIG_STACK_GROWSUP if (write) { - ret = expand_downwards(bprm->vma, pos); + /* We claim to hold the lock - nobody to race with */ + ret = expand_downwards(bprm->vma, pos, true); if (ret < 0) return NULL; } @@ -853,7 +854,7 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; - ret = expand_stack(vma, stack_base); + ret = expand_stack_locked(vma, stack_base, true); if (ret) ret = -EFAULT; diff --git a/include/linux/mm.h b/include/linux/mm.h index 570cf906fbcc1e..01a016521b60b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3192,11 +3192,13 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked); +#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -extern int expand_downwards(struct vm_area_struct *vma, - unsigned long address); +int expand_downwards(struct vm_area_struct *vma, unsigned long address, + bool write_locked); #if VM_GROWSUP extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); #else @@ -3297,6 +3299,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, #endif struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, + unsigned long addr, bool write_locked); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 1dff248805bf7a..a81f5d0997ad92 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, goto fail; } - if (expand_stack(vma, addr)) + if (expand_stack_locked(vma, addr, true)) goto fail; success: diff --git a/mm/mmap.c b/mm/mmap.c index 6d120bf1d0bc7c..2c44ac108a3cfd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1935,7 +1935,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -int expand_upwards(struct vm_area_struct *vma, unsigned long address) +int expand_upwards(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; @@ -1959,6 +1960,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; + if (!write_locked) + return -EAGAIN; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) @@ -2028,7 +2031,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { struct mm_struct *mm = vma->vm_mm; MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); @@ -2042,10 +2046,13 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) /* Enforce stack_guard_gap */ prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ - if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev)) { - if (address - prev->vm_end < stack_guard_gap) + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; + if (!write_locked && (prev->vm_end == address)) + return -EAGAIN; } if (mas_preallocate(&mas, GFP_KERNEL)) @@ -2124,13 +2131,14 @@ static int __init cmdline_parse_stack_guard_gap(char *p) __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { - return expand_upwards(vma, address); + return expand_upwards(vma, address, write_locked); } -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, + unsigned long addr, bool write_locked) { struct vm_area_struct *vma, *prev; @@ -2138,20 +2146,25 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (!prev) + return NULL; + if (expand_stack_locked(prev, addr, write_locked)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { - return expand_downwards(vma, address); + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return -EINVAL; + return expand_downwards(vma, address, write_locked); } -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, + unsigned long addr, bool write_locked) { struct vm_area_struct *vma; unsigned long start; @@ -2162,10 +2175,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return NULL; if (vma->vm_start <= addr) return vma; - if (!(vma->vm_flags & VM_GROWSDOWN)) - return NULL; start = vma->vm_start; - if (expand_stack(vma, addr)) + if (expand_stack_locked(vma, addr, write_locked)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); @@ -2173,6 +2184,11 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) } #endif +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, + unsigned long addr) +{ + return find_extend_vma_locked(mm, addr, false); +} EXPORT_SYMBOL_GPL(find_extend_vma); /* diff --git a/mm/nommu.c b/mm/nommu.c index f670d9979a2610..f476c9ed36b330 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) * expand a stack to a given address * - not supported under NOMMU conditions */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { return -ENOMEM; } From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 19 Jun 2023 11:34:15 -0700 Subject: [PATCH 11/13] execve: expand new process stack manually ahead of time This is a small step towards a model where GUP itself would not expand the stack, and any user that needs GUP to not look up existing mappings, but actually expand on them, would have to do so manually before-hand, and with the mm lock held for writing. It turns out that execve() already did almost exactly that, except it didn't take the mm lock at all (it's single-threaded so no locking technically needed, but it could cause lockdep errors). And it only did it for the CONFIG_STACK_GROWSUP case, since in that case GUP has obviously never expanded the stack downwards. So just make that CONFIG_STACK_GROWSUP case do the right thing with locking, and enable it generally. This will eventually help GUP, and in the meantime avoids a special case and the lockdep issue. Signed-off-by: Linus Torvalds --- fs/exec.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a61eb256e5e4ce..66e3e22ffb8a61 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -200,34 +200,39 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; + struct vm_area_struct *vma = bprm->vma; + struct mm_struct *mm = bprm->mm; int ret; - unsigned int gup_flags = 0; -#ifdef CONFIG_STACK_GROWSUP - if (write) { - /* We claim to hold the lock - nobody to race with */ - ret = expand_downwards(bprm->vma, pos, true); - if (ret < 0) + /* + * Avoid relying on expanding the stack down in GUP (which + * does not work for STACK_GROWSUP anyway), and just do it + * by hand ahead of time. + */ + if (write && pos < vma->vm_start) { + mmap_write_lock(mm); + ret = expand_downwards(vma, pos, true); + if (unlikely(ret < 0)) { + mmap_write_unlock(mm); return NULL; - } -#endif - - if (write) - gup_flags |= FOLL_WRITE; + } + mmap_write_downgrade(mm); + } else + mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process - * doing the exec and bprm->mm is the new process's mm. + * doing the exec and 'mm' is the new process's mm. */ - mmap_read_lock(bprm->mm); - ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, + ret = get_user_pages_remote(mm, pos, 1, + write ? FOLL_WRITE : 0, &page, NULL, NULL); - mmap_read_unlock(bprm->mm); + mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) - acct_arg_size(bprm, vma_pages(bprm->vma)); + acct_arg_size(bprm, vma_pages(vma)); return page; } From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jun 2023 13:45:51 -0700 Subject: [PATCH 12/13] mm: always expand the stack with the mmap write lock held This finishes the job of always holding the mmap write lock when extending the user stack vma, and removes the 'write_locked' argument from the vm helper functions again. For some cases, we just avoid expanding the stack at all: drivers and page pinning really shouldn't be extending any stacks. Let's see if any strange users really wanted that. It's worth noting that architectures that weren't converted to the new lock_mm_and_find_vma() helper function are left using the legacy "expand_stack()" function, but it has been changed to drop the mmap_lock and take it for writing while expanding the vma. This makes it fairly straightforward to convert the remaining architectures. As a result of dropping and re-taking the lock, the calling conventions for this function have also changed, since the old vma may no longer be valid. So it will now return the new vma if successful, and NULL - and the lock dropped - if the area could not be extended. Tested-by: Vegard Nossum Tested-by: John Paul Adrian Glaubitz # ia64 Tested-by: Frank Scheiner # ia64 Signed-off-by: Linus Torvalds --- arch/ia64/mm/fault.c | 36 ++--------- arch/m68k/mm/fault.c | 9 ++- arch/microblaze/mm/fault.c | 5 +- arch/openrisc/mm/fault.c | 5 +- arch/parisc/mm/fault.c | 23 ++++--- arch/s390/mm/fault.c | 5 +- arch/sparc/mm/fault_64.c | 8 ++- arch/um/kernel/trap.c | 11 ++-- drivers/iommu/amd/iommu_v2.c | 4 +- drivers/iommu/iommu-sva.c | 2 +- fs/binfmt_elf.c | 2 +- fs/exec.c | 4 +- include/linux/mm.h | 16 ++--- mm/gup.c | 6 +- mm/memory.c | 10 ++- mm/mmap.c | 121 ++++++++++++++++++++++++++++------- mm/nommu.c | 18 ++---- 17 files changed, 169 insertions(+), 116 deletions(-) diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 85c4d9ac8686d8..5458b52b400996 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -110,10 +110,12 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re * register backing store that needs to expand upwards, in * this case vma will be null, but prev_vma will ne non-null */ - if (( !vma && prev_vma ) || (address < vma->vm_start) ) - goto check_expansion; + if (( !vma && prev_vma ) || (address < vma->vm_start) ) { + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; + } - good_area: code = SEGV_ACCERR; /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ @@ -177,35 +179,9 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re mmap_read_unlock(mm); return; - check_expansion: - if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { - if (!vma) - goto bad_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) - || REGION_OFFSET(address) >= RGN_MAP_LIMIT) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; - } else { - vma = prev_vma; - if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) - || REGION_OFFSET(address) >= RGN_MAP_LIMIT) - goto bad_area; - /* - * Since the register backing store is accessed sequentially, - * we disallow growing it by more than a page at a time. - */ - if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) - goto bad_area; - if (expand_upwards(vma, address)) - goto bad_area; - } - goto good_area; - bad_area: mmap_read_unlock(mm); + bad_area_nosemaphore: if ((isr & IA64_ISR_SP) || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) { diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 228128e45c673b..c290c5c0cfb93a 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -105,8 +105,9 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (address + 256 < rdusp()) goto map_err; } - if (expand_stack(vma, address)) - goto map_err; + vma = expand_stack(mm, address); + if (!vma) + goto map_err_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so @@ -196,10 +197,12 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, goto send_sig; map_err: + mmap_read_unlock(mm); +map_err_nosemaphore: current->thread.signo = SIGSEGV; current->thread.code = SEGV_MAPERR; current->thread.faddr = address; - goto send_sig; + return send_fault_sig(regs); acc_err: current->thread.signo = SIGSEGV; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 687714db6f4d0c..d3c3c33b73a6ed 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -192,8 +192,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, && (kernel_mode(regs) || !store_updates_sp(regs))) goto bad_area; } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; good_area: code = SEGV_ACCERR; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 6734fee3134f4f..a9dcd4381d1a11 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -127,8 +127,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, if (address + PAGE_SIZE < regs->sp) goto bad_area; } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 6941fdbf251738..6e894afa424991 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, retry: mmap_read_lock(mm); vma = find_vma_prev(mm, address, &prev_vma); - if (!vma || address < vma->vm_start) - goto check_expansion; + if (!vma || address < vma->vm_start) { + if (!prev || !(prev->vm_flags & VM_GROWSUP)) + goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; + } + /* * Ok, we have a good vm_area for this memory access. We still need to * check the access permissions. */ -good_area: - if ((vma->vm_flags & acc_type) != acc_type) goto bad_area; @@ -347,17 +351,13 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, mmap_read_unlock(mm); return; -check_expansion: - vma = prev_vma; - if (vma && (expand_stack(vma, address) == 0)) - goto good_area; - /* * Something tried to access memory that isn't in our memory map.. */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { int signo, si_code; @@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs) { unsigned long insn = regs->iir; int breg, treg, xreg, val = 0; - struct vm_area_struct *vma, *prev_vma; + struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; unsigned long address; @@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs) /* Search for VMA */ address = regs->ior; mmap_read_lock(mm); - vma = find_vma_prev(mm, address, &prev_vma); + vma = vma_lookup(mm, address); mmap_read_unlock(mm); /* @@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs) */ acc_type = (insn & 0x40) ? VM_WRITE : VM_READ; if (vma - && address >= vma->vm_start && (vma->vm_flags & acc_type) == acc_type) val = 1; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index b65144c392b01a..dbe8394234e2b1 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -457,8 +457,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto out_up; - if (expand_stack(vma, address)) - goto out_up; + vma = expand_stack(mm, address); + if (!vma) + goto out; } /* diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index d91305de694c54..69ff07bc6c07de 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -383,8 +383,9 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) goto bad_area; } } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. @@ -487,8 +488,9 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) * Fix it, but check if it's kernel or user first.. */ bad_area: - insn = get_fault_insn(regs, insn); mmap_read_unlock(mm); +bad_area_nosemaphore: + insn = get_fault_insn(regs, insn); handle_kernel_fault: do_kernel_fault(regs, si_code, fault_code, insn, address); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index d3ce21c4ca32a8..6d8ae86ae978fd 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -47,14 +47,15 @@ int handle_page_fault(unsigned long address, unsigned long ip, vma = find_vma(mm, address); if (!vma) goto out; - else if (vma->vm_start <= address) + if (vma->vm_start <= address) goto good_area; - else if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - else if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - else if (expand_stack(vma, address)) + if (is_user && !ARCH_IS_STACKGROW(address)) goto out; + vma = expand_stack(mm, address); + if (!vma) + goto out_nosemaphore; good_area: *code_out = SEGV_ACCERR; diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 864e4ffb6aa94e..261352a2327162 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -485,8 +485,8 @@ static void do_fault(struct work_struct *work) flags |= FAULT_FLAG_REMOTE; mmap_read_lock(mm); - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) + vma = vma_lookup(mm, address); + if (!vma) /* failed to get a vma in the right range */ goto out; diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 9821bc44f5ac1d..3ebd4b6586b3ee 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -175,7 +175,7 @@ iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) mmap_read_lock(mm); - vma = find_extend_vma(mm, prm->addr); + vma = vma_lookup(mm, prm->addr); if (!vma) /* Unmapped area */ goto out_put_mm; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 869c3aa0e45587..befa93582ed797 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, */ if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_extend_vma_locked(mm, bprm->p, true); + vma = find_extend_vma_locked(mm, bprm->p); mmap_write_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/exec.c b/fs/exec.c index 66e3e22ffb8a61..b84b4fee0f82f1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -211,7 +211,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, */ if (write && pos < vma->vm_start) { mmap_write_lock(mm); - ret = expand_downwards(vma, pos, true); + ret = expand_downwards(vma, pos); if (unlikely(ret < 0)) { mmap_write_unlock(mm); return NULL; @@ -859,7 +859,7 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; - ret = expand_stack_locked(vma, stack_base, true); + ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; diff --git a/include/linux/mm.h b/include/linux/mm.h index 01a016521b60b6..4a9533efbd5dad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3192,18 +3192,11 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked); -#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); +struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address, - bool write_locked); -#if VM_GROWSUP -extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); -#else - #define expand_upwards(vma, address) (0) -#endif +int expand_downwards(struct vm_area_struct *vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); @@ -3298,9 +3291,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, - unsigned long addr, bool write_locked); + unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/gup.c b/mm/gup.c index bbe4162365933e..e6cdfee4451f05 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { - vma = find_extend_vma(mm, start); + vma = vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, @@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *mm, fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) + vma = vma_lookup(mm, address); + if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) diff --git a/mm/memory.c b/mm/memory.c index a81f5d0997ad92..5ce82a76201d5d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, goto fail; } - if (expand_stack_locked(vma, addr, true)) + if (expand_stack_locked(vma, addr)) goto fail; success: @@ -5713,6 +5713,14 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, if (mmap_read_lock_killable(mm)) return 0; + /* We might need to expand the stack to access it */ + vma = vma_lookup(mm, addr); + if (!vma) { + vma = expand_stack(mm, addr); + if (!vma) + return 0; + } + /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, ret, offset; diff --git a/mm/mmap.c b/mm/mmap.c index 2c44ac108a3cfd..bc510361acec29 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1935,8 +1935,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -int expand_upwards(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +static int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; @@ -1960,8 +1959,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address, if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - if (!write_locked) - return -EAGAIN; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) @@ -2030,15 +2027,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address, /* * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; @@ -2051,8 +2051,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address, vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; - if (!write_locked && (prev->vm_end == address)) - return -EAGAIN; } if (mas_preallocate(&mas, GFP_KERNEL)) @@ -2131,14 +2129,12 @@ static int __init cmdline_parse_stack_guard_gap(char *p) __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { - return expand_upwards(vma, address, write_locked); + return expand_upwards(vma, address); } -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, - unsigned long addr, bool write_locked) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; @@ -2148,23 +2144,21 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, return vma; if (!prev) return NULL; - if (expand_stack_locked(prev, addr, write_locked)) + if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) return -EINVAL; - return expand_downwards(vma, address, write_locked); + return expand_downwards(vma, address); } -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, - unsigned long addr, bool write_locked) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; @@ -2176,7 +2170,7 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, if (vma->vm_start <= addr) return vma; start = vma->vm_start; - if (expand_stack_locked(vma, addr, write_locked)) + if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); @@ -2184,12 +2178,91 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, } #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, - unsigned long addr) +/* + * IA64 has some horrid mapping rules: it can expand both up and down, + * but with various special rules. + * + * We'll get rid of this architecture eventually, so the ugliness is + * temporary. + */ +#ifdef CONFIG_IA64 +static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) +{ + return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && + REGION_OFFSET(addr) < RGN_MAP_LIMIT; +} + +/* + * IA64 stacks grow down, but there's a special register backing store + * that can grow up. Only sequentially, though, so the new address must + * match vm_end. + */ +static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) +{ + if (!vma_expand_ok(vma, addr)) + return -EFAULT; + if (vma->vm_end != (addr & PAGE_MASK)) + return -EFAULT; + return expand_upwards(vma, addr); +} + +static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) +{ + if (!vma_expand_ok(vma, addr)) + return -EFAULT; + return expand_downwards(vma, addr); +} + +#elif defined(CONFIG_STACK_GROWSUP) + +#define vma_expand_up(vma,addr) expand_upwards(vma, addr) +#define vma_expand_down(vma, addr) (-EFAULT) + +#else + +#define vma_expand_up(vma,addr) (-EFAULT) +#define vma_expand_down(vma, addr) expand_downwards(vma, addr) + +#endif + +/* + * expand_stack(): legacy interface for page faulting. Don't use unless + * you have to. + * + * This is called with the mm locked for reading, drops the lock, takes + * the lock for writing, tries to look up a vma again, expands it if + * necessary, and downgrades the lock to reading again. + * + * If no vma is found or it can't be expanded, it returns NULL and has + * dropped the lock. + */ +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { - return find_extend_vma_locked(mm, addr, false); + struct vm_area_struct *vma, *prev; + + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) + return NULL; + + vma = find_vma_prev(mm, addr, &prev); + if (vma && vma->vm_start <= addr) + goto success; + + if (prev && !vma_expand_up(prev, addr)) { + vma = prev; + goto success; + } + + if (vma && !vma_expand_down(vma, addr)) + goto success; + + mmap_write_unlock(mm); + return NULL; + +success: + mmap_write_downgrade(mm); + return vma; } -EXPORT_SYMBOL_GPL(find_extend_vma); /* * Ok - we have the memory areas we should free on a maple tree so release them, diff --git a/mm/nommu.c b/mm/nommu.c index f476c9ed36b330..37d0b03143f175 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -630,25 +630,21 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) } EXPORT_SYMBOL(find_vma); -/* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions - */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - return find_vma(mm, addr); -} - /* * expand a stack to a given address * - not supported under NOMMU conditions */ -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) { return -ENOMEM; } +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_unlock(mm); + return NULL; +} + /* * look up the first VMA exactly that exactly matches addr * - should be called with mm->mmap_lock at least held readlocked From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jun 2023 14:02:25 -0700 Subject: [PATCH 13/13] gup: add warning if some caller would seem to want stack expansion It feels very unlikely that anybody would want to do a GUP in an unmapped area under the stack pointer, but real users sometimes do some really strange things. So add a (temporary) warning for the case where a GUP fails and expanding the stack might have made it work. It's trivial to do the expansion in the caller as part of getting the mm lock in the first place - see __access_remote_vm() for ptrace, for example - it's just that it's unnecessarily painful to do it deep in the guts of the GUP lookup when we might have to drop and re-take the lock. I doubt anybody actually does anything quite this strange, but let's be proactive: adding these warnings is simple, and will make debugging it much easier if they trigger. Signed-off-by: Linus Torvalds --- mm/gup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index e6cdfee4451f05..94102390b273aa 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); + vma = find_vma(mm, start); + if (vma && (start < vma->vm_start)) { + WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); + vma = NULL; + } if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, @@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *mm, fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: - vma = vma_lookup(mm, address); + vma = find_vma(mm, address); if (!vma) return -EFAULT; + if (address < vma->vm_start ) { + WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); + return -EFAULT; + } if (!vma_permits_fault(vma, fault_flags)) return -EFAULT;