Skip to content
Browse files

x86/nmi/64: Switch stacks on userspace NMI entry

Returning to userspace is tricky: IRET can fail, and ESPFIX can
rearrange the stack prior to IRET.

The NMI nesting fixup relies on a precise stack layout and
atomic IRET.  Rather than trying to teach the NMI nesting fixup
to handle ESPFIX and failed IRET, punt: run NMIs that came from
user mode on the normal kernel stack.

This will make some nested NMIs visible to C code, but the C
code is okay with that.

As a side effect, this should speed up perf: it eliminates an
RDMSR when NMIs come from user mode.

Signed-off-by: Andy Lutomirski <>
Reviewed-by: Steven Rostedt <>
Reviewed-by: Borislav Petkov <>
Cc: Linus Torvalds <>
Cc: Peter Zijlstra <>
Cc: Thomas Gleixner <>
Signed-off-by: Ingo Molnar <>
  • Loading branch information...
amluto authored and Ingo Molnar committed Jul 15, 2015
1 parent 0e181bb commit 9b6e6a8334d56354853f9c255d1395c2ba570e0a
Showing with 58 additions and 4 deletions.
  1. +58 −4 arch/x86/entry/entry_64.S
@@ -1250,18 +1250,72 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
* However, espfix prevents us from directly returning to userspace
* with a single IRET instruction. Similarly, IRET to user mode
* can fault. We therefore handle NMIs from user space like
* other IST entries.

/* Use %rdx as our temp variable throughout */
pushq %rdx

testb $3, CS-RIP+8(%rsp)
jz .Lnmi_from_kernel

* NMI from user mode. We need to run on the thread stack, but we
* can't go through the normal entry paths: NMIs are masked, and
* we don't want to enable interrupts, because then we'll end
* up in an awkward situation in which IRQs are on but NMIs
* are off.

movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
pushq 3*8(%rdx) /* pt_regs->flags */
pushq 2*8(%rdx) /* pt_regs->cs */
pushq 1*8(%rdx) /* pt_regs->rip */
pushq $-1 /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq (%rdx) /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq %rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
pushq %rbx /* pt_regs->rbx */
pushq %rbp /* pt_regs->rbp */
pushq %r12 /* pt_regs->r12 */
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */

* If %cs was not the kernel segment, then the NMI triggered in user
* space, which means it is definitely not nested.
* At this point we no longer need to worry about stack damage
* due to nesting -- we're on the normal thread stack and we're
* done with the NMI stack.
cmpl $__KERNEL_CS, 16(%rsp)
jne first_nmi

movq %rsp, %rdi
movq $-1, %rsi
call do_nmi

* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
jmp restore_c_regs_and_iret

* Check the special variable on the stack to see if NMIs are
* executing.

0 comments on commit 9b6e6a8

Please sign in to comment.
You can’t perform that action at this time.