内核抢占

只有在主动放弃 CPU 或者进程返回用户态时（系统调用返回时）才发生进程切换
进程进入内核态后（例如系统调用）即使时间片用完（例如在系统调用期间发生了时钟中断，并且计算发现其时间片用完）也不立即切换，而是等到返回用户态时（例如系统调用返回）。

内核抢占发生的时机

中断处理程序正在执行，且返回内核空间之前。
内核代码再一次具有可抢占性的时候（例如，调用preempt_enable()的时候）。
如果内核中的任务阻塞（这同样也会导致调用schedule()）。
除了以上场景，如果内核中的任务显式调用schedule()。

是否能被抢占？

preemptable()

include/linux/preempt.h

 ...
 #ifdef CONFIG_PREEMPT_COUNT

 #define preempt_disable() \
 do { \
     preempt_count_inc(); \
     barrier(); \
 } while (0)
 ...
 #define preemptible()   (preempt_count() == 0 && !irqs_disabled())

 #ifdef CONFIG_PREEMPT
 #define preempt_enable() \
 do { \
     barrier(); \
     if (unlikely(preempt_count_dec_and_test())) \
         __preempt_schedule(); \
 } while (0)

 ...
 #else /* !CONFIG_PREEMPT */
 #define preempt_enable() \
 do { \
     barrier(); \
     preempt_count_dec(); \
 } while (0)

 ...
 #endif /* CONFIG_PREEMPT */
 ...
 #else /* !CONFIG_PREEMPT_COUNT */
 /*
  * Even if we don't have any preemption, we need preempt disable/enable
  * to be barriers, so that we don't have things like get_user/put_user
  * that can cause faults and scheduling migrate into our preempt-protected
  * region.
  */
 #define preempt_disable()           barrier()
 ...
 #define preempt_enable()            barrier()
 ...
 #define preemptible()               0

 #endif /* CONFIG_PREEMPT_COUNT */
 ...__```

___preempt_schedule()的实现x86平台进行了重载，其他平台则采用通用的宏替换成preempt_schedule()。
内核抢占前会先通过preemptible()判断能否进行抢占。

kernel/sched/core.c

 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
     /*
      * If there is a non-zero preempt_count or interrupts are disabled,
      * we do not want to preempt the current task. Just return..
      */
     if (likely(!preemptible()))
         return;

     preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 ...*```

抢占计数 preempt_count

preempt_count_add()和preempt_count_sub()调用可重载的__preempt_count_add()和__preempt_count_sub()
目前就x86重载__preempt_count_add()和__preempt_count_sub()的实现

非Debug版本的preempt_count_add()和preempt_count_sub()

include/linux/preempt.h

#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
...
#else
#define preempt_count_add(val)  __preempt_count_add(val)
#define preempt_count_sub(val)  __preempt_count_sub(val)
...
#endif
...__```

Debug版本的preempt_count_add()和preempt_count_sub()

kernel/sched/core.c

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                defined(CONFIG_PREEMPT_TRACER))

void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
    /*
     * Underflow?
     */
    if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
        return;
#endif
    __preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
    /*
     * Spinlock count overflowing soon?
     */
    DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                PREEMPT_MASK - 10);
#endif
    if (preempt_count() == val) {
        unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
        current->preempt_disable_ip = ip;
#endif
        trace_preempt_off(CALLER_ADDR0, ip);
    }
}   
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);

void preempt_count_sub(int val)
{   
#ifdef CONFIG_DEBUG_PREEMPT
    /*
     * Underflow?
     */
    if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
        return;
    /*
     * Is the spinlock portion underflowing?
     */
    if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
            !(preempt_count() & PREEMPT_MASK)))
        return;
#endif

    if (preempt_count() == val)
        trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
    __preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
...*```

x86 的 preempt_count

x86的thread_info结构里并没有preempt_count成员，而是通过Per-CPU变量__preempt_count存储的。

arch/x86/include/asm/thread_info.h

 struct thread_info {
     struct task_struct  *task;      /* main task structure */
     __u32           flags;      /* low level flags */
     __u32           status;     /* thread synchronous flags */
     __u32           cpu;        /* current CPU */
     mm_segment_t        addr_limit;
     unsigned int        sig_on_uaccess_error:1;
     unsigned int        uaccess_err:1;  /* uaccess failed */
 };

x86重载的preempt_count相关的实现

arch/x86/include/asm/preempt.h

 DECLARE_PER_CPU(int, __preempt_count);
 ...
 /*
  * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
  * that think a non-zero value indicates we cannot preempt.
  */
 static __always_inline int preempt_count(void)
 {   /*返回抢占计数*/
     return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
 }

 static __always_inline void preempt_count_set(int pc)
 {   /*设置抢占计数*/
     raw_cpu_write_4(__preempt_count, pc);
 }
 ...
 /*
  * The various preempt_count add/sub methods
  */

 static __always_inline void __preempt_count_add(int val)
 {
     raw_cpu_add_4(__preempt_count, val);
 }

 static __always_inline void __preempt_count_sub(int val)
 {
     raw_cpu_add_4(__preempt_count, -val);
 }
 ...

ARM 的 preempt_count

ARM的thread_info结构有preempt_count成员，这和x86的不一样。

arch/arm/include/asm/thread_info.h

 /*
  * low level task data that entry.S needs immediate access to.
  * __switch_to() assumes cpu_context follows immediately after cpu_domain.
  */
 struct thread_info {
     unsigned long       flags;      /* low level flags */
     int         preempt_count;  /* 0 => preemptable, <0 => bug */
     mm_segment_t        addr_limit; /* address limit */
     struct task_struct  *task;      /* main task structure */
     __u32           cpu;        /* cpu */
     __u32           cpu_domain; /* cpu domain */
     struct cpu_context_save cpu_context;    /* cpu context */
     __u32           syscall;    /* syscall number */
     __u8            used_cp[16];    /* thread used copro */
     unsigned long       tp_value[2];    /* TLS registers */
 #ifdef CONFIG_CRUNCH
     struct crunch_state crunchstate;
 #endif
     union fp_state      fpstate __attribute__((aligned(8)));
     union vfp_state     vfpstate;
 #ifdef CONFIG_ARM_THUMBEE
     unsigned long       thumbee_state;  /* ThumbEE Handler Base register */
 #endif
 };
 ...
 /*
  * how to get the current stack pointer in C
  */
 register unsigned long current_stack_pointer asm ("sp");

 /*
  * how to get the thread information struct from C
  */
 static inline struct thread_info *current_thread_info(void) __attribute_const__;

 static inline struct thread_info *current_thread_info(void)
 {
     return (struct thread_info *)
         (current_stack_pointer & ~(THREAD_SIZE - 1));
 }
 ...*```

ARM用的是通用的__preempt_count_add()和__preempt_count_sub()实现

include/asm-generic/preempt.h

 static __always_inline int preempt_count(void)
 {   /*返回抢占计数*/
     return current_thread_info()->preempt_count;
 }

 static __always_inline int *preempt_count_ptr(void)
 {
     return &current_thread_info()->preempt_count;
 }

 static __always_inline void preempt_count_set(int pc)
 {   /*设置抢占计数*/
     *preempt_count_ptr() = pc;
 }
 ...
 /*
  * The various preempt_count add/sub methods
  */

 static __always_inline void __preempt_count_add(int val)
 {
     *preempt_count_ptr() += val;
 }

 static __always_inline void __preempt_count_sub(int val)
 {
     *preempt_count_ptr() -= val;
 }
 ...*```

preempt_count_ptr()仅在以下函数中被调用，x86thread_info没有preempt_count成员，因此以下函数也是独立实现的：
- preempt_count_set()
- __preempt_count_add()
- __preempt_count_sub()
- __preempt_count_dec_and_test()

中断返回时的内核抢占

内核抢占可能发生在处理完硬件中断请求之后。
如果处理器在处理中断请求后返回内核态（返回用户态则没有影响），特定体系结构的汇编实现会检查：
- 抢占计数值是否为 0，即是否允许抢占，
- 以及是否设置了重新调度标志。
如果以上两个条件满足，则通过函数preempt_schedule_irq()调用调度器。
preempt_schedule_irq()与preempt_schedule()本质区别是，preempt_schedule_irq()在调用时中断必须是禁止的，防止中断造成递归调用。
当然，在preempt_schedule_irq()调用schedule()前必须要调用local_irq_enable()开启中断，完成后必须调用local_irq_disable()恢复调用前中断的状态。

x86-64的实现

arch/x86/entry/entry_64.S

/*
 * Build the entry stubs with some assembler magic.
 * We pack 1 stub into every 8-byte block.
 */
        .align 8
ENTRY(irq_entries_start)  /*中断入口点*/
    vector=FIRST_EXTERNAL_VECTOR
    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
        pushq   $(~vector+0x80)                 /* Note: always in signed byte range */
    vector=vector+1
        jmp     common_interrupt
        .align  8
    .endr
END(irq_entries_start)
        /*
         * The interrupt stubs push (~vector+0x80) onto the stack and
         * then jump to common_interrupt.
         */
        .p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
        ASM_CLAC
        addq    $-0x80, (%rsp)                  /* Adjust vector to [-256, -1] range */
        interrupt do_IRQ  /*中断进入 C 处理的入口点*/
        /* 0(%rsp): old RSP */
ret_from_intr:            /* 注意，这里是连着的，do_IRQ返回后会接着执行后面的指令 */
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        decl    PER_CPU_VAR(irq_count)  /*Per-CPU 的 irq_count 减 1*/

        /* Restore saved previous stack */
        popq    %rsp

        testb   $3, CS(%rsp)
        jz      retint_kernel    /*测试结果为0，中断发生时在内核空间，跳到retint_kernel*/

        /* Interrupt came from user space */
GLOBAL(retint_user)              /*测试结果不为0，中断发生在用户空间*/
        mov     %rsp,%rdi
        call    prepare_exit_to_usermode
        TRACE_IRQS_IRETQ
        SWAPGS
        jmp     restore_regs_and_iret

/* Returning to kernel space */
retint_kernel:
#ifdef CONFIG_PREEMPT
        /* Interrupts are off */
        /* Check if we need preemption */
        bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
        jnc     1f  /*检查CF，中断是否关闭。如果CF=0，中断未关闭，则前跳至1，不抢占；否则往下执行*/
0:      cmpl    $0, PER_CPU_VAR(__preempt_count)
        jnz     1f  /*检查上面比较结果是否不为0。如果不为0，抢占是关闭状态，则前跳至1，不抢占；否则抢占发生*/
        call    preempt_schedule_irq /*中断关闭状态下调用函数preempt_schedule_irq()*/
        jmp     0b
1:
#endif

PowerPC-32的实现

arch/powerpc/kernel/entry_32.S

        .globl  ret_from_except
ret_from_except:
        /* Hard-disable interrupts so that current_thread_info()->flags
         * can't change between when we test it and when we return
         * from the interrupt. */
        /* Note: We don't bother telling lockdep about it */
        LOAD_MSR_KERNEL(r10,MSR_KERNEL)
        SYNC                    /* Some chip revs have problems here... */
        MTMSRD(r10)             /* disable interrupts */

        lwz     r3,_MSR(r1)     /* Returning to user mode? */
        andi.   r0,r3,MSR_PR    /*立即数MSR_PR与寄存器r3的值比较，结果存入寄存器r0*/
        beq     resume_kernel   /*如果比较结果相等，中断返回内核空间，跳到resume_kernel*/

user_exc_return:                /* r10 contains MSR_KERNEL here */
        /* Check current_thread_info()->flags */
        CURRENT_THREAD_INFO(r9, r1)
        lwz     r9,TI_FLAGS(r9)
        andi.   r0,r9,_TIF_USER_WORK_MASK
        bne     do_work

restore_user:
#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
        /* Check whether this process has its own DBCR0 value.  The internal
           debug mode bit tells us that dbcr0 should be loaded. */
        lwz     r0,THREAD+THREAD_DBCR0(r2)
        andis.  r10,r0,DBCR0_IDM@h
        bnel-   load_dbcr0
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        CURRENT_THREAD_INFO(r9, r1)
        ACCOUNT_CPU_USER_EXIT(r9, r10, r11)
#endif

        b       restore      /*返回用户空间的函数*/

/* N.B. the only way to get here is from the beq following ret_from_except. */
resume_kernel:
        /* check current_thread_info, _TIF_EMULATE_STACK_STORE */
        CURRENT_THREAD_INFO(r9, r1) /*取当前进程 thread_info*/
        lwz     r8,TI_FLAGS(r9)     /*取当前进程 thread_info 中的TI_FLAGS，存入r8*/
        andis.  r0,r8,_TIF_EMULATE_STACK_STORE@h /*注意，andis的结果放到寄存器r0*/
        beq+    1f                  /*向前跳转到1*/

        addi    r8,r1,INT_FRAME_SIZE    /* Get the kprobed function entry */

        lwz     r3,GPR1(r1)
        subi    r3,r3,INT_FRAME_SIZE    /* dst: Allocate a trampoline exception frame */
        mr      r4,r1                   /* src:  current exception frame */
        mr      r1,r3                   /* Reroute the trampoline frame to r1 */

        /* Copy from the original to the trampoline. */
        li      r5,INT_FRAME_SIZE/4     /* size: INT_FRAME_SIZE */
        li      r6,0                    /* start offset: 0 */
        mtctr   r5
2:      lwzx    r0,r6,r4
        stwx    r0,r6,r3
        addi    r6,r6,4
        bdnz    2b

        /* Do real store operation to complete stwu */
        lwz     r5,GPR1(r1)
        stw     r8,0(r5)

        /* Clear _TIF_EMULATE_STACK_STORE flag */
        lis     r11,_TIF_EMULATE_STACK_STORE@h
        addi    r5,r9,TI_FLAGS
0:      lwarx   r8,0,r5
        andc    r8,r8,r11
#ifdef CONFIG_IBM405_ERR77
        dcbt    0,r5
#endif
        stwcx.  r8,0,r5
        bne-    0b
1:

#ifdef CONFIG_PREEMPT
        /* check current_thread_info->preempt_count */
        lwz     r0,TI_PREEMPT(r9)  /*取当前进程thread_info里的抢占计数到r0*/
        cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
        bne     restore         /*如果非 0，抢占目前处于关闭状态，恢复寄存器并返回中断前的函数*/
        andi.   r8,r8,_TIF_NEED_RESCHED /*检查TI_FLAGS是否设置重新调度位，结果存入r8*/
        beq+    restore         /*如果为 0，没有进程要求重新调度，恢复寄存器并返回中断前的函数，否则执行下一条指令*/
        lwz     r3,_MSR(r1)     /*取中断计数到r3。下面会判断中断是否关闭，如果没关闭则不调度*/
        andi.   r0,r3,MSR_EE    /* interrupts off? */
        beq     restore         /* don't schedule if so */
#ifdef CONFIG_TRACE_IRQFLAGS
        /* Lockdep thinks irqs are enabled, we need to call
         * preempt_schedule_irq with IRQs off, so we inform lockdep
         * now that we -did- turn them off already
         */
        bl      trace_hardirqs_off
#endif
1:      bl      preempt_schedule_irq    /*中断关闭状态下调用函数preempt_schedule_irq()*/
        CURRENT_THREAD_INFO(r9, r1)     /*取当前进程 thread_info*/
        lwz     r3,TI_FLAGS(r9)         /*取当前进程 thread_info 中的TI_FLAGS到r3*/
        andi.   r0,r3,_TIF_NEED_RESCHED /*检查TI_FLAGS是否设置重新调度位，结果在r0*/
        bne-    1b                      /*结果不相等，说明设置了重新调度位，往后跳至1，重新调度*/
#ifdef CONFIG_TRACE_IRQFLAGS
        /* And now, to properly rebalance the above, we tell lockdep they
         * are being turned back on, which will happen when we return
         */
        bl      trace_hardirqs_on
#endif
#endif /* CONFIG_PREEMPT */

preempt_schedule_irq()函数

最后再来看看preempt_schedule_irq()函数

kernel/sched/core.c

 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
  /*这是内核抢占在中断上下文的入口点。
    注意，该函数会在中断禁止的状态下调用，返回时也得是中断禁止的。这会防止中断造成递归调用。*/
 asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
         enum ctx_state prev_state;

         /* Catch callers which need to be fixed */
         /*抢占计数不为 0，或者中断开启，直接 crash。因为该函数仅在中断返回的最后调用，此时仍
           属于中断上下文，如果不满足这两个条件系统肯定出问题了。*/
         BUG_ON(preempt_count() || !irqs_disabled());

         prev_state = exception_enter();

         do {
                 preempt_disable();  /*先禁止抢占*/
                 local_irq_enable(); /*schedule()前一刻中断必然要打开*/
                 __schedule(true);
                 local_irq_disable(); /*schedule()完成后中断立即关闭*/
                 sched_preempt_enable_no_resched(); /*非调度方式开抢占*/
         } while (need_resched());

         exception_exit(prev_state);
 }

References

Revisiting the kernel's preemption model, part 2 - LWN

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

sched_kernel_preempt.md

sched_kernel_preempt.md

内核抢占

目录

用户抢占发生的时机

非内核抢占

内核抢占发生的时机

是否能被抢占？

抢占计数 preempt_count

x86 的 preempt_count

ARM 的 preempt_count

中断返回时的内核抢占

x86-64的实现

PowerPC-32的实现

preempt_schedule_irq()函数

References

Files

sched_kernel_preempt.md

Latest commit

History

sched_kernel_preempt.md

File metadata and controls

内核抢占

目录

用户抢占发生的时机

非内核抢占

内核抢占发生的时机

是否能被抢占？

抢占计数 preempt_count

x86 的 preempt_count

ARM 的 preempt_count

中断返回时的内核抢占

x86-64的实现

PowerPC-32的实现

preempt_schedule_irq()函数

References