Skip to content
Browse files

x86: reserve room for per-thread page tables

Currently page tables have to be re-computed in
an expensive operation on context switch. Here we
reserve some room in the page tables such that
we can have per-thread page table data, which will
be much simpler to update on context switch at
the expense of memory.

Signed-off-by: Andrew Boie <>
  • Loading branch information...
andrewboie authored and carlescufi committed Jul 25, 2019
1 parent 76310f6 commit 26dccaabcbf63839693e9e14e63b5d2f0a72f5fb
Showing with 116 additions and 26 deletions.
  1. +2 −2 arch/x86/core/ia32/thread.c
  2. +11 −3 arch/x86/include/ia32/mmustructs.h
  3. +103 −21 include/arch/x86/ia32/arch.h
@@ -79,8 +79,8 @@ void z_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
#endif /* CONFIG_X86_USERSPACE */

z_x86_mmu_set_flags(&z_x86_kernel_pdpt, stack, MMU_PAGE_SIZE,
z_x86_mmu_set_flags(&z_x86_kernel_pdpt, stack + Z_X86_THREAD_PT_AREA,

stack_high = (char *)STACK_ROUND_DOWN(stack_buf + stack_size);
@@ -476,13 +476,21 @@ union x86_mmu_pte {

#define Z_X86_NUM_PDPT_ENTRIES 4
#define Z_X86_NUM_PD_ENTRIES 512
#define Z_X86_NUM_PT_ENTRIES 512

/* Memory range covered by an instance of various table types */
#define Z_X86_PD_AREA (Z_X86_PT_AREA * Z_X86_NUM_PD_ENTRIES)

typedef u64_t x86_page_entry_data_t;

typedef x86_page_entry_data_t k_mem_partition_attr_t;

struct x86_mmu_pdpt {
union x86_mmu_pdpte entry[4];
union x86_mmu_pdpte entry[Z_X86_NUM_PDPT_ENTRIES];

union x86_mmu_pde {
@@ -491,11 +499,11 @@ union x86_mmu_pde {

struct x86_mmu_pd {
union x86_mmu_pde entry[512];
union x86_mmu_pde entry[Z_X86_NUM_PD_ENTRIES];

struct x86_mmu_pt {
union x86_mmu_pte entry[512];
union x86_mmu_pte entry[Z_X86_NUM_PT_ENTRIES];

#endif /* _ASMLANGUAGE */
@@ -22,6 +22,7 @@
#include <ia32/mmustructs.h>
#include <stdbool.h>
#include <arch/common/ffs.h>
#include <misc/util.h>

#include <arch/common/addr_types.h>
@@ -571,41 +572,122 @@ extern u32_t z_timer_cycle_get_32(void);
extern struct task_state_segment _main_tss;

/* We need a set of page tables for each thread in the system which runs in
* user mode. For each thread, we have:
* - a toplevel PDPT
* - a set of page directories for the memory range covered by system RAM
* - a set of page tbales for the memory range covered by system RAM
* Directories and tables for memory ranges outside of system RAM will be
* shared and not thread-specific.
* NOTE: We are operating under the assumption that memory domain partitions
* will not be configured which grant permission to address ranges outside
* of system RAM.
* Each of these page tables will be programmed to reflect the memory
* permission policy for that thread, which will be the union of:
* - The boot time memory regions (text, rodata, and so forth)
* - The thread's stack buffer
* - Partitions in the memory domain configuration (if a member of a
* memory domain)
* The PDPT is fairly small singleton on x86 PAE (32 bytes) and also must
* be aligned to 32 bytes, so we place it at the highest addresses of the
* page reserved for the privilege elevation stack.
* The page directories and tables require page alignment so we put them as
* additional fields in the stack object, using the below macros to compute how
* many pages we need.

/* Define a range [Z_X86_PT_START, Z_X86_PT_END) which is the memory range
* covered by all the page tables needed for system RAM
#define Z_X86_PT_END ((u32_t)ROUND_UP(DT_PHYS_RAM_ADDR + \
(DT_RAM_SIZE * 1024U), \

/* Number of page tables needed to cover system RAM. Depends on the specific
* bounds of system RAM, but roughly 1 page table per 2MB of RAM */
#define Z_X86_NUM_PT ((Z_X86_PT_END - Z_X86_PT_START) / Z_X86_PT_AREA)

/* Same semantics as above, but for the page directories needed to cover
* system RAM.
#define Z_X86_PD_END ((u32_t)ROUND_UP(DT_PHYS_RAM_ADDR + \
(DT_RAM_SIZE * 1024U), \
/* Number of page directories needed to cover system RAM. Depends on the
* specific bounds of system RAM, but roughly 1 page directory per 1GB of RAM */
#define Z_X86_NUM_PD ((Z_X86_PD_END - Z_X86_PD_START) / Z_X86_PD_AREA)

/* Number of pages we need to reserve in the stack for per-thread page tables */
#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD)
/* If we're not implementing user mode, then the MMU tables don't get changed
* on context switch and we don't need any per-thread page tables
#define Z_X86_NUM_TABLE_PAGES 0U


/* With both hardware stack protection and userspace enabled, stacks are
* arranged as follows:
* High memory addresses
* +---------------+
* | Thread stack |
* +---------------+
* | Kernel stack |
* +---------------+
* | Guard page |
* +---------------+
* +-----------------------------------------+
* | Thread stack (varies) |
* +-----------------------------------------+
* | PDPT (32 bytes) |
* | Privilege elevation stack (4064 bytes) |
* +-----------------------------------------+
* | Guard page (4096 bytes) |
* +-----------------------------------------+
* | User page tables (Z_X86_THREAD_PT_AREA) |
* +-----------------------------------------+
* Low Memory addresses
* Kernel stacks are fixed at 4K. All the pages containing the thread stack
* are marked as user-accessible.
* All threads start in supervisor mode, and the kernel stack/guard page
* are both marked non-present in the MMU.
* If a thread drops down to user mode, the kernel stack page will be marked
* as present, supervior-only, and the _main_tss.esp0 field updated to point
* to the top of it.
* All context switches will save/restore the esp0 field in the TSS.
* Privilege elevation stacks are fixed-size. All the pages containing the
* thread stack are marked as user-accessible. The guard page is marked
* read-only to catch stack overflows in supervisor mode.
* If a thread starts in supervisor mode, the page containing the PDPT and
* privilege elevation stack is also marked read-only.
* If a thread starts in, or drops down to user mode, the privilege stack page
* will be marked as present, supervior-only. The PDPT will be initialized and
* used as the active page tables when that thread is active.
* If KPTI is not enabled, the _main_tss.esp0 field will always be updated
* updated to point to the top of the privilege elevation stack. Otherwise
* _main_tss.esp0 always points to the trampoline stack, which handles the
* page table switch to the kernel PDPT and transplants context to the
* privileged mode stack.
* TODO: The stack object layout is getting rather complex. We should define
* its layout in a struct definition, rather than doing math in the kernel
* code to find the parts we want or to obtain sizes.
/* If only one of HW stack protection or userspace is enabled, then the
* stack will be preceded by one page which is a guard page or a kernel mode
* stack, respectively.
#else /* Neither feature */


0 comments on commit 26dccaa

Please sign in to comment.
You can’t perform that action at this time.