Skip to content

Commit

Permalink
mm: multi-gen LRU: support page table walks
Browse files Browse the repository at this point in the history
To further exploit spatial locality, the aging prefers to walk page
tables to search for young PTEs and promote hot pages. A kill switch
will be added in the next patch to disable this behavior. When
disabled, the aging relies on the rmap only.

NB: this behavior has nothing similar with the page table scanning in
the 2.4 kernel [1], which searches page tables for old PTEs, adds cold
pages to swapcache and unmaps them.

To avoid confusion, the term "iteration" specifically means the
traversal of an entire mm_struct list; the term "walk" will be applied
to page tables and the rmap, as usual.

An mm_struct list is maintained for each memcg, and an mm_struct
follows its owner task to the new memcg when this task is migrated.
Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls
walk_page_range() with each mm_struct on this list to promote hot
pages before it increments max_seq.

When multiple page table walkers iterate the same list, each of them
gets a unique mm_struct; therefore they can run concurrently. Page
table walkers ignore any misplaced pages, e.g., if an mm_struct was
migrated, pages it left in the previous memcg will not be promoted
when its current memcg is under reclaim. Similarly, page table walkers
will not promote pages from nodes other than the one under reclaim.

This patch uses the following optimizations when walking page tables:
1. It tracks the usage of mm_struct's between context switches so that
   page table walkers can skip processes that have been sleeping since
   the last iteration.
2. It uses generational Bloom filters to record populated branches so
   that page table walkers can reduce their search space based on the
   query results, e.g., to skip page tables containing mostly holes or
   misplaced pages.
3. It takes advantage of the accessed bit in non-leaf PMD entries when
   CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
4. It does not zigzag between a PGD table and the same PMD table
   spanning multiple VMAs. IOW, it finishes all the VMAs within the
   range of the same PMD table before it returns to a PGD table. This
   improves the cache performance for workloads that have large
   numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.

Server benchmark results:
  Single workload:
    fio (buffered I/O): no change

  Single workload:
    memcached (anon): +[5.5, 7.5]%
                Ops/sec      KB/sec
      patch1-6: 1015292.83   39490.38
      patch1-7: 1080856.82   42040.53

  Configurations:
    no change

Client benchmark results:
  kswapd profiles:
    patch1-6
      45.49%  lzo1x_1_do_compress (real work)
       7.38%  page_vma_mapped_walk
       7.24%  _raw_spin_unlock_irq
       2.64%  ptep_clear_flush
       2.31%  __zram_bvec_write
       2.13%  do_raw_spin_lock
       2.09%  lru_gen_look_around
       1.89%  free_unref_page_list
       1.85%  memmove
       1.74%  obj_malloc

    patch1-7
      47.73%  lzo1x_1_do_compress (real work)
       6.84%  page_vma_mapped_walk
       6.14%  _raw_spin_unlock_irq
       2.86%  walk_pte_range
       2.79%  ptep_clear_flush
       2.24%  __zram_bvec_write
       2.10%  do_raw_spin_lock
       1.94%  free_unref_page_list
       1.80%  memmove
       1.75%  obj_malloc

  Configurations:
    no change

[1] https://lwn.net/Articles/23732/
[2] https://source.android.com/devices/tech/debug/scudo

Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
  • Loading branch information
yuzhaogoogle authored and xanmod committed Mar 21, 2022
1 parent 1f5e856 commit 869c1f3
Show file tree
Hide file tree
Showing 10 changed files with 1,129 additions and 13 deletions.
2 changes: 2 additions & 0 deletions fs/exec.c
Expand Up @@ -1006,6 +1006,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
lru_gen_add_mm(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
Expand All @@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
lru_gen_use_mm(mm);
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
Expand Down
5 changes: 5 additions & 0 deletions include/linux/memcontrol.h
Expand Up @@ -343,6 +343,11 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_LRU_GEN
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif

struct mem_cgroup_per_node *nodeinfo[];
};

Expand Down
78 changes: 78 additions & 0 deletions include/linux/mm_types.h
Expand Up @@ -3,6 +3,7 @@
#define _LINUX_MM_TYPES_H

#include <linux/mm_types_task.h>
#include <linux/sched.h>

#include <linux/auxvec.h>
#include <linux/kref.h>
Expand All @@ -17,6 +18,8 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/mmdebug.h>

#include <asm/mmu.h>

Expand Down Expand Up @@ -637,6 +640,22 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
#endif
#ifdef CONFIG_LRU_GEN
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
#ifdef CONFIG_MEMCG
/* points to the memcg of "owner" above */
struct mem_cgroup *memcg;
#endif
/*
* Set when switching to this mm_struct, as a hint of
* whether it has been used since the last time per-node
* page table walkers cleared the corresponding bits.
*/
nodemask_t nodes;
} lru_gen;
#endif /* CONFIG_LRU_GEN */
} __randomize_layout;

/*
Expand All @@ -663,6 +682,65 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}

#ifdef CONFIG_LRU_GEN

struct lru_gen_mm_list {
/* mm_struct list for page table walkers */
struct list_head fifo;
/* protects the list above */
spinlock_t lock;
};

void lru_gen_add_mm(struct mm_struct *mm);
void lru_gen_del_mm(struct mm_struct *mm);
#ifdef CONFIG_MEMCG
void lru_gen_migrate_mm(struct mm_struct *mm);
#endif

static inline void lru_gen_init_mm(struct mm_struct *mm)
{
INIT_LIST_HEAD(&mm->lru_gen.list);
#ifdef CONFIG_MEMCG
mm->lru_gen.memcg = NULL;
#endif
nodes_clear(mm->lru_gen.nodes);
}

static inline void lru_gen_use_mm(struct mm_struct *mm)
{
/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
VM_WARN_ON(list_empty(&mm->lru_gen.list));

if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lru_gen.nodes))
nodes_setall(mm->lru_gen.nodes);
}

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_add_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_del_mm(struct mm_struct *mm)
{
}

#ifdef CONFIG_MEMCG
static inline void lru_gen_migrate_mm(struct mm_struct *mm)
{
}
#endif

static inline void lru_gen_init_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_use_mm(struct mm_struct *mm)
{
}

#endif /* CONFIG_LRU_GEN */

struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
Expand Down
58 changes: 58 additions & 0 deletions include/linux/mmzone.h
Expand Up @@ -411,6 +411,58 @@ struct lru_gen_struct {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
};

enum {
MM_PTE_TOTAL, /* total leaf entries */
MM_PTE_OLD, /* old leaf entries */
MM_PTE_YOUNG, /* young leaf entries */
MM_PMD_TOTAL, /* total non-leaf entries */
MM_PMD_FOUND, /* non-leaf entries found in Bloom filters */
MM_PMD_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
};

/* mnemonic codes for the mm stats above */
#define MM_STAT_CODES "toydfa"

/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS 2

struct lru_gen_mm_state {
/* set to max_seq after each iteration */
unsigned long seq;
/* where the current iteration starts (inclusive) */
struct list_head *head;
/* where the last iteration ends (exclusive) */
struct list_head *tail;
/* to wait for the last page table walker to finish */
struct wait_queue_head wait;
/* Bloom filters flip after each iteration */
unsigned long *filters[NR_BLOOM_FILTERS];
/* the mm stats for debugging */
unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
/* the number of concurrent page table walkers */
int nr_walkers;
};

struct lru_gen_mm_walk {
/* the lruvec under reclaim */
struct lruvec *lruvec;
/* unstable max_seq from lru_gen_struct */
unsigned long max_seq;
/* the next address within an mm to scan */
unsigned long next_addr;
/* to batch page table entries */
unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
/* to batch promoted pages */
int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* to batch the mm stats */
int mm_stats[NR_MM_STATS];
/* total batched items */
int batched;
bool can_swap;
bool full_scan;
};

void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

Expand Down Expand Up @@ -461,6 +513,8 @@ struct lruvec {
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
struct lru_gen_struct lrugen;
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
Expand Down Expand Up @@ -1053,6 +1107,10 @@ typedef struct pglist_data {

unsigned long flags;

#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
#endif
ZONE_PADDING(_pad2_)

/* Per-node vmstats */
Expand Down
4 changes: 4 additions & 0 deletions include/linux/swap.h
Expand Up @@ -137,6 +137,10 @@ union swap_header {
*/
struct reclaim_state {
unsigned long reclaimed_slab;
#ifdef CONFIG_LRU_GEN
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
};

#ifdef __KERNEL__
Expand Down
1 change: 1 addition & 0 deletions kernel/exit.c
Expand Up @@ -463,6 +463,7 @@ void mm_update_next_owner(struct mm_struct *mm)
goto retry;
}
WRITE_ONCE(mm->owner, c);
lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
Expand Down
9 changes: 9 additions & 0 deletions kernel/fork.c
Expand Up @@ -1079,6 +1079,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
goto fail_nocontext;

mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;

fail_nocontext:
Expand Down Expand Up @@ -1121,6 +1122,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
mmdrop(mm);
}

Expand Down Expand Up @@ -2586,6 +2588,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}

if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
task_unlock(p);
}

wake_up_new_task(p);

/* forking complete and child started to run, tell ptracer */
Expand Down
1 change: 1 addition & 0 deletions kernel/sched/core.c
Expand Up @@ -4979,6 +4979,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);

if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
Expand Down
24 changes: 24 additions & 0 deletions mm/memcontrol.c
Expand Up @@ -6155,6 +6155,29 @@ static void mem_cgroup_move_task(void)
}
#endif

#ifdef CONFIG_LRU_GEN
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct task_struct *task = NULL;

cgroup_taskset_for_each_leader(task, css, tset)
break;

if (!task)
return;

task_lock(task);
if (task->mm && task->mm->owner == task)
lru_gen_migrate_mm(task->mm);
task_unlock(task);
}
#else
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
}
#endif /* CONFIG_LRU_GEN */

static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
Expand Down Expand Up @@ -6500,6 +6523,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
.attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.dfl_cftypes = memory_files,
Expand Down

0 comments on commit 869c1f3

Please sign in to comment.