Skip to content

Commit

Permalink
mm: multigenerational lru: aging
Browse files Browse the repository at this point in the history
To avoid confusions, the term "scan" will be applied to PTEs in a page
table and pages on an lru list. It emphasizes on consecutive elements
in a set rather than the data structure holding this set together.

The aging produces young generations. Given an lruvec, it iterates
lruvec_memcg()->mm_list and calls walk_page_range() with each
mm_struct on this list to scan PTEs for accessed pages. On finding a
young PTE, it clears the accessed bit and updates the gen counter of
the page mapped by this PTE to (max_seq%MAX_NR_GENS)+1. After each
iteration of this list, it increments max_seq. The aging is needed
before the eviction can continue when max_seq-min_seq+1 reaches
MIN_NR_GENS.

To avoid confusions, the terms "promotion" and "demotion" will be
applied to the multigenerational lru, as a new convention; the terms
"activation" and "deactivation" will be applied to the active/inactive
lru, as usual.

IOW, the aging promotes a page to the youngest generation when it
finds this page accessed thru page tables; demotion happens
consequently when it creates a new generation. Note that promotion
doesn't require any lru list operations in the aging path, only the
update of the gen counter and the lru sizes; demotion, unless as the
result of the creation of a new generation, requires lru list
operations, e.g., lru_deactivate_fn().

The aging uses the following optimizations when walking page tables:
1) It uses the accessed bit in non-leaf PMD entries, the hint from the
   CPU scheduler and the Bloom filters to reduce its search space.
2) It doesn't zigzag between a PGD table and the same PMD or PTE table
   spanning multiple VMAs. In other words, it finishes all the VMAs
   within the range of the same PMD or PTE table before it returns to
   a PGD table. This improves the cache performance for workloads that
   have large numbers of tiny VMAs, especially when
   CONFIG_PGTABLE_LEVELS=5.

The aging is only interested in accessed pages and therefore has the
complexity of O(nr_hot_evictable_pages). The worst case scenario is
the aging fails to exploit any spatial locality and the eviction has
to promote all accessed pages when walking the rmap, which is similar
to the active/inactive lru. However, generations still can provide
better temporal locality.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
  • Loading branch information
yuzhaogoogle authored and xanmod committed Jan 12, 2022
1 parent 4dccd52 commit 22bfc00
Show file tree
Hide file tree
Showing 8 changed files with 946 additions and 2 deletions.
6 changes: 6 additions & 0 deletions include/linux/memcontrol.h
Expand Up @@ -1393,18 +1393,24 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)

static inline void lock_page_memcg(struct page *page)
{
/* to match folio_memcg_rcu() */
rcu_read_lock();
}

static inline void unlock_page_memcg(struct page *page)
{
rcu_read_unlock();
}

static inline void folio_memcg_lock(struct folio *folio)
{
/* to match folio_memcg_rcu() */
rcu_read_lock();
}

static inline void folio_memcg_unlock(struct folio *folio)
{
rcu_read_unlock();
}

static inline void mem_cgroup_handle_over_high(void)
Expand Down
5 changes: 5 additions & 0 deletions include/linux/mm.h
Expand Up @@ -1599,6 +1599,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
return page_to_pfn(&folio->page);
}

static inline struct folio *pfn_folio(unsigned long pfn)
{
return page_folio(pfn_to_page(pfn));
}

/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
#ifdef CONFIG_MIGRATION
static inline bool is_pinnable_page(struct page *page)
Expand Down
10 changes: 10 additions & 0 deletions include/linux/mmzone.h
Expand Up @@ -304,6 +304,7 @@ enum lruvec_flags {
};

struct lruvec;
struct page_vma_mapped_walk;

#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
Expand Down Expand Up @@ -410,6 +411,7 @@ struct lru_gen_mm_walk {
};

void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
Expand All @@ -422,6 +424,10 @@ static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *l
{
}

static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
}

#ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
Expand Down Expand Up @@ -1048,6 +1054,10 @@ typedef struct pglist_data {

unsigned long flags;

#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
#endif
ZONE_PADDING(_pad2_)

/* Per-node vmstats */
Expand Down
16 changes: 16 additions & 0 deletions include/linux/oom.h
Expand Up @@ -57,6 +57,22 @@ struct oom_control {
extern struct mutex oom_lock;
extern struct mutex oom_adj_mutex;

#ifdef CONFIG_MMU
extern struct task_struct *oom_reaper_list;
extern struct wait_queue_head oom_reaper_wait;

static inline bool oom_reaping_in_progress(void)
{
/* a racy check can be used to reduce the chance of overkilling */
return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait);
}
#else
static inline bool oom_reaping_in_progress(void)
{
return false;
}
#endif

static inline void set_current_oom_origin(void)
{
current->signal->oom_flag_origin = true;
Expand Down
4 changes: 4 additions & 0 deletions include/linux/swap.h
Expand Up @@ -137,6 +137,10 @@ union swap_header {
*/
struct reclaim_state {
unsigned long reclaimed_slab;
#ifdef CONFIG_LRU_GEN
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
};

#ifdef __KERNEL__
Expand Down
4 changes: 2 additions & 2 deletions mm/oom_kill.c
Expand Up @@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
* victim (if that is possible) to help the OOM killer to move on.
*/
static struct task_struct *oom_reaper_th;
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);

bool __oom_reap_task_mm(struct mm_struct *mm)
Expand Down
7 changes: 7 additions & 0 deletions mm/rmap.c
Expand Up @@ -73,6 +73,7 @@
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>

#include <asm/tlbflush.h>

Expand Down Expand Up @@ -790,6 +791,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
}

if (pvmw.pte) {
if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
!(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
lru_gen_look_around(&pvmw);
referenced++;
}

if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
Expand Down

0 comments on commit 22bfc00

Please sign in to comment.