Skip to content

Commit

Permalink
mm: multigenerational lru: groundwork
Browse files Browse the repository at this point in the history
For each lruvec, evictable pages are divided into multiple
generations. The youngest generation number is stored in
lrugen->max_seq for both anon and file types as they are aged on an
equal footing. The oldest generation numbers are stored in
lrugen->min_seq[2] separately for anon and file types as clean file
pages can be evicted regardless of may_swap or may_writepage. These
three variables are monotonically increasing. Generation numbers are
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
page->flags. The sliding window technique is used to prevent truncated
generation numbers from overlapping. Each truncated generation number
is an index to
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. Evictable
pages are added to the per-zone lists indexed by lrugen->max_seq or
lrugen->min_seq[2] (modulo MAX_NR_GENS), depending on their types.

Each generation is then divided into multiple tiers. Tiers represent
levels of usage from file descriptors only. Pages accessed N times via
file descriptors belong to tier order_base_2(N). Each generation
contains at most MAX_NR_TIERS tiers, and they require additional
MAX_NR_TIERS-2 bits in page->flags. In contrast to moving across
generations which requires the lru lock for the list operations,
moving across tiers only involves an atomic operation on page->flags
and therefore has a negligible cost. A feedback loop modeled after the
PID controller monitors the refault rates across all tiers and decides
when to activate pages from which tiers in the reclaim path.

The framework comprises two conceptually independent components: the
aging and the eviction, which can be invoked separately from user
space for the purpose of working set estimation and proactive reclaim.

The aging produces young generations. Given an lruvec, the aging scans
page tables for referenced pages of this lruvec. Upon finding one, the
aging updates its generation number to max_seq. After each round of
scan, the aging increments max_seq. The aging is due when both of
min_seq[2] reaches max_seq-1, assuming both anon and file types are
reclaimable.

The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It tries to select a type based on the values of min_seq[2] and
swappiness. During a scan, the eviction sorts pages according to their
new generation numbers, if the aging has found them referenced. When
it finds all the per-zone lists of a selected type are empty, the
eviction increments min_seq[2] indexed by this selected type.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
  • Loading branch information
yuzhaogoogle authored and xanmod committed Jun 28, 2021
1 parent d4b7b40 commit 7c0e13b
Show file tree
Hide file tree
Showing 12 changed files with 644 additions and 11 deletions.
3 changes: 2 additions & 1 deletion fs/fuse/dev.c
Expand Up @@ -784,7 +784,8 @@ static int fuse_check_page(struct page *page)
1 << PG_lru |
1 << PG_active |
1 << PG_reclaim |
1 << PG_waiters))) {
1 << PG_waiters |
LRU_GEN_MASK | LRU_USAGE_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
Expand Down
2 changes: 2 additions & 0 deletions include/linux/mm.h
Expand Up @@ -1089,6 +1089,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_USAGE_PGOFF (LRU_GEN_PGOFF - LRU_USAGE_WIDTH)

/*
* Define the bit shifts to access each section. For non-existent
Expand Down
194 changes: 194 additions & 0 deletions include/linux/mm_inline.h
Expand Up @@ -79,11 +79,199 @@ static __always_inline enum lru_list page_lru(struct page *page)
return lru;
}

#ifdef CONFIG_LRU_GEN

#ifdef CONFIG_LRU_GEN_ENABLED
DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);

static inline bool lru_gen_enabled(void)
{
return static_branch_likely(&lru_gen_static_key);
}
#else
DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);

static inline bool lru_gen_enabled(void)
{
return static_branch_unlikely(&lru_gen_static_key);
}
#endif

/* We track at most MAX_NR_GENS generations using the sliding window technique. */
static inline int lru_gen_from_seq(unsigned long seq)
{
return seq % MAX_NR_GENS;
}

/* Return a proper index regardless whether we keep a full history of stats. */
static inline int hist_from_seq_or_gen(int seq_or_gen)
{
return seq_or_gen % NR_STAT_GENS;
}

/* The youngest and the second youngest generations are counted as active. */
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq);

VM_BUG_ON(!max_seq);
VM_BUG_ON(gen >= MAX_NR_GENS);

return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}

/* Update the sizes of the multigenerational lru lists. */
static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
int old_gen, int new_gen)
{
int type = page_is_file_lru(page);
int zone = page_zonenum(page);
int delta = thp_nr_pages(page);
enum lru_list lru = type * LRU_FILE;
struct lrugen *lrugen = &lruvec->evictable;

lockdep_assert_held(&lruvec->lru_lock);
VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
VM_BUG_ON(old_gen == -1 && new_gen == -1);

if (old_gen >= 0)
WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
lrugen->sizes[old_gen][type][zone] - delta);
if (new_gen >= 0)
WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
lrugen->sizes[new_gen][type][zone] + delta);

if (old_gen < 0) {
if (lru_gen_is_active(lruvec, new_gen))
lru += LRU_ACTIVE;
update_lru_size(lruvec, lru, zone, delta);
return;
}

if (new_gen < 0) {
if (lru_gen_is_active(lruvec, old_gen))
lru += LRU_ACTIVE;
update_lru_size(lruvec, lru, zone, -delta);
return;
}

if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
update_lru_size(lruvec, lru, zone, -delta);
update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
}

VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}

/* Add a page to one of the multigenerational lru lists. Return true on success. */
static inline bool lru_gen_addition(struct page *page, struct lruvec *lruvec, bool front)
{
int gen;
unsigned long old_flags, new_flags;
int type = page_is_file_lru(page);
int zone = page_zonenum(page);
struct lrugen *lrugen = &lruvec->evictable;

if (PageUnevictable(page) || !lrugen->enabled[type])
return false;
/*
* If a page is being faulted in, add it to the youngest generation.
* try_walk_mm_list() may look at the size of the youngest generation to
* determine if the aging is due.
*
* If a page can't be evicted immediately, i.e., an anon page not in
* swap cache, a dirty file page under reclaim, or a page rejected by
* evict_pages() due to races, dirty buffer heads, etc., add it to the
* second oldest generation.
*
* If a page could be evicted immediately, i.e., a clean file page, add
* it to the oldest generation.
*/
if (PageActive(page))
gen = lru_gen_from_seq(lrugen->max_seq);
else if ((!type && !PageSwapCache(page)) ||
(PageReclaim(page) && (PageDirty(page) || PageWriteback(page))) ||
(!PageReferenced(page) && PageWorkingset(page)))
gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
else
gen = lru_gen_from_seq(lrugen->min_seq[type]);

do {
old_flags = READ_ONCE(page->flags);
VM_BUG_ON_PAGE(old_flags & LRU_GEN_MASK, page);

new_flags = (old_flags & ~(LRU_GEN_MASK | BIT(PG_active))) |
((gen + 1UL) << LRU_GEN_PGOFF);
/* see the comment in evict_pages() */
if (!(old_flags & BIT(PG_referenced)))
new_flags &= ~(LRU_USAGE_MASK | LRU_TIER_FLAGS);
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);

lru_gen_update_size(page, lruvec, -1, gen);
if (front)
list_add(&page->lru, &lrugen->lists[gen][type][zone]);
else
list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);

return true;
}

/* Delete a page from one of the multigenerational lru lists. Return true on success. */
static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
{
int gen;
unsigned long old_flags, new_flags;

do {
old_flags = READ_ONCE(page->flags);
if (!(old_flags & LRU_GEN_MASK))
return false;

VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);

gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;

new_flags = old_flags & ~LRU_GEN_MASK;
/* mark page active accordingly */
if (lru_gen_is_active(lruvec, gen))
new_flags |= BIT(PG_active);
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);

lru_gen_update_size(page, lruvec, gen, -1);
list_del(&page->lru);

return true;
}

#else /* CONFIG_LRU_GEN */

static inline bool lru_gen_enabled(void)
{
return false;
}

static inline bool lru_gen_addition(struct page *page, struct lruvec *lruvec, bool front)
{
return false;
}

static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
{
return false;
}

#endif /* CONFIG_LRU_GEN */

static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);

if (lru_gen_addition(page, lruvec, true))
return;

update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
Expand All @@ -93,13 +281,19 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
{
enum lru_list lru = page_lru(page);

if (lru_gen_addition(page, lruvec, false))
return;

update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}

static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
if (lru_gen_deletion(page, lruvec))
return;

list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
Expand Down
106 changes: 106 additions & 0 deletions include/linux/mmzone.h
Expand Up @@ -293,6 +293,108 @@ enum lruvec_flags {
*/
};

struct lruvec;

#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF)

#ifdef CONFIG_LRU_GEN

/*
* For each lruvec, evictable pages are divided into multiple generations. The
* youngest and the oldest generation numbers, AKA max_seq and min_seq, are
* monotonically increasing. The sliding window technique is used to track at
* most MAX_NR_GENS and at least MIN_NR_GENS generations. An offset within the
* window, AKA gen, indexes an array of per-type and per-zone lists for the
* corresponding generation. The counter in page->flags stores gen+1 while a
* page is on one of the multigenerational lru lists. Otherwise, it stores 0.
*/
#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)

/*
* Each generation is then divided into multiple tiers. Tiers represent levels
* of usage from file descriptors, i.e., mark_page_accessed(). In contrast to
* moving across generations which requires the lru lock, moving across tiers
* only involves an atomic operation on page->flags and therefore has a
* negligible cost.
*
* The purposes of tiers are to:
* 1) estimate whether pages accessed multiple times via file descriptors are
* more active than pages accessed only via page tables by separating the two
* access types into upper tiers and the base tier and comparing refault rates
* across tiers.
* 2) improve buffered io performance by deferring activations of pages
* accessed multiple times until the eviction. That is activations happen in
* the reclaim path, not the access path.
*
* Pages accessed N times via file descriptors belong to tier order_base_2(N).
* The base tier uses the following page flag:
* !PageReferenced() -- readahead pages
* PageReferenced() -- single-access pages
* All upper tiers use the following page flags:
* PageReferenced() && PageWorkingset() -- multi-access pages
* in addition to the bits storing N-2 accesses. Therefore, we can support one
* upper tier without using additional bits in page->flags.
*
* Note that
* 1) PageWorkingset() is always set for upper tiers because we want to
* maintain the existing psi behavior.
* 2) !PageReferenced() && PageWorkingset() is not a valid tier. See the
* comment in evict_pages().
*
* Pages from the base tier are evicted regardless of its refault rate. Pages
* from upper tiers will be moved to the next generation, if their refault rates
* are higher than that of the base tier.
*/
#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
#define LRU_TIER_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
#define LRU_USAGE_SHIFT (CONFIG_TIERS_PER_GEN - 1)

/* Whether to keep historical stats for each generation. */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_STAT_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
#else
#define NR_STAT_GENS 1U
#endif

struct lrugen {
/* the aging increments the max generation number */
unsigned long max_seq;
/* the eviction increments the min generation numbers */
unsigned long min_seq[ANON_AND_FILE];
/* the birth time of each generation in jiffies */
unsigned long timestamps[MAX_NR_GENS];
/* the multigenerational lru lists */
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the sizes of the multigenerational lru lists in pages */
unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* to determine which type and its tiers to evict */
atomic_long_t evicted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* the base tier won't be activated */
unsigned long activated[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
/* arithmetic mean weighted by geometric series 1/2, 1/4, ... */
unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multigenerational lru is enabled */
bool enabled[ANON_AND_FILE];
};

void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_set_state(bool enable, bool main, bool swap);

#else /* CONFIG_LRU_GEN */

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}

static inline void lru_gen_set_state(bool enable, bool main, bool swap)
{
}

#endif /* CONFIG_LRU_GEN */

struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
Expand All @@ -310,6 +412,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
#ifdef CONFIG_LRU_GEN
/* unevictable pages are on LRU_UNEVICTABLE */
struct lrugen evictable;
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
Expand Down

0 comments on commit 7c0e13b

Please sign in to comment.