Skip to content

Commit

Permalink
mm: hugetlb controller for cgroups v2
Browse files Browse the repository at this point in the history
In the effort of supporting cgroups v2 into Kubernetes, I stumped on
the lack of the hugetlb controller.

When the controller is enabled, it exposes four new files for each
hugetlb size on non-root cgroups:

- hugetlb.<hugepagesize>.current
- hugetlb.<hugepagesize>.max
- hugetlb.<hugepagesize>.events
- hugetlb.<hugepagesize>.events.local

The differences with the legacy hierarchy are in the file names and
using the value "max" instead of "-1" to disable a limit.

The file .limit_in_bytes is renamed to .max.

The file .usage_in_bytes is renamed to .current.

.failcnt is not provided as a single file anymore, but its value can
be read through the new flat-keyed files .events and .events.local,
through the "max" key.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
  • Loading branch information
giuseppe authored and htejun committed Dec 16, 2019
1 parent 6afa873 commit faced7e
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 12 deletions.
29 changes: 29 additions & 0 deletions Documentation/admin-guide/cgroup-v2.rst
Expand Up @@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/.
5-6. Device
5-7. RDMA
5-7-1. RDMA Interface Files
5-8. HugeTLB
5.8-1. HugeTLB Interface Files
5-8. Misc
5-8-1. perf_event
5-N. Non-normative information
Expand Down Expand Up @@ -2056,6 +2058,33 @@ RDMA Interface Files
mlx4_0 hca_handle=1 hca_object=20
ocrdma1 hca_handle=1 hca_object=23

HugeTLB
-------

The HugeTLB controller allows to limit the HugeTLB usage per control group and
enforces the controller limit during page fault.

HugeTLB Interface Files
~~~~~~~~~~~~~~~~~~~~~~~

hugetlb.<hugepagesize>.current
Show current usage for "hugepagesize" hugetlb. It exists for all
the cgroup except root.

hugetlb.<hugepagesize>.max
Set/show the hard limit of "hugepagesize" hugetlb usage.
The default value is "max". It exists for all the cgroup except root.

hugetlb.<hugepagesize>.events
A read-only flat-keyed file which exists on non-root cgroups.

max
The number of allocation failure due to HugeTLB limit

hugetlb.<hugepagesize>.events.local
Similar to hugetlb.<hugepagesize>.events but the fields in the file
are local to the cgroup i.e. not hierarchical. The file modified event
generated on this file reflects only the local events.

Misc
----
Expand Down
3 changes: 2 additions & 1 deletion include/linux/hugetlb.h
Expand Up @@ -432,7 +432,8 @@ struct hstate {
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
#ifdef CONFIG_CGROUP_HUGETLB
/* cgroup control files */
struct cftype cgroup_files[5];
struct cftype cgroup_files_dfl[5];
struct cftype cgroup_files_legacy[5];
#endif
char name[HSTATE_NAME_LEN];
};
Expand Down
198 changes: 187 additions & 11 deletions mm/hugetlb_cgroup.c
Expand Up @@ -3,6 +3,10 @@
* Copyright IBM Corporation, 2012
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
* Cgroup v2
* Copyright (C) 2019 Red Hat, Inc.
* Author: Giuseppe Scrivano <gscrivan@redhat.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
Expand All @@ -19,18 +23,36 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>

enum hugetlb_memory_event {
HUGETLB_MAX,
HUGETLB_NR_MEMORY_EVENTS,
};

struct hugetlb_cgroup {
struct cgroup_subsys_state css;

/*
* the counter to account for hugepages from hugetlb.
*/
struct page_counter hugepage[HUGE_MAX_HSTATE];

atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];

/* Handle for "hugetlb.events" */
struct cgroup_file events_file[HUGE_MAX_HSTATE];

/* Handle for "hugetlb.events.local" */
struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
};

#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)

#define hugetlb_cgroup_from_counter(counter, idx) \
container_of(counter, struct hugetlb_cgroup, hugepage[idx])

static struct hugetlb_cgroup *root_h_cgroup __read_mostly;

static inline
Expand Down Expand Up @@ -178,6 +200,19 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
} while (hugetlb_cgroup_have_usage(h_cg));
}

static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
enum hugetlb_memory_event event)
{
atomic_long_inc(&hugetlb->events_local[idx][event]);
cgroup_file_notify(&hugetlb->events_local_file[idx]);

do {
atomic_long_inc(&hugetlb->events[idx][event]);
cgroup_file_notify(&hugetlb->events_file[idx]);
} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
!hugetlb_cgroup_is_root(hugetlb));
}

int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr)
{
Expand All @@ -202,8 +237,12 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
}
rcu_read_unlock();

if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
&counter)) {
ret = -ENOMEM;
hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx,
HUGETLB_MAX);
}
css_put(&h_cg->css);
done:
*ptr = h_cg;
Expand Down Expand Up @@ -283,10 +322,45 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}

static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
{
int idx;
u64 val;
struct cftype *cft = seq_cft(seq);
unsigned long limit;
struct page_counter *counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

idx = MEMFILE_IDX(cft->private);
counter = &h_cg->hugepage[idx];

limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));

switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
val = (u64)page_counter_read(counter);
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
case RES_LIMIT:
val = (u64)counter->max;
if (val == limit)
seq_puts(seq, "max\n");
else
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
default:
BUG();
}

return 0;
}

static DEFINE_MUTEX(hugetlb_limit_mutex);

static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
char *buf, size_t nbytes, loff_t off,
const char *max)
{
int ret, idx;
unsigned long nr_pages;
Expand All @@ -296,7 +370,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return -EINVAL;

buf = strstrip(buf);
ret = page_counter_memparse(buf, "-1", &nr_pages);
ret = page_counter_memparse(buf, max, &nr_pages);
if (ret)
return ret;

Expand All @@ -316,6 +390,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret ?: nbytes;
}

static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
}

static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
}

static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
Expand Down Expand Up @@ -350,7 +436,36 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
return buf;
}

static void __init __hugetlb_cgroup_file_init(int idx)
static int __hugetlb_events_show(struct seq_file *seq, bool local)
{
int idx;
long max;
struct cftype *cft = seq_cft(seq);
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

idx = MEMFILE_IDX(cft->private);

if (local)
max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
else
max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);

seq_printf(seq, "max %lu\n", max);

return 0;
}

static int hugetlb_events_show(struct seq_file *seq, void *v)
{
return __hugetlb_events_show(seq, false);
}

static int hugetlb_events_local_show(struct seq_file *seq, void *v)
{
return __hugetlb_events_show(seq, true);
}

static void __init __hugetlb_cgroup_file_dfl_init(int idx)
{
char buf[32];
struct cftype *cft;
Expand All @@ -360,38 +475,93 @@ static void __init __hugetlb_cgroup_file_init(int idx)
mem_fmt(buf, 32, huge_page_size(h));

/* Add the limit file */
cft = &h->cgroup_files[0];
cft = &h->cgroup_files_dfl[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->seq_show = hugetlb_cgroup_read_u64_max;
cft->write = hugetlb_cgroup_write_dfl;
cft->flags = CFTYPE_NOT_ON_ROOT;

/* Add the current usage file */
cft = &h->cgroup_files_dfl[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->seq_show = hugetlb_cgroup_read_u64_max;
cft->flags = CFTYPE_NOT_ON_ROOT;

/* Add the events file */
cft = &h->cgroup_files_dfl[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_show;
cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]),
cft->flags = CFTYPE_NOT_ON_ROOT;

/* Add the events.local file */
cft = &h->cgroup_files_dfl[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_local_show;
cft->file_offset = offsetof(struct hugetlb_cgroup,
events_local_file[idx]),
cft->flags = CFTYPE_NOT_ON_ROOT;

/* NULL terminate the last cft */
cft = &h->cgroup_files_dfl[4];
memset(cft, 0, sizeof(*cft));

WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
h->cgroup_files_dfl));
}

static void __init __hugetlb_cgroup_file_legacy_init(int idx)
{
char buf[32];
struct cftype *cft;
struct hstate *h = &hstates[idx];

/* format the size */
mem_fmt(buf, 32, huge_page_size(h));

/* Add the limit file */
cft = &h->cgroup_files_legacy[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->read_u64 = hugetlb_cgroup_read_u64;
cft->write = hugetlb_cgroup_write;
cft->write = hugetlb_cgroup_write_legacy;

/* Add the usage file */
cft = &h->cgroup_files[1];
cft = &h->cgroup_files_legacy[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->read_u64 = hugetlb_cgroup_read_u64;

/* Add the MAX usage file */
cft = &h->cgroup_files[2];
cft = &h->cgroup_files_legacy[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;

/* Add the failcntfile */
cft = &h->cgroup_files[3];
cft = &h->cgroup_files_legacy[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;

/* NULL terminate the last cft */
cft = &h->cgroup_files[4];
cft = &h->cgroup_files_legacy[4];
memset(cft, 0, sizeof(*cft));

WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
h->cgroup_files));
h->cgroup_files_legacy));
}

static void __init __hugetlb_cgroup_file_init(int idx)
{
__hugetlb_cgroup_file_dfl_init(idx);
__hugetlb_cgroup_file_legacy_init(idx);
}

void __init hugetlb_cgroup_file_init(void)
Expand Down Expand Up @@ -433,8 +603,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
return;
}

static struct cftype hugetlb_files[] = {
{} /* terminate */
};

struct cgroup_subsys hugetlb_cgrp_subsys = {
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
.dfl_cftypes = hugetlb_files,
.legacy_cftypes = hugetlb_files,
};

0 comments on commit faced7e

Please sign in to comment.