Skip to content

Commit

Permalink
Merge branch 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/k…
Browse files Browse the repository at this point in the history
…ernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
 "Mostly fixes for the fallouts from the recent cgroup core changes.

  The decoupled nature of cgroup dynamic hierarchy management
  (hierarchies are created dynamically on mount but may or may not be
  reused once unmounted depending on remaining usages) led to more
  ugliness being added to kernfs.

  Hopefully, this is the last of it"

* 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cpuset: break kernfs active protection in cpuset_write_resmask()
  cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()
  kernfs: introduce kernfs_pin_sb()
  cgroup: fix mount failure in a corner case
  cpuset,mempolicy: fix sleeping function called from invalid context
  cgroup: fix broken css_has_online_children()
  • Loading branch information
torvalds committed Jul 10, 2014
2 parents a805cbf + 76bb5ab commit 40f6123
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 11 deletions.
30 changes: 30 additions & 0 deletions fs/kernfs/mount.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
}

/**
* kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
* @kernfs_root: the kernfs_root in question
* @ns: the namespace tag
*
* Pin the superblock so the superblock won't be destroyed in subsequent
* operations. This can be used to block ->kill_sb() which may be useful
* for kernfs users which dynamically manage superblocks.
*
* Returns NULL if there's no superblock associated to this kernfs_root, or
* -EINVAL if the superblock is being freed.
*/
struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
{
struct kernfs_super_info *info;
struct super_block *sb = NULL;

mutex_lock(&kernfs_mutex);
list_for_each_entry(info, &root->supers, node) {
if (info->ns == ns) {
sb = info->sb;
if (!atomic_inc_not_zero(&info->sb->s_active))
sb = ERR_PTR(-EINVAL);
break;
}
}
mutex_unlock(&kernfs_mutex);
return sb;
}

void __init kernfs_init(void)
{
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
Expand Down
1 change: 1 addition & 0 deletions include/linux/kernfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
struct kernfs_root *root, unsigned long magic,
bool *new_sb_created, const void *ns);
void kernfs_kill_sb(struct super_block *sb);
struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);

void kernfs_init(void);

Expand Down
58 changes: 50 additions & 8 deletions kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
int i;
bool new_sb;

/*
Expand All @@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}

/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
* dying subsystems. We just need to ensure that the ones
* unmounted previously finish dying and don't care about new ones
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
if (!(opts.subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;

if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
cgroup_put(&ss->root->cgrp);
}

for_each_root(root) {
bool name_match = false;

Expand Down Expand Up @@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
}

/*
* A root's lifetime is governed by its root cgroup.
* tryget_live failure indicate that the root is being
* destroyed. Wait for destruction to complete so that the
* subsystems are free. We can use wait_queue for the wait
* but this path is super cold. Let's just sleep for a bit
* and retry.
* We want to reuse @root whose lifetime is governed by its
* ->cgrp. Let's check whether @root is alive and keep it
* that way. As cgroup_kill_sb() can happen anytime, we
* want to block it by pinning the sb so that @root doesn't
* get killed before mount is complete.
*
* With the sb pinned, tryget_live can reliably indicate
* whether @root can be reused. If it's being killed,
* drain it. We can use wait_queue for the wait but this
* path is super cold. Let's just sleep a bit and retry.
*/
if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
if (IS_ERR(pinned_sb) ||
!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
if (!IS_ERR_OR_NULL(pinned_sb))
deactivate_super(pinned_sb);
msleep(10);
ret = restart_syscall();
goto out_free;
Expand Down Expand Up @@ -1770,6 +1802,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
CGROUP_SUPER_MAGIC, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);

/*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
if (pinned_sb) {
WARN_ON(new_sb);
deactivate_super(pinned_sb);
}

return dentry;
}

Expand Down Expand Up @@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)

rcu_read_lock();
css_for_each_child(child, css) {
if (css->flags & CSS_ONLINE) {
if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
Expand Down
20 changes: 19 additions & 1 deletion kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1181,7 +1181,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,

int current_cpuset_is_being_rebound(void)
{
return task_cs(current) == cpuset_being_rebound;
int ret;

rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
rcu_read_unlock();

return ret;
}

static int update_relax_domain_level(struct cpuset *cs, s64 val)
Expand Down Expand Up @@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
*
* cpuset_hotplug_work calls back into cgroup core via
* cgroup_transfer_tasks() and waiting for it from a cgroupfs
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
* grabbing cpuset_mutex anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);

mutex_lock(&cpuset_mutex);
Expand Down Expand Up @@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
return retval ?: nbytes;
}

Expand Down
2 changes: 0 additions & 2 deletions mm/mempolicy.c
Original file line number Diff line number Diff line change
Expand Up @@ -2139,15 +2139,13 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;

rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
Expand Down

0 comments on commit 40f6123

Please sign in to comment.