Skip to content

Commit

Permalink
fuse: wait for writepages in syncfs
Browse files Browse the repository at this point in the history
commit 660585b upstream.

In case of fuse the MM subsystem doesn't guarantee that page writeback
completes by the time ->sync_fs() is called.  This is because fuse
completes page writeback immediately to prevent DoS of memory reclaim by
the userspace file server.

This means that fuse itself must ensure that writes are synced before
sending the SYNCFS request to the server.

Introduce sync buckets, that hold a counter for the number of outstanding
write requests.  On syncfs replace the current bucket with a new one and
wait until the old bucket's counter goes down to zero.

It is possible to have multiple syncfs calls in parallel, in which case
there could be more than one waited-on buckets.  Descendant buckets must
not complete until the parent completes.  Add a count to the child (new)
bucket until the (parent) old bucket completes.

Use RCU protection to dereference the current bucket and to wake up an
emptied bucket.  Use fc->lock to protect against parallel assignments to
the current bucket.

This leaves just the counter to be a possible scalability issue.  The
fc->num_waiting counter has a similar issue, so both should be addressed at
the same time.

Reported-by: Amir Goldstein <amir73il@gmail.com>
Fixes: 2d82ab2 ("virtiofs: propagate sync() to file server")
Cc: <stable@vger.kernel.org> # v5.14
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  • Loading branch information
Miklos Szeredi authored and gregkh committed Sep 15, 2021
1 parent 8d573aa commit 10c6ecc
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 0 deletions.
21 changes: 21 additions & 0 deletions fs/fuse/file.c
Expand Up @@ -392,6 +392,7 @@ struct fuse_writepage_args {
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
struct fuse_sync_bucket *bucket;
};

static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
Expand Down Expand Up @@ -1613,6 +1614,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
int i;

if (wpa->bucket)
fuse_sync_bucket_dec(wpa->bucket);

for (i = 0; i < ap->num_pages; i++)
__free_page(ap->pages[i]);

Expand Down Expand Up @@ -1876,6 +1880,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void)

}

static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
struct fuse_writepage_args *wpa)
{
if (!fc->sync_fs)
return;

rcu_read_lock();
/* Prevent resurrection of dead bucket in unlikely race with syncfs */
do {
wpa->bucket = rcu_dereference(fc->curr_bucket);
} while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
rcu_read_unlock();
}

static int fuse_writepage_locked(struct page *page)
{
struct address_space *mapping = page->mapping;
Expand Down Expand Up @@ -1903,6 +1921,7 @@ static int fuse_writepage_locked(struct page *page)
if (!wpa->ia.ff)
goto err_nofile;

fuse_writepage_add_to_bucket(fc, wpa);
fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);

copy_highpage(tmp_page, page);
Expand Down Expand Up @@ -2153,6 +2172,8 @@ static int fuse_writepages_fill(struct page *page,
__free_page(tmp_page);
goto out_unlock;
}
fuse_writepage_add_to_bucket(fc, wpa);

data->max_pages = 1;

ap = &wpa->ia.ap;
Expand Down
19 changes: 19 additions & 0 deletions fs/fuse/fuse_i.h
Expand Up @@ -515,6 +515,13 @@ struct fuse_fs_context {
void **fudptr;
};

struct fuse_sync_bucket {
/* count is a possible scalability bottleneck */
atomic_t count;
wait_queue_head_t waitq;
struct rcu_head rcu;
};

/**
* A Fuse connection.
*
Expand Down Expand Up @@ -807,6 +814,9 @@ struct fuse_conn {

/** List of filesystems using this connection */
struct list_head mounts;

/* New writepages go into this bucket */
struct fuse_sync_bucket __rcu *curr_bucket;
};

/*
Expand Down Expand Up @@ -910,6 +920,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
descs[i].length = PAGE_SIZE - descs[i].offset;
}

static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
{
/* Need RCU protection to prevent use after free after the decrement */
rcu_read_lock();
if (atomic_dec_and_test(&bucket->count))
wake_up(&bucket->waitq);
rcu_read_unlock();
}

/** Device operations */
extern const struct file_operations fuse_dev_operations;

Expand Down
60 changes: 60 additions & 0 deletions fs/fuse/inode.c
Expand Up @@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
return err;
}

static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
{
struct fuse_sync_bucket *bucket;

bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
if (bucket) {
init_waitqueue_head(&bucket->waitq);
/* Initial active count */
atomic_set(&bucket->count, 1);
}
return bucket;
}

static void fuse_sync_fs_writes(struct fuse_conn *fc)
{
struct fuse_sync_bucket *bucket, *new_bucket;
int count;

new_bucket = fuse_sync_bucket_alloc();
spin_lock(&fc->lock);
bucket = rcu_dereference_protected(fc->curr_bucket, 1);
count = atomic_read(&bucket->count);
WARN_ON(count < 1);
/* No outstanding writes? */
if (count == 1) {
spin_unlock(&fc->lock);
kfree(new_bucket);
return;
}

/*
* Completion of new bucket depends on completion of this bucket, so add
* one more count.
*/
atomic_inc(&new_bucket->count);
rcu_assign_pointer(fc->curr_bucket, new_bucket);
spin_unlock(&fc->lock);
/*
* Drop initial active count. At this point if all writes in this and
* ancestor buckets complete, the count will go to zero and this task
* will be woken up.
*/
atomic_dec(&bucket->count);

wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);

/* Drop temp count on descendant bucket */
fuse_sync_bucket_dec(new_bucket);
kfree_rcu(bucket, rcu);
}

static int fuse_sync_fs(struct super_block *sb, int wait)
{
struct fuse_mount *fm = get_fuse_mount_super(sb);
Expand All @@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait)
if (!fc->sync_fs)
return 0;

fuse_sync_fs_writes(fc);

memset(&inarg, 0, sizeof(inarg));
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
Expand Down Expand Up @@ -763,13 +816,19 @@ void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
struct fuse_sync_bucket *bucket;

if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_conn_free(fc);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
put_user_ns(fc->user_ns);
bucket = rcu_dereference_protected(fc->curr_bucket, 1);
if (bucket) {
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
fc->release(fc);
}
}
Expand Down Expand Up @@ -1366,6 +1425,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_flags & SB_MANDLOCK)
goto err;

rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
fuse_sb_defaults(sb);

if (ctx->is_bdev) {
Expand Down

0 comments on commit 10c6ecc

Please sign in to comment.