From 848cce0d4102b5b4b26b0987b43e1919d462afe2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Feb 2012 17:04:28 +0800 Subject: [PATCH 01/26] Btrfs: avoid setting ->d_op twice Follow those instructions, and you'll trigger a warning in the beginning of d_set_d_op(): # mkfs.btrfs /dev/loop3 # mount /dev/loop3 /mnt # btrfs sub create /mnt/sub # btrfs sub snap /mnt /mnt/snap # touch /mnt/snap/sub touch: cannot touch `tmp': Permission denied __d_alloc() set d_op to sb->s_d_op (btrfs_dentry_operations), and then simple_lookup() reset it to simple_dentry_operations, which triggered the warning. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1be31368e881a2..a682c267576db4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4069,7 +4069,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->dummy_inode = 1; inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -4140,14 +4140,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; + struct inode *inode = dentry->d_inode; - if (!dentry->d_inode && !IS_ROOT(dentry)) - dentry = dentry->d_parent; + if (!inode && !IS_ROOT(dentry)) + inode = dentry->d_parent->d_inode; - if (dentry->d_inode) { - root = BTRFS_I(dentry->d_inode)->root; + if (inode) { + root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; + + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; } return 0; } From 8c9c2bf7a3c4f7e9d158c0be9c49f372fb943ad2 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 25 Feb 2012 09:09:30 +0100 Subject: [PATCH 02/26] btrfs: fix race in reada When inserting into the radix tree returns EEXIST, get the existing entry without giving up the spinlock in between. There was a race for both the zones trees and the extent tree. Signed-off-by: Arne Jansen --- fs/btrfs/inode.c | 8 +++++++- fs/btrfs/reada.c | 35 ++++++++++++++++------------------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a682c267576db4..98ee5a51aa29b4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4332,7 +4332,13 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, } no_dentry: /* is this a reference to our own snapshot? If so - * skip it + * skip it. + * + * In contrast to old kernels, we insert the snapshot's + * dir item and dir index after it has been created, so + * we won't find a reference to our own snapshot. We + * still keep the following code for backward + * compatibility. */ if (location.type == BTRFS_ROOT_ITEM_KEY && location.objectid == root->root_key.objectid) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dc5d33146fdbb9..8dec650099c8a3 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -250,14 +250,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio) { int ret; - int looped = 0; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; u64 start; u64 end; int i; -again: zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, @@ -274,9 +272,6 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); } - if (looped) - return NULL; - cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; @@ -307,13 +302,15 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, ret = radix_tree_insert(&dev->reada_zones, (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); - spin_unlock(&fs_info->reada_lock); - if (ret) { + if (ret == -EEXIST) { kfree(zone); - looped = 1; - goto again; + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); } + spin_unlock(&fs_info->reada_lock); return zone; } @@ -323,8 +320,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_key *top, int level) { int ret; - int looped = 0; struct reada_extent *re = NULL; + struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; @@ -335,14 +332,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; -again: spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) kref_get(&re->refcnt); spin_unlock(&fs_info->reada_lock); - if (re || looped) + if (re) return re; re = kzalloc(sizeof(*re), GFP_NOFS); @@ -398,12 +394,15 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* insert extent in reada_tree + all per-device trees, all or nothing */ spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret == -EEXIST) { + re_exist = radix_tree_lookup(&fs_info->reada_tree, index); + BUG_ON(!re_exist); + kref_get(&re_exist->refcnt); + spin_unlock(&fs_info->reada_lock); + goto error; + } if (ret) { spin_unlock(&fs_info->reada_lock); - if (ret != -ENOMEM) { - /* someone inserted the extent in the meantime */ - looped = 1; - } goto error; } for (i = 0; i < nzones; ++i) { @@ -450,9 +449,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, } kfree(bbio); kfree(re); - if (looped) - goto again; - return NULL; + return re_exist; } static void reada_kref_dummy(struct kref *kr) From 207a232ccac0a8cb79d304bd17298dbc96e2e082 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 25 Feb 2012 09:09:47 +0100 Subject: [PATCH 03/26] btrfs: don't add both copies of DUP to reada extent tree Normally when there are 2 copies of a block, we add both to the reada extent tree and prefetch only the one that is easier to reach. This way we can better utilize multiple devices. In case of DUP this makes no sense as both copies reside on the same device. Signed-off-by: Arne Jansen --- fs/btrfs/reada.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 8dec650099c8a3..ac5d010858848d 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -326,6 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; + struct btrfs_device *prev_dev; u32 blocksize; u64 length; int nzones = 0; @@ -405,8 +406,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); goto error; } + prev_dev = NULL; for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; + if (dev == prev_dev) { + /* + * in case of DUP, just add the first zone. As both + * are on the same device, there's nothing to gain + * from adding both. + * Also, it wouldn't work, as the tree is per device + * and adding would fail with EEXIST + */ + continue; + } + prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { From 8d082fb727ac11930ea20bf1612e334ea7c2b697 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 3 Apr 2012 09:56:53 +0800 Subject: [PATCH 04/26] Btrfs: do not mount when we have a sectorsize unequal to PAGE_SIZE Our code is not ready to cope with a sectorsize that's not equal to PAGE_SIZE. It will lead to hanging-on while writing something. Signed-off-by: Liu Bo --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20196f41120698..b9866f27ebd7da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2254,9 +2254,9 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - if (sectorsize < PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size " - "found on %s\n", sb->s_id); + if (sectorsize != PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } From 871383be592ba7e819d27556591e315a0df38cee Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 2 Apr 2012 18:31:37 +0200 Subject: [PATCH 05/26] btrfs: add missing unlocks to transaction abort paths Added in commit 49b25e0540904be0bf558b84475c69d72e4de66e ("btrfs: enhance transaction abort infrastructure") Reported-by: Dan Carpenter Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 11b77a59db6299..36422254ef6765 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -73,8 +73,10 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) cur_trans = root->fs_info->running_transaction; if (cur_trans) { - if (cur_trans->aborted) + if (cur_trans->aborted) { + spin_unlock(&root->fs_info->trans_lock); return cur_trans->aborted; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -1400,6 +1402,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } @@ -1411,6 +1414,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } From 8e52acf70459020d7e9e9fda25066be4da520943 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 12 Mar 2012 16:39:28 +0800 Subject: [PATCH 06/26] Btrfs: retrurn void from clear_state_bit Currently it returns a set of bits that were cleared, but this return value is not used at all. Moreover it doesn't seem to be useful, because we may clear the bits of a few extent_states, but only the cleared bits of last one is returned. Signed-off-by: Li Zefan --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4789770f8eaf00..05951bdf72cc82 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -404,18 +404,16 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). + * it will optionally wake up any one waiting on this state (wake == 1) * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static int clear_state_bit(struct extent_io_tree *tree, +static void clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int *bits, int wake) { int bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret = state->state & bits_to_clear; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -437,7 +435,6 @@ static int clear_state_bit(struct extent_io_tree *tree, } else { merge_state(tree, state); } - return ret; } static struct extent_state * From cdc6a3952558f00b1bc3b6401e1cf98797632fe2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 12 Mar 2012 16:39:48 +0800 Subject: [PATCH 07/26] Btrfs: avoid possible use-after-free in clear_extent_bit() clear_extent_bit() { next_node = rb_next(&state->rb_node); ... clear_state_bit(state); <-- this may free next_node if (next_node) { state = rb_entry(next_node); ... } } clear_state_bit() calls merge_state() which may free the next node of the passing extent_state, so clear_extent_bit() may end up referencing freed memory. Signed-off-by: Li Zefan --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05951bdf72cc82..11eeb81fe69515 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -402,6 +402,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + /* * utility function to clear some bits in an extent state struct. * it will optionally wake up any one waiting on this state (wake == 1) @@ -409,10 +418,11 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static void clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - int *bits, int wake) +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + int *bits, int wake) { + struct extent_state *next; int bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -425,6 +435,7 @@ static void clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { + next = next_state(state); if (state->tree) { rb_erase(&state->rb_node, &tree->state); state->tree = NULL; @@ -434,7 +445,9 @@ static void clear_state_bit(struct extent_io_tree *tree, } } else { merge_state(tree, state); + next = next_state(state); } + return next; } static struct extent_state * @@ -473,7 +486,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; - struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; @@ -525,14 +537,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, WARN_ON(state->end < start); last_end = state->end; - if (state->end < end && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) + if (!(state->state & bits)) { + state = next_state(state); goto next; + } /* * | ---- desired range ---- | @@ -590,16 +599,13 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, goto out; } - clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake); next: if (last_end == (u64)-1) goto out; start = last_end + 1; - if (start <= end && next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); + if (start <= end && state && !need_resched()) goto hit_next; - } goto search_again; out: From 4735fb282830c0966b301dabcccf4753fa6604bb Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 12 Apr 2012 22:47:52 +0200 Subject: [PATCH 08/26] Btrfs: Make free_ipath() deal gracefully with NULL pointers Make free_ipath() behave like most other freeing functions in the kernel and gracefully do nothing when passed a NULL pointer. Besides this making the bahaviour consistent with functions such as kfree(), vfree(), btrfs_free_path() etc etc, it also fixes a real NULL deref issue in fs/btrfs/ioctl.c::btrfs_ioctl_ino_to_path(). In that function we have this code: ... ipath = init_ipath(size, root, path); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; goto out; } ... out: btrfs_free_path(path); free_ipath(ipath); ... If we ever take the true branch of that 'if' statement we'll end up passing a NULL pointer to free_ipath() which will subsequently dereference it and we'll go "Boom" :-( This patch will avoid that. Signed-off-by: Jesper Juhl --- fs/btrfs/backref.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f4e90748940abf..b332ff04c5ee6d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1414,6 +1414,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + if (!ipath) + return; kfree(ipath->fspath); kfree(ipath); } From aefc1eb13ebbb86c5ffade8a9e2425cd71032d7e Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 13 Apr 2012 12:28:00 +0200 Subject: [PATCH 09/26] Btrfs: don't call free_extent_buffer twice in iterate_irefs Avoid calling free_extent_buffer more than once when the iterator function returns non-zero. The only code that uses this is scrub repair for corrupted nodatasum blocks. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b332ff04c5ee6d..fb56bcc8037703 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1247,7 +1247,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_path *path, iterate_irefs_t *iterate, void *ctx) { - int ret; + int ret = 0; int slot; u32 cur; u32 len; @@ -1259,7 +1259,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_inode_ref *iref; struct btrfs_key found_key; - while (1) { + while (!ret) { ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1288,10 +1288,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); - if (ret) { - free_extent_buffer(eb); + if (ret) break; - } len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } From b916a59adfdc875381b68ced258694b434cf43ae Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 13 Apr 2012 12:28:08 +0200 Subject: [PATCH 10/26] Btrfs: add missing read locks in backref.c iref_to_path and iterate_irefs both increment the eb's refcount to use it after releasing the path. Both depend on consistent data remaining in the extent buffer and need a read lock to protect it. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index fb56bcc8037703..bcec06750232e6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -22,6 +22,7 @@ #include "ulist.h" #include "transaction.h" #include "delayed-ref.h" +#include "locking.h" /* * this structure records all encountered refs on the way up to the root @@ -893,18 +894,22 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = size - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; if (bytes_left >= 0) dest[bytes_left] = '\0'; + path->leave_spinning = 1; while (1) { len = btrfs_inode_ref_name_len(eb, iref); bytes_left -= len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, (unsigned long)(iref + 1), len); - if (eb != eb_in) + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + } ret = inode_ref_info(parent, 0, fs_root, path, &found_key); if (ret > 0) ret = -ENOENT; @@ -919,8 +924,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, slot = path->slots[0]; eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ - if (eb != eb_in) + if (eb != eb_in) { atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); @@ -931,6 +939,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); + path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); @@ -1260,6 +1269,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { + path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1275,6 +1285,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(eb, slot); @@ -1293,6 +1305,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } From 37db63a400d3bac467795aa43901065fd8d903b7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 13 Apr 2012 17:05:08 +0300 Subject: [PATCH 11/26] Btrfs: fix max chunk size check in chunk allocator Fix a bug, where in case we need to adjust stripe_size so that the length of the resulting chunk is less than or equal to max_chunk_size, DUP chunks turn out to be only half as big as they could be. Cc: Arne Jansen Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 759d02486d7ccb..ce289af526f086 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3324,12 +3324,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; num_stripes = ndevs * dev_stripes; - if (stripe_size * num_stripes > max_chunk_size * ncopies) { + if (stripe_size * ndevs > max_chunk_size * ncopies) { stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, num_stripes); + do_div(stripe_size, ndevs); } do_div(stripe_size, dev_stripes); + + /* align to BTRFS_STRIPE_LEN */ do_div(stripe_size, BTRFS_STRIPE_LEN); stripe_size *= BTRFS_STRIPE_LEN; From 8a3db1849e9e2563727ea2dc32737502e0096641 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Mon, 16 Apr 2012 06:44:37 +0300 Subject: [PATCH 12/26] btrfs: fix early abort in 'remount' Cc: Jeff Mahoney Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Sergei Trofimovich --- fs/btrfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 84571d7da12e93..43aa2dd0bc7dd7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1152,13 +1152,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { - if (fs_info->fs_devices->rw_devices == 0) + if (fs_info->fs_devices->rw_devices == 0) { ret = -EACCES; goto restore; + } - if (btrfs_super_log_root(fs_info->super_copy) != 0) + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; + } ret = btrfs_cleanup_fs_roots(fs_info); if (ret) From 48d282326b3ce5f435835f5fb0e3231c399f4f9a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 14 Apr 2012 11:24:33 +0200 Subject: [PATCH 13/26] fs/btrfs/volumes.c: add missing free_fs_devices Free fs_devices as done in the error-handling code just below. Signed-off-by: Julia Lawall --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ce289af526f086..3b984173d25bc2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4352,8 +4352,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) ret = __btrfs_open_devices(fs_devices, FMODE_READ, root->fs_info->bdev_holder); - if (ret) + if (ret) { + free_fs_devices(fs_devices); goto out; + } if (!fs_devices->seeding) { __btrfs_close_devices(fs_devices); From 5cf1ab56133ad7b712673c071b439d4a555a2d1e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Apr 2012 09:42:26 -0400 Subject: [PATCH 14/26] Btrfs: always store the mirror we read the eb from A user reported a panic where we were trying to fix a bad mirror but the mirror number we were giving was 0, which is invalid. This is because we don't do the transid verification until after the read, so as far as the read code is concerned the read was a success. So instead store the mirror we read from so that if there is some failure post read we know which mirror to try next and which mirror needs to be fixed if we find a good copy of the block. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 16 ++++++++-------- fs/btrfs/extent_io.c | 15 ++++++--------- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/inode.c | 2 +- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b9866f27ebd7da..d0c969beaad4d3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -383,17 +383,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - if (!failed_mirror) { - failed = 1; - printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); - failed_mirror = eb->failed_mirror; - } - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) break; + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } + mirror_num++; if (mirror_num == failed_mirror) mirror_num++; @@ -564,7 +563,7 @@ struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, } static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { struct extent_io_tree *tree; u64 found_start; @@ -589,6 +588,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (!reads_done) goto err; + eb->read_mirror = mirror; if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; goto err; @@ -652,7 +652,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = failed_mirror; + eb->read_mirror = failed_mirror; if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 11eeb81fe69515..0c23e57077c6f7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2304,7 +2304,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; - int failed_mirror; + int mirror; int ret; if (err) @@ -2343,20 +2343,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); + mirror = (int)(unsigned long)bio->bi_bdev; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, - state); + state, mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate) - failed_mirror = (int)(unsigned long)bio->bi_bdev; - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + ret = tree->ops->readpage_io_failed_hook(page, mirror); if (!ret && !err && test_bit(BIO_UPTODATE, &bio->bi_flags)) uptodate = 1; @@ -2371,8 +2369,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + ret = bio_readpage_error(bio, page, start, end, mirror, NULL); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -4465,7 +4462,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = 0; + eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index faf10eb57f75eb..b516c3b8dec68d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,7 +79,7 @@ struct extent_io_ops { u64 start, u64 end, struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state); + struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, @@ -135,7 +135,7 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; - int failed_mirror; + int read_mirror; struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98ee5a51aa29b4..d953f8820464e4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1947,7 +1947,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); struct inode *inode = page->mapping->host; From 253beebd5a255e07d6a8b65515491f33664e82a2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 18 Apr 2012 09:59:03 +0300 Subject: [PATCH 15/26] Btrfs: double unlock bug in error handling The caller expects this function to return with the lock held and releases it immediately on error. Signed-off-by: Dan Carpenter --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2b35f8d14bb9a1..a0bb9dcd3c3604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2301,6 +2301,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + spin_lock(&delayed_refs->lock); return ret; } @@ -2331,6 +2332,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + spin_lock(&delayed_refs->lock); return ret; } From b9688bb8459b67e42327de6420edb405a9188775 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 18 Apr 2012 10:27:16 +0200 Subject: [PATCH 16/26] btrfs: don't return EINTR It is basically a good thing if we are interruptible when waiting for free space, but the generality in which it is implemented currently leads to system calls being interruptible that are not documented this way. For example git can't handle interrupted unlink(), leading to corrupt repos under space pressure. Instead we raise the bar to only be interruptible by SIGKILL. Thanks to David Sterba for suggesting this. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a0bb9dcd3c3604..84497f8eb04371 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3771,13 +3771,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, */ if (current->journal_info) return -EAGAIN; - ret = wait_event_interruptible(space_info->wait, - !space_info->flush); - /* Must have been interrupted, return */ - if (ret) { - printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__); + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ + if (ret) return -EINTR; - } spin_lock(&space_info->lock); } From 99ba55ad696944b37d5557bc5b4816890854fdb9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 19 Mar 2012 16:17:22 +0100 Subject: [PATCH 17/26] Btrfs: fix btrfs_ioctl_dev_info() crash on missing device When a filesystem is mounted with the degraded option, it is possible that some of the devices are not there. btrfs_ioctl_dev_info() crashs in this case because the device name is a NULL pointer. This ioctl was only used for scrub. Signed-off-by: Stefan Behrens --- fs/btrfs/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 18cc23d164a80b..14f8e1faa46ee0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2262,7 +2262,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + if (dev->name) + strncpy(di_args->path, dev->name, sizeof(di_args->path)); + else + di_args->path[0] = '\0'; out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) From 5c84fc3c3914e9adfa6155a167c6c0c2709e6a62 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 30 Mar 2012 13:58:31 +0200 Subject: [PATCH 18/26] Btrfs: don't count CRC or header errors twice while scrubbing Each CRC or header error was counted twice, this is now fixed. Signed-off-by: Stefan Behrens --- fs/btrfs/scrub.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 60f0e28db31eea..b679bf68861e3c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1258,12 +1258,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - if (fail) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); - } - return fail; } @@ -1336,15 +1330,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++crc_fail; - if (crc_fail || fail) { - spin_lock(&sdev->stat_lock); - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); - } - return fail || crc_fail; } From 25cd999e1a685dab65292afbe9fa24d790d8a859 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 30 Mar 2012 13:58:32 +0200 Subject: [PATCH 19/26] Btrfs: fix that check_int_data mount option was ignored The bitfield member mount_opt was too small by one bit to hold the mount option that enabled to include data extents in the integrity checker. Since the same issue happened when the BTRFS_MOUNT_PANIC_ON_FATAL_ERROR option was added (git rebase silently merges so that the increase of the size of the bitfield member is lost), the bit limit was removed entirely. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5b8ef8eb35218e..ec42a24e935e83 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1078,7 +1078,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:21; + unsigned long mount_opt; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; From 996d282c7ff470f150a467eb4815b90159d04c47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 20:35:03 -0400 Subject: [PATCH 20/26] Btrfs: do not start delalloc inodes during sync btrfs_start_delalloc_inodes will just walk the list of delalloc inodes and start writing them out, but it doesn't splice the list or anything so as long as somebody is doing work on the box you could end up in this section _forever_. So just remove it, it's not needed anyway since sync will start writeback on all inodes anyway, all we need to do is wait for ordered extents and then we can commit the transaction. In my horrible torture test sync goes from taking 4 minutes to about 1.5 minutes. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 43aa2dd0bc7dd7..f267718cbd1a74 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -819,7 +819,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); From 3e74317ad773ba9df36db1fa32848cba41ac4d1a Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 27 Apr 2012 12:41:45 -0400 Subject: [PATCH 21/26] Btrfs: fix repair code for RAID10 btrfs_map_block sets mirror_num, so that the repair code knows eventually which device gave us the read error. For RAID10, mirror_num must be 1 or 2. Before this fix mirror_num was incorrectly related to our stripe index. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b984173d25bc2..1411b99555a4c1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3807,10 +3807,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, else if (mirror_num) stripe_index += mirror_num - 1; else { + int old_stripe_index = stripe_index; stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); - mirror_num = stripe_index + 1; + mirror_num = stripe_index - old_stripe_index + 1; } } else { /* From 1daf3540fa77faea2f91d96bcaf07ce48ee827be Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: [PATCH 22/26] Btrfs: Prevent root_list corruption I was seeing root_list corruption on unmount during fs resize in 3.4-rc4; add correct locking to address this. Signed-off-by: Daniel J Blueman Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 017281dbb2a71f..5a105a086acfea 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1279,7 +1279,9 @@ static int __update_reloc_root(struct btrfs_root *root, int del) if (rb_node) backref_tree_panic(rb_node, -EEXIST, node->bytenr); } else { + spin_lock(&root->fs_info->trans_lock); list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); kfree(node); } return 0; From 1f699d38b6556c393ac80f1c23c2053502a51631 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: [PATCH 23/26] Btrfs: fix block_rsv and space_info lock ordering may_commit_transaction() calls spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); and update_global_block_rsv() calls spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); Lockdep complains about this at run time. Everywhere except in update_global_block_rsv(), the space_info lock is the outer lock, therefore the locking order in update_global_block_rsv() is changed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84497f8eb04371..6fc2e6f5aab859 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4214,8 +4214,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = calc_global_metadata_size(fs_info); - spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); block_rsv->size = num_bytes; @@ -4241,8 +4241,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->full = 1; } - spin_unlock(&sinfo->lock); spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); } static void init_global_block_rsv(struct btrfs_fs_info *fs_info) From 7654b72417e10e294563496e25211200f9b8b6d3 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: [PATCH 24/26] Btrfs: Fix space checking during fs resize Fix out-of-space checking, addressing a warning and potential resource leak when resizing the filesystem down while allocating blocks. Signed-off-by: Daniel J Blueman Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5a105a086acfea..646ee21bb035d9 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3813,7 +3813,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { - if (ret != -EAGAIN) { + if (ret != -ENOSPC) { err = ret; WARN_ON(1); break; From fede766f28dd766d4e8feb321fdb19edb21ef6fb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 27 Apr 2012 14:23:22 -0400 Subject: [PATCH 25/26] Btrfs: avoid deadlocks from GFP_KERNEL allocations during btrfs_real_readdir Btrfs has an optimization where it will preallocate dentries during readdir to fill in enough information to open the inode without an extra lookup. But, we're calling d_alloc, which is doing GFP_KERNEL allocations, and that leads to deadlocks because our readdir code has tree locks held. For now, disable this optimization. We'll fix the gfp mask in the next merge window. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d953f8820464e4..3ce7805d1117ee 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4192,7 +4192,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; - struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4283,7 +4282,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; - struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4304,33 +4302,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - q.name = name_ptr; - q.len = name_len; - q.hash = full_name_hash(q.name, q.len); - tmp = d_lookup(filp->f_dentry, &q); - if (!tmp) { - struct btrfs_key *newkey; - - newkey = kzalloc(sizeof(struct btrfs_key), - GFP_NOFS); - if (!newkey) - goto no_dentry; - tmp = d_alloc(filp->f_dentry, &q); - if (!tmp) { - kfree(newkey); - dput(tmp); - goto no_dentry; - } - memcpy(newkey, &location, - sizeof(struct btrfs_key)); - tmp->d_fsdata = newkey; - tmp->d_flags |= DCACHE_NEED_LOOKUP; - d_rehash(tmp); - dput(tmp); - } else { - dput(tmp); - } -no_dentry: + /* is this a reference to our own snapshot? If so * skip it. * From dc7fdde39e4962b1a88741f7eba2a6b3be1285d8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 27 Apr 2012 14:31:29 -0400 Subject: [PATCH 26/26] Btrfs: reduce lock contention during extent insertion We're spending huge amounts of time on lock contention during end_io processing because we unconditionally assume we are overwriting an existing extent in the file for each IO. This checks to see if we are outside i_size, and if so, it uses a less expensive readonly search of the btree to look for existing extents. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d83260d7498fe2..53bf2d764bbc4f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -567,6 +567,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int extent_type; int recow; int ret; + int modify_tree = -1; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -575,10 +576,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, if (!path) return -ENOMEM; + if (start >= BTRFS_I(inode)->disk_i_size) + modify_tree = 0; + while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, - search_start, -1); + search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == start) { @@ -634,7 +638,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, } search_start = max(key.offset, start); - if (recow) { + if (recow || !modify_tree) { + modify_tree = -1; btrfs_release_path(path); continue; }