btrfs: rework chunk allocation to avoid exhaustion of the system chun…

…k array commit 79bd371 upstream. Commit eafa4fd ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations") fixed a problem that resulted in exhausting the system chunk array in the superblock when there are many tasks allocating chunks in parallel. Basically too many tasks enter the first phase of chunk allocation without previous tasks having finished their second phase of allocation, resulting in too many system chunks being allocated. That was originally observed when running the fallocate tests of stress-ng on a PowerPC machine, using a node size of 64K. However that commit also introduced a deadlock where a task in phase 1 of the chunk allocation waited for another task that had allocated a system chunk to finish its phase 2, but that other task was waiting on an extent buffer lock held by the first task, therefore resulting in both tasks not making any progress. That change was later reverted by a patch with the subject "btrfs: fix deadlock with concurrent chunk allocations involving system chunks", since there is no simple and short solution to address it and the deadlock is relatively easy to trigger on zoned filesystems, while the system chunk array exhaustion is not so common. This change reworks the chunk allocation to avoid the system chunk array exhaustion. It accomplishes that by making the first phase of chunk allocation do the updates of the device items in the chunk btree and the insertion of the new chunk item in the chunk btree. This is done while under the protection of the chunk mutex (fs_info->chunk_mutex), in the same critical section that checks for available system space, allocates a new system chunk if needed and reserves system chunk space. This way we do not have chunk space reserved until the second phase completes. The same logic is applied to chunk removal as well, since it keeps reserved system space long after it is done updating the chunk btree. For direct allocation of system chunks, the previous behaviour remains, because otherwise we would deadlock on extent buffers of the chunk btree. Changes to the chunk btree are by large done by chunk allocation and chunk removal, which first reserve chunk system space and then later do changes to the chunk btree. The other remaining cases are uncommon and correspond to adding a device, removing a device and resizing a device. All these other cases do not pre-reserve system space, they modify the chunk btree right away, so they don't hold reserved space for a long period like chunk allocation and chunk removal do. The diff of this change is huge, but more than half of it is just addition of comments describing both how things work regarding chunk allocation and removal, including both the new behavior and the parts of the old behavior that did not change. CC: stable@vger.kernel.org # 5.12+ Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Tested-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Tested-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
xanmod · Jul 20, 2021 · d6479f0 · d6479f0
1 parent 789b24d
commit d6479f0
Show file tree

Hide file tree

Showing 7 changed files with 546 additions and 184 deletions.
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
@@ -97,6 +97,7 @@ struct btrfs_block_group {
 	unsigned int removed:1;
 	unsigned int to_copy:1;
 	unsigned int relocating_repair:1;
+	unsigned int chunk_item_inserted:1;
 
 	int disk_cache_state;
 
@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
 int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
-			   u64 type, u64 chunk_offset, u64 size);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+						 u64 bytes_used, u64 type,
+						 u64 chunk_offset, u64 size);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
 			     bool do_chunk_alloc);

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
-					  struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  u64 parent_start,
-					  const struct btrfs_disk_key *disk_key,
-					  int level,
-					  u64 hint,
-					  u64 empty_size,
-					  enum btrfs_lock_nesting nest)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct extent_buffer *ret;
-
-	/*
-	 * If we are COWing a node/leaf from the extent, chunk, device or free
-	 * space trees, make sure that we do not finish block group creation of
-	 * pending block groups. We do this to avoid a deadlock.
-	 * COWing can result in allocation of a new chunk, and flushing pending
-	 * block groups (btrfs_create_pending_block_groups()) can be triggered
-	 * when finishing allocation of a new chunk. Creation of a pending block
-	 * group modifies the extent, chunk, device and free space trees,
-	 * therefore we could deadlock with ourselves since we are holding a
-	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
-	 * try to COW later.
-	 * For similar reasons, we also need to delay flushing pending block
-	 * groups when splitting a leaf or node, from one of those trees, since
-	 * we are holding a write lock on it and its parent or when inserting a
-	 * new root node for one of those trees.
-	 */
-	if (root == fs_info->extent_root ||
-	    root == fs_info->chunk_root ||
-	    root == fs_info->dev_root ||
-	    root == fs_info->free_space_root)
-		trans->can_flush_pending_bgs = false;
-
-	ret = btrfs_alloc_tree_block(trans, root, parent_start,
-				     root->root_key.objectid, disk_key, level,
-				     hint, empty_size, nest);
-	trans->can_flush_pending_bgs = true;
-
-	return ret;
-}
-
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
 		parent_start = parent->start;
 
-	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
-					   level, search_start, empty_size, nest);
+	cow = btrfs_alloc_tree_block(trans, root, parent_start,
+				     root->root_key.objectid, &disk_key, level,
+				     search_start, empty_size, nest);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
-					 root->node->start, 0,
-					 BTRFS_NESTING_NEW_ROOT);
+	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				   &lower_key, level, root->node->start, 0,
+				   BTRFS_NESTING_NEW_ROOT);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	mid = (c_nritems + 1) / 2;
 	btrfs_node_key(c, &disk_key, mid);
 
-	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
-					     c->start, 0, BTRFS_NESTING_SPLIT);
+	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				       &disk_key, level, c->start, 0,
+				       BTRFS_NESTING_SPLIT);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -3381,10 +3340,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
 	 * use BTRFS_NESTING_NEW_ROOT.
 	 */
-	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
-					     l->start, 0, num_doubles ?
-					     BTRFS_NESTING_NEW_ROOT :
-					     BTRFS_NESTING_SPLIT);
+	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				       &disk_key, 0, l->start, 0,
+				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
+				       BTRFS_NESTING_SPLIT);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
@@ -254,8 +254,11 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
 }
 
 /*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
  */
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 {
@@ -264,8 +267,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 	if (!trans->chunk_bytes_reserved)
 		return;
 
-	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
 	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
 				trans->chunk_bytes_reserved, NULL);
 	trans->chunk_bytes_reserved = 0;
@@ -699,7 +700,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	h->fs_info = root->fs_info;
 
 	h->type = type;
-	h->can_flush_pending_bgs = true;
 	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();

diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
@@ -134,7 +134,7 @@ struct btrfs_trans_handle {
 	short aborted;
 	bool adding_csums;
 	bool allocating_chunk;
-	bool can_flush_pending_bgs;
+	bool removing_chunk;
 	bool reloc_reserved;
 	bool in_fsync;
 	struct btrfs_root *root;