Skip to content

Commit

Permalink
Btrfs: add mount -o auto_defrag
Browse files Browse the repository at this point in the history
This will detect small random writes into files and
queue the up for an auto defrag process.  It isn't well suited to
database workloads yet, but works for smaller files such as rpm, sqlite
or bdb databases.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
  • Loading branch information
chrismason-xx committed May 26, 2011
1 parent d6c0cb3 commit 4cb5300
Show file tree
Hide file tree
Showing 8 changed files with 678 additions and 135 deletions.
1 change: 1 addition & 0 deletions fs/btrfs/btrfs_inode.h
Expand Up @@ -153,6 +153,7 @@ struct btrfs_inode {
unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
unsigned in_defrag:1;

/*
* always compress this one file
Expand Down
45 changes: 44 additions & 1 deletion fs/btrfs/ctree.h
Expand Up @@ -1074,6 +1074,11 @@ struct btrfs_fs_info {
/* all metadata allocations go through this cluster */
struct btrfs_free_cluster meta_alloc_cluster;

/* auto defrag inodes go here */
spinlock_t defrag_inodes_lock;
struct rb_root defrag_inodes;
atomic_t defrag_running;

spinlock_t ref_cache_lock;
u64 total_ref_cache_size;

Expand Down Expand Up @@ -1205,6 +1210,38 @@ struct btrfs_root {
struct super_block anon_super;
};

struct btrfs_ioctl_defrag_range_args {
/* start of the defrag operation */
__u64 start;

/* number of bytes to defrag, use (u64)-1 to say all */
__u64 len;

/*
* flags for the operation, which can include turning
* on compression for this one defrag
*/
__u64 flags;

/*
* any extent bigger than this will be considered
* already defragged. Use 0 to take the kernel default
* Use 1 to say every single extent must be rewritten
*/
__u32 extent_thresh;

/*
* which compression method to use if turning on compression
* for this defrag operation. If unspecified, zlib will
* be used
*/
__u32 compress_type;

/* spare for later */
__u32 unused[4];
};


/*
* inode items have the data typically returned from stat and store other
* info about object characteristics. There is one for every file and dir in
Expand Down Expand Up @@ -1302,6 +1339,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)

#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
Expand Down Expand Up @@ -2528,8 +2566,13 @@ extern const struct dentry_operations btrfs_dentry_operations;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);

int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
/* file.c */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
Expand Down
12 changes: 12 additions & 0 deletions fs/btrfs/disk-io.c
Expand Up @@ -1475,6 +1475,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(root);
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
btrfs_run_defrag_inodes(root->fs_info);
}

if (freezing(current)) {
Expand Down Expand Up @@ -1616,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);

init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
Expand All @@ -1638,9 +1640,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;

fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
Expand Down Expand Up @@ -2501,6 +2505,14 @@ int close_ctree(struct btrfs_root *root)
smp_mb();

btrfs_scrub_cancel(root);

/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));

/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);

btrfs_put_block_group_cache(fs_info);

/*
Expand Down
257 changes: 257 additions & 0 deletions fs/btrfs/file.c
Expand Up @@ -40,6 +40,263 @@
#include "locking.h"
#include "compat.h"

/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
* inodes need defragging passes
*/
struct inode_defrag {
struct rb_node rb_node;
/* objectid */
u64 ino;
/*
* transid where the defrag was added, we search for
* extents newer than this
*/
u64 transid;

/* root objectid */
u64 root;

/* last offset we were able to defrag */
u64 last_offset;

/* if we've wrapped around back to zero once already */
int cycled;
};

/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
* If you're inserting a record for an older transid than an
* existing record, the transid already in the tree is lowered
*
* If an existing record is found the defrag item you
* pass in is freed
*/
static int __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;

p = &root->fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);

if (defrag->ino < entry->ino)
p = &parent->rb_left;
else if (defrag->ino > entry->ino)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
* an old defrag run, make sure to
* lower the transid of our existing record
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
if (defrag->last_offset > entry->last_offset)
entry->last_offset = defrag->last_offset;
goto exists;
}
}
BTRFS_I(inode)->in_defrag = 1;
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
return 0;

exists:
kfree(defrag);
return 0;

}

/*
* insert a defrag record for this inode if auto defrag is
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
int ret = 0;
u64 transid;

if (!btrfs_test_opt(root, AUTO_DEFRAG))
return 0;

if (root->fs_info->closing)
return 0;

if (BTRFS_I(inode)->in_defrag)
return 0;

if (trans)
transid = trans->transid;
else
transid = BTRFS_I(inode)->root->last_trans;

defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
if (!defrag)
return -ENOMEM;

defrag->ino = inode->i_ino;
defrag->transid = transid;
defrag->root = root->root_key.objectid;

spin_lock(&root->fs_info->defrag_inodes_lock);
if (!BTRFS_I(inode)->in_defrag)
ret = __btrfs_add_inode_defrag(inode, defrag);
spin_unlock(&root->fs_info->defrag_inodes_lock);
return ret;
}

/*
* must be called with the defrag_inodes lock held
*/
struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
struct rb_node **next)
{
struct inode_defrag *entry = NULL;
struct rb_node *p;
struct rb_node *parent = NULL;

p = info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);

if (ino < entry->ino)
p = parent->rb_left;
else if (ino > entry->ino)
p = parent->rb_right;
else
return entry;
}

if (next) {
while (parent && ino > entry->ino) {
parent = rb_next(parent);
entry = rb_entry(parent, struct inode_defrag, rb_node);
}
*next = parent;
}
return NULL;
}

/*
* run through the list of inodes in the FS that need
* defragging
*/
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
struct btrfs_root *inode_root;
struct inode *inode;
struct rb_node *n;
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
u64 first_ino = 0;
int num_defrag;
int defrag_batch = 1024;

memset(&range, 0, sizeof(range));
range.len = (u64)-1;

atomic_inc(&fs_info->defrag_running);
spin_lock(&fs_info->defrag_inodes_lock);
while(1) {
n = NULL;

/* find an inode to defrag */
defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
if (!defrag) {
if (n)
defrag = rb_entry(n, struct inode_defrag, rb_node);
else if (first_ino) {
first_ino = 0;
continue;
} else {
break;
}
}

/* remove it from the rbtree */
first_ino = defrag->ino + 1;
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);

if (fs_info->closing)
goto next_free;

spin_unlock(&fs_info->defrag_inodes_lock);

/* get the inode */
key.objectid = defrag->root;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(inode_root))
goto next;

key.objectid = defrag->ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;

inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode))
goto next;

/* do a chunk of defrag */
BTRFS_I(inode)->in_defrag = 0;
range.start = defrag->last_offset;
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
defrag_batch);
/*
* if we filled the whole defrag batch, there
* must be more work to do. Queue this defrag
* again
*/
if (num_defrag == defrag_batch) {
defrag->last_offset = range.start;
__btrfs_add_inode_defrag(inode, defrag);
/*
* we don't want to kfree defrag, we added it back to
* the rbtree
*/
defrag = NULL;
} else if (defrag->last_offset && !defrag->cycled) {
/*
* we didn't fill our defrag batch, but
* we didn't start at zero. Make sure we loop
* around to the start of the file.
*/
defrag->last_offset = 0;
defrag->cycled = 1;
__btrfs_add_inode_defrag(inode, defrag);
defrag = NULL;
}

iput(inode);
next:
spin_lock(&fs_info->defrag_inodes_lock);
next_free:
kfree(defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);

atomic_dec(&fs_info->defrag_running);

/*
* during unmount, we use the transaction_wait queue to
* wait for the defragger to stop
*/
wake_up(&fs_info->transaction_wait);
return 0;
}

/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
Expand Down

0 comments on commit 4cb5300

Please sign in to comment.