Skip to content

Commit

Permalink
ext4: limit the length of per-inode prealloc list
Browse files Browse the repository at this point in the history
In the scenario of writing sparse files, the per-inode prealloc list may
be very long, resulting in high overhead for ext4_mb_use_preallocated().
To circumvent this problem, we limit the maximum length of per-inode
prealloc list to 512 and allow users to modify it.

After patching, we observed that the sys ratio of cpu has dropped, and
the system throughput has increased significantly. We created a process
to write the sparse file, and the running time of the process on the
fixed kernel was significantly reduced, as follows:

Running time on unfixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real    0m2.051s
user    0m0.008s
sys     0m2.026s

Running time on fixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real    0m0.471s
user    0m0.004s
sys     0m0.395s

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  • Loading branch information
brookxu-cn authored and tytso committed Aug 19, 2020
1 parent 66d5e02 commit 27bc446
Show file tree
Hide file tree
Showing 13 changed files with 104 additions and 29 deletions.
3 changes: 3 additions & 0 deletions Documentation/admin-guide/ext4.rst
Expand Up @@ -482,6 +482,9 @@ Files in /sys/fs/ext4/<devname>:
multiple of this tuning parameter if the stripe size is not set in the
ext4 superblock

mb_max_inode_prealloc
The maximum length of per-inode ext4_prealloc_space list.

mb_max_to_scan
The maximum number of extents the multiblock allocator will search to
find the best extent.
Expand Down
4 changes: 3 additions & 1 deletion fs/ext4/ext4.h
Expand Up @@ -1070,6 +1070,7 @@ struct ext4_inode_info {
struct timespec64 i_crtime;

/* mballoc */
atomic_t i_prealloc_active;
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;

Expand Down Expand Up @@ -1518,6 +1519,7 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_mb_max_inode_prealloc;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
Expand Down Expand Up @@ -2682,7 +2684,7 @@ extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *);
extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
Expand Down
10 changes: 5 additions & 5 deletions fs/ext4/extents.c
Expand Up @@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
Expand Down Expand Up @@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free().
*/
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ext4_free_blocks(handle, inode, NULL, newblock,
Expand Down Expand Up @@ -5293,7 +5293,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
}

down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);

ret = ext4_es_remove_extent(inode, punch_start,
EXT_MAX_BLOCKS - punch_start);
Expand All @@ -5307,7 +5307,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);

ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
Expand Down Expand Up @@ -5439,7 +5439,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_stop;

down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);

path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
Expand Down
2 changes: 1 addition & 1 deletion fs/ext4/file.c
Expand Up @@ -147,7 +147,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
Expand Down
2 changes: 1 addition & 1 deletion fs/ext4/indirect.c
Expand Up @@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
Expand Down
6 changes: 3 additions & 3 deletions fs/ext4/inode.c
Expand Up @@ -383,7 +383,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);
}

static int __check_block_validity(struct inode *inode, const char *func,
Expand Down Expand Up @@ -4055,7 +4055,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (stop_block > first_block) {

down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);

ret = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
Expand Down Expand Up @@ -4210,7 +4210,7 @@ int ext4_truncate(struct inode *inode)

down_write(&EXT4_I(inode)->i_data_sem);

ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
Expand Down
2 changes: 1 addition & 1 deletion fs/ext4/ioctl.c
Expand Up @@ -202,7 +202,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
reset_inode_seed(inode);
reset_inode_seed(inode_bl);

ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);

err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
Expand Down
74 changes: 66 additions & 8 deletions fs/ext4/mballoc.c
Expand Up @@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
Expand Down Expand Up @@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}

static void ext4_mb_mark_pa_deleted(struct super_block *sb,
struct ext4_prealloc_space *pa)
{
struct ext4_inode_info *ei;

if (pa->pa_deleted) {
ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
pa->pa_type, pa->pa_pstart, pa->pa_lstart,
pa->pa_len);
return;
}

pa->pa_deleted = 1;

if (pa->pa_type == MB_INODE_PA) {
ei = EXT4_I(pa->pa_inode);
atomic_dec(&ei->i_prealloc_active);
}
}

static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
Expand Down Expand Up @@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
return;
}

pa->pa_deleted = 1;
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);

grp_blk = pa->pa_pstart;
Expand Down Expand Up @@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
atomic_inc(&ei->i_prealloc_active);
}

/*
Expand Down Expand Up @@ -4182,7 +4204,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
}

/* seems this one can be freed ... */
pa->pa_deleted = 1;
ext4_mb_mark_pa_deleted(sb, pa);

/* we can trust pa_free ... */
free += pa->pa_free;
Expand Down Expand Up @@ -4245,7 +4267,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
*
* FIXME!! Make sure it is valid at all the call sites
*/
void ext4_discard_preallocations(struct inode *inode)
void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
Expand All @@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)

mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
trace_ext4_discard_preallocations(inode);
trace_ext4_discard_preallocations(inode,
atomic_read(&ei->i_prealloc_active), needed);

INIT_LIST_HEAD(&list);

if (needed == 0)
needed = UINT_MAX;

repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
while (!list_empty(&ei->i_prealloc_list)) {
pa = list_entry(ei->i_prealloc_list.next,
while (!list_empty(&ei->i_prealloc_list) && needed) {
pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
Expand All @@ -4288,10 +4314,11 @@ void ext4_discard_preallocations(struct inode *inode)

}
if (pa->pa_deleted == 0) {
pa->pa_deleted = 1;
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
needed--;
continue;
}

Expand Down Expand Up @@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
BUG_ON(pa->pa_type != MB_GROUP_PA);

/* seems this one can be freed ... */
pa->pa_deleted = 1;
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);

list_del_rcu(&pa->pa_inode_list);
Expand Down Expand Up @@ -4690,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
return ;
}

/*
* if per-inode prealloc list is too long, trim some PA
*/
static void ext4_mb_trim_inode_pa(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int count, delta;

count = atomic_read(&ei->i_prealloc_active);
delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
if (count > sbi->s_mb_max_inode_prealloc + delta) {
count -= sbi->s_mb_max_inode_prealloc;
ext4_discard_preallocations(inode, count);
}
}

/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct inode *inode = ac->ac_inode;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
Expand All @@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
ext4_mb_add_n_trim(ac);
}
}

if (pa->pa_type == MB_INODE_PA) {
/*
* treat per-inode prealloc list as a lru list, then try
* to trim the least recently used PA.
*/
spin_lock(pa->pa_obj_lock);
list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
}

ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
Expand All @@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
ext4_mb_trim_inode_pa(inode);
return 0;
}

Expand Down
4 changes: 4 additions & 0 deletions fs/ext4/mballoc.h
Expand Up @@ -73,6 +73,10 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512

/*
* maximum length of inode prealloc list
*/
#define MB_DEFAULT_MAX_INODE_PREALLOC 512

struct ext4_free_data {
/* this links the free block information from sb_info */
Expand Down
4 changes: 2 additions & 2 deletions fs/ext4/move_extent.c
Expand Up @@ -686,8 +686,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,

out:
if (*moved_len) {
ext4_discard_preallocations(orig_inode);
ext4_discard_preallocations(donor_inode);
ext4_discard_preallocations(orig_inode, 0);
ext4_discard_preallocations(donor_inode, 0);
}

ext4_ext_drop_refs(path);
Expand Down
3 changes: 2 additions & 1 deletion fs/ext4/super.c
Expand Up @@ -1127,6 +1127,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
atomic_set(&ei->i_prealloc_active, 0);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
Expand Down Expand Up @@ -1220,7 +1221,7 @@ void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
clear_inode(inode);
ext4_discard_preallocations(inode);
ext4_discard_preallocations(inode, 0);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
Expand Down
2 changes: 2 additions & 0 deletions fs/ext4/sysfs.c
Expand Up @@ -218,6 +218,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
Expand Down Expand Up @@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(mb_max_inode_prealloc),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
Expand Down
17 changes: 11 additions & 6 deletions include/trace/events/ext4.h
Expand Up @@ -746,24 +746,29 @@ TRACE_EVENT(ext4_mb_release_group_pa,
);

TRACE_EVENT(ext4_discard_preallocations,
TP_PROTO(struct inode *inode),
TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),

TP_ARGS(inode),
TP_ARGS(inode, len, needed),

TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( dev_t, dev )
__field( ino_t, ino )
__field( unsigned int, len )
__field( unsigned int, needed )

),

TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->len = len;
__entry->needed = needed;
),

TP_printk("dev %d,%d ino %lu",
TP_printk("dev %d,%d ino %lu len: %u needed %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino)
(unsigned long) __entry->ino, __entry->len,
__entry->needed)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
Expand Down

0 comments on commit 27bc446

Please sign in to comment.