Skip to content

Commit

Permalink
vfs: add support for a lazytime mount option
Browse files Browse the repository at this point in the history
Add a new mount option which enables a new "lazytime" mode.  This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode.  The on-disk times will only get
updated when (a) if the inode needs to be updated for some non-time
related change, (b) if userspace calls fsync(), syncfs() or sync(), or
(c) just before an undeleted inode is evicted from memory.

This is OK according to POSIX because there are no guarantees after a
crash unless userspace explicitly requests via a fsync(2) call.

For workloads which feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table.  The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives.  Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact long tail latencies --- which
is a very big deal for web serving tiers (for example).

Google-Bug-Id: 18297052

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
  • Loading branch information
tytso authored and Al Viro committed Feb 5, 2015
1 parent e36f014 commit 0ae45f6
Show file tree
Hide file tree
Showing 13 changed files with 186 additions and 35 deletions.
6 changes: 6 additions & 0 deletions fs/ext4/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
*
* If only the I_DIRTY_TIME flag is set, we can skip everything. If
* I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
* to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;

if (flags == I_DIRTY_TIME)
return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
Expand Down
62 changes: 51 additions & 11 deletions fs/fs-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -247,28 +247,41 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}

#define EXPIRE_DIRTY_ATIME 0x0001

/*
* Move expired (dirtied before work->older_than_this) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
int flags,
struct wb_writeback_work *work)
{
unsigned long *older_than_this = NULL;
unsigned long expire_time;
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
struct inode *inode;
int do_sb_sort = 0;
int moved = 0;

if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
else if ((work->reason == WB_REASON_SYNC) == 0) {
expire_time = jiffies - (HZ * 86400);
older_than_this = &expire_time;
}
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
if (work->older_than_this &&
inode_dirtied_after(inode, *work->older_than_this))
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_wb_list, &tmp);
moved++;
if (flags & EXPIRE_DIRTY_ATIME)
set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
Expand Down Expand Up @@ -309,9 +322,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;

assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work);
trace_writeback_queue_io(wb, work, moved);
}

Expand Down Expand Up @@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* updates after data IO completion.
*/
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
list_move(&inode->i_wb_list, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
list_del_init(&inode->i_wb_list);
Expand Down Expand Up @@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock);

dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~I_DIRTY;
if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
(inode->i_state & I_DIRTY_TIME)) ||
(inode->i_state & I_DIRTY_TIME_EXPIRED)) {
dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
trace_writeback_lazytime(inode);
}
inode->i_state &= ~dirty;

/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
Expand All @@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)

spin_unlock(&inode->i_lock);

if (dirty & I_DIRTY_TIME)
mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
Expand Down Expand Up @@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* make sure inode is on some writeback list and leave it there unless
* we have completely cleaned the inode.
*/
if (!(inode->i_state & I_DIRTY) &&
if (!(inode->i_state & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
Expand All @@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* If inode is clean, remove it from writeback lists. Otherwise don't
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY))
if (!(inode->i_state & I_DIRTY_ALL))
list_del_init(&inode->i_wb_list);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
Expand Down Expand Up @@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb,
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY))
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
Expand Down Expand Up @@ -1145,40 +1171,52 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;
int dirtytime;

trace_writeback_mark_inode_dirty(inode, flags);

/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
trace_writeback_dirty_inode_start(inode, flags);

if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);

trace_writeback_dirty_inode(inode, flags);
}
if (flags & I_DIRTY_INODE)
flags &= ~I_DIRTY_TIME;
dirtytime = flags & I_DIRTY_TIME;

/*
* Paired with smp_mb() in __writeback_single_inode() for the
* following lockless i_state test. See there for details.
*/
smp_mb();

if ((inode->i_state & flags) == flags)
if (((inode->i_state & flags) == flags) ||
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;

if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);

spin_lock(&inode->i_lock);
if (dirtytime && (inode->i_state & I_DIRTY_INODE))
goto out_unlock_inode;
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;

if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;

/*
Expand Down Expand Up @@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}

inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
list_move(&inode->i_wb_list, dirtytime ?
&bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);
trace_writeback_dirty_inode_enqueue(inode);

if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
Expand Down
4 changes: 2 additions & 2 deletions fs/gfs2/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
int sync_state = inode->i_state & I_DIRTY;
int sync_state = inode->i_state & I_DIRTY_ALL;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;

Expand All @@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
if (!gfs2_is_jdata(ip))
sync_state &= ~I_DIRTY_PAGES;
if (datasync)
sync_state &= ~I_DIRTY_SYNC;
sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);

if (sync_state) {
ret = sync_inode_metadata(inode, 1);
Expand Down
56 changes: 40 additions & 16 deletions fs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <trace/events/writeback.h>
#include "internal.h"

/*
Expand All @@ -30,7 +31,7 @@
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
Expand Down Expand Up @@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
I_FREEING | I_WILL_FREE)) &&
!atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
inode_lru_list_add(inode);
}
Expand Down Expand Up @@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_unlock(&inode->i_lock);
continue;
}
if (inode->i_state & I_DIRTY && !kill_dirty) {
if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
spin_unlock(&inode->i_lock);
busy = 1;
continue;
Expand Down Expand Up @@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode)
*/
void iput(struct inode *inode)
{
if (inode) {
BUG_ON(inode->i_state & I_CLEAR);

if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
iput_final(inode);
if (!inode)
return;
BUG_ON(inode->i_state & I_CLEAR);
retry:
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
atomic_inc(&inode->i_count);
inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
trace_writeback_lazytime_iput(inode);
mark_inode_dirty_sync(inode);
goto retry;
}
iput_final(inode);
}
}
EXPORT_SYMBOL(iput);
Expand Down Expand Up @@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
return 0;
}

/*
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
static int update_time(struct inode *inode, struct timespec *time, int flags)
int generic_update_time(struct inode *inode, struct timespec *time, int flags)
{
if (inode->i_op->update_time)
return inode->i_op->update_time(inode, time, flags);
int iflags = I_DIRTY_TIME;

if (flags & S_ATIME)
inode->i_atime = *time;
Expand All @@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
inode->i_ctime = *time;
if (flags & S_MTIME)
inode->i_mtime = *time;
mark_inode_dirty_sync(inode);

if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
iflags |= I_DIRTY_SYNC;
__mark_inode_dirty(inode, iflags);
return 0;
}
EXPORT_SYMBOL(generic_update_time);

/*
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
static int update_time(struct inode *inode, struct timespec *time, int flags)
{
int (*update_time)(struct inode *, struct timespec *, int);

update_time = inode->i_op->update_time ? inode->i_op->update_time :
generic_update_time;

return update_time(inode, time, flags);
}

/**
* touch_atime - update the access time
Expand Down
2 changes: 1 addition & 1 deletion fs/jfs/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return rc;

mutex_lock(&inode->i_mutex);
if (!(inode->i_state & I_DIRTY) ||
if (!(inode->i_state & I_DIRTY_ALL) ||
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
Expand Down
2 changes: 1 addition & 1 deletion fs/libfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,

mutex_lock(&inode->i_mutex);
ret = sync_mapping_buffers(inode->i_mapping);
if (!(inode->i_state & I_DIRTY))
if (!(inode->i_state & I_DIRTY_ALL))
goto out;
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
Expand Down
1 change: 1 addition & 0 deletions fs/proc_namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
{ MS_MANDLOCK, ",mand" },
{ MS_LAZYTIME, ",lazytime" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
Expand Down
8 changes: 8 additions & 0 deletions fs/sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
*/
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;

if (!file->f_op->fsync)
return -EINVAL;
if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
spin_lock(&inode->i_lock);
inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
mark_inode_dirty_sync(inode);
}
return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
Expand Down
1 change: 1 addition & 0 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ struct bdi_writeback {
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
struct list_head b_dirty_time; /* time stamps are dirty */
spinlock_t list_lock; /* protects the b_* lists */
};

Expand Down

0 comments on commit 0ae45f6

Please sign in to comment.