Skip to content

Commit

Permalink
ext4: fix races between page faults and hole punching
Browse files Browse the repository at this point in the history
Currently, page faults and hole punching are completely unsynchronized.
This can result in page fault faulting in a page into a range that we
are punching after truncate_pagecache_range() has been called and thus
we can end up with a page mapped to disk blocks that will be shortly
freed. Filesystem corruption will shortly follow. Note that the same
race is avoided for truncate by checking page fault offset against
i_size but there isn't similar mechanism available for punching holes.

Fix the problem by creating new rw semaphore i_mmap_sem in inode and
grab it for writing over truncate, hole punching, and other functions
removing blocks from extent tree and for read over page faults. We
cannot easily use i_data_sem for this since that ranks below transaction
start and we need something ranking above it so that it can be held over
the whole truncate / hole punching operation. Also remove various
workarounds we had in the code to reduce race window when page fault
could have created pages with stale mapping information.

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  • Loading branch information
Jan Kara authored and tytso committed Dec 7, 2015
1 parent f41683a commit ea3d720
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 42 deletions.
10 changes: 10 additions & 0 deletions fs/ext4/ext4.h
Expand Up @@ -910,6 +910,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
/*
* i_mmap_sem is for serializing page faults with truncate / punch hole
* operations. We have to make sure that new page cannot be faulted in
* a section of the inode that is being punched. We cannot easily use
* i_data_sem for this since we need protection for the whole punch
* operation and i_data_sem ranks below transaction start so we have
* to occasionally drop it.
*/
struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;

Expand Down Expand Up @@ -2484,6 +2493,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
Expand Down
54 changes: 30 additions & 24 deletions fs/ext4/extents.c
Expand Up @@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;

trace_ext4_zero_range(inode, offset, len, mode);
Expand All @@ -4785,17 +4784,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
return ret;
}

/*
* Write out all dirty pages to avoid race conditions
* Then release them.
*/
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
ret = filemap_write_and_wait_range(mapping, offset,
offset + len - 1);
if (ret)
return ret;
}

/*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
Expand Down Expand Up @@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);

/* Now release the pages and zero block aligned part of pages*/
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);

/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);

/*
* Prevent page faults from reinstantiating pages we have
* released from page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);

ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
Expand Down Expand Up @@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}

truncate_pagecache(inode, ioffset);

/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);

/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_pagecache(inode, ioffset);

credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_dio;
goto out_mmap;
}

down_write(&EXT4_I(inode)->i_data_sem);
Expand Down Expand Up @@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)

out_stop:
ext4_journal_stop(handle);
out_dio:
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
Expand Down Expand Up @@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}

truncate_pagecache(inode, ioffset);

/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);

/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_pagecache(inode, ioffset);

credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_dio;
goto out_mmap;
}

/* Expand file to avoid data loss if there is error while shifting */
Expand Down Expand Up @@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)

out_stop:
ext4_journal_stop(handle);
out_dio:
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
Expand Down
66 changes: 57 additions & 9 deletions fs/ext4/file.c
Expand Up @@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int result;
handle_t *handle = NULL;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
struct inode *inode = file_inode(vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;

if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
}
} else
down_read(&EXT4_I(inode)->i_mmap_sem);

if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
Expand All @@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
}
} else
up_read(&EXT4_I(inode)->i_mmap_sem);

return result;
}
Expand All @@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
ext4_chunk_trans_blocks(inode,
PMD_SIZE / PAGE_SIZE));
}
} else
down_read(&EXT4_I(inode)->i_mmap_sem);

if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
Expand All @@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
}
} else
up_read(&EXT4_I(inode)->i_mmap_sem);

return result;
}

static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext4_get_block_dax,
ext4_end_io_unwritten);
int err;
struct inode *inode = file_inode(vma->vm_file);

sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
ext4_end_io_unwritten);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);

return err;
}

/*
* Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
* handler we check for races agaist truncate. Note that since we cycle through
* i_mmap_sem, we are sure that also any hole punching that began before we
* were called is finished by now and so if it included part of the file we
* are working on, our pte will get unmapped and the check for pte_same() in
* wp_pfn_shared() fails. Thus fault gets retried and things work out as
* desired.
*/
static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
struct super_block *sb = inode->i_sb;
int ret = VM_FAULT_NOPAGE;
loff_t size;

sb_start_pagefault(sb);
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);

return ret;
}

static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif

static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
Expand Down
36 changes: 27 additions & 9 deletions fs/ext4/inode.c
Expand Up @@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)

}

/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);

/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;

Expand All @@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);

/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
Expand Down Expand Up @@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);

/* Now release the pages again to reduce race window */
if (last_block_offset > first_block_offset)
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);

inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
Expand Down Expand Up @@ -4823,13 +4824,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
if (shrink)
ext4_truncate(inode);
up_write(&EXT4_I(inode)->i_mmap_sem);
}

if (!rc) {
Expand Down Expand Up @@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)

sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);

down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
Expand Down Expand Up @@ -5347,6 +5352,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out_ret:
ret = block_page_mkwrite_return(ret);
out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}

int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
int err;

down_read(&EXT4_I(inode)->i_mmap_sem);
err = filemap_fault(vma, vmf);
up_read(&EXT4_I(inode)->i_mmap_sem);

return err;
}
1 change: 1 addition & 0 deletions fs/ext4/super.c
Expand Up @@ -958,6 +958,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}

Expand Down
2 changes: 2 additions & 0 deletions fs/ext4/truncate.h
Expand Up @@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
up_write(&EXT4_I(inode)->i_mmap_sem);
}

/*
Expand Down

0 comments on commit ea3d720

Please sign in to comment.