Skip to content

Commit ea3d720

Browse files
Jan Karatytso
Jan Kara
authored andcommitted
ext4: fix races between page faults and hole punching
Currently, page faults and hole punching are completely unsynchronized. This can result in page fault faulting in a page into a range that we are punching after truncate_pagecache_range() has been called and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. Note that the same race is avoided for truncate by checking page fault offset against i_size but there isn't similar mechanism available for punching holes. Fix the problem by creating new rw semaphore i_mmap_sem in inode and grab it for writing over truncate, hole punching, and other functions removing blocks from extent tree and for read over page faults. We cannot easily use i_data_sem for this since that ranks below transaction start and we need something ranking above it so that it can be held over the whole truncate / hole punching operation. Also remove various workarounds we had in the code to reduce race window when page fault could have created pages with stale mapping information. Signed-off-by: Jan Kara <jack@suse.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
1 parent f41683a commit ea3d720

File tree

6 files changed

+127
-42
lines changed

6 files changed

+127
-42
lines changed

Diff for: fs/ext4/ext4.h

+10
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,15 @@ struct ext4_inode_info {
910910
* by other means, so we have i_data_sem.
911911
*/
912912
struct rw_semaphore i_data_sem;
913+
/*
914+
* i_mmap_sem is for serializing page faults with truncate / punch hole
915+
* operations. We have to make sure that new page cannot be faulted in
916+
* a section of the inode that is being punched. We cannot easily use
917+
* i_data_sem for this since we need protection for the whole punch
918+
* operation and i_data_sem ranks below transaction start so we have
919+
* to occasionally drop it.
920+
*/
921+
struct rw_semaphore i_mmap_sem;
913922
struct inode vfs_inode;
914923
struct jbd2_inode *jinode;
915924

@@ -2484,6 +2493,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
24842493
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
24852494
loff_t lstart, loff_t lend);
24862495
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2496+
extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
24872497
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
24882498
extern void ext4_da_update_reserve_space(struct inode *inode,
24892499
int used, int quota_claim);

Diff for: fs/ext4/extents.c

+30-24
Original file line numberDiff line numberDiff line change
@@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
47704770
int partial_begin, partial_end;
47714771
loff_t start, end;
47724772
ext4_lblk_t lblk;
4773-
struct address_space *mapping = inode->i_mapping;
47744773
unsigned int blkbits = inode->i_blkbits;
47754774

47764775
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4785,17 +4784,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
47854784
return ret;
47864785
}
47874786

4788-
/*
4789-
* Write out all dirty pages to avoid race conditions
4790-
* Then release them.
4791-
*/
4792-
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4793-
ret = filemap_write_and_wait_range(mapping, offset,
4794-
offset + len - 1);
4795-
if (ret)
4796-
return ret;
4797-
}
4798-
47994787
/*
48004788
* Round up offset. This is not fallocate, we neet to zero out
48014789
* blocks, so convert interior block aligned part of the range to
@@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
48564844
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
48574845
EXT4_EX_NOCACHE);
48584846

4859-
/* Now release the pages and zero block aligned part of pages*/
4860-
truncate_pagecache_range(inode, start, end - 1);
4861-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4862-
48634847
/* Wait all existing dio workers, newcomers will block on i_mutex */
48644848
ext4_inode_block_unlocked_dio(inode);
48654849
inode_dio_wait(inode);
48664850

4851+
/*
4852+
* Prevent page faults from reinstantiating pages we have
4853+
* released from page cache.
4854+
*/
4855+
down_write(&EXT4_I(inode)->i_mmap_sem);
4856+
/* Now release the pages and zero block aligned part of pages */
4857+
truncate_pagecache_range(inode, start, end - 1);
4858+
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4859+
48674860
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
48684861
flags, mode);
4862+
up_write(&EXT4_I(inode)->i_mmap_sem);
48694863
if (ret)
48704864
goto out_dio;
48714865
}
@@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
55245518
goto out_mutex;
55255519
}
55265520

5527-
truncate_pagecache(inode, ioffset);
5528-
55295521
/* Wait for existing dio to complete */
55305522
ext4_inode_block_unlocked_dio(inode);
55315523
inode_dio_wait(inode);
55325524

5525+
/*
5526+
* Prevent page faults from reinstantiating pages we have released from
5527+
* page cache.
5528+
*/
5529+
down_write(&EXT4_I(inode)->i_mmap_sem);
5530+
truncate_pagecache(inode, ioffset);
5531+
55335532
credits = ext4_writepage_trans_blocks(inode);
55345533
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
55355534
if (IS_ERR(handle)) {
55365535
ret = PTR_ERR(handle);
5537-
goto out_dio;
5536+
goto out_mmap;
55385537
}
55395538

55405539
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
55735572

55745573
out_stop:
55755574
ext4_journal_stop(handle);
5576-
out_dio:
5575+
out_mmap:
5576+
up_write(&EXT4_I(inode)->i_mmap_sem);
55775577
ext4_inode_resume_unlocked_dio(inode);
55785578
out_mutex:
55795579
mutex_unlock(&inode->i_mutex);
@@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
56605660
goto out_mutex;
56615661
}
56625662

5663-
truncate_pagecache(inode, ioffset);
5664-
56655663
/* Wait for existing dio to complete */
56665664
ext4_inode_block_unlocked_dio(inode);
56675665
inode_dio_wait(inode);
56685666

5667+
/*
5668+
* Prevent page faults from reinstantiating pages we have released from
5669+
* page cache.
5670+
*/
5671+
down_write(&EXT4_I(inode)->i_mmap_sem);
5672+
truncate_pagecache(inode, ioffset);
5673+
56695674
credits = ext4_writepage_trans_blocks(inode);
56705675
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
56715676
if (IS_ERR(handle)) {
56725677
ret = PTR_ERR(handle);
5673-
goto out_dio;
5678+
goto out_mmap;
56745679
}
56755680

56765681
/* Expand file to avoid data loss if there is error while shifting */
@@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
57415746

57425747
out_stop:
57435748
ext4_journal_stop(handle);
5744-
out_dio:
5749+
out_mmap:
5750+
up_write(&EXT4_I(inode)->i_mmap_sem);
57455751
ext4_inode_resume_unlocked_dio(inode);
57465752
out_mutex:
57475753
mutex_unlock(&inode->i_mutex);

Diff for: fs/ext4/file.c

+57-9
Original file line numberDiff line numberDiff line change
@@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
209209
{
210210
int result;
211211
handle_t *handle = NULL;
212-
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
212+
struct inode *inode = file_inode(vma->vm_file);
213+
struct super_block *sb = inode->i_sb;
213214
bool write = vmf->flags & FAULT_FLAG_WRITE;
214215

215216
if (write) {
216217
sb_start_pagefault(sb);
217218
file_update_time(vma->vm_file);
219+
down_read(&EXT4_I(inode)->i_mmap_sem);
218220
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
219221
EXT4_DATA_TRANS_BLOCKS(sb));
220-
}
222+
} else
223+
down_read(&EXT4_I(inode)->i_mmap_sem);
221224

222225
if (IS_ERR(handle))
223226
result = VM_FAULT_SIGBUS;
@@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
228231
if (write) {
229232
if (!IS_ERR(handle))
230233
ext4_journal_stop(handle);
234+
up_read(&EXT4_I(inode)->i_mmap_sem);
231235
sb_end_pagefault(sb);
232-
}
236+
} else
237+
up_read(&EXT4_I(inode)->i_mmap_sem);
233238

234239
return result;
235240
}
@@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
246251
if (write) {
247252
sb_start_pagefault(sb);
248253
file_update_time(vma->vm_file);
254+
down_read(&EXT4_I(inode)->i_mmap_sem);
249255
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
250256
ext4_chunk_trans_blocks(inode,
251257
PMD_SIZE / PAGE_SIZE));
252-
}
258+
} else
259+
down_read(&EXT4_I(inode)->i_mmap_sem);
253260

254261
if (IS_ERR(handle))
255262
result = VM_FAULT_SIGBUS;
@@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
260267
if (write) {
261268
if (!IS_ERR(handle))
262269
ext4_journal_stop(handle);
270+
up_read(&EXT4_I(inode)->i_mmap_sem);
263271
sb_end_pagefault(sb);
264-
}
272+
} else
273+
up_read(&EXT4_I(inode)->i_mmap_sem);
265274

266275
return result;
267276
}
268277

269278
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
270279
{
271-
return dax_mkwrite(vma, vmf, ext4_get_block_dax,
272-
ext4_end_io_unwritten);
280+
int err;
281+
struct inode *inode = file_inode(vma->vm_file);
282+
283+
sb_start_pagefault(inode->i_sb);
284+
file_update_time(vma->vm_file);
285+
down_read(&EXT4_I(inode)->i_mmap_sem);
286+
err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
287+
ext4_end_io_unwritten);
288+
up_read(&EXT4_I(inode)->i_mmap_sem);
289+
sb_end_pagefault(inode->i_sb);
290+
291+
return err;
292+
}
293+
294+
/*
295+
* Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
296+
* handler we check for races agaist truncate. Note that since we cycle through
297+
* i_mmap_sem, we are sure that also any hole punching that began before we
298+
* were called is finished by now and so if it included part of the file we
299+
* are working on, our pte will get unmapped and the check for pte_same() in
300+
* wp_pfn_shared() fails. Thus fault gets retried and things work out as
301+
* desired.
302+
*/
303+
static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
304+
struct vm_fault *vmf)
305+
{
306+
struct inode *inode = file_inode(vma->vm_file);
307+
struct super_block *sb = inode->i_sb;
308+
int ret = VM_FAULT_NOPAGE;
309+
loff_t size;
310+
311+
sb_start_pagefault(sb);
312+
file_update_time(vma->vm_file);
313+
down_read(&EXT4_I(inode)->i_mmap_sem);
314+
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
315+
if (vmf->pgoff >= size)
316+
ret = VM_FAULT_SIGBUS;
317+
up_read(&EXT4_I(inode)->i_mmap_sem);
318+
sb_end_pagefault(sb);
319+
320+
return ret;
273321
}
274322

275323
static const struct vm_operations_struct ext4_dax_vm_ops = {
276324
.fault = ext4_dax_fault,
277325
.pmd_fault = ext4_dax_pmd_fault,
278326
.page_mkwrite = ext4_dax_mkwrite,
279-
.pfn_mkwrite = dax_pfn_mkwrite,
327+
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
280328
};
281329
#else
282330
#define ext4_dax_vm_ops ext4_file_vm_ops
283331
#endif
284332

285333
static const struct vm_operations_struct ext4_file_vm_ops = {
286-
.fault = filemap_fault,
334+
.fault = ext4_filemap_fault,
287335
.map_pages = filemap_map_pages,
288336
.page_mkwrite = ext4_page_mkwrite,
289337
};

Diff for: fs/ext4/inode.c

+27-9
Original file line numberDiff line numberDiff line change
@@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
36233623

36243624
}
36253625

3626+
/* Wait all existing dio workers, newcomers will block on i_mutex */
3627+
ext4_inode_block_unlocked_dio(inode);
3628+
inode_dio_wait(inode);
3629+
3630+
/*
3631+
* Prevent page faults from reinstantiating pages we have released from
3632+
* page cache.
3633+
*/
3634+
down_write(&EXT4_I(inode)->i_mmap_sem);
36263635
first_block_offset = round_up(offset, sb->s_blocksize);
36273636
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
36283637

@@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
36313640
truncate_pagecache_range(inode, first_block_offset,
36323641
last_block_offset);
36333642

3634-
/* Wait all existing dio workers, newcomers will block on i_mutex */
3635-
ext4_inode_block_unlocked_dio(inode);
3636-
inode_dio_wait(inode);
3637-
36383643
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
36393644
credits = ext4_writepage_trans_blocks(inode);
36403645
else
@@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
36803685
if (IS_SYNC(inode))
36813686
ext4_handle_sync(handle);
36823687

3683-
/* Now release the pages again to reduce race window */
3684-
if (last_block_offset > first_block_offset)
3685-
truncate_pagecache_range(inode, first_block_offset,
3686-
last_block_offset);
3687-
36883688
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
36893689
ext4_mark_inode_dirty(handle, inode);
36903690
out_stop:
36913691
ext4_journal_stop(handle);
36923692
out_dio:
3693+
up_write(&EXT4_I(inode)->i_mmap_sem);
36933694
ext4_inode_resume_unlocked_dio(inode);
36943695
out_mutex:
36953696
mutex_unlock(&inode->i_mutex);
@@ -4823,13 +4824,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
48234824
} else
48244825
ext4_wait_for_tail_page_commit(inode);
48254826
}
4827+
down_write(&EXT4_I(inode)->i_mmap_sem);
48264828
/*
48274829
* Truncate pagecache after we've waited for commit
48284830
* in data=journal mode to make pages freeable.
48294831
*/
48304832
truncate_pagecache(inode, inode->i_size);
48314833
if (shrink)
48324834
ext4_truncate(inode);
4835+
up_write(&EXT4_I(inode)->i_mmap_sem);
48334836
}
48344837

48354838
if (!rc) {
@@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
52785281

52795282
sb_start_pagefault(inode->i_sb);
52805283
file_update_time(vma->vm_file);
5284+
5285+
down_read(&EXT4_I(inode)->i_mmap_sem);
52815286
/* Delalloc case is easy... */
52825287
if (test_opt(inode->i_sb, DELALLOC) &&
52835288
!ext4_should_journal_data(inode) &&
@@ -5347,6 +5352,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
53475352
out_ret:
53485353
ret = block_page_mkwrite_return(ret);
53495354
out:
5355+
up_read(&EXT4_I(inode)->i_mmap_sem);
53505356
sb_end_pagefault(inode->i_sb);
53515357
return ret;
53525358
}
5359+
5360+
int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5361+
{
5362+
struct inode *inode = file_inode(vma->vm_file);
5363+
int err;
5364+
5365+
down_read(&EXT4_I(inode)->i_mmap_sem);
5366+
err = filemap_fault(vma, vmf);
5367+
up_read(&EXT4_I(inode)->i_mmap_sem);
5368+
5369+
return err;
5370+
}

Diff for: fs/ext4/super.c

+1
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,7 @@ static void init_once(void *foo)
958958
INIT_LIST_HEAD(&ei->i_orphan);
959959
init_rwsem(&ei->xattr_sem);
960960
init_rwsem(&ei->i_data_sem);
961+
init_rwsem(&ei->i_mmap_sem);
961962
inode_init_once(&ei->vfs_inode);
962963
}
963964

Diff for: fs/ext4/truncate.h

+2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
*/
1111
static inline void ext4_truncate_failed_write(struct inode *inode)
1212
{
13+
down_write(&EXT4_I(inode)->i_mmap_sem);
1314
truncate_inode_pages(inode->i_mapping, inode->i_size);
1415
ext4_truncate(inode);
16+
up_write(&EXT4_I(inode)->i_mmap_sem);
1517
}
1618

1719
/*

0 commit comments

Comments
 (0)