Skip to content

Commit

Permalink
vfs: atomic f_pos accesses as per POSIX
Browse files Browse the repository at this point in the history
Our write() system call has always been atomic in the sense that you get
the expected thread-safe contiguous write, but we haven't actually
guaranteed that concurrent writes are serialized wrt f_pos accesses, so
threads (or processes) that share a file descriptor and use "write()"
concurrently would quite likely overwrite each others data.

This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says:

 "2.9.7 Thread Interactions with Regular File Operations

  All of the following functions shall be atomic with respect to each
  other in the effects specified in POSIX.1-2008 when they operate on
  regular files or symbolic links: [...]"

and one of the effects is the file position update.

This unprotected file position behavior is not new behavior, and nobody
has ever cared.  Until now.  Yongzhi Pan reported unexpected behavior to
Michael Kerrisk that was due to this.

This resolves the issue with a f_pos-specific lock that is taken by
read/write/lseek on file descriptors that may be shared across threads
or processes.

Reported-by: Yongzhi Pan <panyongzhi@gmail.com>
Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
  • Loading branch information
torvalds authored and Al Viro committed Mar 10, 2014
1 parent 1b56e98 commit 9c225f2
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 18 deletions.
1 change: 1 addition & 0 deletions fs/file_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
eventpoll_init_file(f);
/* f->f_version: 0 */
return f;
Expand Down
2 changes: 1 addition & 1 deletion fs/namei.c
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,

nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
if (f.need_put)
if (f.flags & FDPUT_FPUT)
*fp = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
rcu_read_lock();
Expand Down
4 changes: 4 additions & 0 deletions fs/open.c
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
return 0;
}

/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;

f->f_op = fops_get(inode->i_fop);
if (unlikely(WARN_ON(!f->f_op))) {
error = -ENODEV;
Expand Down
54 changes: 40 additions & 14 deletions fs/read_write.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);

/*
* We only lock f_pos if we have threads or if the file might be
* shared with another process. In both cases we'll have an elevated
* file count (done either by fdget() or by fork()).
*/
static inline struct fd fdget_pos(int fd)
{
struct fd f = fdget(fd);
struct file *file = f.file;

if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
if (file_count(file) > 1) {
f.flags |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
}
return f;
}

static inline void fdput_pos(struct fd f)
{
if (f.flags & FDPUT_POS_UNLOCK)
mutex_unlock(&f.file->f_pos_lock);
fdput(f);
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
if (!f.file)
return -EBADF;

Expand All @@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
fdput(f);
fdput_pos(f);
return retval;
}

Expand Down Expand Up @@ -498,31 +524,31 @@ static inline void file_pos_write(struct file *file, loff_t pos)

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}
return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}

return ret;
Expand Down Expand Up @@ -797,15 +823,15 @@ EXPORT_SYMBOL(vfs_writev);
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}

if (ret > 0)
Expand All @@ -817,15 +843,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}

if (ret > 0)
Expand Down Expand Up @@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;

Expand All @@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
ret = compat_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
fdput(f);
fdput_pos(f);
return ret;
}

Expand Down Expand Up @@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
const struct compat_iovec __user *, vec,
compat_ulong_t, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;

Expand All @@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
ret = compat_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
fdput(f);
fdput_pos(f);
return ret;
}

Expand Down
6 changes: 4 additions & 2 deletions include/linux/file.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed)

struct fd {
struct file *file;
int need_put;
unsigned int flags;
};
#define FDPUT_FPUT 1
#define FDPUT_POS_UNLOCK 2

static inline void fdput(struct fd fd)
{
if (fd.need_put)
if (fd.flags & FDPUT_FPUT)
fput(fd.file);
}

Expand Down
6 changes: 5 additions & 1 deletion include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH ((__force fmode_t)0x4000)

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)

Expand Down Expand Up @@ -780,13 +783,14 @@ struct file {
const struct file_operations *f_op;

/*
* Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
Expand Down

0 comments on commit 9c225f2

Please sign in to comment.