Skip to content

Commit e16b3fc

Browse files
amotinbehlendorf
authored andcommitted
Illumos 5008 - lock contention (rrw_exit) while running a read only load
5008 lock contention (rrw_exit) while running a read only load Reviewed by: Matthew Ahrens <matthew.ahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Alex Reece <alex.reece@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Richard Yao <ryao@gentoo.org> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Approved by: Garrett D'Amore <garrett@damore.org> Porting notes: This patch ported perfectly cleanly to ZoL. During testing 100% cached small-block reads, extreme contention was noticed on rrl->rr_lock from rrw_exit() due to the frequent entering and leaving ZPL. Illumos picked up this patch from FreeBSD and it also helps under Linux. On a 1-minute 4K cached read test with 10 fio processes pinned to a single socket on a 4-socket (10 thread per socket) NUMA system, contentions on rrl->rr_lock were reduced from 508799 to 43085. Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3555
1 parent 4bda3bd commit e16b3fc

File tree

6 files changed

+126
-13
lines changed

6 files changed

+126
-13
lines changed

include/sys/rrwlock.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,31 @@ void rrw_tsd_destroy(void *arg);
8383
#define RRW_LOCK_HELD(x) \
8484
(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
8585

86+
/*
87+
* A reader-mostly lock implementation, tuning above reader-writer locks
88+
* for hightly parallel read acquisitions, pessimizing write acquisitions.
89+
*
90+
* This should be a prime number. See comment in rrwlock.c near
91+
* RRM_TD_LOCK() for details.
92+
*/
93+
#define RRM_NUM_LOCKS 17
94+
typedef struct rrmlock {
95+
rrwlock_t locks[RRM_NUM_LOCKS];
96+
} rrmlock_t;
97+
98+
void rrm_init(rrmlock_t *rrl, boolean_t track_all);
99+
void rrm_destroy(rrmlock_t *rrl);
100+
void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
101+
void rrm_enter_read(rrmlock_t *rrl, void *tag);
102+
void rrm_enter_write(rrmlock_t *rrl);
103+
void rrm_exit(rrmlock_t *rrl, void *tag);
104+
boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
105+
106+
#define RRM_READ_HELD(x) rrm_held(x, RW_READER)
107+
#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER)
108+
#define RRM_LOCK_HELD(x) \
109+
(rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
110+
86111
#ifdef __cplusplus
87112
}
88113
#endif

include/sys/zfs_vfsops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ typedef struct zfs_sb {
6767
boolean_t z_atime; /* enable atimes mount option */
6868
boolean_t z_relatime; /* enable relatime mount option */
6969
boolean_t z_unmounted; /* unmounted */
70-
rrwlock_t z_teardown_lock;
70+
rrmlock_t z_teardown_lock;
7171
krwlock_t z_teardown_inactive_lock;
7272
list_t z_all_znodes; /* all znodes in the fs */
7373
uint64_t z_nr_znodes; /* number of znodes in the fs */

include/sys/zfs_znode.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ typedef struct znode {
250250
/* Called on entry to each ZFS vnode and vfs operation */
251251
#define ZFS_ENTER(zsb) \
252252
{ \
253-
rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \
253+
rrm_enter_read(&(zsb)->z_teardown_lock, FTAG); \
254254
if ((zsb)->z_unmounted) { \
255255
ZFS_EXIT(zsb); \
256256
return (EIO); \
@@ -260,7 +260,7 @@ typedef struct znode {
260260
/* Must be called before exiting the vop */
261261
#define ZFS_EXIT(zsb) \
262262
{ \
263-
rrw_exit(&(zsb)->z_teardown_lock, FTAG); \
263+
rrm_exit(&(zsb)->z_teardown_lock, FTAG); \
264264
}
265265

266266
/* Verifies the znode is valid */

module/zfs/rrwlock.c

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,3 +305,91 @@ rrw_tsd_destroy(void *arg)
305305
(void *)curthread, (void *)rn->rn_rrl);
306306
}
307307
}
308+
309+
/*
310+
* A reader-mostly lock implementation, tuning above reader-writer locks
311+
* for hightly parallel read acquisitions, while pessimizing writes.
312+
*
313+
* The idea is to split single busy lock into array of locks, so that
314+
* each reader can lock only one of them for read, depending on result
315+
* of simple hash function. That proportionally reduces lock congestion.
316+
* Writer same time has to sequentially aquire write on all the locks.
317+
* That makes write aquisition proportionally slower, but in places where
318+
* it is used (filesystem unmount) performance is not critical.
319+
*
320+
* All the functions below are direct wrappers around functions above.
321+
*/
322+
void
323+
rrm_init(rrmlock_t *rrl, boolean_t track_all)
324+
{
325+
int i;
326+
327+
for (i = 0; i < RRM_NUM_LOCKS; i++)
328+
rrw_init(&rrl->locks[i], track_all);
329+
}
330+
331+
void
332+
rrm_destroy(rrmlock_t *rrl)
333+
{
334+
int i;
335+
336+
for (i = 0; i < RRM_NUM_LOCKS; i++)
337+
rrw_destroy(&rrl->locks[i]);
338+
}
339+
340+
void
341+
rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
342+
{
343+
if (rw == RW_READER)
344+
rrm_enter_read(rrl, tag);
345+
else
346+
rrm_enter_write(rrl);
347+
}
348+
349+
/*
350+
* This maps the current thread to a specific lock. Note that the lock
351+
* must be released by the same thread that acquired it. We do this
352+
* mapping by taking the thread pointer mod a prime number. We examine
353+
* only the low 32 bits of the thread pointer, because 32-bit division
354+
* is faster than 64-bit division, and the high 32 bits have little
355+
* entropy anyway.
356+
*/
357+
#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
358+
359+
void
360+
rrm_enter_read(rrmlock_t *rrl, void *tag)
361+
{
362+
rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
363+
}
364+
365+
void
366+
rrm_enter_write(rrmlock_t *rrl)
367+
{
368+
int i;
369+
370+
for (i = 0; i < RRM_NUM_LOCKS; i++)
371+
rrw_enter_write(&rrl->locks[i]);
372+
}
373+
374+
void
375+
rrm_exit(rrmlock_t *rrl, void *tag)
376+
{
377+
int i;
378+
379+
if (rrl->locks[0].rr_writer == curthread) {
380+
for (i = 0; i < RRM_NUM_LOCKS; i++)
381+
rrw_exit(&rrl->locks[i], tag);
382+
} else {
383+
rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
384+
}
385+
}
386+
387+
boolean_t
388+
rrm_held(rrmlock_t *rrl, krw_t rw)
389+
{
390+
if (rw == RW_WRITER) {
391+
return (rrw_held(&rrl->locks[0], rw));
392+
} else {
393+
return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
394+
}
395+
}

module/zfs/zfs_ioctl.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1451,15 +1451,15 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
14511451
if (get_zfs_sb(name, zsbp) != 0)
14521452
error = zfs_sb_create(name, zsbp);
14531453
if (error == 0) {
1454-
rrw_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
1454+
rrm_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
14551455
RW_READER, tag);
14561456
if ((*zsbp)->z_unmounted) {
14571457
/*
14581458
* XXX we could probably try again, since the unmounting
14591459
* thread should be just about to disassociate the
14601460
* objset from the zsb.
14611461
*/
1462-
rrw_exit(&(*zsbp)->z_teardown_lock, tag);
1462+
rrm_exit(&(*zsbp)->z_teardown_lock, tag);
14631463
return (SET_ERROR(EBUSY));
14641464
}
14651465
}
@@ -1469,7 +1469,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
14691469
static void
14701470
zfs_sb_rele(zfs_sb_t *zsb, void *tag)
14711471
{
1472-
rrw_exit(&zsb->z_teardown_lock, tag);
1472+
rrm_exit(&zsb->z_teardown_lock, tag);
14731473

14741474
if (zsb->z_sb) {
14751475
deactivate_super(zsb->z_sb);

module/zfs/zfs_vfsops.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
771771
mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL);
772772
list_create(&zsb->z_all_znodes, sizeof (znode_t),
773773
offsetof(znode_t, z_link_node));
774-
rrw_init(&zsb->z_teardown_lock, B_FALSE);
774+
rrm_init(&zsb->z_teardown_lock, B_FALSE);
775775
rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
776776
rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);
777777

@@ -890,7 +890,7 @@ zfs_sb_free(zfs_sb_t *zsb)
890890
mutex_destroy(&zsb->z_znodes_lock);
891891
mutex_destroy(&zsb->z_lock);
892892
list_destroy(&zsb->z_all_znodes);
893-
rrw_destroy(&zsb->z_teardown_lock);
893+
rrm_destroy(&zsb->z_teardown_lock);
894894
rw_destroy(&zsb->z_teardown_inactive_lock);
895895
rw_destroy(&zsb->z_fuid_lock);
896896
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1221,7 +1221,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
12211221
}
12221222
}
12231223

1224-
rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
1224+
rrm_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
12251225

12261226
if (!unmounting) {
12271227
/*
@@ -1252,7 +1252,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
12521252
*/
12531253
if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) {
12541254
rw_exit(&zsb->z_teardown_inactive_lock);
1255-
rrw_exit(&zsb->z_teardown_lock, FTAG);
1255+
rrm_exit(&zsb->z_teardown_lock, FTAG);
12561256
return (SET_ERROR(EIO));
12571257
}
12581258

@@ -1280,7 +1280,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
12801280
*/
12811281
if (unmounting) {
12821282
zsb->z_unmounted = B_TRUE;
1283-
rrw_exit(&zsb->z_teardown_lock, FTAG);
1283+
rrm_exit(&zsb->z_teardown_lock, FTAG);
12841284
rw_exit(&zsb->z_teardown_inactive_lock);
12851285
}
12861286

@@ -1599,7 +1599,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
15991599
znode_t *zp;
16001600
uint64_t sa_obj = 0;
16011601

1602-
ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock));
1602+
ASSERT(RRM_WRITE_HELD(&zsb->z_teardown_lock));
16031603
ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock));
16041604

16051605
/*
@@ -1663,7 +1663,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
16631663
bail:
16641664
/* release the VFS ops */
16651665
rw_exit(&zsb->z_teardown_inactive_lock);
1666-
rrw_exit(&zsb->z_teardown_lock, FTAG);
1666+
rrm_exit(&zsb->z_teardown_lock, FTAG);
16671667

16681668
if (err) {
16691669
/*

0 commit comments

Comments
 (0)