Skip to content

Commit 2bce804

Browse files
ahrensbehlendorf
authored andcommitted
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the same directory, on a machine with 16 CPU cores, I observed poor performance. I noticed that dmu_tx_hold_zap() was using about 30% of all CPU, and doing dnode_hold() 7 times on the same object (the ZAP object that is being held). dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the dnode_t that we already have in hand, rather than repeatedly calling dnode_hold(). To do this, we need to pass the dnode_t down through all the intermediate calls that dmu_tx_hold_zap() makes, making these routines take the dnode_t* rather than an objset_t* and a uint64_t object number. In particular, the following routines will need to have analogous *_by_dnode() variants created: dmu_buf_hold_noread() dmu_buf_hold() zap_lookup() zap_lookup_norm() zap_count_write() zap_lockdir() zap_count_write() This can improve performance on the benchmark described above by 100%, from 30,000 file creations per second to 60,000. (This improvement is on top of that provided by working around the object allocation issue. Peak performance of ~90,000 creations per second was observed with 8 CPUs; adding CPUs past that decreased performance due to lock contention.) The CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds to 40 CPU-seconds. Sponsored by: Intel Corp. Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/7004 OpenZFS-commit: openzfs/openzfs#109 Closes #4641 Closes #4972
1 parent 8bea981 commit 2bce804

File tree

8 files changed

+146
-19
lines changed

8 files changed

+146
-19
lines changed

include/sys/dmu.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
23+
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
2424
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
2525
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
2626
* Copyright 2014 HybridCluster. All rights reserved.
@@ -73,6 +73,7 @@ struct sa_handle;
7373
typedef struct objset objset_t;
7474
typedef struct dmu_tx dmu_tx_t;
7575
typedef struct dsl_dir dsl_dir_t;
76+
typedef struct dnode dnode_t;
7677

7778
typedef enum dmu_object_byteswap {
7879
DMU_BSWAP_UINT8,
@@ -420,7 +421,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
420421
#define WP_DMU_SYNC 0x2
421422
#define WP_SPILL 0x4
422423

423-
void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
424+
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
424425
struct zio_prop *zp);
425426
/*
426427
* The bonus data is accessed more or less like a regular buffer.
@@ -446,7 +447,7 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
446447
*/
447448

448449
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
449-
int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
450+
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
450451
void *tag, dmu_buf_t **dbp);
451452
int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
452453

@@ -466,6 +467,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
466467
*/
467468
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
468469
void *tag, dmu_buf_t **, int flags);
470+
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
471+
void *tag, dmu_buf_t **dbp, int flags);
469472

470473
/*
471474
* Add a reference to a dmu buffer that has already been held via
@@ -620,6 +623,8 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
620623
void *dmu_buf_get_user(dmu_buf_t *db);
621624

622625
objset_t *dmu_buf_get_objset(dmu_buf_t *db);
626+
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
627+
void dmu_buf_dnode_exit(dmu_buf_t *db);
623628

624629
/* Block until any in-progress dmu buf user evictions complete. */
625630
void dmu_buf_user_evict_wait(void);
@@ -799,7 +804,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
799804
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
800805
void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
801806
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
802-
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
807+
void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
803808
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
804809
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
805810
/*

include/sys/dnode.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
2424
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
2525
*/
2626

@@ -187,7 +187,7 @@ typedef struct dnode_phys {
187187
#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
188188
(((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
189189

190-
typedef struct dnode {
190+
struct dnode {
191191
/*
192192
* Protects the structure of the dnode, including the number of levels
193193
* of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
@@ -286,7 +286,7 @@ typedef struct dnode {
286286

287287
/* holds prefetch structure */
288288
struct zfetch dn_zfetch;
289-
} dnode_t;
289+
};
290290

291291
/*
292292
* Adds a level of indirection between the dbuf and the dnode to avoid

include/sys/zap.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,14 @@ int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
236236
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
237237
int key_numints);
238238

239-
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
239+
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
240+
uint64_t integer_size, uint64_t num_integers, void *buf);
241+
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
242+
uint64_t integer_size, uint64_t num_integers, void *buf,
243+
matchtype_t mt, char *realname, int rn_len,
244+
boolean_t *ncp);
245+
246+
int zap_count_write_by_dnode(dnode_t *dn, const char *name,
240247
int add, uint64_t *towrite, uint64_t *tooverwrite);
241248

242249
/*

module/zfs/dbuf.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2323
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24-
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24+
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
2525
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2626
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
2727
*/
@@ -2843,6 +2843,21 @@ dmu_buf_get_objset(dmu_buf_t *db)
28432843
return (dbi->db_objset);
28442844
}
28452845

2846+
dnode_t *
2847+
dmu_buf_dnode_enter(dmu_buf_t *db)
2848+
{
2849+
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2850+
DB_DNODE_ENTER(dbi);
2851+
return (DB_DNODE(dbi));
2852+
}
2853+
2854+
void
2855+
dmu_buf_dnode_exit(dmu_buf_t *db)
2856+
{
2857+
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2858+
DB_DNODE_EXIT(dbi);
2859+
}
2860+
28462861
static void
28472862
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
28482863
{

module/zfs/dmu.c

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,26 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
127127
{ zfs_acl_byteswap, "acl" }
128128
};
129129

130+
int
131+
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
132+
void *tag, dmu_buf_t **dbp)
133+
{
134+
uint64_t blkid;
135+
dmu_buf_impl_t *db;
136+
137+
blkid = dbuf_whichblock(dn, 0, offset);
138+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
139+
db = dbuf_hold(dn, blkid, tag);
140+
rw_exit(&dn->dn_struct_rwlock);
141+
142+
if (db == NULL) {
143+
*dbp = NULL;
144+
return (SET_ERROR(EIO));
145+
}
146+
147+
*dbp = &db->db;
148+
return (0);
149+
}
130150
int
131151
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
132152
void *tag, dmu_buf_t **dbp)
@@ -154,6 +174,29 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
154174
return (err);
155175
}
156176

177+
int
178+
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
179+
void *tag, dmu_buf_t **dbp, int flags)
180+
{
181+
int err;
182+
int db_flags = DB_RF_CANFAIL;
183+
184+
if (flags & DMU_READ_NO_PREFETCH)
185+
db_flags |= DB_RF_NOPREFETCH;
186+
187+
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
188+
if (err == 0) {
189+
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
190+
err = dbuf_read(db, NULL, db_flags);
191+
if (err != 0) {
192+
dbuf_rele(db, tag);
193+
*dbp = NULL;
194+
}
195+
}
196+
197+
return (err);
198+
}
199+
157200
int
158201
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
159202
void *tag, dmu_buf_t **dbp, int flags)

module/zfs/dmu_tx.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2323
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24-
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24+
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
2525
*/
2626

2727
#include <sys/dmu.h>
@@ -794,15 +794,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
794794
* access the name in this fat-zap so that we'll check
795795
* for i/o errors to the leaf blocks, etc.
796796
*/
797-
err = zap_lookup(dn->dn_objset, dn->dn_object, name,
798-
8, 0, NULL);
797+
err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
799798
if (err == EIO) {
800799
tx->tx_err = err;
801800
return;
802801
}
803802
}
804803

805-
err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
804+
err = zap_count_write_by_dnode(dn, name, add,
806805
&txh->txh_space_towrite, &txh->txh_space_tooverwrite);
807806

808807
/*

module/zfs/zap.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,15 +270,23 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
270270
uint64_t blk, off;
271271
int err;
272272
dmu_buf_t *db;
273+
dnode_t *dn;
273274
int bs = FZAP_BLOCK_SHIFT(zap);
274275

275276
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
276277

277278
blk = idx >> (bs-3);
278279
off = idx & ((1<<(bs-3))-1);
279280

280-
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
281+
/*
282+
* Note: this is equivalent to dmu_buf_hold(), but we use
283+
* _dnode_enter / _by_dnode because it's faster because we don't
284+
* have to hold the dnode.
285+
*/
286+
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
287+
err = dmu_buf_hold_by_dnode(dn,
281288
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
289+
dmu_buf_dnode_exit(zap->zap_dbuf);
282290
if (err)
283291
return (err);
284292
*valp = ((uint64_t *)db->db_data)[off];
@@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
292300
*/
293301
blk = (idx*2) >> (bs-3);
294302

295-
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
303+
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
304+
err = dmu_buf_hold_by_dnode(dn,
296305
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
297306
DMU_READ_NO_PREFETCH);
307+
dmu_buf_dnode_exit(zap->zap_dbuf);
298308
if (err == 0)
299309
dmu_buf_rele(db, FTAG);
300310
}
@@ -502,6 +512,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
502512
zap_leaf_t *l;
503513
int bs = FZAP_BLOCK_SHIFT(zap);
504514
int err;
515+
dnode_t *dn;
505516

506517
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
507518

@@ -515,8 +526,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
515526
if (blkid == 0)
516527
return (ENOENT);
517528

518-
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
529+
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
530+
err = dmu_buf_hold_by_dnode(dn,
519531
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
532+
dmu_buf_dnode_exit(zap->zap_dbuf);
520533
if (err)
521534
return (err);
522535

module/zfs/zap_micro.c

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,24 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
536536
return (0);
537537
}
538538

539+
static int
540+
zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
541+
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
542+
{
543+
dmu_buf_t *db;
544+
int err;
545+
546+
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
547+
if (err != 0) {
548+
return (err);
549+
}
550+
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
551+
if (err != 0) {
552+
dmu_buf_rele(db, tag);
553+
}
554+
return (err);
555+
}
556+
539557
int
540558
zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
541559
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
@@ -927,6 +945,33 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
927945
return (err);
928946
}
929947

948+
int
949+
zap_lookup_by_dnode(dnode_t *dn, const char *name,
950+
uint64_t integer_size, uint64_t num_integers, void *buf)
951+
{
952+
return (zap_lookup_norm_by_dnode(dn, name, integer_size,
953+
num_integers, buf, MT_EXACT, NULL, 0, NULL));
954+
}
955+
956+
int
957+
zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
958+
uint64_t integer_size, uint64_t num_integers, void *buf,
959+
matchtype_t mt, char *realname, int rn_len,
960+
boolean_t *ncp)
961+
{
962+
zap_t *zap;
963+
int err;
964+
965+
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
966+
FTAG, &zap);
967+
if (err != 0)
968+
return (err);
969+
err = zap_lookup_impl(zap, name, integer_size,
970+
num_integers, buf, mt, realname, rn_len, ncp);
971+
zap_unlockdir(zap, FTAG);
972+
return (err);
973+
}
974+
930975
int
931976
zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
932977
int key_numints)
@@ -1460,7 +1505,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
14601505
}
14611506

14621507
int
1463-
zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
1508+
zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
14641509
uint64_t *towrite, uint64_t *tooverwrite)
14651510
{
14661511
zap_t *zap;
@@ -1488,7 +1533,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
14881533
* At present we are just evaluating the possibility of this operation
14891534
* and hence we do not want to trigger an upgrade.
14901535
*/
1491-
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE,
1536+
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
14921537
FTAG, &zap);
14931538
if (err != 0)
14941539
return (err);
@@ -1552,7 +1597,7 @@ EXPORT_SYMBOL(zap_lookup_uint64);
15521597
EXPORT_SYMBOL(zap_contains);
15531598
EXPORT_SYMBOL(zap_prefetch);
15541599
EXPORT_SYMBOL(zap_prefetch_uint64);
1555-
EXPORT_SYMBOL(zap_count_write);
1600+
EXPORT_SYMBOL(zap_count_write_by_dnode);
15561601
EXPORT_SYMBOL(zap_add);
15571602
EXPORT_SYMBOL(zap_add_uint64);
15581603
EXPORT_SYMBOL(zap_update);

0 commit comments

Comments
 (0)