Skip to content

Commit 755065f

Browse files
amotinbehlendorf
authored andcommitted
OpenZFS 6322 - ZFS indirect block predictive prefetch
For quite some time I was thinking about possibility to prefetch ZFS indirection tables while doing sequential reads or writes. Recent changes in predictive prefetcher made that much easier to do. My tests on zvol with 16KB block size on 5x striped and 2x mirrored pool of 10 disks show almost double throughput on sequential read, and almost tripple on sequential rewrite. While for read alike effect can be received from increasing maximal prefetch distance (though at higher memory cost), for rewrite there is no other solution so far. Authored by: Alexander Motin <mav@freebsd.org> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com> Ported-by: kernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/6322 OpenZFS-commit: illumos/illumos-gate@cb92f413 Closes #5040 Porting notes: - Change from upstream in module/zfs/dbuf.c in 'int dbuf_read' due to commit 5f6d0b6 'Handle block pointers with a corrupt logical size' - Difference from upstream in module/zfs/dmu_zfetch.c, uint32_t zfetch_max_idistance -> unsigned int zfetch_max_idistance - Variables have been initialized at the beginning of the function (void dmu_zfetch) to resemble the order of occurrence and account for C99, C11 mode errors.
1 parent 98ace73 commit 755065f

File tree

5 files changed

+91
-26
lines changed

5 files changed

+91
-26
lines changed

include/sys/dmu_zfetch.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ struct dnode; /* so we can reference dnode */
4343
typedef struct zstream {
4444
uint64_t zs_blkid; /* expect next access at this blkid */
4545
uint64_t zs_pf_blkid; /* next block to prefetch */
46+
47+
/*
48+
* We will next prefetch the L1 indirect block of this level-0
49+
* block id.
50+
*/
51+
uint64_t zs_ipf_blkid;
52+
4653
kmutex_t zs_lock; /* protects stream */
4754
hrtime_t zs_atime; /* time last prefetch issued */
4855
list_node_t zs_node; /* link for zf_stream */
@@ -59,7 +66,7 @@ void zfetch_fini(void);
5966

6067
void dmu_zfetch_init(zfetch_t *, struct dnode *);
6168
void dmu_zfetch_fini(zfetch_t *);
62-
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
69+
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
6370

6471

6572
#ifdef __cplusplus

include/sys/dnode.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,15 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
349349
void dnode_evict_dbufs(dnode_t *dn);
350350
void dnode_evict_bonus(dnode_t *dn);
351351

352+
#define DNODE_IS_CACHEABLE(_dn) \
353+
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
354+
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
355+
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
356+
357+
#define DNODE_META_IS_CACHEABLE(_dn) \
358+
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
359+
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
360+
352361
#ifdef ZFS_DEBUG
353362

354363
/*

module/zfs/dbuf.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -844,7 +844,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
844844
if (db->db_state == DB_CACHED) {
845845
mutex_exit(&db->db_mtx);
846846
if (prefetch)
847-
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
847+
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
848848
if ((flags & DB_RF_HAVESTRUCT) == 0)
849849
rw_exit(&dn->dn_struct_rwlock);
850850
DB_DNODE_EXIT(db);
@@ -859,7 +859,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
859859
/* dbuf_read_impl has dropped db_mtx for us */
860860

861861
if (!err && prefetch)
862-
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
862+
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
863863

864864
if ((flags & DB_RF_HAVESTRUCT) == 0)
865865
rw_exit(&dn->dn_struct_rwlock);
@@ -878,7 +878,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
878878
*/
879879
mutex_exit(&db->db_mtx);
880880
if (prefetch)
881-
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
881+
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
882882
if ((flags & DB_RF_HAVESTRUCT) == 0)
883883
rw_exit(&dn->dn_struct_rwlock);
884884
DB_DNODE_EXIT(db);

module/zfs/dmu.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -485,9 +485,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
485485
dbp[i] = &db->db;
486486
}
487487

488-
if ((flags & DMU_READ_NO_PREFETCH) == 0 && read &&
489-
length <= zfetch_array_rd_sz) {
490-
dmu_zfetch(&dn->dn_zfetch, blkid, nblks);
488+
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
489+
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
490+
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
491+
read && DNODE_IS_CACHEABLE(dn));
491492
}
492493
rw_exit(&dn->dn_struct_rwlock);
493494

module/zfs/dmu_zfetch.c

Lines changed: 67 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ unsigned int zfetch_max_streams = 8;
5050
unsigned int zfetch_min_sec_reap = 2;
5151
/* max bytes to prefetch per stream (default 8MB) */
5252
unsigned int zfetch_max_distance = 8 * 1024 * 1024;
53+
/* max bytes to prefetch indirects for per stream (default 64MB) */
54+
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
5355
/* max number of bytes in an array_read in which we allow prefetching (1MB) */
5456
unsigned long zfetch_array_rd_sz = 1024 * 1024;
5557

@@ -189,23 +191,30 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
189191
zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
190192
zs->zs_blkid = blkid;
191193
zs->zs_pf_blkid = blkid;
194+
zs->zs_ipf_blkid = blkid;
192195
zs->zs_atime = gethrtime();
193196
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
194197

195198
list_insert_head(&zf->zf_stream, zs);
196199
}
197200

198201
/*
199-
* This is the prefetch entry point. It calls all of the other dmu_zfetch
200-
* routines to create, delete, find, or operate upon prefetch streams.
202+
* This is the predictive prefetch entry point. It associates dnode access
203+
* specified with blkid and nblks arguments with prefetch stream, predicts
204+
* further accesses based on that stats and initiates speculative prefetch.
205+
* fetch_data argument specifies whether actual data blocks should be fetched:
206+
* FALSE -- prefetch only indirect blocks for predicted data blocks;
207+
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
201208
*/
202209
void
203-
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
210+
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
204211
{
205212
zstream_t *zs;
206-
int64_t pf_start;
207-
int pf_nblks;
208-
int i;
213+
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
214+
int64_t pf_ahead_blks, max_blks, iblk;
215+
int epbs, max_dist_blks, pf_nblks, ipf_nblks, i;
216+
uint64_t end_of_access_blkid;
217+
end_of_access_blkid = blkid + nblks;
209218

210219
if (zfs_prefetch_disable)
211220
return;
@@ -242,7 +251,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
242251
*/
243252
ZFETCHSTAT_BUMP(zfetchstat_misses);
244253
if (rw_tryupgrade(&zf->zf_rwlock))
245-
dmu_zfetch_stream_create(zf, blkid + nblks);
254+
dmu_zfetch_stream_create(zf, end_of_access_blkid);
246255
rw_exit(&zf->zf_rwlock);
247256
return;
248257
}
@@ -254,36 +263,75 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
254263
* Normally, we start prefetching where we stopped
255264
* prefetching last (zs_pf_blkid). But when we get our first
256265
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
257-
* want to prefetch to block we just accessed. In this case,
266+
* want to prefetch the block we just accessed. In this case,
258267
* start just after the block we just accessed.
259268
*/
260-
pf_start = MAX(zs->zs_pf_blkid, blkid + nblks);
269+
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
261270

262271
/*
263272
* Double our amount of prefetched data, but don't let the
264273
* prefetch get further ahead than zfetch_max_distance.
265274
*/
266-
pf_nblks =
267-
MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks,
268-
zs->zs_blkid + nblks +
269-
(zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start);
275+
if (fetch_data) {
276+
max_dist_blks =
277+
zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
278+
/*
279+
* Previously, we were (zs_pf_blkid - blkid) ahead. We
280+
* want to now be double that, so read that amount again,
281+
* plus the amount we are catching up by (i.e. the amount
282+
* read just now).
283+
*/
284+
pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
285+
max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
286+
pf_nblks = MIN(pf_ahead_blks, max_blks);
287+
} else {
288+
pf_nblks = 0;
289+
}
270290

271291
zs->zs_pf_blkid = pf_start + pf_nblks;
272-
zs->zs_atime = gethrtime();
273-
zs->zs_blkid = blkid + nblks;
274292

275293
/*
276-
* dbuf_prefetch() issues the prefetch i/o
277-
* asynchronously, but it may need to wait for an
278-
* indirect block to be read from disk. Therefore
279-
* we do not want to hold any locks while we call it.
294+
* Do the same for indirects, starting from where we stopped last,
295+
* or where we will stop reading data blocks (and the indirects
296+
* that point to them).
280297
*/
298+
ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
299+
max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
300+
/*
301+
* We want to double our distance ahead of the data prefetch
302+
* (or reader, if we are not prefetching data). Previously, we
303+
* were (zs_ipf_blkid - blkid) ahead. To double that, we read
304+
* that amount again, plus the amount we are catching up by
305+
* (i.e. the amount read now + the amount of data prefetched now).
306+
*/
307+
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
308+
max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
309+
ipf_nblks = MIN(pf_ahead_blks, max_blks);
310+
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
311+
312+
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
313+
ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
314+
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
315+
316+
zs->zs_atime = gethrtime();
317+
zs->zs_blkid = end_of_access_blkid;
281318
mutex_exit(&zs->zs_lock);
282319
rw_exit(&zf->zf_rwlock);
320+
321+
/*
322+
* dbuf_prefetch() is asynchronous (even when it needs to read
323+
* indirect blocks), but we still prefer to drop our locks before
324+
* calling it to reduce the time we hold them.
325+
*/
326+
283327
for (i = 0; i < pf_nblks; i++) {
284328
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
285329
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
286330
}
331+
for (iblk = ipf_istart; iblk < ipf_iend; iblk++) {
332+
dbuf_prefetch(zf->zf_dnode, 1, iblk,
333+
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
334+
}
287335
ZFETCHSTAT_BUMP(zfetchstat_hits);
288336
}
289337

0 commit comments

Comments
 (0)