Skip to content

Commit f1512ee

Browse files
ahrensbehlendorf
authored andcommitted
Illumos 5027 - zfs large block support
5027 zfs large block support Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Reviewed by: Richard Elling <richard.elling@richardelling.com> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/5027 illumos/illumos-gate@b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #354
1 parent 3df2934 commit f1512ee

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+613
-155
lines changed

cmd/zdb/zdb.c

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2185,6 +2185,8 @@ dump_label(const char *dev)
21852185
(void) close(fd);
21862186
}
21872187

2188+
static uint64_t num_large_blocks;
2189+
21882190
/*ARGSUSED*/
21892191
static int
21902192
dump_one_dir(const char *dsname, void *arg)
@@ -2197,6 +2199,8 @@ dump_one_dir(const char *dsname, void *arg)
21972199
(void) printf("Could not open %s, error %d\n", dsname, error);
21982200
return (0);
21992201
}
2202+
if (dmu_objset_ds(os)->ds_large_blocks)
2203+
num_large_blocks++;
22002204
dump_dir(os);
22012205
dmu_objset_disown(os, FTAG);
22022206
fuid_table_destroy();
@@ -2207,7 +2211,7 @@ dump_one_dir(const char *dsname, void *arg)
22072211
/*
22082212
* Block statistics.
22092213
*/
2210-
#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
2214+
#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
22112215
typedef struct zdb_blkstats {
22122216
uint64_t zb_asize;
22132217
uint64_t zb_lsize;
@@ -2273,7 +2277,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
22732277
zb->zb_lsize += BP_GET_LSIZE(bp);
22742278
zb->zb_psize += BP_GET_PSIZE(bp);
22752279
zb->zb_count++;
2276-
zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
2280+
2281+
/*
2282+
* The histogram is only big enough to record blocks up to
2283+
* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
2284+
* "other", bucket.
2285+
*/
2286+
int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
2287+
idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
2288+
zb->zb_psize_histogram[idx]++;
22772289

22782290
zb->zb_gangs += BP_COUNT_GANG(bp);
22792291

@@ -2979,6 +2991,7 @@ dump_zpool(spa_t *spa)
29792991
dump_metaslab_groups(spa);
29802992

29812993
if (dump_opt['d'] || dump_opt['i']) {
2994+
uint64_t refcount;
29822995
dump_dir(dp->dp_meta_objset);
29832996
if (dump_opt['d'] >= 3) {
29842997
dump_bpobj(&spa->spa_deferred_bpobj,
@@ -2998,8 +3011,21 @@ dump_zpool(spa_t *spa)
29983011
}
29993012
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
30003013
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
3014+
3015+
(void) feature_get_refcount(spa,
3016+
&spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
3017+
if (num_large_blocks != refcount) {
3018+
(void) printf("large_blocks feature refcount mismatch: "
3019+
"expected %lld != actual %lld\n",
3020+
(longlong_t)num_large_blocks,
3021+
(longlong_t)refcount);
3022+
rc = 2;
3023+
} else {
3024+
(void) printf("Verified large_blocks feature refcount "
3025+
"is correct (%llu)\n", (longlong_t)refcount);
3026+
}
30013027
}
3002-
if (dump_opt['b'] || dump_opt['c'])
3028+
if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
30033029
rc = dump_block_stats(spa);
30043030

30053031
if (rc == 0)

cmd/zfs/zfs_main.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,9 @@ get_usage(zfs_help_t idx)
258258
case HELP_ROLLBACK:
259259
return (gettext("\trollback [-rRf] <snapshot>\n"));
260260
case HELP_SEND:
261-
return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] "
261+
return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
262262
"<snapshot>\n"
263-
"\tsend [-e] [-i snapshot|bookmark] "
263+
"\tsend [-Le] [-i snapshot|bookmark] "
264264
"<filesystem|volume|snapshot>\n"));
265265
case HELP_SET:
266266
return (gettext("\tset <property=value> "
@@ -3683,7 +3683,7 @@ zfs_do_send(int argc, char **argv)
36833683
boolean_t extraverbose = B_FALSE;
36843684

36853685
/* check options */
3686-
while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
3686+
while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
36873687
switch (c) {
36883688
case 'i':
36893689
if (fromname)
@@ -3718,6 +3718,9 @@ zfs_do_send(int argc, char **argv)
37183718
case 'n':
37193719
flags.dryrun = B_TRUE;
37203720
break;
3721+
case 'L':
3722+
flags.largeblock = B_TRUE;
3723+
break;
37213724
case 'e':
37223725
flags.embed_data = B_TRUE;
37233726
break;
@@ -3774,6 +3777,8 @@ zfs_do_send(int argc, char **argv)
37743777
if (zhp == NULL)
37753778
return (1);
37763779

3780+
if (flags.largeblock)
3781+
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
37773782
if (flags.embed_data)
37783783
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
37793784

cmd/zstreamdump/zstreamdump.c

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ uint64_t total_stream_len = 0;
5656
FILE *send_stream = 0;
5757
boolean_t do_byteswap = B_FALSE;
5858
boolean_t do_cksum = B_TRUE;
59-
#define INITIAL_BUFLEN (1<<20)
6059

6160
static void
6261
usage(void)
@@ -69,6 +68,18 @@ usage(void)
6968
exit(1);
7069
}
7170

71+
static void *
72+
safe_malloc(size_t size)
73+
{
74+
void *rv = malloc(size);
75+
if (rv == NULL) {
76+
(void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n",
77+
(unsigned)size);
78+
abort();
79+
}
80+
return (rv);
81+
}
82+
7283
/*
7384
* ssread - send stream read.
7485
*
@@ -160,7 +171,7 @@ print_block(char *buf, int length)
160171
int
161172
main(int argc, char *argv[])
162173
{
163-
char *buf = malloc(INITIAL_BUFLEN);
174+
char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
164175
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
165176
uint64_t total_records = 0;
166177
dmu_replay_record_t thedrr;
@@ -308,9 +319,9 @@ main(int argc, char *argv[])
308319
nvlist_t *nv;
309320
int sz = drr->drr_payloadlen;
310321

311-
if (sz > INITIAL_BUFLEN) {
322+
if (sz > SPA_MAXBLOCKSIZE) {
312323
free(buf);
313-
buf = malloc(sz);
324+
buf = safe_malloc(sz);
314325
}
315326
(void) ssread(buf, sz, &zc);
316327
if (ferror(send_stream))

cmd/ztest/ztest.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,9 +1040,14 @@ ztest_spa_get_ashift(void) {
10401040
static int
10411041
ztest_random_blocksize(void)
10421042
{
1043-
// Choose a block size >= the ashift.
1044-
uint64_t block_shift =
1045-
ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
1043+
/*
1044+
* Choose a block size >= the ashift.
1045+
* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
1046+
*/
1047+
int maxbs = SPA_OLD_MAXBLOCKSHIFT;
1048+
if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
1049+
maxbs = 20;
1050+
uint64_t block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
10461051
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
10471052
}
10481053

@@ -4972,7 +4977,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
49724977
char *path0;
49734978
char *pathrand;
49744979
size_t fsize;
4975-
int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
4980+
int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
49764981
int iters = 1000;
49774982
int maxfaults;
49784983
int mirror_save;

include/libzfs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,9 @@ typedef struct sendflags {
617617
/* show progress (ie. -v) */
618618
boolean_t progress;
619619

620+
/* large blocks (>128K) are permitted */
621+
boolean_t largeblock;
622+
620623
/* WRITE_EMBEDDED records of type DATA are permitted */
621624
boolean_t embed_data;
622625
} sendflags_t;

include/libzfs_core.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ int lzc_release(nvlist_t *, nvlist_t **);
5353
int lzc_get_holds(const char *, nvlist_t **);
5454

5555
enum lzc_send_flags {
56-
LZC_SEND_FLAG_EMBED_DATA = 1 << 0
56+
LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
57+
LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
5758
};
5859

5960
int lzc_send(const char *, const char *, int, enum lzc_send_flags);

include/sys/dmu.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
245245
* The maximum number of bytes that can be accessed as part of one
246246
* operation, including metadata.
247247
*/
248-
#define DMU_MAX_ACCESS (10<<20) /* 10MB */
248+
#define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
249249
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
250250

251251
#define DMU_USERUSED_OBJECT (-1ULL)
@@ -732,6 +732,7 @@ void xuio_stat_wbuf_copied(void);
732732
void xuio_stat_wbuf_nocopy(void);
733733

734734
extern int zfs_prefetch_disable;
735+
extern int zfs_max_recordsize;
735736

736737
/*
737738
* Asynchronously try to read in the data.

include/sys/dmu_objset.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ struct objset {
9999
zfs_cache_type_t os_secondary_cache;
100100
zfs_sync_type_t os_sync;
101101
zfs_redundant_metadata_type_t os_redundant_metadata;
102+
int os_recordsize;
102103

103104
/* no lock needed: */
104105
struct dmu_tx *os_synctx; /* XXX sketchy */

include/sys/dmu_send.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,14 @@ struct dsl_dataset;
3737
struct drr_begin;
3838
struct avl_tree;
3939

40-
int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
40+
int dmu_send(const char *tosnap, const char *fromsnap,
41+
boolean_t embedok, boolean_t large_block_ok,
4142
int outfd, struct vnode *vp, offset_t *off);
4243
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
4344
uint64_t *sizep);
4445
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
45-
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
46+
boolean_t embedok, boolean_t large_block_ok,
47+
int outfd, struct vnode *vp, offset_t *off);
4648

4749
typedef struct dmu_recv_cookie {
4850
struct dsl_dataset *drc_ds;

include/sys/dsl_dataset.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ struct dsl_pool;
8383
*/
8484
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
8585

86+
/*
87+
* This field is present (with value=0) if this dataset may contain large
88+
* blocks (>128KB). If it is present, then this dataset
89+
* is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
90+
*/
91+
#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
92+
8693
/*
8794
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
8895
* name lookups should be performed case-insensitively.
@@ -138,6 +145,8 @@ typedef struct dsl_dataset {
138145
/* only used in syncing context, only valid for non-snapshots: */
139146
struct dsl_dataset *ds_prev;
140147
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
148+
boolean_t ds_large_blocks;
149+
boolean_t ds_need_large_blocks;
141150

142151
/* has internal locking: */
143152
dsl_deadlist_t ds_deadlist;
@@ -252,6 +261,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
252261
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
253262
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
254263
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
264+
int dsl_dataset_activate_large_blocks(const char *dsname);
265+
void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
255266

256267
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
257268

0 commit comments

Comments
 (0)