Skip to content

Commit

Permalink
Illumos 5027 - zfs large block support
Browse files Browse the repository at this point in the history
5027 zfs large block support
Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Richard Elling <richard.elling@richardelling.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/5027
  illumos/illumos-gate@b515258

Porting Notes:

* Included in this patch is a tiny ISP2() cleanup in zio_init() from
Illumos 5255.

* Unlike the upstream Illumos commit this patch does not impose an
arbitrary 128K block size limit on volumes.  Volumes, like filesystems,
are limited by the zfs_max_recordsize=1M module option.

* By default the maximum record size is limited to 1M by the module
option zfs_max_recordsize.  This value may be safely increased up to
16M which is the largest block size supported by the on-disk format.
At the moment, 1M blocks clearly offer a significant performance
improvement but the benefits of going beyond this for the majority
of workloads are less clear.

* The illumos version of this patch increased DMU_MAX_ACCESS to 32M.
This was determined not to be large enough when using 16M blocks
because the zfs_make_xattrdir() function will fail (EFBIG) when
assigning a TX.  This was immediately observed under Linux because
all newly created files must have a security xattr created and
that was failing.  Therefore, we've set DMU_MAX_ACCESS to 64M.

* On 32-bit platforms a hard limit of 1M is set for blocks due
to the limited virtual address space.  We should be able to relax
this one the ABD patches are merged.

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #354
  • Loading branch information
ahrens authored and behlendorf committed May 11, 2015
1 parent 3df2934 commit f1512ee
Show file tree
Hide file tree
Showing 55 changed files with 613 additions and 155 deletions.
32 changes: 29 additions & 3 deletions cmd/zdb/zdb.c
Expand Up @@ -2185,6 +2185,8 @@ dump_label(const char *dev)
(void) close(fd);
}

static uint64_t num_large_blocks;

/*ARGSUSED*/
static int
dump_one_dir(const char *dsname, void *arg)
Expand All @@ -2197,6 +2199,8 @@ dump_one_dir(const char *dsname, void *arg)
(void) printf("Could not open %s, error %d\n", dsname, error);
return (0);
}
if (dmu_objset_ds(os)->ds_large_blocks)
num_large_blocks++;
dump_dir(os);
dmu_objset_disown(os, FTAG);
fuid_table_destroy();
Expand All @@ -2207,7 +2211,7 @@ dump_one_dir(const char *dsname, void *arg)
/*
* Block statistics.
*/
#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
typedef struct zdb_blkstats {
uint64_t zb_asize;
uint64_t zb_lsize;
Expand Down Expand Up @@ -2273,7 +2277,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zb->zb_lsize += BP_GET_LSIZE(bp);
zb->zb_psize += BP_GET_PSIZE(bp);
zb->zb_count++;
zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;

/*
* The histogram is only big enough to record blocks up to
* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
* "other", bucket.
*/
int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
zb->zb_psize_histogram[idx]++;

zb->zb_gangs += BP_COUNT_GANG(bp);

Expand Down Expand Up @@ -2979,6 +2991,7 @@ dump_zpool(spa_t *spa)
dump_metaslab_groups(spa);

if (dump_opt['d'] || dump_opt['i']) {
uint64_t refcount;
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
dump_bpobj(&spa->spa_deferred_bpobj,
Expand All @@ -2998,8 +3011,21 @@ dump_zpool(spa_t *spa)
}
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);

(void) feature_get_refcount(spa,
&spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
if (num_large_blocks != refcount) {
(void) printf("large_blocks feature refcount mismatch: "
"expected %lld != actual %lld\n",
(longlong_t)num_large_blocks,
(longlong_t)refcount);
rc = 2;
} else {
(void) printf("Verified large_blocks feature refcount "
"is correct (%llu)\n", (longlong_t)refcount);
}
}
if (dump_opt['b'] || dump_opt['c'])
if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
rc = dump_block_stats(spa);

if (rc == 0)
Expand Down
11 changes: 8 additions & 3 deletions cmd/zfs/zfs_main.c
Expand Up @@ -258,9 +258,9 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] "
return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
"<snapshot>\n"
"\tsend [-e] [-i snapshot|bookmark] "
"\tsend [-Le] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"));
case HELP_SET:
return (gettext("\tset <property=value> "
Expand Down Expand Up @@ -3683,7 +3683,7 @@ zfs_do_send(int argc, char **argv)
boolean_t extraverbose = B_FALSE;

/* check options */
while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
switch (c) {
case 'i':
if (fromname)
Expand Down Expand Up @@ -3718,6 +3718,9 @@ zfs_do_send(int argc, char **argv)
case 'n':
flags.dryrun = B_TRUE;
break;
case 'L':
flags.largeblock = B_TRUE;
break;
case 'e':
flags.embed_data = B_TRUE;
break;
Expand Down Expand Up @@ -3774,6 +3777,8 @@ zfs_do_send(int argc, char **argv)
if (zhp == NULL)
return (1);

if (flags.largeblock)
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;

Expand Down
19 changes: 15 additions & 4 deletions cmd/zstreamdump/zstreamdump.c
Expand Up @@ -56,7 +56,6 @@ uint64_t total_stream_len = 0;
FILE *send_stream = 0;
boolean_t do_byteswap = B_FALSE;
boolean_t do_cksum = B_TRUE;
#define INITIAL_BUFLEN (1<<20)

static void
usage(void)
Expand All @@ -69,6 +68,18 @@ usage(void)
exit(1);
}

static void *
safe_malloc(size_t size)
{
void *rv = malloc(size);
if (rv == NULL) {
(void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n",
(unsigned)size);
abort();
}
return (rv);
}

/*
* ssread - send stream read.
*
Expand Down Expand Up @@ -160,7 +171,7 @@ print_block(char *buf, int length)
int
main(int argc, char *argv[])
{
char *buf = malloc(INITIAL_BUFLEN);
char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
uint64_t total_records = 0;
dmu_replay_record_t thedrr;
Expand Down Expand Up @@ -308,9 +319,9 @@ main(int argc, char *argv[])
nvlist_t *nv;
int sz = drr->drr_payloadlen;

if (sz > INITIAL_BUFLEN) {
if (sz > SPA_MAXBLOCKSIZE) {
free(buf);
buf = malloc(sz);
buf = safe_malloc(sz);
}
(void) ssread(buf, sz, &zc);
if (ferror(send_stream))
Expand Down
13 changes: 9 additions & 4 deletions cmd/ztest/ztest.c
Expand Up @@ -1040,9 +1040,14 @@ ztest_spa_get_ashift(void) {
static int
ztest_random_blocksize(void)
{
// Choose a block size >= the ashift.
uint64_t block_shift =
ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
/*
* Choose a block size >= the ashift.
* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
*/
int maxbs = SPA_OLD_MAXBLOCKSHIFT;
if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
maxbs = 20;
uint64_t block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
}

Expand Down Expand Up @@ -4972,7 +4977,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
char *path0;
char *pathrand;
size_t fsize;
int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int iters = 1000;
int maxfaults;
int mirror_save;
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs.h
Expand Up @@ -617,6 +617,9 @@ typedef struct sendflags {
/* show progress (ie. -v) */
boolean_t progress;

/* large blocks (>128K) are permitted */
boolean_t largeblock;

/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
} sendflags_t;
Expand Down
3 changes: 2 additions & 1 deletion include/libzfs_core.h
Expand Up @@ -53,7 +53,8 @@ int lzc_release(nvlist_t *, nvlist_t **);
int lzc_get_holds(const char *, nvlist_t **);

enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0
LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
};

int lzc_send(const char *, const char *, int, enum lzc_send_flags);
Expand Down
3 changes: 2 additions & 1 deletion include/sys/dmu.h
Expand Up @@ -245,7 +245,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
* The maximum number of bytes that can be accessed as part of one
* operation, including metadata.
*/
#define DMU_MAX_ACCESS (10<<20) /* 10MB */
#define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */

#define DMU_USERUSED_OBJECT (-1ULL)
Expand Down Expand Up @@ -732,6 +732,7 @@ void xuio_stat_wbuf_copied(void);
void xuio_stat_wbuf_nocopy(void);

extern int zfs_prefetch_disable;
extern int zfs_max_recordsize;

/*
* Asynchronously try to read in the data.
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu_objset.h
Expand Up @@ -99,6 +99,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;

/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
Expand Down
6 changes: 4 additions & 2 deletions include/sys/dmu_send.h
Expand Up @@ -37,12 +37,14 @@ struct dsl_dataset;
struct drr_begin;
struct avl_tree;

int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
int dmu_send(const char *tosnap, const char *fromsnap,
boolean_t embedok, boolean_t large_block_ok,
int outfd, struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
boolean_t embedok, boolean_t large_block_ok,
int outfd, struct vnode *vp, offset_t *off);

typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds;
Expand Down
11 changes: 11 additions & 0 deletions include/sys/dsl_dataset.h
Expand Up @@ -83,6 +83,13 @@ struct dsl_pool;
*/
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"

/*
* This field is present (with value=0) if this dataset may contain large
* blocks (>128KB). If it is present, then this dataset
* is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
*/
#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"

/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
Expand Down Expand Up @@ -138,6 +145,8 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
boolean_t ds_large_blocks;
boolean_t ds_need_large_blocks;

/* has internal locking: */
dsl_deadlist_t ds_deadlist;
Expand Down Expand Up @@ -252,6 +261,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
int dsl_dataset_activate_large_blocks(const char *dsname);
void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);

int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);

Expand Down
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Expand Up @@ -200,6 +200,7 @@ typedef enum {
ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
ZPOOL_PROP_MAXBLOCKSIZE,
ZPOOL_PROP_TNAME,
ZPOOL_NUM_PROPS
} zpool_prop_t;
Expand Down
22 changes: 16 additions & 6 deletions include/sys/spa.h
Expand Up @@ -98,17 +98,26 @@ _NOTE(CONSTCOND) } while (0)
_NOTE(CONSTCOND) } while (0)

/*
* We currently support nine block sizes, from 512 bytes to 128K.
* We could go higher, but the benefits are near-zero and the cost
* of COWing a giant block to modify one byte would become excessive.
* We currently support block sizes from 512 bytes to 16MB.
* The benefits of larger blocks, and thus larger IO, need to be weighed
* against the cost of COWing a giant block to modify one byte, and the
* large latency of reading or writing a large block.
*
* Note that although blocks up to 16MB are supported, the recordsize
* property can not be set larger than zfs_max_recordsize (default 1MB).
* See the comment near zfs_max_recordsize in dsl_dataset.c for details.
*
* Note that although the LSIZE field of the blkptr_t can store sizes up
* to 32MB, the dnode's dn_datablkszsec can only store sizes up to
* 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
*/
#define SPA_MINBLOCKSHIFT 9
#define SPA_MAXBLOCKSHIFT 17
#define SPA_OLD_MAXBLOCKSHIFT 17
#define SPA_MAXBLOCKSHIFT 24
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)

#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)

/*
* Size of block to hold the configuration data (a packed nvlist)
*/
Expand Down Expand Up @@ -830,6 +839,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);

extern int spa_mode(spa_t *spa);
Expand Down
3 changes: 1 addition & 2 deletions include/sys/zap_impl.h
Expand Up @@ -42,8 +42,7 @@ extern int fzap_default_block_shift;

#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE

#define ZAP_NEED_CD (-1U)

Expand Down
5 changes: 4 additions & 1 deletion include/sys/zfs_ioctl.h
Expand Up @@ -96,13 +96,16 @@ typedef enum drr_headertype {
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
/* flag #18 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)

/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
DMU_BACKUP_FEATURE_LARGE_BLOCKS)

/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
Expand Down
2 changes: 1 addition & 1 deletion include/sys/zfs_sa.h
Expand Up @@ -129,7 +129,7 @@ typedef struct znode_phys {
#ifdef _KERNEL

#define DXATTR_MAX_ENTRY_SIZE (32768)
#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1)
#define DXATTR_MAX_SA_SIZE (SPA_OLD_MAXBLOCKSIZE >> 1)

int zfs_sa_readlink(struct znode *, uio_t *);
void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
Expand Down
2 changes: 0 additions & 2 deletions include/sys/zfs_znode.h
Expand Up @@ -137,8 +137,6 @@ extern "C" {
#define ZFS_SHARES_DIR "SHARES"
#define ZFS_SA_ATTRS "SA_ATTRS"

#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)

/*
* Path component length
*
Expand Down
1 change: 0 additions & 1 deletion include/sys/zil.h
Expand Up @@ -90,7 +90,6 @@ typedef struct zil_chain {
} zil_chain_t;

#define ZIL_MIN_BLKSZ 4096ULL
#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE

/*
* The words of a log block checksum.
Expand Down
2 changes: 1 addition & 1 deletion include/sys/zil_impl.h
Expand Up @@ -140,7 +140,7 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;

#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))

#ifdef __cplusplus
Expand Down

0 comments on commit f1512ee

Please sign in to comment.