From c598d91dcb0c7e95abdacb2711898ae14ab52ca1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Apr 2018 19:19:09 -0400 Subject: [PATCH] Update bcachefs sources to edf5f38218 bcachefs: Refactor superblock code --- .bcachefs_revision | 2 +- cmd_key.c | 6 +- cmd_migrate.c | 1 + include/linux/bug.h | 2 +- include/trace/events/bcachefs.h | 2 +- libbcachefs.c | 158 ++-- libbcachefs/alloc.c | 397 +++++---- libbcachefs/alloc.h | 12 +- libbcachefs/alloc_types.h | 5 +- libbcachefs/bcachefs.h | 11 +- libbcachefs/bcachefs_format.h | 5 +- libbcachefs/bkey_methods.c | 26 +- libbcachefs/bkey_methods.h | 2 +- libbcachefs/btree_gc.c | 14 +- libbcachefs/btree_iter.c | 21 +- libbcachefs/btree_iter.h | 30 +- libbcachefs/btree_types.h | 2 +- libbcachefs/btree_update_interior.c | 14 + libbcachefs/btree_update_interior.h | 2 + libbcachefs/btree_update_leaf.c | 12 + libbcachefs/buckets.c | 51 +- libbcachefs/buckets.h | 8 +- libbcachefs/buckets_types.h | 7 +- libbcachefs/chardev.c | 20 +- libbcachefs/checksum.c | 16 +- libbcachefs/checksum.h | 1 + libbcachefs/clock_types.h | 2 +- libbcachefs/compress.c | 2 +- libbcachefs/debug.c | 13 +- libbcachefs/dirent.c | 12 +- libbcachefs/dirent.h | 9 +- libbcachefs/disk_groups.c | 462 +++++++++++ libbcachefs/disk_groups.h | 99 +++ libbcachefs/extents.c | 230 +++--- libbcachefs/extents.h | 33 +- libbcachefs/fs-io.c | 5 +- libbcachefs/inode.c | 12 +- libbcachefs/inode.h | 8 +- libbcachefs/io.c | 91 +- libbcachefs/io.h | 2 - libbcachefs/journal.c | 143 ++-- libbcachefs/journal.h | 2 +- libbcachefs/migrate.c | 1 + libbcachefs/move.c | 1 + libbcachefs/movinggc.c | 19 +- libbcachefs/opts.c | 1 + libbcachefs/quota.c | 45 +- libbcachefs/quota.h | 11 +- libbcachefs/replicas.c | 698 ++++++++++++++++ libbcachefs/replicas.h | 51 ++ libbcachefs/super-io.c | 1195 +++------------------------ libbcachefs/super-io.h | 148 +--- libbcachefs/super.c | 487 +++++------ libbcachefs/super.h | 1 - libbcachefs/super_types.h | 6 +- libbcachefs/sysfs.c | 111 +-- libbcachefs/tier.c | 1 + libbcachefs/xattr.c | 12 +- libbcachefs/xattr.h | 9 +- 59 files changed, 2530 insertions(+), 2219 deletions(-) create mode 100644 libbcachefs/disk_groups.c create mode 100644 libbcachefs/disk_groups.h create mode 100644 libbcachefs/replicas.c create mode 100644 libbcachefs/replicas.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 641ae5fe..a7c36b9e 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9fc6ccd8659598d4ca885220a795889071b619f4 +edf5f38218f699e53913a549465f35d36c4418f7 diff --git a/cmd_key.c b/cmd_key.c index 0ca591c5..6052cb00 100644 --- a/cmd_key.c +++ b/cmd_key.c @@ -86,7 +86,7 @@ int cmd_set_passphrase(int argc, char *argv[]) if (IS_ERR(c)) die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c))); - struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb); + struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb); if (!crypt) die("Filesystem does not have encryption enabled"); @@ -100,7 +100,7 @@ int cmd_set_passphrase(int argc, char *argv[]) char *new_passphrase = read_passphrase_twice("Enter new passphrase: "); struct bch_key passphrase_key = derive_passphrase(crypt, new_passphrase); - if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb), + if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb.sb), &new_key, sizeof(new_key))) die("error encrypting key"); crypt->key = new_key; @@ -123,7 +123,7 @@ int cmd_remove_passphrase(int argc, char *argv[]) if (IS_ERR(c)) die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c))); - struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb); + struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb); if (!crypt) die("Filesystem does not have encryption enabled"); diff --git a/cmd_migrate.c b/cmd_migrate.c index a42d11ec..db20b71c 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -31,6 +31,7 @@ #include "libbcachefs/fs.h" #include "libbcachefs/inode.h" #include "libbcachefs/io.h" +#include "libbcachefs/replicas.h" #include "libbcachefs/str_hash.h" #include "libbcachefs/super.h" #include "libbcachefs/xattr.h" diff --git a/include/linux/bug.h b/include/linux/bug.h index e25568c8..f8929688 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -15,7 +15,7 @@ #define BUG_ON(cond) assert(!(cond)) #define WARN_ON_ONCE(cond) ({ bool _r = (cond); if (_r) assert(0); _r; }) -#define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; }) +#define WARN_ONCE(cond, ...) ({ bool _r = (cond); if (_r) assert(0); _r; }) #define __WARN() assert(0) #define __WARN_printf(arg...) assert(0) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index a7be2d82..a34574ca 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -319,7 +319,7 @@ TRACE_EVENT(btree_gc_coalesce_fail, TP_fast_assign( __entry->reason = reason; - memcpy(__entry->uuid, c->disk_sb->user_uuid.b, 16); + memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); ), TP_printk("%pU: %u", __entry->uuid, __entry->reason) diff --git a/libbcachefs.c b/libbcachefs.c index 052ca35b..a6eb9889 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -14,12 +14,14 @@ #include -#include "libbcachefs/bcachefs_format.h" -#include "libbcachefs/checksum.h" -#include "crypto.h" #include "libbcachefs.h" +#include "crypto.h" +#include "libbcachefs/bcachefs_format.h" #include "libbcachefs/btree_cache.h" +#include "libbcachefs/checksum.h" +#include "libbcachefs/disk_groups.h" #include "libbcachefs/opts.h" +#include "libbcachefs/replicas.h" #include "libbcachefs/super-io.h" #define NSEC_PER_SEC 1000000000L @@ -124,8 +126,8 @@ void bch2_pick_bucket_size(struct format_opts opts, struct dev_opts *dev) } -static unsigned parse_target(struct dev_opts *devs, size_t nr_devs, - struct bch_sb_field_disk_groups *gi, +static unsigned parse_target(struct bch_sb_handle *sb, + struct dev_opts *devs, size_t nr_devs, const char *s) { struct dev_opts *i; @@ -138,7 +140,7 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs, if (!strcmp(s, i->path)) return dev_to_target(i - devs); - idx = __bch2_disk_group_find(gi, s); + idx = bch2_disk_path_find(sb, s); if (idx >= 0) return group_to_target(idx); @@ -149,11 +151,9 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs, struct bch_sb *bch2_format(struct format_opts opts, struct dev_opts *devs, size_t nr_devs) { - struct bch_sb *sb; + struct bch_sb_handle sb = { NULL }; struct dev_opts *i; struct bch_sb_field_members *mi; - struct bch_sb_field_disk_groups *gi = NULL; - unsigned u64s; /* calculate block size: */ if (!opts.block_size) @@ -184,58 +184,51 @@ struct bch_sb *bch2_format(struct format_opts opts, if (uuid_is_null(opts.uuid.b)) uuid_generate(opts.uuid.b); - sb = calloc(1, sizeof(*sb) + - sizeof(struct bch_sb_field_members) + - sizeof(struct bch_member) * nr_devs + - sizeof(struct bch_sb_field_disk_groups) + - sizeof(struct bch_disk_group) * nr_devs + - sizeof(struct bch_sb_field_crypt)); + if (bch2_sb_realloc(&sb, 0)) + die("insufficient memory"); - sb->version = cpu_to_le64(BCH_SB_VERSION_MAX); - sb->magic = BCACHE_MAGIC; - sb->block_size = cpu_to_le16(opts.block_size); - sb->user_uuid = opts.uuid; - sb->nr_devices = nr_devs; + sb.sb->version = cpu_to_le64(BCH_SB_VERSION_MAX); + sb.sb->magic = BCACHE_MAGIC; + sb.sb->block_size = cpu_to_le16(opts.block_size); + sb.sb->user_uuid = opts.uuid; + sb.sb->nr_devices = nr_devs; - uuid_generate(sb->uuid.b); + uuid_generate(sb.sb->uuid.b); if (opts.label) - strncpy((char *) sb->label, opts.label, sizeof(sb->label)); - - SET_BCH_SB_CSUM_TYPE(sb, opts.meta_csum_type); - SET_BCH_SB_META_CSUM_TYPE(sb, opts.meta_csum_type); - SET_BCH_SB_DATA_CSUM_TYPE(sb, opts.data_csum_type); - SET_BCH_SB_COMPRESSION_TYPE(sb, opts.compression_type); - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb, opts.background_compression_type); - - SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size); - SET_BCH_SB_GC_RESERVE(sb, 8); - SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas); - SET_BCH_SB_META_REPLICAS_REQ(sb, opts.meta_replicas_required); - SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas); - SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required); - SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action); - SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH); - SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, ilog2(opts.encoded_extent_max)); - - SET_BCH_SB_POSIX_ACL(sb, 1); + strncpy((char *) sb.sb->label, opts.label, sizeof(sb.sb->label)); + + SET_BCH_SB_CSUM_TYPE(sb.sb, opts.meta_csum_type); + SET_BCH_SB_META_CSUM_TYPE(sb.sb, opts.meta_csum_type); + SET_BCH_SB_DATA_CSUM_TYPE(sb.sb, opts.data_csum_type); + SET_BCH_SB_COMPRESSION_TYPE(sb.sb, opts.compression_type); + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb.sb, + opts.background_compression_type); + + SET_BCH_SB_BTREE_NODE_SIZE(sb.sb, opts.btree_node_size); + SET_BCH_SB_GC_RESERVE(sb.sb, 8); + SET_BCH_SB_META_REPLICAS_WANT(sb.sb, opts.meta_replicas); + SET_BCH_SB_META_REPLICAS_REQ(sb.sb, opts.meta_replicas_required); + SET_BCH_SB_DATA_REPLICAS_WANT(sb.sb, opts.data_replicas); + SET_BCH_SB_DATA_REPLICAS_REQ(sb.sb, opts.data_replicas_required); + SET_BCH_SB_ERROR_ACTION(sb.sb, opts.on_error_action); + SET_BCH_SB_STR_HASH_TYPE(sb.sb, BCH_STR_HASH_SIPHASH); + SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,ilog2(opts.encoded_extent_max)); + + SET_BCH_SB_POSIX_ACL(sb.sb, 1); struct timespec now; if (clock_gettime(CLOCK_REALTIME, &now)) die("error getting current time: %m"); - sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec); - sb->time_precision = cpu_to_le32(1); - - mi = vstruct_end(sb); - u64s = (sizeof(struct bch_sb_field_members) + - sizeof(struct bch_member) * nr_devs) / sizeof(u64); - - le32_add_cpu(&sb->u64s, u64s); - le32_add_cpu(&mi->field.u64s, u64s); - mi->field.type = BCH_SB_FIELD_members; + sb.sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec); + sb.sb->time_precision = cpu_to_le32(1); /* Member info: */ + mi = bch2_sb_resize_members(&sb, + (sizeof(*mi) + sizeof(struct bch_member) * + nr_devs) / sizeof(u64)); + for (i = devs; i < devs + nr_devs; i++) { struct bch_member *m = mi->members + (i - devs); @@ -253,63 +246,38 @@ struct bch_sb *bch2_format(struct format_opts opts, /* Disk groups */ for (i = devs; i < devs + nr_devs; i++) { struct bch_member *m = mi->members + (i - devs); - struct bch_disk_group *g; - size_t len; int idx; if (!i->group) continue; - len = min_t(size_t, strlen(i->group) + 1, BCH_SB_LABEL_SIZE); - - if (!gi) { - gi = vstruct_end(sb); - u64s = sizeof(*gi) / sizeof(u64); - le32_add_cpu(&sb->u64s, u64s); - le32_add_cpu(&gi->field.u64s, u64s); - gi->field.type = BCH_SB_FIELD_disk_groups; - } - - idx = __bch2_disk_group_find(gi, i->group); - if (idx >= 0) { - g = gi->entries + idx; - } else { - u64s = sizeof(*g) / sizeof(u64); - g = vstruct_end(&gi->field); - le32_add_cpu(&sb->u64s, u64s); - le32_add_cpu(&gi->field.u64s, u64s); - memcpy(g->label, i->group, len); - SET_BCH_GROUP_DATA_ALLOWED(g, ~0); - } + idx = bch2_disk_path_find_or_create(&sb, i->group); + if (idx < 0) + die("error creating disk path: %s", idx); - SET_BCH_MEMBER_GROUP(m, (g - gi->entries) + 1); + SET_BCH_MEMBER_GROUP(m, idx + 1); } - SET_BCH_SB_FOREGROUND_TARGET(sb, - parse_target(devs, nr_devs, gi, opts.foreground_target)); - SET_BCH_SB_BACKGROUND_TARGET(sb, - parse_target(devs, nr_devs, gi, opts.background_target)); - SET_BCH_SB_PROMOTE_TARGET(sb, - parse_target(devs, nr_devs, gi, opts.promote_target)); + SET_BCH_SB_FOREGROUND_TARGET(sb.sb, + parse_target(&sb, devs, nr_devs, opts.foreground_target)); + SET_BCH_SB_BACKGROUND_TARGET(sb.sb, + parse_target(&sb, devs, nr_devs, opts.background_target)); + SET_BCH_SB_PROMOTE_TARGET(sb.sb, + parse_target(&sb, devs, nr_devs, opts.promote_target)); /* Crypt: */ if (opts.encrypted) { - struct bch_sb_field_crypt *crypt = vstruct_end(sb); - - u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64); - - le32_add_cpu(&sb->u64s, u64s); - crypt->field.u64s = cpu_to_le32(u64s); - crypt->field.type = BCH_SB_FIELD_crypt; + struct bch_sb_field_crypt *crypt = + bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64)); - bch_sb_crypt_init(sb, crypt, opts.passphrase); - SET_BCH_SB_ENCRYPTION_TYPE(sb, 1); + bch_sb_crypt_init(sb.sb, crypt, opts.passphrase); + SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1); } for (i = devs; i < devs + nr_devs; i++) { - sb->dev_idx = i - devs; + sb.sb->dev_idx = i - devs; - init_layout(&sb->layout, opts.block_size, + init_layout(&sb.sb->layout, opts.block_size, i->sb_offset, i->sb_end); if (i->sb_offset == BCH_SB_SECTOR) { @@ -319,11 +287,11 @@ struct bch_sb *bch2_format(struct format_opts opts, xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0); } - bch2_super_write(i->fd, sb); + bch2_super_write(i->fd, sb.sb); close(i->fd); } - return sb; + return sb.sb; } void bch2_super_write(int fd, struct bch_sb *sb) @@ -553,11 +521,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f, typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units); -struct bch_sb_field_ops { +struct bch_sb_field_toolops { sb_field_print_fn print; }; -static const struct bch_sb_field_ops bch2_sb_field_ops[] = { +static const struct bch_sb_field_toolops bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = { \ .print = bch2_sb_print_##f, \ diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index ede44f73..16bdc48c 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -58,11 +58,13 @@ #include "btree_cache.h" #include "btree_io.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" #include "checksum.h" #include "clock.h" #include "debug.h" +#include "disk_groups.h" #include "error.h" #include "extents.h" #include "io.h" @@ -79,7 +81,7 @@ #include #include -static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int); +static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); /* Ratelimiting/PD controllers */ @@ -130,8 +132,7 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -static const char *bch2_alloc_invalid(const struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) { if (k.k->p.inode >= c->sb.nr_devices || !c->devs[k.k->p.inode]) @@ -152,8 +153,8 @@ static const char *bch2_alloc_invalid(const struct bch_fs *c, return NULL; } -static void bch2_alloc_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_alloc_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { buf[0] = '\0'; @@ -163,11 +164,6 @@ static void bch2_alloc_to_text(struct bch_fs *c, char *buf, } } -const struct bkey_ops bch2_bkey_alloc_ops = { - .key_invalid = bch2_alloc_invalid, - .val_to_text = bch2_alloc_to_text, -}; - static inline unsigned get_alloc_field(const u8 **p, unsigned bytes) { unsigned v; @@ -236,9 +232,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) d = a.v->data; if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - g->prio[READ] = get_alloc_field(&d, 2); + g->io_time[READ] = get_alloc_field(&d, 2); if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - g->prio[WRITE] = get_alloc_field(&d, 2); + g->io_time[WRITE] = get_alloc_field(&d, 2); lg_local_unlock(&c->usage_lock); } @@ -270,21 +266,21 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } - mutex_lock(&c->prio_clock[READ].lock); + mutex_lock(&c->bucket_clock[READ].lock); for_each_member_device(ca, c, i) { down_read(&ca->bucket_lock); - bch2_recalc_min_prio(c, ca, READ); + bch2_recalc_oldest_io(c, ca, READ); up_read(&ca->bucket_lock); } - mutex_unlock(&c->prio_clock[READ].lock); + mutex_unlock(&c->bucket_clock[READ].lock); - mutex_lock(&c->prio_clock[WRITE].lock); + mutex_lock(&c->bucket_clock[WRITE].lock); for_each_member_device(ca, c, i) { down_read(&ca->bucket_lock); - bch2_recalc_min_prio(c, ca, WRITE); + bch2_recalc_oldest_io(c, ca, WRITE); up_read(&ca->bucket_lock); } - mutex_unlock(&c->prio_clock[WRITE].lock); + mutex_unlock(&c->bucket_clock[WRITE].lock); return 0; } @@ -320,9 +316,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, d = a->v.data; if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - put_alloc_field(&d, 2, g->prio[READ]); + put_alloc_field(&d, 2, g->io_time[READ]); if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - put_alloc_field(&d, 2, g->prio[WRITE]); + put_alloc_field(&d, 2, g->io_time[WRITE]); lg_local_unlock(&c->usage_lock); ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, @@ -395,38 +391,34 @@ int bch2_alloc_write(struct bch_fs *c) /* Bucket IO clocks: */ -static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) +static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) { - struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_clock *clock = &c->bucket_clock[rw]; struct bucket_array *buckets = bucket_array(ca); struct bucket *g; - u16 max_delta = 1; + u16 max_last_io = 0; unsigned i; - lockdep_assert_held(&c->prio_clock[rw].lock); + lockdep_assert_held(&c->bucket_clock[rw].lock); - /* Determine min prio for this particular device */ + /* Recalculate max_last_io for this device: */ for_each_bucket(g, buckets) - max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); + max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - ca->min_prio[rw] = clock->hand - max_delta; + ca->max_last_bucket_io[rw] = max_last_io; - /* - * This may possibly increase the min prio for the whole device, check - * that as well. - */ - max_delta = 1; + /* Recalculate global max_last_io: */ + max_last_io = 0; for_each_member_device(ca, c, i) - max_delta = max(max_delta, - (u16) (clock->hand - ca->min_prio[rw])); + max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - clock->min_prio = clock->hand - max_delta; + clock->max_last_io = max_last_io; } -static void bch2_rescale_prios(struct bch_fs *c, int rw) +static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) { - struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_clock *clock = &c->bucket_clock[rw]; struct bucket_array *buckets; struct bch_dev *ca; struct bucket *g; @@ -439,10 +431,10 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw) buckets = bucket_array(ca); for_each_bucket(g, buckets) - g->prio[rw] = clock->hand - - (clock->hand - g->prio[rw]) / 2; + g->io_time[rw] = clock->hand - + bucket_last_io(c, g, rw) / 2; - bch2_recalc_min_prio(c, ca, rw); + bch2_recalc_oldest_io(c, ca, rw); up_read(&ca->bucket_lock); } @@ -450,19 +442,26 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw) static void bch2_inc_clock_hand(struct io_timer *timer) { - struct prio_clock *clock = container_of(timer, - struct prio_clock, rescale); + struct bucket_clock *clock = container_of(timer, + struct bucket_clock, rescale); struct bch_fs *c = container_of(clock, - struct bch_fs, prio_clock[clock->rw]); + struct bch_fs, bucket_clock[clock->rw]); + struct bch_dev *ca; u64 capacity; + unsigned i; mutex_lock(&clock->lock); - clock->hand++; - /* if clock cannot be advanced more, rescale prio */ - if (clock->hand == (u16) (clock->min_prio - 1)) - bch2_rescale_prios(c, clock->rw); + if (clock->max_last_io >= U16_MAX - 2) + bch2_rescale_bucket_io_times(c, clock->rw); + + BUG_ON(clock->max_last_io >= U16_MAX - 2); + + for_each_member_device(ca, c, i) + ca->max_last_bucket_io[clock->rw]++; + clock->max_last_io++; + clock->hand++; mutex_unlock(&clock->lock); @@ -484,9 +483,9 @@ static void bch2_inc_clock_hand(struct io_timer *timer) bch2_io_timer_add(&c->io_clock[clock->rw], timer); } -static void bch2_prio_timer_init(struct bch_fs *c, int rw) +static void bch2_bucket_clock_init(struct bch_fs *c, int rw) { - struct prio_clock *clock = &c->prio_clock[rw]; + struct bucket_clock *clock = &c->bucket_clock[rw]; clock->hand = 1; clock->rw = rw; @@ -536,7 +535,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) while (1) { set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) { - ret = -1; + ret = 1; break; } @@ -635,13 +634,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark m) { + unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); + unsigned max_last_io = ca->max_last_bucket_io[READ]; + /* * Time since last read, scaled to [0, 8) where larger value indicates * more recently read data: */ - unsigned long hotness = - (bucket(ca, b)->prio[READ] - ca->min_prio[READ]) * 7 / - (c->prio_clock[READ].hand - ca->min_prio[READ]); + unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; /* How much we want to keep the data in this bucket: */ unsigned long data_wantness = @@ -659,23 +659,25 @@ static inline int bucket_alloc_cmp(alloc_heap *h, struct alloc_heap_entry l, struct alloc_heap_entry r) { - return (l.key > r.key) - (l.key < r.key); + return (l.key > r.key) - (l.key < r.key) ?: + (l.nr < r.nr) - (l.nr > r.nr) ?: + (l.bucket > r.bucket) - (l.bucket < r.bucket); } static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; - struct alloc_heap_entry e; + struct alloc_heap_entry e = { 0 }; size_t b; ca->alloc_heap.used = 0; - mutex_lock(&c->prio_clock[READ].lock); + mutex_lock(&c->bucket_clock[READ].lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); - bch2_recalc_min_prio(c, ca, READ); + bch2_recalc_oldest_io(c, ca, READ); /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -684,30 +686,45 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) */ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + unsigned long key = bucket_sort_key(c, ca, b, m); if (!bch2_can_invalidate_bucket(ca, b, m)) continue; - e = (struct alloc_heap_entry) { - .bucket = b, - .key = bucket_sort_key(c, ca, b, m) - }; + if (e.nr && e.bucket + e.nr == b && e.key == key) { + e.nr++; + } else { + if (e.nr) + heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); + + e = (struct alloc_heap_entry) { + .bucket = b, + .nr = 1, + .key = key, + }; + } - heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); + cond_resched(); } + if (e.nr) + heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); + up_read(&ca->bucket_lock); - mutex_unlock(&c->prio_clock[READ].lock); + mutex_unlock(&c->bucket_clock[READ].lock); heap_resort(&ca->alloc_heap, bucket_alloc_cmp); - /* - * If we run out of buckets to invalidate, bch2_allocator_thread() will - * kick stuff and retry us - */ - while (!fifo_full(&ca->free_inc) && - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) - bch2_invalidate_one_bucket(c, ca, e.bucket); + while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) { + for (b = e.bucket; + b < e.bucket + e.nr; + b++) { + if (fifo_full(&ca->free_inc)) + return; + + bch2_invalidate_one_bucket(c, ca, b); + } + } } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) @@ -729,6 +746,8 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) if (bch2_can_invalidate_bucket(ca, b, m)) bch2_invalidate_one_bucket(c, ca, b); + + cond_resched(); } } @@ -749,6 +768,8 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca if (bch2_can_invalidate_bucket(ca, b, m)) bch2_invalidate_one_bucket(c, ca, b); + + cond_resched(); } } @@ -850,7 +871,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { - ret = -1; + ret = 1; break; } @@ -880,7 +901,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) ca->mi.bucket_size, GFP_NOIO, 0); if (push_invalidated_bucket(c, ca, bucket)) - return -1; + return 1; } return 0; @@ -905,17 +926,32 @@ static int bch2_allocator_thread(void *arg) while (1) { while (1) { + cond_resched(); + + pr_debug("discarding %zu invalidated buckets", + ca->nr_invalidated); + ret = discard_invalidated_buckets(c, ca); if (ret) - return 0; + goto stop; if (fifo_empty(&ca->free_inc)) break; + pr_debug("invalidating %zu buckets", + fifo_used(&ca->free_inc)); + journal_seq = 0; ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX); - if (ret) - return 0; + if (ret) { + bch_err(ca, "error invalidating buckets: %i", ret); + goto stop; + } + + if (!ca->nr_invalidated) { + bch_err(ca, "allocator thread unable to make forward progress!"); + goto stop; + } if (ca->allocator_invalidating_data) ret = bch2_journal_flush_seq(&c->journal, journal_seq); @@ -927,22 +963,29 @@ static int bch2_allocator_thread(void *arg) * journal error - buckets haven't actually been * invalidated, can't discard them: */ - if (ret) - return 0; + if (ret) { + bch_err(ca, "journal error: %i", ret); + goto stop; + } } + pr_debug("free_inc now empty"); + /* Reset front/back so we can easily sort fifo entries later: */ ca->free_inc.front = ca->free_inc.back = 0; ca->allocator_journal_seq_flush = 0; ca->allocator_invalidating_data = false; down_read(&c->gc_lock); - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { - up_read(&c->gc_lock); - return 0; - } - while (1) { + size_t prev = fifo_used(&ca->free_inc); + + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { + up_read(&c->gc_lock); + bch_err(ca, "gc failure"); + goto stop; + } + /* * Find some buckets that we can invalidate, either * they're completely unused, or only contain clean data @@ -950,7 +993,14 @@ static int bch2_allocator_thread(void *arg) * another cache tier */ + pr_debug("scanning for reclaimable buckets"); + find_reclaimable_buckets(c, ca); + + pr_debug("found %zu buckets (free_inc %zu/%zu)", + fifo_used(&ca->free_inc) - prev, + fifo_used(&ca->free_inc), ca->free_inc.size); + trace_alloc_batch(ca, fifo_used(&ca->free_inc), ca->free_inc.size); @@ -977,15 +1027,20 @@ static int bch2_allocator_thread(void *arg) ca->allocator_blocked = true; closure_wake_up(&c->freelist_wait); - if (wait_buckets_available(c, ca)) { + ret = wait_buckets_available(c, ca); + if (ret) { up_read(&c->gc_lock); - return 0; + goto stop; } } ca->allocator_blocked = false; up_read(&c->gc_lock); + pr_debug("free_inc now %zu/%zu", + fifo_used(&ca->free_inc), + ca->free_inc.size); + sort_free_inc(c, ca); /* @@ -993,6 +1048,10 @@ static int bch2_allocator_thread(void *arg) * write out the new bucket gens: */ } + +stop: + pr_debug("alloc thread stopping (ret %i)", ret); + return 0; } /* Allocation */ @@ -1046,8 +1105,8 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } -/* _only_ for allocating the journal and btree roots on a brand new fs: */ -int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) +/* _only_ for allocating the journal on a new device: */ +long bch2_bucket_alloc_new_fs(struct bch_dev *ca) { struct bucket_array *buckets; ssize_t b; @@ -1056,14 +1115,8 @@ int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) buckets = bucket_array(ca); for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark)) { - bch2_mark_alloc_bucket(c, ca, b, true, - gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - set_bit(b, ca->buckets_dirty); + if (is_available_bucket(buckets->b[b].mark)) goto success; - } b = -1; success: rcu_read_unlock(); @@ -1135,9 +1188,8 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, break; } - if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) && - (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0) - goto out; + if (cl) + closure_wait(&c->freelist_wait, cl); spin_unlock(&c->freelist_lock); @@ -1218,7 +1270,7 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, *v = *v < scale ? 0 : *v - scale; } -static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, +static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, unsigned nr_replicas, enum alloc_reserve reserve, @@ -1284,52 +1336,22 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, break; } } + rcu_read_unlock(); EBUG_ON(reserve == RESERVE_MOVINGGC && ret != ALLOC_SUCCESS && ret != OPEN_BUCKETS_EMPTY); - rcu_read_unlock(); - return ret; -} - -static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, - unsigned nr_replicas, - enum alloc_reserve reserve, - struct bch_devs_mask *devs, - struct closure *cl) -{ - bool waiting = false; - - while (1) { - switch (__bch2_bucket_alloc_set(c, wp, nr_replicas, - reserve, devs, cl)) { - case ALLOC_SUCCESS: - if (waiting) - closure_wake_up(&c->freelist_wait); - - return 0; - - case NO_DEVICES: - if (waiting) - closure_wake_up(&c->freelist_wait); - return -EROFS; - - case FREELIST_EMPTY: - if (!cl) - return -ENOSPC; - if (waiting) - return -EAGAIN; - - /* Retry allocation after adding ourself to waitlist: */ - closure_wait(&c->freelist_wait, cl); - waiting = true; - break; - case OPEN_BUCKETS_EMPTY: - return cl ? -EAGAIN : -ENOSPC; - default: - BUG(); - } + switch (ret) { + case ALLOC_SUCCESS: + return 0; + case NO_DEVICES: + return -EROFS; + case FREELIST_EMPTY: + case OPEN_BUCKETS_EMPTY: + return cl ? -EAGAIN : -ENOSPC; + default: + BUG(); } } @@ -1530,11 +1552,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, nr_ptrs_have = wp->first_ptr; /* does writepoint have ptrs we don't want to use? */ - writepoint_for_each_ptr(wp, ob, i) - if (!dev_idx_in_target(c, ob->ptr.dev, target)) { - swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); - wp->first_ptr++; - } + if (target) + writepoint_for_each_ptr(wp, ob, i) + if (!dev_idx_in_target(c, ob->ptr.dev, target)) { + swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); + wp->first_ptr++; + } if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) { ret = open_bucket_add_buckets(c, target, wp, devs_have, @@ -1551,7 +1574,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, nr_replicas, reserve, cl); } - if (ret) + if (ret && ret != -EROFS) goto err; alloc_done: /* check for more than one cache: */ @@ -1584,6 +1607,13 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, nr_ptrs_effective += ca->mi.durability; } + if (ret == -EROFS && + nr_ptrs_effective >= nr_replicas_required) + ret = 0; + + if (ret) + goto err; + if (nr_ptrs_effective > nr_replicas) { writepoint_for_each_ptr(wp, ob, i) { ca = bch_dev_bkey_exists(c, ob->ptr.dev); @@ -1749,14 +1779,14 @@ void bch2_recalc_capacity(struct bch_fs *c) if (c->capacity) { bch2_io_timer_add(&c->io_clock[READ], - &c->prio_clock[READ].rescale); + &c->bucket_clock[READ].rescale); bch2_io_timer_add(&c->io_clock[WRITE], - &c->prio_clock[WRITE].rescale); + &c->bucket_clock[WRITE].rescale); } else { bch2_io_timer_del(&c->io_clock[READ], - &c->prio_clock[READ].rescale); + &c->bucket_clock[READ].rescale); bch2_io_timer_del(&c->io_clock[WRITE], - &c->prio_clock[WRITE].rescale); + &c->bucket_clock[WRITE].rescale); } /* Wake up case someone was waiting for buckets */ @@ -1889,7 +1919,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca) if (ca->alloc_thread) return 0; - p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator"); + p = kthread_create(bch2_allocator_thread, ca, + "bch_alloc[%s]", ca->name); if (IS_ERR(p)) return PTR_ERR(p); @@ -1923,7 +1954,7 @@ static void allocator_start_issue_discards(struct bch_fs *c) static int __bch2_fs_allocator_start(struct bch_fs *c) { struct bch_dev *ca; - size_t bu, i, devs_have_enough = 0; + size_t bu, i; unsigned dev_iter; u64 journal_seq = 0; bool invalidating_data = false; @@ -1964,16 +1995,21 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) /* did we find enough buckets? */ for_each_rw_member(ca, c, dev_iter) - devs_have_enough += (fifo_used(&ca->free_inc) >= - ca->free[RESERVE_BTREE].size); + if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) { + percpu_ref_put(&ca->io_ref); + goto not_enough; + } - if (devs_have_enough >= c->opts.metadata_replicas) - return 0; + return 0; +not_enough: + pr_debug("did not find enough empty buckets; issuing discards"); /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */ for_each_rw_member(ca, c, dev_iter) discard_invalidated_buckets(c, ca); + pr_debug("scanning for reclaimable buckets"); + for_each_rw_member(ca, c, dev_iter) { BUG_ON(!fifo_empty(&ca->free_inc)); ca->free_inc.front = ca->free_inc.back = 0; @@ -1988,6 +2024,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) break; } + pr_debug("done scanning for reclaimable buckets"); + /* * We're moving buckets to freelists _before_ they've been marked as * invalidated on disk - we have to so that we can allocate new btree @@ -1997,10 +2035,13 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) * have cached data in them, which is live until they're marked as * invalidated on disk: */ - if (invalidating_data) + if (invalidating_data) { + pr_debug("invalidating existing data"); set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); - else + } else { + pr_debug("issuing discards"); allocator_start_issue_discards(c); + } /* * XXX: it's possible for this to deadlock waiting on journal reclaim, @@ -2017,13 +2058,15 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) } if (invalidating_data) { + pr_debug("flushing journal"); + ret = bch2_journal_flush_seq(&c->journal, journal_seq); if (ret) return ret; - } - if (invalidating_data) + pr_debug("issuing discards"); allocator_start_issue_discards(c); + } for_each_rw_member(ca, c, dev_iter) while (ca->nr_invalidated) { @@ -2038,19 +2081,43 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; + bool flush_updates; + size_t nr_pending_updates; clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); again: + pr_debug("flushing dirty btree nodes"); + cond_resched(); + + flush_updates = false; + nr_pending_updates = bch2_btree_interior_updates_nr_pending(c); + + rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) if (btree_node_dirty(b) && (!b->written || b->level)) { - rcu_read_unlock(); - six_lock_read(&b->lock); - bch2_btree_node_write(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto again; + if (btree_node_may_write(b)) { + rcu_read_unlock(); + six_lock_read(&b->lock); + bch2_btree_node_write(c, b, SIX_LOCK_read); + six_unlock_read(&b->lock); + goto again; + } else { + flush_updates = true; + } } rcu_read_unlock(); + + /* + * This is ugly, but it's needed to flush btree node writes + * without spinning... + */ + if (flush_updates) { + closure_wait_event(&c->btree_interior_update_wait, + bch2_btree_interior_updates_nr_pending(c) < + nr_pending_updates); + goto again; + } } return 0; @@ -2087,8 +2154,8 @@ void bch2_fs_allocator_init(struct bch_fs *c) mutex_init(&c->write_points_hash_lock); spin_lock_init(&c->freelist_lock); - bch2_prio_timer_init(c, READ); - bch2_prio_timer_init(c, WRITE); + bch2_bucket_clock_init(c, READ); + bch2_bucket_clock_init(c, WRITE); /* open bucket 0 is a sentinal NULL: */ spin_lock_init(&c->open_buckets[0].lock); diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index f914dbd5..372cc047 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -9,6 +9,14 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); + +#define bch2_bkey_alloc_ops (struct bkey_ops) { \ + .key_invalid = bch2_alloc_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -30,6 +38,8 @@ enum bucket_alloc_ret { NO_DEVICES = -3, /* -EROFS */ }; +long bch2_bucket_alloc_new_fs(struct bch_dev *); + int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool, struct closure *); @@ -127,6 +137,4 @@ int bch2_alloc_write(struct bch_fs *); int bch2_fs_allocator_start(struct bch_fs *); void bch2_fs_allocator_init(struct bch_fs *); -extern const struct bkey_ops bch2_bkey_alloc_ops; - #endif /* _BCACHEFS_ALLOC_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index f3bd4701..8a71a376 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -8,7 +8,7 @@ #include "fifo.h" /* There's two of these clocks, one for reads and one for writes: */ -struct prio_clock { +struct bucket_clock { /* * "now" in (read/write) IO time - incremented whenever we do X amount * of reads or writes. @@ -23,7 +23,7 @@ struct prio_clock { * consistent. */ u16 hand; - u16 min_prio; + u16 max_last_io; int rw; @@ -80,6 +80,7 @@ struct write_point_specifier { struct alloc_heap_entry { size_t bucket; + size_t nr; unsigned long key; }; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 369d078c..bc10324f 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -384,7 +384,7 @@ struct bch_dev { alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; spinlock_t freelist_lock; - unsigned nr_invalidated; + size_t nr_invalidated; u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; @@ -392,7 +392,7 @@ struct bch_dev { size_t fifo_last_bucket; /* last calculated minimum prio */ - u16 min_prio[2]; + u16 max_last_bucket_io[2]; atomic_long_t saturated_count; size_t inc_gen_needs_gc; @@ -431,11 +431,11 @@ struct bch_dev { */ enum { /* startup: */ - BCH_FS_BRAND_NEW_FS, BCH_FS_ALLOC_READ_DONE, BCH_FS_ALLOCATOR_STARTED, BCH_FS_INITIAL_GC_DONE, BCH_FS_FSCK_DONE, + BCH_FS_STARTED, /* shutdown: */ BCH_FS_EMERGENCY_RO, @@ -519,8 +519,7 @@ struct bch_fs { u64 features; } sb; - struct bch_sb *disk_sb; - unsigned disk_sb_order; + struct bch_sb_handle disk_sb; unsigned short block_bits; /* ilog2(block_size) */ @@ -595,7 +594,7 @@ struct bch_fs { * those together consistently we keep track of the smallest nonzero * priority of any bucket. */ - struct prio_clock prio_clock[2]; + struct bucket_clock bucket_clock[2]; struct io_clock io_clock[2]; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d89f7781..eed6fb85 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -955,8 +955,9 @@ struct bch_disk_group { __le64 flags[2]; }; -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) struct bch_sb_field_disk_groups { struct bch_sb_field field; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 84cdf662..e4f62f90 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -10,20 +10,20 @@ #include "quota.h" #include "xattr.h" -const struct bkey_ops *bch2_bkey_ops[] = { - [BKEY_TYPE_EXTENTS] = &bch2_bkey_extent_ops, - [BKEY_TYPE_INODES] = &bch2_bkey_inode_ops, - [BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops, - [BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops, - [BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops, - [BKEY_TYPE_QUOTAS] = &bch2_bkey_quota_ops, - [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops, +const struct bkey_ops bch2_bkey_ops[] = { + [BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops, + [BKEY_TYPE_INODES] = bch2_bkey_inode_ops, + [BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops, + [BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops, + [BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops, + [BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops, + [BKEY_TYPE_BTREE] = bch2_bkey_btree_ops, }; const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k) { - const struct bkey_ops *ops = bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[type]; switch (k.k->type) { case KEY_TYPE_DELETED: @@ -51,7 +51,7 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type, const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k) { - const struct bkey_ops *ops = bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[type]; if (k.k->u64s < BKEY_U64s) return "u64s too small"; @@ -100,7 +100,7 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { enum bkey_type type = btree_node_type(b); - const struct bkey_ops *ops = bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[type]; const char *invalid; BUG_ON(!k.k->u64s); @@ -141,7 +141,7 @@ int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k) int bch2_val_to_text(struct bch_fs *c, enum bkey_type type, char *buf, size_t size, struct bkey_s_c k) { - const struct bkey_ops *ops = bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[type]; char *out = buf, *end = buf + size; switch (k.k->type) { @@ -182,7 +182,7 @@ void bch2_bkey_swab(enum bkey_type type, const struct bkey_format *f, struct bkey_packed *k) { - const struct bkey_ops *ops = bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[type]; bch2_bkey_swab_key(f, k); diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 59db3037..9e2c90d5 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -81,6 +81,6 @@ int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, struct bkey_packed *); -extern const struct bkey_ops *bch2_bkey_ops[]; +extern const struct bkey_ops bch2_bkey_ops[]; #endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index f2e9c10e..ad51f29c 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -18,6 +18,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "replicas.h" #include "super-io.h" #include @@ -317,7 +318,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, unsigned i; u64 b; - lockdep_assert_held(&c->sb_lock); + if (c) + lockdep_assert_held(&c->sb_lock); for (i = 0; i < layout->nr_superblocks; i++) { u64 offset = le64_to_cpu(layout->sb_offset[i]); @@ -331,7 +333,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, BCH_DATA_SB, flags); } - spin_lock(&c->journal.lock); + if (c) + spin_lock(&c->journal.lock); for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; @@ -340,7 +343,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, gc_phase(GC_PHASE_SB), flags); } - spin_unlock(&c->journal.lock); + if (c) + spin_unlock(&c->journal.lock); } static void bch2_mark_superblocks(struct bch_fs *c) @@ -1034,8 +1038,8 @@ static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal) int ret; mutex_lock(&c->sb_lock); - if (!bch2_sb_get_replicas(c->disk_sb)) { - if (BCH_SB_INITIALIZED(c->disk_sb)) + if (!bch2_sb_get_replicas(c->disk_sb.sb)) { + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index cc5bcbb2..465aadba 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1290,16 +1290,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - struct bkey_s_c k; + /* + * XXX: when we just need to relock we should be able to avoid + * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK + * for that to work + */ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - k = bch2_btree_iter_peek_slot(iter); - if (btree_iter_err(k)) - return k; + return bch2_btree_iter_peek_slot(iter); } - iter->pos = btree_type_successor(iter->btree_id, iter->k.p); - if (!bkey_deleted(&iter->k)) __btree_iter_advance(&iter->l[0]); @@ -1318,6 +1321,8 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, iter->c = c; iter->pos = pos; + bkey_init(&iter->k); + iter->k.p = pos; iter->flags = flags; iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; @@ -1330,6 +1335,10 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, iter->l[iter->level].b = BTREE_ITER_NOT_END; iter->next = iter; + if (unlikely((flags & BTREE_ITER_IS_EXTENTS) && + !bkey_cmp(pos, POS_MAX))) + iter->uptodate = BTREE_ITER_END; + prefetch(c->btree_roots[btree_id].b); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 318b0424..95191ba2 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -231,6 +231,20 @@ static inline int btree_iter_cmp(const struct btree_iter *l, return __btree_iter_cmp(l->btree_id, l->pos, r); } +/* + * Unlocks before scheduling + * Note: does not revalidate iterator + */ +static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) +{ + if (need_resched()) { + bch2_btree_iter_unlock(iter); + schedule(); + } else if (race_fault()) { + bch2_btree_iter_unlock(iter); + } +} + #define __for_each_btree_node(_iter, _c, _btree_id, _start, \ _locks_want, _depth, _flags, _b) \ for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \ @@ -253,6 +267,8 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, unsigned flags) { + bch2_btree_iter_cond_resched(iter); + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_next_slot(iter) : bch2_btree_iter_next(iter); @@ -275,18 +291,4 @@ static inline int btree_iter_err(struct bkey_s_c k) return PTR_ERR_OR_ZERO(k.k); } -/* - * Unlocks before scheduling - * Note: does not revalidate iterator - */ -static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) -{ - if (need_resched()) { - bch2_btree_iter_unlock(iter); - schedule(); - } else if (race_fault()) { - bch2_btree_iter_unlock(iter); - } -} - #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index e86c6bce..8854305d 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -299,7 +299,7 @@ static inline enum bkey_type btree_node_type(struct btree *b) static inline const struct bkey_ops *btree_node_ops(struct btree *b) { - return bch2_bkey_ops[btree_node_type(b)]; + return &bch2_bkey_ops[btree_node_type(b)]; } static inline bool btree_node_has_ptrs(struct btree *b) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index f42239da..63696920 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -13,6 +13,7 @@ #include "extents.h" #include "journal.h" #include "keylist.h" +#include "replicas.h" #include "super-io.h" #include @@ -2116,3 +2117,16 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) return out - buf; } + +size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) +{ + size_t ret = 0; + struct list_head *i; + + mutex_lock(&c->btree_interior_update_lock); + list_for_each(i, &c->btree_interior_update_list) + ret++; + mutex_unlock(&c->btree_interior_update_lock); + + return ret; +} diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 0b58ccc9..3e66d69e 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -343,4 +343,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans, ssize_t bch2_btree_updates_print(struct bch_fs *, char *); +size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 007aa5ef..53b39de5 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -443,8 +443,20 @@ int __bch2_btree_insert_at(struct btree_insert *trans) * potentially blocks the allocator: */ ret = bch2_btree_split_leaf(c, split, trans->flags); + + /* + * This can happen when we insert part of an extent - with an update + * with multiple keys, we don't want to redo the entire update - that's + * just too confusing: + */ + if (!ret && + (trans->flags & BTREE_INSERT_ATOMIC) && + trans->did_work) + ret = -EINTR; + if (ret) goto err; + /* * if the split didn't have to drop locks the insert will still be * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 864de940..1f944cb8 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -309,7 +309,7 @@ static bool bucket_became_unavailable(struct bch_fs *c, { return is_available_bucket(old) && !is_available_bucket(new) && - c && c->gc_pos.phase == GC_PHASE_DONE; + (!c || c->gc_pos.phase == GC_PHASE_DONE); } void bch2_fs_usage_apply(struct bch_fs *c, @@ -351,12 +351,16 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, { struct bch_dev_usage *dev_usage; - lockdep_assert_held(&c->usage_lock); + if (c) + lockdep_assert_held(&c->usage_lock); - bch2_fs_inconsistent_on(old.data_type && new.data_type && - old.data_type != new.data_type, c, + if (old.data_type && new.data_type && + old.data_type != new.data_type) { + BUG_ON(!c); + bch2_fs_inconsistent(c, "different types of data in same bucket: %u, %u", old.data_type, new.data_type); + } dev_usage = this_cpu_ptr(ca->usage_percpu); @@ -466,21 +470,29 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!type); - lg_local_lock(&c->usage_lock); - g = bucket(ca, b); + if (likely(c)) { + lg_local_lock(&c->usage_lock); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) { - lg_local_unlock(&c->usage_lock); - return; + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && + gc_will_visit(c, pos)) { + lg_local_unlock(&c->usage_lock); + return; + } } + preempt_disable(); + + g = bucket(ca, b); old = bucket_data_cmpxchg(c, ca, g, new, ({ saturated_add(ca, new.dirty_sectors, sectors, GC_MAX_SECTORS_USED); new.data_type = type; })); - lg_local_unlock(&c->usage_lock); + + preempt_enable(); + + if (likely(c)) + lg_local_unlock(&c->usage_lock); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); @@ -859,9 +871,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bch2_copygc_stop(ca); - down_write(&c->gc_lock); - down_write(&ca->bucket_lock); - lg_global_lock(&c->usage_lock); + if (resize) { + down_write(&c->gc_lock); + down_write(&ca->bucket_lock); + lg_global_lock(&c->usage_lock); + } old_buckets = bucket_array(ca); @@ -885,7 +899,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) swap(ca->oldest_gens, oldest_gens); swap(ca->buckets_dirty, buckets_dirty); - lg_global_unlock(&c->usage_lock); + if (resize) + lg_global_unlock(&c->usage_lock); spin_lock(&c->freelist_lock); for (i = 0; i < RESERVE_NR; i++) { @@ -904,8 +919,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) nbuckets = ca->mi.nbuckets; - up_write(&ca->bucket_lock); - up_write(&c->gc_lock); + if (resize) { + up_write(&ca->bucket_lock); + up_write(&c->gc_lock); + } if (start_copygc && bch2_copygc_start(c, ca)) diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index fda7fd70..399a853c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -31,6 +31,7 @@ static inline struct bucket_array *bucket_array(struct bch_dev *ca) { return rcu_dereference_check(ca->buckets, + !ca->fs || lockdep_is_held(&ca->fs->usage_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); @@ -47,7 +48,12 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, size_t b, int rw) { - bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand; + bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; +} + +static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) +{ + return c->bucket_clock[rw].hand - g->io_time[rw]; } /* diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index a0256e13..28bd2c59 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -31,12 +31,12 @@ struct bucket_mark { }; struct bucket { - u16 prio[2]; - union { struct bucket_mark _mark; const struct bucket_mark mark; }; + + u16 io_time[2]; }; struct bucket_array { @@ -85,8 +85,9 @@ struct disk_reservation { }; struct copygc_heap_entry { + u8 gen; + u32 sectors; u64 offset; - struct bucket_mark mark; }; typedef HEAP(struct copygc_heap_entry) copygc_heap; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index ab6dc665..8403bae6 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -372,6 +372,9 @@ static long bch2_ioctl_usage(struct bch_fs *c, unsigned i, j; int ret; + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; @@ -460,7 +463,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c, sb = ca->disk_sb.sb; } else { - sb = c->disk_sb; + sb = c->disk_sb.sb; } if (vstruct_bytes(sb) > arg.size) { @@ -535,13 +538,22 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* ioctls that do require admin cap: */ switch (cmd) { case BCH_IOCTL_START: BCH_IOCTL(start, struct bch_ioctl_start); case BCH_IOCTL_STOP: return bch2_ioctl_stop(c); + case BCH_IOCTL_READ_SUPER: + BCH_IOCTL(read_super, struct bch_ioctl_read_super); + case BCH_IOCTL_DISK_GET_IDX: + BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); + } + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + /* ioctls that do require admin cap: */ + switch (cmd) { case BCH_IOCTL_DISK_ADD: BCH_IOCTL(disk_add, struct bch_ioctl_disk); case BCH_IOCTL_DISK_REMOVE: @@ -554,10 +566,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); case BCH_IOCTL_DATA: BCH_IOCTL(data, struct bch_ioctl_data); - case BCH_IOCTL_READ_SUPER: - BCH_IOCTL(read_super, struct bch_ioctl_read_super); - case BCH_IOCTL_DISK_GET_IDX: - BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); case BCH_IOCTL_DISK_RESIZE: BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 56bd99fd..6d8543eb 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -569,7 +569,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, if (!bch2_key_is_encrypted(&sb_key)) goto out; - ret = bch2_request_key(c->disk_sb, &user_key); + ret = bch2_request_key(c->disk_sb.sb, &user_key); if (ret) { bch_err(c, "error requesting encryption key: %i", ret); goto err; @@ -623,7 +623,7 @@ int bch2_disable_encryption(struct bch_fs *c) mutex_lock(&c->sb_lock); - crypt = bch2_sb_get_crypt(c->disk_sb); + crypt = bch2_sb_get_crypt(c->disk_sb.sb); if (!crypt) goto out; @@ -639,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c) crypt->key.magic = BCH_KEY_MAGIC; crypt->key.key = key; - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0); + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); bch2_write_super(c); out: mutex_unlock(&c->sb_lock); @@ -657,7 +657,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) mutex_lock(&c->sb_lock); /* Do we already have an encryption key? */ - if (bch2_sb_get_crypt(c->disk_sb)) + if (bch2_sb_get_crypt(c->disk_sb.sb)) goto err; ret = bch2_alloc_ciphers(c); @@ -668,7 +668,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) get_random_bytes(&key.key, sizeof(key.key)); if (keyed) { - ret = bch2_request_key(c->disk_sb, &user_key); + ret = bch2_request_key(c->disk_sb.sb, &user_key); if (ret) { bch_err(c, "error requesting encryption key: %i", ret); goto err; @@ -685,7 +685,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) if (ret) goto err; - crypt = bch2_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64)); + crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); if (!crypt) { ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ goto err; @@ -694,7 +694,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) crypt->key = key; /* write superblock */ - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1); + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); bch2_write_super(c); err: mutex_unlock(&c->sb_lock); @@ -728,7 +728,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) goto out; } - crypt = bch2_sb_get_crypt(c->disk_sb); + crypt = bch2_sb_get_crypt(c->disk_sb.sb); if (!crypt) goto out; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 7862294b..2690cc4b 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -117,6 +117,7 @@ static const unsigned bch_crc_bytes[] = { [BCH_CSUM_CHACHA20_POLY1305_128] = 16, }; +/* returns true if not equal */ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) { /* diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index bfd4b303..df404b6d 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -3,7 +3,7 @@ #include "util.h" -#define NR_IO_TIMERS 8 +#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) /* * Clocks/timers in units of sectors of IO: diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 18c94598..1af62621 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -500,7 +500,7 @@ int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) return ret; } - c->disk_sb->features[0] |= cpu_to_le64(f); + c->disk_sb.sb->features[0] |= cpu_to_le64(f); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 00e0de16..7190990d 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -212,17 +212,20 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) return i->ret; - for_each_btree_key(&iter, i->c, i->id, i->from, - BTREE_ITER_PREFETCH, k) { - i->from = iter.pos; + bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH); + k = bch2_btree_iter_peek(&iter); + while (k.k && !(err = btree_iter_err(k))) { bch2_bkey_val_to_text(i->c, bkey_type(0, i->id), - i->buf, sizeof(i->buf), k); + i->buf, sizeof(i->buf), k); i->bytes = strlen(i->buf); BUG_ON(i->bytes >= PAGE_SIZE); i->buf[i->bytes] = '\n'; i->bytes++; + k = bch2_btree_iter_next(&iter); + i->from = iter.pos; + err = flush_buf(i); if (err) break; @@ -230,7 +233,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) break; } - err = bch2_btree_iter_unlock(&iter) ?: err; + bch2_btree_iter_unlock(&iter); return err < 0 ? err : i->ret; } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 6bdece3a..df9913f8 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -79,8 +79,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .cmp_bkey = dirent_cmp_bkey, }; -static const char *bch2_dirent_invalid(const struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_dirent d; unsigned len; @@ -116,8 +115,8 @@ static const char *bch2_dirent_invalid(const struct bch_fs *c, } } -static void bch2_dirent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_dirent_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { struct bkey_s_c_dirent d; size_t n = 0; @@ -136,11 +135,6 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf, } } -const struct bkey_ops bch2_bkey_dirent_ops = { - .key_invalid = bch2_dirent_invalid, - .val_to_text = bch2_dirent_to_text, -}; - static struct bkey_i_dirent *dirent_create_key(u8 type, const struct qstr *name, u64 dst) { diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 98405b5b..5d066af1 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -4,7 +4,14 @@ #include "str_hash.h" extern const struct bch_hash_desc bch2_dirent_hash_desc; -extern const struct bkey_ops bch2_bkey_dirent_ops; + +const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); + +#define bch2_bkey_dirent_ops (struct bkey_ops) { \ + .key_invalid = bch2_dirent_invalid, \ + .val_to_text = bch2_dirent_to_text, \ +} struct qstr; struct file; diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c new file mode 100644 index 00000000..c129a33e --- /dev/null +++ b/libbcachefs/disk_groups.c @@ -0,0 +1,462 @@ +#include "bcachefs.h" +#include "disk_groups.h" +#include "super-io.h" + +#include + +static int group_cmp(const void *_l, const void *_r) +{ + const struct bch_disk_group *l = _l; + const struct bch_disk_group *r = _r; + + return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - + (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: + ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - + (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: + strncmp(l->label, r->label, sizeof(l->label)); +} + +const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g, *sorted = NULL; + struct bch_sb_field_members *mi; + struct bch_member *m; + unsigned i, nr_groups, len; + const char *err = NULL; + + mi = bch2_sb_get_members(sb); + groups = bch2_sb_get_disk_groups(sb); + nr_groups = disk_groups_nr(groups); + + for (m = mi->members; + m < mi->members + sb->nr_devices; + m++) { + unsigned g; + + if (!BCH_MEMBER_GROUP(m)) + continue; + + g = BCH_MEMBER_GROUP(m) - 1; + + if (g >= nr_groups || + BCH_GROUP_DELETED(&groups->entries[g])) + return "disk has invalid group"; + } + + if (!nr_groups) + return NULL; + + for (g = groups->entries; + g < groups->entries + nr_groups; + g++) { + if (BCH_GROUP_DELETED(g)) + continue; + + len = strnlen(g->label, sizeof(g->label)); + if (!len) { + err = "group with empty label"; + goto err; + } + } + + sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); + if (!sorted) + return "cannot allocate memory"; + + memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); + sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); + + for (i = 0; i + 1 < nr_groups; i++) + if (!BCH_GROUP_DELETED(sorted + i) && + !group_cmp(sorted + i, sorted + i + 1)) { + err = "duplicate groups"; + goto err; + } + + err = NULL; +err: + kfree(sorted); + return err; +} + +static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + char *out = buf, *end = buf + size; + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g; + unsigned nr_groups = disk_groups_nr(groups); + + for (g = groups->entries; + g < groups->entries + nr_groups; + g++) { + if (g != groups->entries) + out += scnprintf(out, end - out, " "); + + if (BCH_GROUP_DELETED(g)) + out += scnprintf(out, end - out, "[deleted]"); + else + out += scnprintf(out, end - out, + "[parent %llu name %s]", + BCH_GROUP_PARENT(g), + g->label); + } + + return out - buf; +} + +const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { + .validate = bch2_sb_disk_groups_validate, + .to_text = bch2_sb_disk_groups_to_text +}; + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field_disk_groups *groups; + struct bch_disk_groups_cpu *cpu_g, *old_g; + unsigned i, g, nr_groups; + + lockdep_assert_held(&c->sb_lock); + + mi = bch2_sb_get_members(c->disk_sb.sb); + groups = bch2_sb_get_disk_groups(c->disk_sb.sb); + nr_groups = disk_groups_nr(groups); + + if (!groups) + return 0; + + cpu_g = kzalloc(sizeof(*cpu_g) + + sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); + if (!cpu_g) + return -ENOMEM; + + cpu_g->nr = nr_groups; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *src = &groups->entries[i]; + struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; + + dst->deleted = BCH_GROUP_DELETED(src); + dst->parent = BCH_GROUP_PARENT(src); + } + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + struct bch_disk_group_cpu *dst = + &cpu_g->entries[BCH_MEMBER_GROUP(m)]; + + if (!bch2_member_exists(m)) + continue; + + g = BCH_MEMBER_GROUP(m); + while (g) { + dst = &cpu_g->entries[g - 1]; + __set_bit(i, dst->devs.d); + g = dst->parent; + } + } + + old_g = c->disk_groups; + rcu_assign_pointer(c->disk_groups, cpu_g); + if (old_g) + kfree_rcu(old_g, rcu); + + return 0; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_DEV: { + struct bch_dev *ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + return ca ? &ca->self : NULL; + } + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + return t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + } + default: + BUG(); + } +} + +static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, + unsigned parent, + const char *name, unsigned namelen) +{ + unsigned i, nr_groups = disk_groups_nr(groups); + + if (!namelen || namelen > BCH_SB_LABEL_SIZE) + return -EINVAL; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *g = groups->entries + i; + + if (BCH_GROUP_DELETED(g)) + continue; + + if (!BCH_GROUP_DELETED(g) && + BCH_GROUP_PARENT(g) == parent && + strnlen(g->label, sizeof(g->label)) == namelen && + !memcmp(name, g->label, namelen)) + return i; + } + + return -1; +} + +static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, + const char *name, unsigned namelen) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_get_disk_groups(sb->sb); + unsigned i, nr_groups = disk_groups_nr(groups); + struct bch_disk_group *g; + + if (!namelen || namelen > BCH_SB_LABEL_SIZE) + return -EINVAL; + + for (i = 0; + i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); + i++) + ; + + if (i == nr_groups) { + unsigned u64s = + (sizeof(struct bch_sb_field_disk_groups) + + sizeof(struct bch_disk_group) * (nr_groups + 1)) / + sizeof(u64); + + groups = bch2_sb_resize_disk_groups(sb, u64s); + if (!groups) + return -ENOSPC; + + nr_groups = disk_groups_nr(groups); + } + + BUG_ON(i >= nr_groups); + + g = &groups->entries[i]; + + memcpy(g->label, name, namelen); + if (namelen < sizeof(g->label)) + g->label[namelen] = '\0'; + SET_BCH_GROUP_DELETED(g, 0); + SET_BCH_GROUP_PARENT(g, parent); + SET_BCH_GROUP_DATA_ALLOWED(g, ~0); + + return i; +} + +int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_get_disk_groups(sb->sb); + int v = -1; + + do { + const char *next = strchrnul(name, '.'); + unsigned len = next - name; + + if (*next == '.') + next++; + + v = __bch2_disk_group_find(groups, v + 1, name, len); + name = next; + } while (*name && v >= 0); + + return v; +} + +int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) +{ + struct bch_sb_field_disk_groups *groups; + unsigned parent = 0; + int v = -1; + + do { + const char *next = strchrnul(name, '.'); + unsigned len = next - name; + + if (*next == '.') + next++; + + groups = bch2_sb_get_disk_groups(sb->sb); + + v = __bch2_disk_group_find(groups, parent, name, len); + if (v < 0) + v = __bch2_disk_group_add(sb, parent, name, len); + if (v < 0) + return v; + + parent = v + 1; + name = next; + } while (*name && v >= 0); + + return v; +} + +int bch2_disk_path_print(struct bch_sb_handle *sb, + char *buf, size_t len, unsigned v) +{ + char *out = buf, *end = out + len; + struct bch_sb_field_disk_groups *groups = + bch2_sb_get_disk_groups(sb->sb); + struct bch_disk_group *g; + unsigned nr = 0; + u16 path[32]; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto inval; + + if (v >= disk_groups_nr(groups)) + goto inval; + + g = groups->entries + v; + + if (BCH_GROUP_DELETED(g)) + goto inval; + + path[nr++] = v; + + if (!BCH_GROUP_PARENT(g)) + break; + + v = BCH_GROUP_PARENT(g) - 1; + } + + while (nr) { + unsigned b = 0; + + v = path[--nr]; + g = groups->entries + v; + + if (end != out) + b = min_t(size_t, end - out, + strnlen(g->label, sizeof(g->label))); + memcpy(out, g->label, b); + if (b < end - out) + out[b] = '\0'; + out += b; + + if (nr) + out += scnprintf(out, end - out, "."); + } + + return out - buf; +inval: + return scnprintf(buf, len, "invalid group %u", v); +} + +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + struct bch_member *mi; + int v = -1; + + mutex_lock(&c->sb_lock); + + if (!strlen(name) || !strcmp(name, "none")) + goto write_sb; + + v = bch2_disk_path_find_or_create(&c->disk_sb, name); + if (v < 0) { + mutex_unlock(&c->sb_lock); + return v; + } + +write_sb: + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_GROUP(mi, v + 1); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) +{ + struct bch_dev *ca; + int g; + + if (!strlen(buf) || !strcmp(buf, "none")) { + *v = 0; + return 0; + } + + /* Is it a device? */ + ca = bch2_dev_lookup(c, buf); + if (!IS_ERR(ca)) { + *v = dev_to_target(ca->dev_idx); + percpu_ref_put(&ca->ref); + return 0; + } + + mutex_lock(&c->sb_lock); + g = bch2_disk_path_find(&c->disk_sb, buf); + mutex_unlock(&c->sb_lock); + + if (g >= 0) { + *v = group_to_target(g); + return 0; + } + + return -EINVAL; +} + +int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v) +{ + struct target t = target_decode(v); + int ret; + + switch (t.type) { + case TARGET_NULL: + return scnprintf(buf, len, "none"); + case TARGET_DEV: { + struct bch_dev *ca; + + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + char b[BDEVNAME_SIZE]; + + ret = scnprintf(buf, len, "/dev/%s", + bdevname(ca->disk_sb.bdev, b)); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + ret = scnprintf(buf, len, "offline device %u", t.dev); + } else { + ret = scnprintf(buf, len, "invalid device %u", t.dev); + } + + rcu_read_unlock(); + break; + } + case TARGET_GROUP: + mutex_lock(&c->sb_lock); + ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group); + mutex_unlock(&c->sb_lock); + break; + default: + BUG(); + } + + return ret; +} diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h new file mode 100644 index 00000000..9da9805a --- /dev/null +++ b/libbcachefs/disk_groups.h @@ -0,0 +1,99 @@ +#ifndef _BCACHEFS_DISK_GROUPS_H +#define _BCACHEFS_DISK_GROUPS_H + +extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; + +static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) +{ + return groups + ? (vstruct_end(&groups->field) - + (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) + : 0; +} + +struct target { + enum { + TARGET_NULL, + TARGET_DEV, + TARGET_GROUP, + } type; + union { + unsigned dev; + unsigned group; + }; +}; + +#define TARGET_DEV_START 1 +#define TARGET_GROUP_START (256 + TARGET_DEV_START) + +static inline u16 dev_to_target(unsigned dev) +{ + return TARGET_DEV_START + dev; +} + +static inline u16 group_to_target(unsigned group) +{ + return TARGET_GROUP_START + group; +} + +static inline struct target target_decode(unsigned target) +{ + if (target >= TARGET_GROUP_START) + return (struct target) { + .type = TARGET_GROUP, + .group = target - TARGET_GROUP_START + }; + + if (target >= TARGET_DEV_START) + return (struct target) { + .type = TARGET_DEV, + .group = target - TARGET_DEV_START + }; + + return (struct target) { .type = TARGET_NULL }; +} + +static inline bool dev_in_target(struct bch_dev *ca, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_NULL: + return false; + case TARGET_DEV: + return ca->dev_idx == t.dev; + case TARGET_GROUP: + return ca->mi.group && ca->mi.group - 1 == t.group; + default: + BUG(); + } +} + +static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target) +{ + bool ret; + + rcu_read_lock(); + ret = dev_in_target(rcu_dereference(c->devs[dev]), target); + rcu_read_unlock(); + + return ret; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + +int bch2_disk_path_find(struct bch_sb_handle *, const char *); +int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); +int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned); + +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); +int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64); + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *); + +int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); + +const char *bch2_sb_validate_disk_groups(struct bch_sb *, + struct bch_sb_field *); + +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f73e7562..c5d1e7cb 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -14,10 +14,12 @@ #include "checksum.h" #include "debug.h" #include "dirent.h" +#include "disk_groups.h" #include "error.h" #include "extents.h" #include "inode.h" #include "journal.h" +#include "replicas.h" #include "super.h" #include "super-io.h" #include "util.h" @@ -25,9 +27,6 @@ #include -static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, - struct bkey_i *, struct bkey_i *); - static void sort_key_next(struct btree_node_iter_large *iter, struct btree *b, struct btree_node_iter_set *i) @@ -160,9 +159,13 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ { const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) - if (dev_in_target(c->devs[ptr->dev], target)) + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (dev_in_target(ca, target) && + (!ptr->cached || !ptr_stale(ca, ptr))) return ptr; + } return NULL; } @@ -356,11 +359,25 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, return true; } +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); +} + void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) { union bch_extent_entry *entry = e.v->start; union bch_extent_crc *crc, *prev = NULL; - struct bch_extent_crc_unpacked u, prev_u; + struct bch_extent_crc_unpacked u, prev_u = { 0 }; while (entry != extent_entry_last(e)) { union bch_extent_entry *next = extent_entry_next(entry); @@ -382,7 +399,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) goto drop; } - if (prev && !memcmp(&u, &prev_u, sizeof(u))) { + if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) { /* identical to previous crc entry: */ goto drop; } @@ -439,13 +456,12 @@ static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) bch2_extent_drop_redundant_crcs(e); } -static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk, - struct bkey_s k) +bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k) { return bch2_extent_normalize(c, k); } -static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { switch (k->type) { case BCH_EXTENT: @@ -628,8 +644,7 @@ static void extent_pick_read_device(struct bch_fs *c, /* Btree ptrs */ -static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { if (bkey_extent_is_cached(k.k)) return "cached"; @@ -671,8 +686,8 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, } } -static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, + struct bkey_s_c k) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; @@ -727,8 +742,8 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, mark.gen, (unsigned) mark.counter); } -static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end = buf + size; const char *invalid; @@ -756,13 +771,6 @@ bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, return pick; } -const struct bkey_ops bch2_bkey_btree_ops = { - .key_invalid = bch2_btree_ptr_invalid, - .key_debugcheck = btree_ptr_debugcheck, - .val_to_text = bch2_btree_ptr_to_text, - .swab = bch2_ptr_swab, -}; - /* Extents */ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) @@ -1436,7 +1444,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, } static enum btree_insert_ret -bch2_delete_fixup_extent(struct extent_insert_state *s) +__bch2_delete_fixup_extent(struct extent_insert_state *s) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; @@ -1450,8 +1458,7 @@ bch2_delete_fixup_extent(struct extent_insert_state *s) EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - s->whiteout = *insert; - s->do_journal = false; + s->whiteout = *insert; while (bkey_cmp(s->committed, insert->k.p) < 0 && (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && @@ -1474,12 +1481,12 @@ bch2_delete_fixup_extent(struct extent_insert_state *s) overlap = bch2_extent_overlap(&insert->k, k.k); ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; + if (ret) + break; ret = extent_insert_advance_pos(s, k.s_c); if (ret) - goto stop; + break; s->do_journal = true; @@ -1520,25 +1527,65 @@ bch2_delete_fixup_extent(struct extent_insert_state *s) bch2_btree_iter_set_pos_same_leaf(iter, s->committed); } - if (ret == BTREE_INSERT_OK && - bkey_cmp(s->committed, insert->k.p) < 0) - ret = extent_insert_advance_pos(s, bkey_s_c_null); -stop: - extent_insert_committed(s); + return ret; +} - bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res, - gc_pos_btree_node(b)); +static enum btree_insert_ret +__bch2_insert_fixup_extent(struct extent_insert_state *s) +{ + struct btree_iter *iter = s->insert->iter; + struct btree_iter_level *l = &iter->l[0]; + struct btree *b = l->b; + struct btree_node_iter *node_iter = &l->iter; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_i *insert = s->insert->k; + enum btree_insert_ret ret = BTREE_INSERT_OK; - EBUG_ON(bkey_cmp(iter->pos, s->committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != - !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); + while (bkey_cmp(s->committed, insert->k.p) < 0 && + (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && + (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { + struct bset_tree *t = bch2_bkey_to_bset(b, _k); + struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); + enum bch_extent_overlap overlap; - bch2_cut_front(iter->pos, insert); + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) - ret = BTREE_INSERT_NEED_TRAVERSE; + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; + + overlap = bch2_extent_overlap(&insert->k, k.k); + + ret = extent_insert_check_split_compressed(s, k.s_c, overlap); + if (ret) + break; + + if (!k.k->size) + goto squash; + + /* + * Only call advance pos & call hook for nonzero size extents: + */ + ret = extent_insert_advance_pos(s, k.s_c); + if (ret) + break; - EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK); + if (k.k->size && + (k.k->needs_whiteout || bset_written(b, bset(b, t)))) + insert->k.needs_whiteout = true; + + if (overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(b, t, _k); + _k->needs_whiteout = false; + } +squash: + ret = extent_squash(s, insert, t, _k, k, overlap); + if (ret != BTREE_INSERT_OK) + break; + } return ret; } @@ -1590,9 +1637,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans, struct btree_iter *iter = insert->iter; struct btree_iter_level *l = &iter->l[0]; struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bkey_packed *_k; - struct bkey unpacked; enum btree_insert_ret ret = BTREE_INSERT_OK; struct extent_insert_state s = { @@ -1605,9 +1649,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans, EBUG_ON(iter->level); EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); - if (s.deleting) - return bch2_delete_fixup_extent(&s); - /* * As we process overlapping extents, we advance @iter->pos both to * signal to our caller (btree_insert_key()) how much of @insert->k has @@ -1616,67 +1657,32 @@ bch2_insert_fixup_extent(struct btree_insert *trans, */ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + if (!s.deleting && + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) bch2_add_sectors(&s, bkey_i_to_s_c(insert->k), bkey_start_offset(&insert->k->k), insert->k->k.size); - while (bkey_cmp(s.committed, insert->k->k.p) < 0 && - (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0) - break; - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - ret = extent_insert_check_split_compressed(&s, k.s_c, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - - if (!k.k->size) - goto squash; - - /* - * Only call advance pos & call hook for nonzero size extents: - */ - ret = extent_insert_advance_pos(&s, k.s_c); - if (ret != BTREE_INSERT_OK) - goto stop; - - if (k.k->size && - (k.k->needs_whiteout || bset_written(b, bset(b, t)))) - insert->k->k.needs_whiteout = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(b, t, _k); - _k->needs_whiteout = false; - } -squash: - ret = extent_squash(&s, insert->k, t, _k, k, overlap); - if (ret != BTREE_INSERT_OK) - goto stop; - } + ret = !s.deleting + ? __bch2_insert_fixup_extent(&s) + : __bch2_delete_fixup_extent(&s); if (ret == BTREE_INSERT_OK && bkey_cmp(s.committed, insert->k->k.p) < 0) ret = extent_insert_advance_pos(&s, bkey_s_c_null); -stop: + extent_insert_committed(&s); + + if (s.deleting) + bch2_cut_front(iter->pos, insert->k); + /* * Subtract any remaining sectors from @insert, if we bailed out early * and didn't fully insert @insert: */ - if (insert->k->k.size && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + if (!s.deleting && + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && + insert->k->k.size) bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), bkey_start_offset(&insert->k->k), insert->k->k.size); @@ -1692,13 +1698,13 @@ bch2_insert_fixup_extent(struct btree_insert *trans, if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) ret = BTREE_INSERT_NEED_TRAVERSE; - EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK); + WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0), + "ret %u insert->k.size %u", ret, insert->k->k.size); return ret; } -static const char *bch2_extent_invalid(const struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) { if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) return "value too big"; @@ -1865,8 +1871,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, return; } -static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { switch (k.k->type) { case BCH_EXTENT: @@ -1880,8 +1885,8 @@ static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, } } -static void bch2_extent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_extent_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end = buf + size; const char *invalid; @@ -1963,7 +1968,7 @@ void bch2_extent_crc_append(struct bkey_i_extent *e, extent_for_each_crc(extent_i_to_s(e), crc, i) ; - if (!memcmp(&crc, &new, sizeof(crc))) + if (!bch2_crc_unpacked_cmp(crc, new)) return; bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); @@ -2089,9 +2094,8 @@ void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, } } -static enum merge_result bch2_extent_merge(struct bch_fs *c, - struct btree *bk, - struct bkey_i *l, struct bkey_i *r) +enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, + struct bkey_i *l, struct bkey_i *r) { struct bkey_s_extent el, er; union bch_extent_entry *en_l, *en_r; @@ -2410,13 +2414,3 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) return ret; } - -const struct bkey_ops bch2_bkey_extent_ops = { - .key_invalid = bch2_extent_invalid, - .key_debugcheck = bch2_extent_debugcheck, - .val_to_text = bch2_extent_to_text, - .swab = bch2_ptr_swab, - .key_normalize = bch2_ptr_normalize, - .key_merge = bch2_extent_merge, - .is_extents = true, -}; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 376e51c9..8dc15484 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -15,6 +15,36 @@ struct extent_insert_hook; struct bch_devs_mask; union bch_extent_crc; +const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *, + struct bkey_s_c); +void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); + +#define bch2_bkey_btree_ops (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ +} + +const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s); +enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, + struct bkey_i *, struct bkey_i *); + +#define bch2_bkey_extent_ops (struct bkey_ops) { \ + .key_invalid = bch2_extent_invalid, \ + .key_debugcheck = bch2_extent_debugcheck, \ + .val_to_text = bch2_extent_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_ptr_normalize, \ + .key_merge = bch2_extent_merge, \ + .is_extents = true, \ +} + struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *, struct btree *, struct btree_node_iter_large *); @@ -23,9 +53,6 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct btree *, struct btree_node_iter_large *); -extern const struct bkey_ops bch2_bkey_btree_ops; -extern const struct bkey_ops bch2_bkey_extent_ops; - struct extent_pick_ptr bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, struct bch_devs_mask *avoid); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index cb90738c..d1473f2a 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -468,7 +468,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop) } BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k))); - BUG_ON(!ret != !k->k.size); + + if (WARN_ONCE(!ret != !k->k.size, + "ret %i k->size %u", ret, k->k.size)) + ret = k->k.size ? -EINTR : 0; err: if (ret == -EINTR) continue; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 797aa2a9..3ae5ac97 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -175,8 +175,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, return 0; } -static const char *bch2_inode_invalid(const struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) { if (k.k->p.offset) return "nonzero offset"; @@ -224,8 +223,8 @@ static const char *bch2_inode_invalid(const struct bch_fs *c, } } -static void bch2_inode_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_inode_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end = out + size; struct bkey_s_c_inode inode; @@ -247,11 +246,6 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf, } } -const struct bkey_ops bch2_bkey_inode_ops = { - .key_invalid = bch2_inode_invalid, - .val_to_text = bch2_inode_to_text, -}; - void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 5c7aeadc..26461063 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -5,7 +5,13 @@ #include -extern const struct bkey_ops bch2_bkey_inode_ops; +const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); + +#define bch2_bkey_inode_ops (struct bkey_ops) { \ + .key_invalid = bch2_inode_invalid, \ + .val_to_text = bch2_inode_to_text, \ +} struct bch_inode_unpacked { u64 bi_inum; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 7ee9c392..27e45081 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -20,6 +20,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "replicas.h" #include "super.h" #include "super-io.h" #include "tier.h" @@ -196,8 +197,6 @@ static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - BUG_ON(!(op->flags & BCH_WRITE_DONE)); - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) op->error = bch2_journal_error(&op->c->journal); @@ -205,7 +204,6 @@ static void bch2_write_done(struct closure *cl) bch2_disk_reservation_put(op->c, &op->res); percpu_ref_put(&op->c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); - op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED); closure_return(cl); } @@ -232,9 +230,8 @@ int bch2_write_index_default(struct bch_write_op *op) /** * bch_write_index - after a write, update index to point to new data */ -static void bch2_write_index(struct closure *cl) +static void __bch2_write_index(struct bch_write_op *op) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bkey_s_extent e; @@ -242,8 +239,6 @@ static void bch2_write_index(struct closure *cl) struct bkey_i *src, *dst = keys->keys, *n, *k; int ret; - op->flags |= BCH_WRITE_LOOPED; - for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); bkey_copy(dst, src); @@ -292,9 +287,19 @@ static void bch2_write_index(struct closure *cl) } out: bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); + return; +err: + keys->top = keys->keys; + op->error = ret; + goto out; +} - if (!(op->flags & BCH_WRITE_DONE)) - continue_at(cl, __bch2_write, op->io_wq); +static void bch2_write_index(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + + __bch2_write_index(op); if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { bch2_journal_flush_seq_async(&c->journal, @@ -304,12 +309,6 @@ static void bch2_write_index(struct closure *cl) } else { continue_at_nobarrier(cl, bch2_write_done, NULL); } - return; -err: - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_DONE; - goto out; } static void bch2_write_endio(struct bio *bio) @@ -730,18 +729,18 @@ static void __bch2_write(struct closure *cl) struct bch_fs *c = op->c; struct write_point *wp; int ret; - +again: do { /* +1 for possible cache device: */ if (op->open_buckets_nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets)) - continue_at(cl, bch2_write_index, index_update_wq(op)); + goto flush_io; if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)) - continue_at(cl, bch2_write_index, index_update_wq(op)); + goto flush_io; wp = bch2_alloc_sectors_start(c, op->target, @@ -760,33 +759,7 @@ static void __bch2_write(struct closure *cl) goto err; } - /* - * If we already have some keys, must insert them first - * before allocating another open bucket. We only hit - * this case if open_bucket_nr > 1. - */ - if (!bch2_keylist_empty(&op->insert_keys)) - continue_at(cl, bch2_write_index, - index_update_wq(op)); - - /* - * If we've looped, we're running out of a workqueue - - * not the bch2_write() caller's context - and we don't - * want to block the workqueue: - */ - if (op->flags & BCH_WRITE_LOOPED) - continue_at(cl, __bch2_write, op->io_wq); - - /* - * Otherwise, we do want to block the caller on alloc - * failure instead of letting it queue up more and more - * writes: - * XXX: this technically needs a try_to_freeze() - - * except that that's not safe because caller may have - * issued other IO... hmm.. - */ - closure_sync(cl); - continue; + goto flush_io; } ret = bch2_write_extent(op, wp); @@ -802,28 +775,24 @@ static void __bch2_write(struct closure *cl) goto err; } while (ret); - op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_index, index_update_wq(op)); err: - /* - * Right now we can only error here if we went RO - the - * allocation failed, but we already checked for -ENOSPC when we - * got our reservation. - * - * XXX capacity might have changed, but we don't check for that - * yet: - */ op->error = ret; - op->flags |= BCH_WRITE_DONE; - /* - * No reason not to insert keys for whatever data was successfully - * written (especially for a cmpxchg operation that's moving data - * around) - */ continue_at(cl, !bch2_keylist_empty(&op->insert_keys) ? bch2_write_index : bch2_write_done, index_update_wq(op)); +flush_io: + closure_sync(cl); + + if (!bch2_keylist_empty(&op->insert_keys)) { + __bch2_write_index(op); + + if (op->error) + continue_at_nobarrier(cl, bch2_write_done, NULL); + } + + goto again; } /** @@ -969,7 +938,7 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e, if (percpu_ref_is_dying(&c->writes)) return false; - return bch2_extent_has_target(c, e, target); + return bch2_extent_has_target(c, e, target) == NULL; } /* Read */ diff --git a/libbcachefs/io.h b/libbcachefs/io.h index bf0b17e1..a0c795ab 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -36,8 +36,6 @@ enum bch_write_flags { /* Internal: */ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), - BCH_WRITE_DONE = (1 << 10), - BCH_WRITE_LOOPED = (1 << 11), }; static inline u64 *op_journal_seq(struct bch_write_op *op) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index e5000767..b525a85c 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -19,6 +19,7 @@ #include "io.h" #include "keylist.h" #include "journal.h" +#include "replicas.h" #include "super-io.h" #include "vstructs.h" @@ -1582,40 +1583,19 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) return ret; } -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) +static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) { - struct journal *j = &c->journal; + struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets; - struct disk_reservation disk_res = { 0, 0 }; - struct closure cl; u64 *new_bucket_seq = NULL, *new_buckets = NULL; int ret = 0; - closure_init_stack(&cl); - /* don't handle reducing nr of buckets yet: */ if (nr <= ja->nr) return 0; - /* - * note: journal buckets aren't really counted as _sectors_ used yet, so - * we don't need the disk reservation to avoid the BUG_ON() in buckets.c - * when space used goes up without a reservation - but we do need the - * reservation to ensure we'll actually be able to allocate: - */ - - if (bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0)) - return -ENOSPC; - - mutex_lock(&c->sb_lock); - ret = -ENOMEM; new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); @@ -1627,29 +1607,41 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, if (!journal_buckets) goto err; - spin_lock(&j->lock); + if (c) + spin_lock(&c->journal.lock); + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); - spin_unlock(&j->lock); + + if (c) + spin_unlock(&c->journal.lock); while (ja->nr < nr) { - struct open_bucket *ob; - size_t bucket; - int ob_idx; + struct open_bucket *ob = NULL; + long bucket; - ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl); - if (ob_idx < 0) { - if (!closure_wait(&c->freelist_wait, &cl)) - closure_sync(&cl); - continue; + if (new_fs) { + bucket = bch2_bucket_alloc_new_fs(ca); + if (bucket < 0) { + ret = -ENOSPC; + goto err; + } + } else { + int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl); + if (ob_idx < 0) { + ret = cl ? -EAGAIN : -ENOSPC; + goto err; + } + + ob = c->open_buckets + ob_idx; + bucket = sector_to_bucket(ca, ob->ptr.offset); } - ob = c->open_buckets + ob_idx; - bucket = sector_to_bucket(ca, ob->ptr.offset); + if (c) + spin_lock(&c->journal.lock); - spin_lock(&j->lock); __array_insert_item(ja->buckets, ja->nr, ja->last_idx); __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx); __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx); @@ -1664,34 +1656,77 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ja->last_idx++; } ja->nr++; - spin_unlock(&j->lock); - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), 0); + if (c) + spin_unlock(&c->journal.lock); - bch2_open_bucket_put(c, ob); + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + new_fs + ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE + : 0); + + if (!new_fs) + bch2_open_bucket_put(c, ob); } - bch2_write_super(c); - ret = 0; err: - mutex_unlock(&c->sb_lock); - kfree(new_bucket_seq); kfree(new_buckets); - bch2_disk_reservation_put(c, &disk_res); - if (!ret) - bch2_dev_allocator_add(c, ca); + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + struct journal_device *ja = &ca->journal; + struct closure cl; + unsigned current_nr; + int ret; + + closure_init_stack(&cl); + + do { + struct disk_reservation disk_res = { 0, 0 }; + + closure_sync(&cl); + + mutex_lock(&c->sb_lock); + current_nr = ja->nr; + + /* + * note: journal buckets aren't really counted as _sectors_ used yet, so + * we don't need the disk reservation to avoid the BUG_ON() in buckets.c + * when space used goes up without a reservation - but we do need the + * reservation to ensure we'll actually be able to allocate: + */ + + if (bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0)) { + mutex_unlock(&c->sb_lock); + return -ENOSPC; + } + + ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + + bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); + if (ja->nr != current_nr) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } while (ret == -EAGAIN); return ret; } -int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_dev *ca) { unsigned nr; @@ -1707,7 +1742,7 @@ int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca) min(1 << 10, (1 << 20) / ca->mi.bucket_size)); - return bch2_set_nr_journal_buckets(c, ca, nr); + return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); } /* Journalling */ @@ -2320,8 +2355,8 @@ static void journal_write(struct closure *cl) journal_write_compact(jset); - jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); + jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); + jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); jset->version = cpu_to_le32(BCACHE_JSET_VERSION); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 46ae8f0d..cf5cc9ba 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -400,7 +400,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) ssize_t bch2_journal_print_debug(struct journal *, char *); ssize_t bch2_journal_print_pins(struct journal *, char *); -int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 1bc0e714..ea519102 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -11,6 +11,7 @@ #include "keylist.h" #include "migrate.h" #include "move.h" +#include "replicas.h" #include "super-io.h" static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 07d2e2c8..87e6e80c 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -6,6 +6,7 @@ #include "inode.h" #include "io.h" #include "move.h" +#include "replicas.h" #include "super-io.h" #include "keylist.h" diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 3b4a5292..28dabca7 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -9,6 +9,7 @@ #include "btree_update.h" #include "buckets.h" #include "clock.h" +#include "disk_groups.h" #include "extents.h" #include "eytzinger.h" #include "io.h" @@ -51,7 +52,7 @@ static inline int sectors_used_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) { - return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark); + return (l.sectors > r.sectors) - (l.sectors < r.sectors); } static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) @@ -78,7 +79,7 @@ static bool __copygc_pred(struct bch_dev *ca, return (i >= 0 && ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].mark.gen); + ptr->gen == h->data[i].gen); } return false; @@ -154,8 +155,9 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) continue; e = (struct copygc_heap_entry) { - .offset = bucket_to_sector(ca, b), - .mark = m + .gen = m.gen, + .sectors = bucket_sectors_used(m), + .offset = bucket_to_sector(ca, b), }; heap_add_or_replace(h, e, -sectors_used_cmp); } @@ -163,11 +165,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) up_read(&c->gc_lock); for (i = h->data; i < h->data + h->used; i++) - sectors_to_move += bucket_sectors_used(i->mark); + sectors_to_move += i->sectors; while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { BUG_ON(!heap_pop(h, e, -sectors_used_cmp)); - sectors_to_move -= bucket_sectors_used(e.mark); + sectors_to_move -= e.sectors; } buckets_to_move = h->used; @@ -191,7 +193,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) size_t b = sector_to_bucket(ca, i->offset); struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - if (i->mark.gen == m.gen && bucket_sectors_used(m)) { + if (i->gen == m.gen && bucket_sectors_used(m)) { sectors_not_moved += bucket_sectors_used(m); buckets_not_moved++; } @@ -284,7 +286,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_copygc_thread, ca, "bch_copygc"); + t = kthread_create(bch2_copygc_thread, ca, + "bch_copygc[%s]", ca->name); if (IS_ERR(t)) return PTR_ERR(t); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 326b8ad9..8db8096e 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -2,6 +2,7 @@ #include #include "bcachefs.h" +#include "disk_groups.h" #include "opts.h" #include "super-io.h" #include "util.h" diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index d28f1333..bb03d83a 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -4,7 +4,22 @@ #include "quota.h" #include "super-io.h" -static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) +static const char *bch2_sb_validate_quota(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + + if (vstruct_bytes(&q->field) != sizeof(*q)) + return "invalid field quota: wrong size"; + + return NULL; +} + +const struct bch_sb_field_ops bch_sb_field_ops_quota = { + .validate = bch2_sb_validate_quota, +}; + +const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_quota dq; @@ -30,8 +45,8 @@ static const char * const bch2_quota_counters[] = { "inodes", }; -static void bch2_quota_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_quota_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end= buf + size; struct bkey_s_c_quota dq; @@ -50,11 +65,6 @@ static void bch2_quota_to_text(struct bch_fs *c, char *buf, } } -const struct bkey_ops bch2_bkey_quota_ops = { - .key_invalid = bch2_quota_invalid, - .val_to_text = bch2_quota_to_text, -}; - #ifdef CONFIG_BCACHEFS_QUOTA #include @@ -399,7 +409,7 @@ static void bch2_sb_quota_read(struct bch_fs *c) struct bch_sb_field_quota *sb_quota; unsigned i, j; - sb_quota = bch2_sb_get_quota(c->disk_sb); + sb_quota = bch2_sb_get_quota(c->disk_sb.sb); if (!sb_quota) return; @@ -476,13 +486,13 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) mutex_lock(&c->sb_lock); if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb, true); + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb, true); + SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb, true); + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -499,13 +509,13 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) mutex_lock(&c->sb_lock); if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb, false); + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb, false); + SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb, false); + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -616,9 +626,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type, q = &c->quotas[type]; mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_quota(c->disk_sb); + sb_quota = bch2_sb_get_quota(c->disk_sb.sb); if (!sb_quota) { - sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64)); + sb_quota = bch2_sb_resize_quota(&c->disk_sb, + sizeof(*sb_quota) / sizeof(u64)); if (!sb_quota) return -ENOSPC; } diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index 509b7f0e..0b24f22c 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -1,9 +1,18 @@ #ifndef _BCACHEFS_QUOTA_H #define _BCACHEFS_QUOTA_H +#include "inode.h" #include "quota_types.h" -extern const struct bkey_ops bch2_bkey_quota_ops; +extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + +const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); + +#define bch2_bkey_quota_ops (struct bkey_ops) { \ + .key_invalid = bch2_quota_invalid, \ + .val_to_text = bch2_quota_to_text, \ +} enum quota_acct_mode { BCH_QUOTA_PREALLOC, diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c new file mode 100644 index 00000000..6c52d1d4 --- /dev/null +++ b/libbcachefs/replicas.c @@ -0,0 +1,698 @@ + +#include "bcachefs.h" +#include "replicas.h" +#include "super-io.h" + +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, + struct bch_replicas_cpu *); + +/* Replicas tracking - in memory: */ + +#define for_each_cpu_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ + _i = (void *) (_i) + (_r)->entry_size) + +static inline struct bch_replicas_cpu_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + +static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) +{ + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +} + +static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, + unsigned dev) +{ + return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; +} + +static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, + unsigned dev) +{ + e->devs[dev >> 3] |= 1 << (dev & 7); +} + +static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) +{ + return (r->entry_size - + offsetof(struct bch_replicas_cpu_entry, devs)) * 8; +} + +int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r, + char *buf, size_t size) +{ + char *out = buf, *end = out + size; + struct bch_replicas_cpu_entry *e; + bool first = true; + unsigned i; + + for_each_cpu_replicas_entry(r, e) { + bool first_e = true; + + if (!first) + out += scnprintf(out, end - out, " "); + first = false; + + out += scnprintf(out, end - out, "%u: [", e->data_type); + + for (i = 0; i < replicas_dev_slots(r); i++) + if (replicas_test_dev(e, i)) { + if (!first_e) + out += scnprintf(out, end - out, " "); + first_e = false; + out += scnprintf(out, end - out, "%u", i); + } + out += scnprintf(out, end - out, "]"); + } + + return out - buf; +} + +static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e, + enum bch_data_type data_type, + struct bch_replicas_cpu_entry *r, + unsigned *max_dev) +{ + const struct bch_extent_ptr *ptr; + unsigned nr = 0; + + BUG_ON(!data_type || + data_type == BCH_DATA_SB || + data_type >= BCH_DATA_NR); + + memset(r, 0, sizeof(*r)); + r->data_type = data_type; + + *max_dev = 0; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) { + *max_dev = max_t(unsigned, *max_dev, ptr->dev); + replicas_set_dev(r, ptr->dev); + nr++; + } + return nr; +} + +static inline void devlist_to_replicas(struct bch_devs_list devs, + enum bch_data_type data_type, + struct bch_replicas_cpu_entry *r, + unsigned *max_dev) +{ + unsigned i; + + BUG_ON(!data_type || + data_type == BCH_DATA_SB || + data_type >= BCH_DATA_NR); + + memset(r, 0, sizeof(*r)); + r->data_type = data_type; + + *max_dev = 0; + + for (i = 0; i < devs.nr; i++) { + *max_dev = max_t(unsigned, *max_dev, devs.devs[i]); + replicas_set_dev(r, devs.devs[i]); + } +} + +static struct bch_replicas_cpu * +cpu_replicas_add_entry(struct bch_replicas_cpu *old, + struct bch_replicas_cpu_entry new_entry, + unsigned max_dev) +{ + struct bch_replicas_cpu *new; + unsigned i, nr, entry_size; + + entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + + DIV_ROUND_UP(max_dev + 1, 8); + entry_size = max(entry_size, old->entry_size); + nr = old->nr + 1; + + new = kzalloc(sizeof(struct bch_replicas_cpu) + + nr * entry_size, GFP_NOIO); + if (!new) + return NULL; + + new->nr = nr; + new->entry_size = entry_size; + + for (i = 0; i < old->nr; i++) + memcpy(cpu_replicas_entry(new, i), + cpu_replicas_entry(old, i), + min(new->entry_size, old->entry_size)); + + memcpy(cpu_replicas_entry(new, old->nr), + &new_entry, + new->entry_size); + + bch2_cpu_replicas_sort(new); + return new; +} + +static bool replicas_has_entry(struct bch_replicas_cpu *r, + struct bch_replicas_cpu_entry search, + unsigned max_dev) +{ + return max_dev < replicas_dev_slots(r) && + eytzinger0_find(r->entries, r->nr, + r->entry_size, + memcmp, &search) < r->nr; +} + +noinline +static int bch2_mark_replicas_slowpath(struct bch_fs *c, + struct bch_replicas_cpu_entry new_entry, + unsigned max_dev) +{ + struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL; + int ret = -ENOMEM; + + mutex_lock(&c->sb_lock); + + old_gc = rcu_dereference_protected(c->replicas_gc, + lockdep_is_held(&c->sb_lock)); + if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) { + new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev); + if (!new_gc) + goto err; + } + + old_r = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + if (!replicas_has_entry(old_r, new_entry, max_dev)) { + new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev); + if (!new_r) + goto err; + + ret = bch2_cpu_replicas_to_sb_replicas(c, new_r); + if (ret) + goto err; + } + + /* allocations done, now commit: */ + + if (new_r) + bch2_write_super(c); + + /* don't update in memory replicas until changes are persistent */ + + if (new_gc) { + rcu_assign_pointer(c->replicas_gc, new_gc); + kfree_rcu(old_gc, rcu); + } + + if (new_r) { + rcu_assign_pointer(c->replicas, new_r); + kfree_rcu(old_r, rcu); + } + + mutex_unlock(&c->sb_lock); + return 0; +err: + mutex_unlock(&c->sb_lock); + if (new_gc) + kfree(new_gc); + if (new_r) + kfree(new_r); + return ret; +} + +int bch2_mark_replicas(struct bch_fs *c, + enum bch_data_type data_type, + struct bch_devs_list devs) +{ + struct bch_replicas_cpu_entry search; + struct bch_replicas_cpu *r, *gc_r; + unsigned max_dev; + bool marked; + + if (!devs.nr) + return 0; + + BUG_ON(devs.nr >= BCH_REPLICAS_MAX); + + devlist_to_replicas(devs, data_type, &search, &max_dev); + + rcu_read_lock(); + r = rcu_dereference(c->replicas); + gc_r = rcu_dereference(c->replicas_gc); + marked = replicas_has_entry(r, search, max_dev) && + (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev)); + rcu_read_unlock(); + + return likely(marked) ? 0 + : bch2_mark_replicas_slowpath(c, search, max_dev); +} + +int bch2_mark_bkey_replicas(struct bch_fs *c, + enum bch_data_type data_type, + struct bkey_s_c k) +{ + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; + int ret; + + for (i = 0; i < cached.nr; i++) + if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, + bch2_dev_list_single(cached.devs[i])))) + return ret; + + return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k)); +} + +int bch2_replicas_gc_end(struct bch_fs *c, int err) +{ + struct bch_replicas_cpu *new_r, *old_r; + int ret = 0; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + + new_r = rcu_dereference_protected(c->replicas_gc, + lockdep_is_held(&c->sb_lock)); + + if (err) { + rcu_assign_pointer(c->replicas_gc, NULL); + kfree_rcu(new_r, rcu); + goto err; + } + + if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { + ret = -ENOSPC; + goto err; + } + + old_r = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + + rcu_assign_pointer(c->replicas, new_r); + rcu_assign_pointer(c->replicas_gc, NULL); + kfree_rcu(old_r, rcu); + + bch2_write_super(c); +err: + mutex_unlock(&c->sb_lock); + return ret; +} + +int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) +{ + struct bch_replicas_cpu *dst, *src; + struct bch_replicas_cpu_entry *e; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + BUG_ON(c->replicas_gc); + + src = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + + dst = kzalloc(sizeof(struct bch_replicas_cpu) + + src->nr * src->entry_size, GFP_NOIO); + if (!dst) { + mutex_unlock(&c->sb_lock); + return -ENOMEM; + } + + dst->nr = 0; + dst->entry_size = src->entry_size; + + for_each_cpu_replicas_entry(src, e) + if (!((1 << e->data_type) & typemask)) + memcpy(cpu_replicas_entry(dst, dst->nr++), + e, dst->entry_size); + + bch2_cpu_replicas_sort(dst); + + rcu_assign_pointer(c->replicas_gc, dst); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* Replicas tracking - superblock: */ + +static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, + unsigned *nr, + unsigned *bytes, + unsigned *max_dev) +{ + struct bch_replicas_entry *i; + unsigned j; + + *nr = 0; + *bytes = sizeof(*r); + *max_dev = 0; + + if (!r) + return; + + for_each_replicas_entry(r, i) { + for (j = 0; j < i->nr; j++) + *max_dev = max_t(unsigned, *max_dev, i->devs[j]); + (*nr)++; + } + + *bytes = (void *) i - (void *) r; +} + +static struct bch_replicas_cpu * +__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) +{ + struct bch_replicas_cpu *cpu_r; + unsigned i, nr, bytes, max_dev, entry_size; + + bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); + + entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + + DIV_ROUND_UP(max_dev + 1, 8); + + cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + + nr * entry_size, GFP_NOIO); + if (!cpu_r) + return NULL; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; + + if (nr) { + struct bch_replicas_cpu_entry *dst = + cpu_replicas_entry(cpu_r, 0); + struct bch_replicas_entry *src = sb_r->entries; + + while (dst < cpu_replicas_entry(cpu_r, nr)) { + dst->data_type = src->data_type; + for (i = 0; i < src->nr; i++) + replicas_set_dev(dst, src->devs[i]); + + src = replicas_entry_next(src); + dst = (void *) dst + entry_size; + } + } + + bch2_cpu_replicas_sort(cpu_r); + return cpu_r; +} + +int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) +{ + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_cpu *cpu_r, *old_r; + + sb_r = bch2_sb_get_replicas(c->disk_sb.sb); + cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); + if (!cpu_r) + return -ENOMEM; + + old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock)); + rcu_assign_pointer(c->replicas, cpu_r); + if (old_r) + kfree_rcu(old_r, rcu); + + return 0; +} + +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_entry *sb_e; + struct bch_replicas_cpu_entry *e; + size_t i, bytes; + + bytes = sizeof(struct bch_sb_field_replicas); + + for_each_cpu_replicas_entry(r, e) { + bytes += sizeof(struct bch_replicas_entry); + for (i = 0; i < r->entry_size - 1; i++) + bytes += hweight8(e->devs[i]); + } + + sb_r = bch2_sb_resize_replicas(&c->disk_sb, + DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); + if (!sb_r) + return -ENOSPC; + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + sb_e = sb_r->entries; + for_each_cpu_replicas_entry(r, e) { + sb_e->data_type = e->data_type; + + for (i = 0; i < replicas_dev_slots(r); i++) + if (replicas_test_dev(e, i)) + sb_e->devs[sb_e->nr++] = i; + + sb_e = replicas_entry_next(sb_e); + + BUG_ON((void *) sb_e > vstruct_end(&sb_r->field)); + } + + return 0; +} + +static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) +{ + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + struct bch_replicas_cpu *cpu_r = NULL; + struct bch_replicas_entry *e; + const char *err; + unsigned i; + + for_each_replicas_entry(sb_r, e) { + err = "invalid replicas entry: invalid data type"; + if (e->data_type >= BCH_DATA_NR) + goto err; + + err = "invalid replicas entry: no devices"; + if (!e->nr) + goto err; + + err = "invalid replicas entry: too many devices"; + if (e->nr >= BCH_REPLICAS_MAX) + goto err; + + err = "invalid replicas entry: invalid device"; + for (i = 0; i < e->nr; i++) + if (!bch2_dev_exists(sb, mi, e->devs[i])) + goto err; + } + + err = "cannot allocate memory"; + cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); + if (!cpu_r) + goto err; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i + 1 < cpu_r->nr; i++) { + struct bch_replicas_cpu_entry *l = + cpu_replicas_entry(cpu_r, i); + struct bch_replicas_cpu_entry *r = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); + + err = "duplicate replicas entry"; + if (!memcmp(l, r, cpu_r->entry_size)) + goto err; + } + + err = NULL; +err: + kfree(cpu_r); + return err; +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas = { + .validate = bch2_sb_validate_replicas, +}; + +int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size) +{ + char *out = buf, *end = out + size; + struct bch_replicas_entry *e; + bool first = true; + unsigned i; + + if (!r) { + out += scnprintf(out, end - out, "(no replicas section found)"); + return out - buf; + } + + for_each_replicas_entry(r, e) { + if (!first) + out += scnprintf(out, end - out, " "); + first = false; + + out += scnprintf(out, end - out, "%u: [", e->data_type); + + for (i = 0; i < e->nr; i++) + out += scnprintf(out, end - out, + i ? " %u" : "%u", e->devs[i]); + out += scnprintf(out, end - out, "]"); + } + + return out - buf; +} + +/* Query replicas: */ + +bool bch2_replicas_marked(struct bch_fs *c, + enum bch_data_type data_type, + struct bch_devs_list devs) +{ + struct bch_replicas_cpu_entry search; + unsigned max_dev; + bool ret; + + if (!devs.nr) + return true; + + devlist_to_replicas(devs, data_type, &search, &max_dev); + + rcu_read_lock(); + ret = replicas_has_entry(rcu_dereference(c->replicas), + search, max_dev); + rcu_read_unlock(); + + return ret; +} + +bool bch2_bkey_replicas_marked(struct bch_fs *c, + enum bch_data_type data_type, + struct bkey_s_c k) +{ + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; + + for (i = 0; i < cached.nr; i++) + if (!bch2_replicas_marked(c, BCH_DATA_CACHED, + bch2_dev_list_single(cached.devs[i]))) + return false; + + return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k)); +} + +struct replicas_status __bch2_replicas_status(struct bch_fs *c, + struct bch_devs_mask online_devs) +{ + struct bch_sb_field_members *mi; + struct bch_replicas_cpu_entry *e; + struct bch_replicas_cpu *r; + unsigned i, dev, dev_slots, nr_online, nr_offline; + struct replicas_status ret; + + memset(&ret, 0, sizeof(ret)); + + for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) + ret.replicas[i].nr_online = UINT_MAX; + + mi = bch2_sb_get_members(c->disk_sb.sb); + rcu_read_lock(); + + r = rcu_dereference(c->replicas); + dev_slots = replicas_dev_slots(r); + + for_each_cpu_replicas_entry(r, e) { + if (e->data_type >= ARRAY_SIZE(ret.replicas)) + panic("e %p data_type %u\n", e, e->data_type); + + nr_online = nr_offline = 0; + + for (dev = 0; dev < dev_slots; dev++) { + if (!replicas_test_dev(e, dev)) + continue; + + BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev)); + + if (test_bit(dev, online_devs.d)) + nr_online++; + else + nr_offline++; + } + + ret.replicas[e->data_type].nr_online = + min(ret.replicas[e->data_type].nr_online, + nr_online); + + ret.replicas[e->data_type].nr_offline = + max(ret.replicas[e->data_type].nr_offline, + nr_offline); + } + + rcu_read_unlock(); + + return ret; +} + +struct replicas_status bch2_replicas_status(struct bch_fs *c) +{ + return __bch2_replicas_status(c, bch2_online_devs(c)); +} + +static bool have_enough_devs(struct replicas_status s, + enum bch_data_type type, + bool force_if_degraded, + bool force_if_lost) +{ + return (!s.replicas[type].nr_offline || force_if_degraded) && + (s.replicas[type].nr_online || force_if_lost); +} + +bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) +{ + return (have_enough_devs(s, BCH_DATA_JOURNAL, + flags & BCH_FORCE_IF_METADATA_DEGRADED, + flags & BCH_FORCE_IF_METADATA_LOST) && + have_enough_devs(s, BCH_DATA_BTREE, + flags & BCH_FORCE_IF_METADATA_DEGRADED, + flags & BCH_FORCE_IF_METADATA_LOST) && + have_enough_devs(s, BCH_DATA_USER, + flags & BCH_FORCE_IF_DATA_DEGRADED, + flags & BCH_FORCE_IF_DATA_LOST)); +} + +unsigned bch2_replicas_online(struct bch_fs *c, bool meta) +{ + struct replicas_status s = bch2_replicas_status(c); + + return meta + ? min(s.replicas[BCH_DATA_JOURNAL].nr_online, + s.replicas[BCH_DATA_BTREE].nr_online) + : s.replicas[BCH_DATA_USER].nr_online; +} + +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_replicas_cpu_entry *e; + struct bch_replicas_cpu *r; + unsigned ret = 0; + + rcu_read_lock(); + r = rcu_dereference(c->replicas); + + if (ca->dev_idx >= replicas_dev_slots(r)) + goto out; + + for_each_cpu_replicas_entry(r, e) + if (replicas_test_dev(e, ca->dev_idx)) + ret |= 1 << e->data_type; +out: + rcu_read_unlock(); + + return ret; +} diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h new file mode 100644 index 00000000..49f114b0 --- /dev/null +++ b/libbcachefs/replicas.h @@ -0,0 +1,51 @@ +#ifndef _BCACHEFS_REPLICAS_H +#define _BCACHEFS_REPLICAS_H + +bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, + struct bch_devs_list); +bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type, + struct bkey_s_c); +int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, + struct bch_devs_list); +int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type, + struct bkey_s_c); + +int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t); +int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t); + +struct replicas_status { + struct { + unsigned nr_online; + unsigned nr_offline; + } replicas[BCH_DATA_NR]; +}; + +struct replicas_status __bch2_replicas_status(struct bch_fs *, + struct bch_devs_mask); +struct replicas_status bch2_replicas_status(struct bch_fs *); +bool bch2_have_enough_devs(struct replicas_status, unsigned); + +unsigned bch2_replicas_online(struct bch_fs *, bool); +unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + +int bch2_replicas_gc_end(struct bch_fs *, int); +int bch2_replicas_gc_start(struct bch_fs *, unsigned); + +/* iterate over superblock replicas - used by userspace tools: */ + +static inline struct bch_replicas_entry * +replicas_entry_next(struct bch_replicas_entry *i) +{ + return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; +} + +#define for_each_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + +int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; + +#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 69101f3a..a2b981a3 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1,8 +1,11 @@ #include "bcachefs.h" #include "checksum.h" +#include "disk_groups.h" #include "error.h" #include "io.h" +#include "replicas.h" +#include "quota.h" #include "super-io.h" #include "super.h" #include "vstructs.h" @@ -10,13 +13,6 @@ #include #include -static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, - struct bch_replicas_cpu *); -static int bch2_sb_disk_groups_to_cpu(struct bch_fs *); - -/* superblock fields (optional/variable size sections: */ - const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -24,34 +20,8 @@ const char * const bch2_sb_fields[] = { NULL }; -#define x(f, nr) \ -static const char *bch2_sb_validate_##f(struct bch_sb *, struct bch_sb_field *); - BCH_SB_FIELDS() -#undef x - -struct bch_sb_field_ops { - const char * (*validate)(struct bch_sb *, struct bch_sb_field *); -}; - -static const struct bch_sb_field_ops bch2_sb_field_ops[] = { -#define x(f, nr) \ - [BCH_SB_FIELD_##f] = { \ - .validate = bch2_sb_validate_##f, \ - }, - BCH_SB_FIELDS() -#undef x -}; - -static const char *bch2_sb_field_validate(struct bch_sb *sb, - struct bch_sb_field *f) - -{ - unsigned type = le32_to_cpu(f->type); - - return type < BCH_SB_FIELD_NR - ? bch2_sb_field_ops[type].validate(sb, f) - : NULL; -} +static const char *bch2_sb_field_validate(struct bch_sb *, + struct bch_sb_field *); struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, enum bch_sb_field_type type) @@ -66,14 +36,18 @@ struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, return NULL; } -static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb, - struct bch_sb_field *f, - unsigned u64s) +static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, + struct bch_sb_field *f, + unsigned u64s) { unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; + + BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > + sb->page_order); if (!f) { - f = vstruct_last(sb); + f = vstruct_last(sb->sb); memset(f, 0, sizeof(u64) * u64s); f->u64s = cpu_to_le32(u64s); f->type = 0; @@ -84,13 +58,13 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb, f->u64s = cpu_to_le32(u64s); dst = vstruct_end(f); - memmove(dst, src, vstruct_end(sb) - src); + memmove(dst, src, vstruct_end(sb->sb) - src); if (dst > src) memset(src, 0, dst - src); } - le32_add_cpu(&sb->u64s, u64s - old_u64s); + sb->sb->u64s = cpu_to_le32(sb_u64s); return f; } @@ -108,26 +82,42 @@ void bch2_free_super(struct bch_sb_handle *sb) memset(sb, 0, sizeof(*sb)); } -static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order) +int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) { + size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); + unsigned order = get_order(new_bytes); struct bch_sb *new_sb; struct bio *bio; + if (sb->have_layout) { + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + + if (new_bytes > max_bytes) { + char buf[BDEVNAME_SIZE]; + + pr_err("%s: superblock too big: want %zu but have %llu", + bdevname(sb->bdev, buf), new_bytes, max_bytes); + return -ENOSPC; + } + } + if (sb->page_order >= order && sb->sb) return 0; if (dynamic_fault("bcachefs:add:super_realloc")) return -ENOMEM; - bio = bio_kmalloc(GFP_KERNEL, 1 << order); - if (!bio) - return -ENOMEM; + if (sb->have_bio) { + bio = bio_kmalloc(GFP_KERNEL, 1 << order); + if (!bio) + return -ENOMEM; - if (sb->bio) - bio_put(sb->bio); - sb->bio = bio; + if (sb->bio) + bio_put(sb->bio); + sb->bio = bio; + } - new_sb = (void *) __get_free_pages(GFP_KERNEL, order); + new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); if (!new_sb) return -ENOMEM; @@ -142,45 +132,6 @@ static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order) return 0; } -static int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -{ - u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); - u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; - - if (new_bytes > max_bytes) { - char buf[BDEVNAME_SIZE]; - - pr_err("%s: superblock too big: want %llu but have %llu", - bdevname(sb->bdev, buf), new_bytes, max_bytes); - return -ENOSPC; - } - - return __bch2_super_realloc(sb, get_order(new_bytes)); -} - -static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s) -{ - u64 bytes = __vstruct_bytes(struct bch_sb, u64s); - struct bch_sb *sb; - unsigned order = get_order(bytes); - - if (c->disk_sb && order <= c->disk_sb_order) - return 0; - - sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (!sb) - return -ENOMEM; - - if (c->disk_sb) - memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order); - - free_pages((unsigned long) c->disk_sb, c->disk_sb_order); - - c->disk_sb = sb; - c->disk_sb_order = order; - return 0; -} - struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, enum bch_sb_field_type type, unsigned u64s) @@ -192,38 +143,26 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) return NULL; - f = __bch2_sb_field_resize(sb->sb, f, u64s); - f->type = cpu_to_le32(type); - return f; -} - -struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - struct bch_dev *ca; - unsigned i; - - lockdep_assert_held(&c->sb_lock); + if (sb->fs_sb) { + struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); + struct bch_dev *ca; + unsigned i; - if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d)) - return NULL; + lockdep_assert_held(&c->sb_lock); - /* XXX: we're not checking that offline device have enough space */ + /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(ca, c, i) { - struct bch_sb_handle *sb = &ca->disk_sb; + for_each_online_member(ca, c, i) { + struct bch_sb_handle *sb = &ca->disk_sb; - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { - percpu_ref_put(&ca->ref); - return NULL; + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + percpu_ref_put(&ca->ref); + return NULL; + } } } - f = __bch2_sb_field_resize(c->disk_sb, f, u64s); + f = __bch2_sb_field_resize(sb, f, u64s); f->type = cpu_to_le32(type); return f; } @@ -384,7 +323,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) static void bch2_sb_update(struct bch_fs *c) { - struct bch_sb *src = c->disk_sb; + struct bch_sb *src = c->disk_sb.sb; struct bch_sb_field_members *mi = bch2_sb_get_members(src); struct bch_dev *ca; unsigned i; @@ -407,9 +346,10 @@ static void bch2_sb_update(struct bch_fs *c) } /* doesn't copy member info */ -static void __copy_super(struct bch_sb *dst, struct bch_sb *src) +static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; + struct bch_sb *dst = dst_handle->sb; dst->version = src->version; dst->seq = src->seq; @@ -433,8 +373,8 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src) continue; dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type)); - dst_f = __bch2_sb_field_resize(dst, dst_f, - le32_to_cpu(src_f->u64s)); + dst_f = __bch2_sb_field_resize(dst_handle, dst_f, + le32_to_cpu(src_f->u64s)); memcpy(dst_f, src_f, vstruct_bytes(src_f)); } @@ -451,11 +391,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) lockdep_assert_held(&c->sb_lock); - ret = bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s); + ret = bch2_sb_realloc(&c->disk_sb, + le32_to_cpu(src->u64s) - journal_u64s); if (ret) return ret; - __copy_super(c->disk_sb, src); + __copy_super(&c->disk_sb, src); ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret) @@ -471,7 +412,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) { - struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb; + struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; struct bch_sb_field_journal *journal_buckets = bch2_sb_get_journal(dst); unsigned journal_u64s = journal_buckets @@ -484,7 +425,7 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) if (ret) return ret; - __copy_super(dst, src); + __copy_super(&ca->disk_sb, src); return 0; } @@ -494,7 +435,6 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) { struct bch_csum csum; size_t bytes; - unsigned order; reread: bio_reset(sb->bio); bio_set_dev(sb->bio, sb->bdev); @@ -518,9 +458,8 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) if (bytes > 512 << sb->sb->layout.sb_max_size_bits) return "Bad superblock: too big"; - order = get_order(bytes); - if (order > sb->page_order) { - if (__bch2_super_realloc(sb, order)) + if (get_order(bytes) > sb->page_order) { + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) return "cannot allocate memory"; goto reread; } @@ -550,7 +489,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts, pr_verbose_init(*opts, ""); memset(sb, 0, sizeof(*sb)); - sb->mode = FMODE_READ; + sb->mode = FMODE_READ; + sb->have_bio = true; if (!opt_get(*opts, noexcl)) sb->mode |= FMODE_EXCL; @@ -575,7 +515,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts, } err = "cannot allocate memory"; - ret = __bch2_super_realloc(sb, 0); + ret = bch2_sb_realloc(sb, 0); if (ret) goto err; @@ -644,6 +584,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts, bdev_get_queue(sb->bdev)->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; ret = 0; + sb->have_layout = true; out: pr_verbose_init(*opts, "ret %i", ret); return ret; @@ -711,7 +652,7 @@ void bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); - le64_add_cpu(&c->disk_sb->seq, 1); + le64_add_cpu(&c->disk_sb.sb->seq, 1); for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -837,6 +778,10 @@ static const char *bch2_sb_validate_journal(struct bch_sb *sb, return err; } +static const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_validate_journal, +}; + /* BCH_SB_FIELD_members: */ static const char *bch2_sb_validate_members(struct bch_sb *sb, @@ -880,6 +825,10 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb, return NULL; } +static const struct bch_sb_field_ops bch_sb_field_ops_members = { + .validate = bch2_sb_validate_members, +}; + /* BCH_SB_FIELD_crypt: */ static const char *bch2_sb_validate_crypt(struct bch_sb *sb, @@ -896,980 +845,42 @@ static const char *bch2_sb_validate_crypt(struct bch_sb *sb, return NULL; } -/* BCH_SB_FIELD_replicas: */ - -/* Replicas tracking - in memory: */ - -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) - -static inline struct bch_replicas_cpu_entry * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -{ - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); -} - -static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) -{ - return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; -} - -static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) -{ - e->devs[dev >> 3] |= 1 << (dev & 7); -} - -static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) -{ - return (r->entry_size - - offsetof(struct bch_replicas_cpu_entry, devs)) * 8; -} - -int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r, - char *buf, size_t size) -{ - char *out = buf, *end = out + size; - struct bch_replicas_cpu_entry *e; - bool first = true; - unsigned i; - - for_each_cpu_replicas_entry(r, e) { - bool first_e = true; - - if (!first) - out += scnprintf(out, end - out, " "); - first = false; - - out += scnprintf(out, end - out, "%u: [", e->data_type); - - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) { - if (!first_e) - out += scnprintf(out, end - out, " "); - first_e = false; - out += scnprintf(out, end - out, "%u", i); - } - out += scnprintf(out, end - out, "]"); - } - - return out - buf; -} - -static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) -{ - const struct bch_extent_ptr *ptr; - unsigned nr = 0; - - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); - - memset(r, 0, sizeof(*r)); - r->data_type = data_type; - - *max_dev = 0; - - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - *max_dev = max_t(unsigned, *max_dev, ptr->dev); - replicas_set_dev(r, ptr->dev); - nr++; - } - return nr; -} - -static inline void devlist_to_replicas(struct bch_devs_list devs, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) -{ - unsigned i; - - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); - - memset(r, 0, sizeof(*r)); - r->data_type = data_type; - - *max_dev = 0; - - for (i = 0; i < devs.nr; i++) { - *max_dev = max_t(unsigned, *max_dev, devs.devs[i]); - replicas_set_dev(r, devs.devs[i]); - } -} - -static struct bch_replicas_cpu * -cpu_replicas_add_entry(struct bch_replicas_cpu *old, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) -{ - struct bch_replicas_cpu *new; - unsigned i, nr, entry_size; - - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - entry_size = max(entry_size, old->entry_size); - nr = old->nr + 1; - - new = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!new) - return NULL; - - new->nr = nr; - new->entry_size = entry_size; - - for (i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(new, i), - cpu_replicas_entry(old, i), - min(new->entry_size, old->entry_size)); - - memcpy(cpu_replicas_entry(new, old->nr), - &new_entry, - new->entry_size); - - bch2_cpu_replicas_sort(new); - return new; -} - -static bool replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_cpu_entry search, - unsigned max_dev) -{ - return max_dev < replicas_dev_slots(r) && - eytzinger0_find(r->entries, r->nr, - r->entry_size, - memcmp, &search) < r->nr; -} - -noinline -static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) -{ - struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL; - int ret = -ENOMEM; - - mutex_lock(&c->sb_lock); - - old_gc = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) { - new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev); - if (!new_gc) - goto err; - } - - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - if (!replicas_has_entry(old_r, new_entry, max_dev)) { - new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev); - if (!new_r) - goto err; - - ret = bch2_cpu_replicas_to_sb_replicas(c, new_r); - if (ret) - goto err; - } - - /* allocations done, now commit: */ - - if (new_r) - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ - - if (new_gc) { - rcu_assign_pointer(c->replicas_gc, new_gc); - kfree_rcu(old_gc, rcu); - } - - if (new_r) { - rcu_assign_pointer(c->replicas, new_r); - kfree_rcu(old_r, rcu); - } - - mutex_unlock(&c->sb_lock); - return 0; -err: - mutex_unlock(&c->sb_lock); - if (new_gc) - kfree(new_gc); - if (new_r) - kfree(new_r); - return ret; -} - -int bch2_mark_replicas(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) -{ - struct bch_replicas_cpu_entry search; - struct bch_replicas_cpu *r, *gc_r; - unsigned max_dev; - bool marked; - - if (!devs.nr) - return 0; - - BUG_ON(devs.nr >= BCH_REPLICAS_MAX); - - devlist_to_replicas(devs, data_type, &search, &max_dev); - - rcu_read_lock(); - r = rcu_dereference(c->replicas); - gc_r = rcu_dereference(c->replicas_gc); - marked = replicas_has_entry(r, search, max_dev) && - (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev)); - rcu_read_unlock(); - - return likely(marked) ? 0 - : bch2_mark_replicas_slowpath(c, search, max_dev); -} - -int bch2_mark_bkey_replicas(struct bch_fs *c, - enum bch_data_type data_type, - struct bkey_s_c k) -{ - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - int ret; - - for (i = 0; i < cached.nr; i++) - if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i])))) - return ret; - - return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k)); -} - -int bch2_replicas_gc_end(struct bch_fs *c, int err) -{ - struct bch_replicas_cpu *new_r, *old_r; - int ret = 0; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - - new_r = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - - if (err) { - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(new_r, rcu); - goto err; - } - - if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { - ret = -ENOSPC; - goto err; - } - - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - - rcu_assign_pointer(c->replicas, new_r); - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(old_r, rcu); - - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - struct bch_replicas_cpu *dst, *src; - struct bch_replicas_cpu_entry *e; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc); - - src = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - - dst = kzalloc(sizeof(struct bch_replicas_cpu) + - src->nr * src->entry_size, GFP_NOIO); - if (!dst) { - mutex_unlock(&c->sb_lock); - return -ENOMEM; - } - - dst->nr = 0; - dst->entry_size = src->entry_size; - - for_each_cpu_replicas_entry(src, e) - if (!((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(dst, dst->nr++), - e, dst->entry_size); - - bch2_cpu_replicas_sort(dst); - - rcu_assign_pointer(c->replicas_gc, dst); - mutex_unlock(&c->sb_lock); - - return 0; -} - -/* Replicas tracking - superblock: */ - -static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, - unsigned *nr, - unsigned *bytes, - unsigned *max_dev) -{ - struct bch_replicas_entry *i; - unsigned j; - - *nr = 0; - *bytes = sizeof(*r); - *max_dev = 0; - - if (!r) - return; - - for_each_replicas_entry(r, i) { - for (j = 0; j < i->nr; j++) - *max_dev = max_t(unsigned, *max_dev, i->devs[j]); - (*nr)++; - } - - *bytes = (void *) i - (void *) r; -} - -static struct bch_replicas_cpu * -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) -{ - struct bch_replicas_cpu *cpu_r; - unsigned i, nr, bytes, max_dev, entry_size; - - bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); - - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - - cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!cpu_r) - return NULL; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - if (nr) { - struct bch_replicas_cpu_entry *dst = - cpu_replicas_entry(cpu_r, 0); - struct bch_replicas_entry *src = sb_r->entries; - - while (dst < cpu_replicas_entry(cpu_r, nr)) { - dst->data_type = src->data_type; - for (i = 0; i < src->nr; i++) - replicas_set_dev(dst, src->devs[i]); - - src = replicas_entry_next(src); - dst = (void *) dst + entry_size; - } - } - - bch2_cpu_replicas_sort(cpu_r); - return cpu_r; -} - -static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -{ - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_cpu *cpu_r, *old_r; - - sb_r = bch2_sb_get_replicas(c->disk_sb); - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) - return -ENOMEM; - - old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->replicas, cpu_r); - if (old_r) - kfree_rcu(old_r, rcu); - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *sb_e; - struct bch_replicas_cpu_entry *e; - size_t i, bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, e) { - bytes += sizeof(struct bch_replicas_entry); - for (i = 0; i < r->entry_size - 1; i++) - bytes += hweight8(e->devs[i]); - } - - sb_r = bch2_fs_sb_resize_replicas(c, - DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); - if (!sb_r) - return -ENOSPC; - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - sb_e = sb_r->entries; - for_each_cpu_replicas_entry(r, e) { - sb_e->data_type = e->data_type; - - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) - sb_e->devs[sb_e->nr++] = i; - - sb_e = replicas_entry_next(sb_e); - - BUG_ON((void *) sb_e > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static const char *bch2_sb_validate_replicas(struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu *cpu_r = NULL; - struct bch_replicas_entry *e; - const char *err; - unsigned i; - - for_each_replicas_entry(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; - - err = "invalid replicas entry: no devices"; - if (!e->nr) - goto err; - - err = "invalid replicas entry: too many devices"; - if (e->nr >= BCH_REPLICAS_MAX) - goto err; - - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; - } - - err = "cannot allocate memory"; - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) - goto err; - - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - memcmp, NULL); - - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_cpu_entry *l = - cpu_replicas_entry(cpu_r, i); - struct bch_replicas_cpu_entry *r = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - - err = "duplicate replicas entry"; - if (!memcmp(l, r, cpu_r->entry_size)) - goto err; - } - - err = NULL; -err: - kfree(cpu_r); - return err; -} - -int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size) -{ - char *out = buf, *end = out + size; - struct bch_replicas_entry *e; - bool first = true; - unsigned i; - - if (!r) { - out += scnprintf(out, end - out, "(no replicas section found)"); - return out - buf; - } - - for_each_replicas_entry(r, e) { - if (!first) - out += scnprintf(out, end - out, " "); - first = false; - - out += scnprintf(out, end - out, "%u: [", e->data_type); - - for (i = 0; i < e->nr; i++) - out += scnprintf(out, end - out, - i ? " %u" : "%u", e->devs[i]); - out += scnprintf(out, end - out, "]"); - } - - return out - buf; -} - -/* Query replicas: */ - -bool bch2_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; - bool ret; - - if (!devs.nr) - return true; - - devlist_to_replicas(devs, data_type, &search, &max_dev); - - rcu_read_lock(); - ret = replicas_has_entry(rcu_dereference(c->replicas), - search, max_dev); - rcu_read_unlock(); - - return ret; -} - -bool bch2_bkey_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, - struct bkey_s_c k) -{ - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - - for (i = 0; i < cached.nr; i++) - if (!bch2_replicas_marked(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i]))) - return false; - - return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k)); -} - -struct replicas_status __bch2_replicas_status(struct bch_fs *c, - struct bch_devs_mask online_devs) -{ - struct bch_sb_field_members *mi; - struct bch_replicas_cpu_entry *e; - struct bch_replicas_cpu *r; - unsigned i, dev, dev_slots, nr_online, nr_offline; - struct replicas_status ret; - - memset(&ret, 0, sizeof(ret)); - - for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - ret.replicas[i].nr_online = UINT_MAX; - - mi = bch2_sb_get_members(c->disk_sb); - rcu_read_lock(); - - r = rcu_dereference(c->replicas); - dev_slots = replicas_dev_slots(r); - - for_each_cpu_replicas_entry(r, e) { - if (e->data_type >= ARRAY_SIZE(ret.replicas)) - panic("e %p data_type %u\n", e, e->data_type); - - nr_online = nr_offline = 0; - - for (dev = 0; dev < dev_slots; dev++) { - if (!replicas_test_dev(e, dev)) - continue; - - BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev)); - - if (test_bit(dev, online_devs.d)) - nr_online++; - else - nr_offline++; - } - - ret.replicas[e->data_type].nr_online = - min(ret.replicas[e->data_type].nr_online, - nr_online); - - ret.replicas[e->data_type].nr_offline = - max(ret.replicas[e->data_type].nr_offline, - nr_offline); - } - - rcu_read_unlock(); - - return ret; -} - -struct replicas_status bch2_replicas_status(struct bch_fs *c) -{ - return __bch2_replicas_status(c, bch2_online_devs(c)); -} - -static bool have_enough_devs(struct replicas_status s, - enum bch_data_type type, - bool force_if_degraded, - bool force_if_lost) -{ - return (!s.replicas[type].nr_offline || force_if_degraded) && - (s.replicas[type].nr_online || force_if_lost); -} - -bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) -{ - return (have_enough_devs(s, BCH_DATA_JOURNAL, - flags & BCH_FORCE_IF_METADATA_DEGRADED, - flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_BTREE, - flags & BCH_FORCE_IF_METADATA_DEGRADED, - flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_USER, - flags & BCH_FORCE_IF_DATA_DEGRADED, - flags & BCH_FORCE_IF_DATA_LOST)); -} - -unsigned bch2_replicas_online(struct bch_fs *c, bool meta) -{ - struct replicas_status s = bch2_replicas_status(c); - - return meta - ? min(s.replicas[BCH_DATA_JOURNAL].nr_online, - s.replicas[BCH_DATA_BTREE].nr_online) - : s.replicas[BCH_DATA_USER].nr_online; -} - -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -{ - struct bch_replicas_cpu_entry *e; - struct bch_replicas_cpu *r; - unsigned ret = 0; - - rcu_read_lock(); - r = rcu_dereference(c->replicas); - - if (ca->dev_idx >= replicas_dev_slots(r)) - goto out; - - for_each_cpu_replicas_entry(r, e) - if (replicas_test_dev(e, ca->dev_idx)) - ret |= 1 << e->data_type; -out: - rcu_read_unlock(); - - return ret; -} +static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + .validate = bch2_sb_validate_crypt, +}; -/* Quotas: */ +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { +#define x(f, nr) \ + [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, + BCH_SB_FIELDS() +#undef x +}; -static const char *bch2_sb_validate_quota(struct bch_sb *sb, +static const char *bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f) { - struct bch_sb_field_quota *q = field_to_type(f, quota); - - if (vstruct_bytes(&q->field) != sizeof(*q)) - return "invalid field quota: wrong size"; - - return NULL; -} - -/* Disk groups: */ - -static int strcmp_void(const void *l, const void *r) -{ - return strcmp(l, r); -} - -static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g; - struct bch_sb_field_members *mi; - struct bch_member *m; - unsigned i, nr_groups, nr_live = 0, len; - char **labels, *l; - const char *err = NULL; - - mi = bch2_sb_get_members(sb); - groups = bch2_sb_get_disk_groups(sb); - nr_groups = disk_groups_nr(groups); - - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) { - unsigned g; - - if (!BCH_MEMBER_GROUP(m)) - continue; - - g = BCH_MEMBER_GROUP(m) - 1; - - if (g >= nr_groups || - BCH_GROUP_DELETED(&groups->entries[g])) - return "disk has invalid group"; - } - - if (!nr_groups) - return NULL; - - labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL); - if (!labels) - return "cannot allocate memory"; - - for (g = groups->entries; - g < groups->entries + nr_groups; - g++) { - if (BCH_GROUP_DELETED(g)) - continue; - - len = strnlen(g->label, sizeof(g->label)); - - labels[nr_live++] = l = kmalloc(len + 1, GFP_KERNEL); - if (!l) { - err = "cannot allocate memory"; - goto err; - } - - memcpy(l, g->label, len); - l[len] = '\0'; - } - - sort(labels, nr_live, sizeof(labels[0]), strcmp_void, NULL); - - for (i = 0; i + 1 < nr_live; i++) - if (!strcmp(labels[i], labels[i + 1])) { - err = "duplicate group labels"; - goto err; - } - - err = NULL; -err: - for (i = 0; i < nr_live; i++) - kfree(labels[i]); - kfree(labels); - return err; -} - -static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_members *mi; - struct bch_sb_field_disk_groups *groups; - struct bch_disk_groups_cpu *cpu_g, *old_g; - unsigned i, nr_groups; - - lockdep_assert_held(&c->sb_lock); - - mi = bch2_sb_get_members(c->disk_sb); - groups = bch2_sb_get_disk_groups(c->disk_sb); - nr_groups = disk_groups_nr(groups); - - if (!groups) - return 0; - - cpu_g = kzalloc(sizeof(*cpu_g) + - sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); - if (!cpu_g) - return -ENOMEM; - - cpu_g->nr = nr_groups; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *src = &groups->entries[i]; - struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; - - dst->deleted = BCH_GROUP_DELETED(src); - } - - for (i = 0; i < c->disk_sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - struct bch_disk_group_cpu *dst = - &cpu_g->entries[BCH_MEMBER_GROUP(m)]; - - if (!bch2_member_exists(m)) - continue; - - dst = BCH_MEMBER_GROUP(m) - ? &cpu_g->entries[BCH_MEMBER_GROUP(m) - 1] - : NULL; - if (dst) - __set_bit(i, dst->devs.d); - } - - old_g = c->disk_groups; - rcu_assign_pointer(c->disk_groups, cpu_g); - if (old_g) - kfree_rcu(old_g, rcu); - - return 0; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -{ - struct target t = target_decode(target); - - switch (t.type) { - case TARGET_DEV: { - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - return ca ? &ca->self : NULL; - } - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - return t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - } - default: - BUG(); - } -} - -int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, - const char *name) -{ - unsigned i, nr_groups = disk_groups_nr(groups); - unsigned len = strlen(name); - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - if (strnlen(g->label, sizeof(g->label)) == len && - !memcmp(name, g->label, len)) - return i; - } - - return -1; -} - -static int bch2_disk_group_find(struct bch_fs *c, const char *name) -{ - int ret; - - mutex_lock(&c->sb_lock); - ret = __bch2_disk_group_find(bch2_sb_get_disk_groups(c->disk_sb), name); - mutex_unlock(&c->sb_lock); + unsigned type = le32_to_cpu(f->type); - return ret; + return type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type]->validate(sb, f) + : NULL; } -int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) +size_t bch2_sb_field_to_text(char *buf, size_t size, + struct bch_sb *sb, struct bch_sb_field *f) { - struct bch_dev *ca; - int g; - - if (!strlen(buf) || !strcmp(buf, "none")) { - *v = 0; - return 0; - } - - /* Is it a device? */ - ca = bch2_dev_lookup(c, buf); - if (!IS_ERR(ca)) { - *v = dev_to_target(ca->dev_idx); - percpu_ref_put(&ca->ref); - return 0; - } + unsigned type = le32_to_cpu(f->type); + size_t (*to_text)(char *, size_t, struct bch_sb *, + struct bch_sb_field *) = + type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type]->to_text + : NULL; - g = bch2_disk_group_find(c, buf); - if (g >= 0) { - *v = group_to_target(g); + if (!to_text) { + if (size) + buf[0] = '\0'; return 0; } - return -EINVAL; -} - -int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v) -{ - struct target t = target_decode(v); - int ret; - - switch (t.type) { - case TARGET_NULL: - return scnprintf(buf, len, "none"); - case TARGET_DEV: { - struct bch_dev *ca; - - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && percpu_ref_tryget(&ca->io_ref)) { - char b[BDEVNAME_SIZE]; - - ret = scnprintf(buf, len, "/dev/%s", - bdevname(ca->disk_sb.bdev, b)); - percpu_ref_put(&ca->io_ref); - } else if (ca) { - ret = scnprintf(buf, len, "offline device %u", t.dev); - } else { - ret = scnprintf(buf, len, "invalid device %u", t.dev); - } - - rcu_read_unlock(); - break; - } - case TARGET_GROUP: { - struct bch_sb_field_disk_groups *groups; - struct bch_disk_group *g; - - mutex_lock(&c->sb_lock); - groups = bch2_sb_get_disk_groups(c->disk_sb); - - g = t.group < disk_groups_nr(groups) - ? groups->entries + t.group - : NULL; - - if (g && !BCH_GROUP_DELETED(g)) { - ret = len ? min(len - 1, strnlen(g->label, sizeof(g->label))) : 0; - - memcpy(buf, g->label, ret); - if (len) - buf[ret] = '\0'; - } else { - ret = scnprintf(buf, len, "invalid group %u", t.group); - } - - mutex_unlock(&c->sb_lock); - break; - } - default: - BUG(); - } - - return ret; + return to_text(buf, size, sb, f); } diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 2514ac8a..f407c205 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -11,8 +11,6 @@ struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, enum bch_sb_field_type, unsigned); -struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *, - enum bch_sb_field_type, unsigned); #define field_to_type(_f, _name) \ container_of_or_null(_f, struct bch_sb_field_##_name, field) @@ -30,13 +28,6 @@ bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ { \ return field_to_type(bch2_sb_field_resize(sb, \ BCH_SB_FIELD_##_name, u64s), _name); \ -} \ - \ -static inline struct bch_sb_field_##_name * \ -bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \ -{ \ - return field_to_type(bch2_fs_sb_field_resize(c, \ - BCH_SB_FIELD_##_name, u64s), _name); \ } BCH_SB_FIELDS() @@ -44,6 +35,12 @@ BCH_SB_FIELDS() extern const char * const bch2_sb_fields[]; +struct bch_sb_field_ops { + const char * (*validate)(struct bch_sb *, struct bch_sb_field *); + size_t (*to_text)(char *, size_t, struct bch_sb *, + struct bch_sb_field *); +}; + static inline bool bch2_sb_test_feature(struct bch_sb *sb, enum bch_sb_features f) { @@ -90,7 +87,7 @@ int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); -int bch2_super_realloc(struct bch_sb_handle *, unsigned); +int bch2_sb_realloc(struct bch_sb_handle *, unsigned); const char *bch2_sb_validate(struct bch_sb_handle *); @@ -139,135 +136,4 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) }; } -/* BCH_SB_FIELD_replicas: */ - -bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); -bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type, - struct bkey_s_c); -int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); -int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type, - struct bkey_s_c); - -int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t); -int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t); - -struct replicas_status { - struct { - unsigned nr_online; - unsigned nr_offline; - } replicas[BCH_DATA_NR]; -}; - -struct replicas_status __bch2_replicas_status(struct bch_fs *, - struct bch_devs_mask); -struct replicas_status bch2_replicas_status(struct bch_fs *); -bool bch2_have_enough_devs(struct replicas_status, unsigned); - -unsigned bch2_replicas_online(struct bch_fs *, bool); -unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); - -int bch2_replicas_gc_end(struct bch_fs *, int); -int bch2_replicas_gc_start(struct bch_fs *, unsigned); - -/* iterate over superblock replicas - used by userspace tools: */ - -static inline struct bch_replicas_entry * -replicas_entry_next(struct bch_replicas_entry *i) -{ - return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; -} - -#define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -/* disk groups: */ - -static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -{ - return groups - ? (vstruct_end(&groups->field) - - (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) - : 0; -} - -struct target { - enum { - TARGET_NULL, - TARGET_DEV, - TARGET_GROUP, - } type; - union { - unsigned dev; - unsigned group; - }; -}; - -#define TARGET_DEV_START 1 -#define TARGET_GROUP_START (256 + TARGET_DEV_START) - -static inline u16 dev_to_target(unsigned dev) -{ - return TARGET_DEV_START + dev; -} - -static inline u16 group_to_target(unsigned group) -{ - return TARGET_GROUP_START + group; -} - -static inline struct target target_decode(unsigned target) -{ - if (target >= TARGET_GROUP_START) - return (struct target) { - .type = TARGET_GROUP, - .group = target - TARGET_GROUP_START - }; - - if (target >= TARGET_DEV_START) - return (struct target) { - .type = TARGET_DEV, - .group = target - TARGET_DEV_START - }; - - return (struct target) { .type = TARGET_NULL }; -} - -static inline bool dev_in_target(struct bch_dev *ca, unsigned target) -{ - struct target t = target_decode(target); - - switch (t.type) { - case TARGET_NULL: - return false; - case TARGET_DEV: - return ca->dev_idx == t.dev; - case TARGET_GROUP: - return ca->mi.group && ca->mi.group - 1 == t.group; - default: - BUG(); - } -} - -static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target) -{ - bool ret; - - rcu_read_lock(); - ret = dev_in_target(rcu_dereference(c->devs[dev]), target); - rcu_read_unlock(); - - return ret; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); - -int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *); - -int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64); - #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 77670ea6..05910c40 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -18,6 +18,7 @@ #include "clock.h" #include "compress.h" #include "debug.h" +#include "disk_groups.h" #include "error.h" #include "fs.h" #include "fs-io.h" @@ -30,6 +31,7 @@ #include "migrate.h" #include "movinggc.h" #include "quota.h" +#include "replicas.h" #include "super.h" #include "super-io.h" #include "sysfs.h" @@ -122,7 +124,7 @@ static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) lockdep_assert_held(&bch_fs_list_lock); list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) + if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) return c; return NULL; @@ -203,23 +205,12 @@ static void bch_fs_mark_clean(struct bch_fs *c) !test_bit(BCH_FS_ERROR, &c->flags) && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); bch2_write_super(c); mutex_unlock(&c->sb_lock); } } -static bool btree_interior_updates_done(struct bch_fs *c) -{ - bool ret; - - mutex_lock(&c->btree_interior_update_lock); - ret = list_empty(&c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return ret; -} - static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; @@ -251,7 +242,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) * fully complete: */ closure_wait_event(&c->btree_interior_update_wait, - btree_interior_updates_done(c)); + !bch2_btree_interior_updates_nr_pending(c)); if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) bch2_btree_verify_flushed(c); @@ -433,7 +424,8 @@ static void bch2_fs_free(struct bch_fs *c) if (c->wq) destroy_workqueue(c->wq); - free_pages((unsigned long) c->disk_sb, c->disk_sb_order); + free_pages((unsigned long) c->disk_sb.sb, + c->disk_sb.page_order); kvpfree(c, sizeof(*c)); module_put(THIS_MODULE); } @@ -501,11 +493,54 @@ void bch2_fs_stop(struct bch_fs *c) kobject_put(&c->kobj); } +static const char *bch2_fs_online(struct bch_fs *c) +{ + struct bch_dev *ca; + const char *err = NULL; + unsigned i; + int ret; + + lockdep_assert_held(&bch_fs_list_lock); + + if (!list_empty(&c->list)) + return NULL; + + if (__bch2_uuid_to_fs(c->sb.uuid)) + return "filesystem UUID already open"; + + ret = bch2_fs_chardev_init(c); + if (ret) + return "error creating character device"; + + bch2_fs_debug_init(c); + + if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || + kobject_add(&c->internal, &c->kobj, "internal") || + kobject_add(&c->opts_dir, &c->kobj, "options") || + kobject_add(&c->time_stats, &c->kobj, "time_stats") || + bch2_opts_create_sysfs_files(&c->opts_dir)) + return "error creating sysfs objects"; + + mutex_lock(&c->state_lock); + + err = "error creating sysfs objects"; + __for_each_member_device(ca, c, i, NULL) + if (bch2_dev_sysfs_online(c, ca)) + goto err; + + list_add(&c->list, &bch_fs_list); + err = NULL; +err: + mutex_unlock(&c->state_lock); + return err; +} + static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_sb_field_members *mi; struct bch_fs *c; unsigned i, iter_size; + const char *err; pr_verbose_init(opts, ""); @@ -516,6 +551,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) __module_get(THIS_MODULE); c->minor = -1; + c->disk_sb.fs_sb = true; mutex_init(&c->state_lock); mutex_init(&c->sb_lock); @@ -627,9 +663,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_fsio_init(c)) goto err; - mi = bch2_sb_get_members(c->disk_sb); + mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_dev_exists(c->disk_sb, mi, i) && + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && bch2_dev_alloc(c, i)) goto err; @@ -644,6 +680,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) kobject_init(&c->internal, &bch2_fs_internal_ktype); kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + + mutex_lock(&bch_fs_list_lock); + err = bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); + if (err) { + bch_err(c, "bch2_fs_online() error: %s", err); + goto err; + } out: pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); return c; @@ -653,60 +697,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } -static const char *__bch2_fs_online(struct bch_fs *c) -{ - struct bch_dev *ca; - const char *err = NULL; - unsigned i; - int ret; - - lockdep_assert_held(&bch_fs_list_lock); - - if (!list_empty(&c->list)) - return NULL; - - if (__bch2_uuid_to_fs(c->sb.uuid)) - return "filesystem UUID already open"; - - ret = bch2_fs_chardev_init(c); - if (ret) - return "error creating character device"; - - bch2_fs_debug_init(c); - - if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || - kobject_add(&c->internal, &c->kobj, "internal") || - kobject_add(&c->opts_dir, &c->kobj, "options") || - kobject_add(&c->time_stats, &c->kobj, "time_stats") || - bch2_opts_create_sysfs_files(&c->opts_dir)) - return "error creating sysfs objects"; - - mutex_lock(&c->state_lock); - - err = "error creating sysfs objects"; - __for_each_member_device(ca, c, i, NULL) - if (bch2_dev_sysfs_online(c, ca)) - goto err; - - list_add(&c->list, &bch_fs_list); - err = NULL; -err: - mutex_unlock(&c->state_lock); - return err; -} - -static const char *bch2_fs_online(struct bch_fs *c) -{ - const char *err; - - mutex_lock(&bch_fs_list_lock); - err = __bch2_fs_online(c); - mutex_unlock(&bch_fs_list_lock); - - return err; -} - -static const char *__bch2_fs_start(struct bch_fs *c) +const char *bch2_fs_start(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; @@ -730,15 +721,15 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (BCH_SB_INITIALIZED(c->disk_sb)) { + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) { ret = bch2_journal_read(c, &journal); if (ret) goto err; j = &list_entry(journal.prev, struct journal_replay, list)->j; - c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); - c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); + c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); for (i = 0; i < BTREE_ID_NR; i++) { unsigned level; @@ -824,21 +815,18 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch_notice(c, "initializing new filesystem"); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - set_bit(BCH_FS_BRAND_NEW_FS, &c->flags); ret = bch2_initial_gc(c, &journal); if (ret) goto err; err = "unable to allocate journal buckets"; - for_each_rw_member(ca, c, i) - if (bch2_dev_journal_alloc(c, ca)) { + for_each_online_member(ca, c, i) + if (bch2_dev_journal_alloc(ca)) { percpu_ref_put(&ca->io_ref); goto err; } - clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags); - for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); @@ -889,18 +877,20 @@ static const char *__bch2_fs_start(struct bch_fs *c) } mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); + mi = bch2_sb_get_members(c->disk_sb.sb); now = ktime_get_seconds(); for_each_member_device(ca, c, i) mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); - SET_BCH_SB_INITIALIZED(c->disk_sb, true); - SET_BCH_SB_CLEAN(c->disk_sb, false); + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); bch2_write_super(c); mutex_unlock(&c->sb_lock); + set_bit(BCH_FS_STARTED, &c->flags); + err = NULL; out: mutex_unlock(&c->state_lock); @@ -939,11 +929,6 @@ static const char *__bch2_fs_start(struct bch_fs *c) goto out; } -const char *bch2_fs_start(struct bch_fs *c) -{ - return __bch2_fs_start(c) ?: bch2_fs_online(c); -} - static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) { struct bch_sb_field_members *sb_mi; @@ -956,7 +941,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) return "mismatched block size"; if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < - BCH_SB_BTREE_NODE_SIZE(c->disk_sb)) + BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) return "new cache bucket size is too small"; return NULL; @@ -1082,28 +1067,19 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) return 0; } -static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) +static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + struct bch_member *member) { - struct bch_member *member; - struct bch_dev *ca = NULL; - int ret = 0; - - pr_verbose_init(c->opts, ""); - - if (bch2_fs_init_fault("dev_alloc")) - goto err; + struct bch_dev *ca; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - goto err; + return NULL; kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); - ca->dev_idx = dev_idx; - __set_bit(ca->dev_idx, ca->self.d); - init_rwsem(&ca->bucket_lock); writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); @@ -1113,14 +1089,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) INIT_WORK(&ca->io_error_work, bch2_io_error_work); - if (bch2_fs_init_fault("dev_alloc")) - goto err; - - member = bch2_sb_get_members(c->disk_sb)->members + dev_idx; - ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || @@ -1132,11 +1102,43 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; + return ca; +err: + bch2_dev_free(ca); + return NULL; +} + +static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, + unsigned dev_idx) +{ + ca->dev_idx = dev_idx; + __set_bit(ca->dev_idx, ca->self.d); + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); if (bch2_dev_sysfs_online(c, ca)) pr_warn("error creating sysfs objects"); +} + +static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) +{ + struct bch_member *member = + bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; + struct bch_dev *ca = NULL; + int ret = 0; + + pr_verbose_init(c->opts, ""); + + if (bch2_fs_init_fault("dev_alloc")) + goto err; + + ca = __bch2_dev_alloc(c, member); + if (!ca) + goto err; + + bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); return ret; @@ -1147,21 +1149,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) goto out; } -static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) +static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) { - struct bch_dev *ca; - int ret; - - lockdep_assert_held(&c->state_lock); - - if (le64_to_cpu(sb->sb->seq) > - le64_to_cpu(c->disk_sb->seq)) - bch2_sb_to_fs(c, sb->sb); - - BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || - !c->devs[sb->sb->dev_idx]); - - ca = bch_dev_locked(c, sb->sb->dev_idx); + unsigned ret; if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", @@ -1179,7 +1169,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { - bch_err(c, "device too small"); + bch_err(ca, "device too small"); return -EINVAL; } @@ -1187,35 +1177,50 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - /* - * Increase journal write timeout if flushes to this device are - * expensive: - */ - if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) && - journal_flushes_device(ca)) - c->journal.write_delay_ms = - max(c->journal.write_delay_ms, 1000U); - /* Commit: */ ca->disk_sb = *sb; if (sb->mode & FMODE_EXCL) ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); + if (ca->fs) + mutex_lock(&ca->fs->sb_lock); + + bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + + if (ca->fs) + mutex_unlock(&ca->fs->sb_lock); + + percpu_ref_reinit(&ca->io_ref); + + return 0; +} + +static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) +{ + struct bch_dev *ca; + int ret; + + lockdep_assert_held(&c->state_lock); + + if (le64_to_cpu(sb->sb->seq) > + le64_to_cpu(c->disk_sb.sb->seq)) + bch2_sb_to_fs(c, sb->sb); + + BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || + !c->devs[sb->sb->dev_idx]); + + ca = bch_dev_locked(c, sb->sb->dev_idx); + + ret = __bch2_dev_attach_bdev(ca, sb); + if (ret) + return ret; + if (c->sb.nr_devices == 1) bdevname(ca->disk_sb.bdev, c->name); bdevname(ca->disk_sb.bdev, ca->name); - mutex_lock(&c->sb_lock); - bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); - mutex_unlock(&c->sb_lock); - - if (ca->mi.state == BCH_MEMBER_STATE_RW) - bch2_dev_allocator_add(c, ca); - rebalance_wakeup(c); - - percpu_ref_reinit(&ca->io_ref); return 0; } @@ -1289,10 +1294,10 @@ static bool bch2_fs_may_start(struct bch_fs *c) if (!c->opts.degraded) { mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); + mi = bch2_sb_get_members(c->disk_sb.sb); - for (i = 0; i < c->disk_sb->nr_devices; i++) { - if (!bch2_dev_exists(c->disk_sb, mi, i)) + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) continue; ca = bch_dev_locked(c, i); @@ -1360,7 +1365,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch_notice(ca, "%s", bch2_dev_state[new_state]); mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); + mi = bch2_sb_get_members(c->disk_sb.sb); SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1470,7 +1475,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * this device must be gone: */ mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); + mi = bch2_sb_get_members(c->disk_sb.sb); memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); bch2_write_super(c); @@ -1492,8 +1497,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_sb_handle sb; const char *err; struct bch_dev *ca = NULL; - struct bch_sb_field_members *mi, *dev_mi; - struct bch_member saved_mi; + struct bch_sb_field_members *mi; + struct bch_member dev_mi; unsigned dev_idx, nr_devices, u64s; int ret; @@ -1505,24 +1510,52 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (err) return -EINVAL; + dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + err = bch2_dev_may_add(sb.sb, c); if (err) return -EINVAL; + ca = __bch2_dev_alloc(c, &dev_mi); + if (!ca) { + bch2_free_super(&sb); + return -ENOMEM; + } + + ret = __bch2_dev_attach_bdev(ca, &sb); + if (ret) { + bch2_dev_free(ca); + return ret; + } + + err = "journal alloc failed"; + ret = bch2_dev_journal_alloc(ca); + if (ret) + goto err; + mutex_lock(&c->state_lock); mutex_lock(&c->sb_lock); - /* Grab member info for new disk: */ - dev_mi = bch2_sb_get_members(sb.sb); - saved_mi = dev_mi->members[sb.sb->dev_idx]; - saved_mi.last_mount = cpu_to_le64(ktime_get_seconds()); + err = "insufficient space in new superblock"; + ret = bch2_sb_from_fs(c, ca); + if (ret) + goto err_unlock; + + mi = bch2_sb_get_members(ca->disk_sb.sb); + + if (!bch2_sb_resize_members(&ca->disk_sb, + le32_to_cpu(mi->field.u64s) + + sizeof(dev_mi) / sizeof(u64))) { + ret = -ENOSPC; + goto err_unlock; + } if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; - mi = bch2_sb_get_members(c->disk_sb); + mi = bch2_sb_get_members(c->disk_sb.sb); for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (!bch2_dev_exists(c->disk_sb, mi, dev_idx)) + if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) goto have_slot; no_slot: err = "no slots available in superblock"; @@ -1533,64 +1566,47 @@ int bch2_dev_add(struct bch_fs *c, const char *path) nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); u64s = (sizeof(struct bch_sb_field_members) + sizeof(struct bch_member) * nr_devices) / sizeof(u64); - err = "no space in superblock for member info"; - dev_mi = bch2_sb_resize_members(&sb, u64s); - if (!dev_mi) - goto err_unlock; + err = "no space in superblock for member info"; + ret = -ENOSPC; - mi = bch2_fs_sb_resize_members(c, u64s); + mi = bch2_sb_resize_members(&c->disk_sb, u64s); if (!mi) goto err_unlock; - memcpy(dev_mi, mi, u64s * sizeof(u64)); - dev_mi->members[dev_idx] = saved_mi; + /* success: */ - sb.sb->uuid = c->disk_sb->uuid; - sb.sb->dev_idx = dev_idx; - sb.sb->nr_devices = nr_devices; + mi->members[dev_idx] = dev_mi; + mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds()); + c->disk_sb.sb->nr_devices = nr_devices; - /* commit new member info */ - memcpy(mi, dev_mi, u64s * sizeof(u64)); - c->disk_sb->nr_devices = nr_devices; - c->sb.nr_devices = nr_devices; + ca->disk_sb.sb->dev_idx = dev_idx; + bch2_dev_attach(c, ca, dev_idx); bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (bch2_dev_alloc(c, dev_idx)) { - err = "cannot allocate memory"; - ret = -ENOMEM; - goto err; - } - - if (__bch2_dev_online(c, &sb)) { - err = "bch2_dev_online() error"; - ret = -ENOMEM; - goto err; - } - - ca = bch_dev_locked(c, dev_idx); if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); if (err) - goto err; - - err = "journal alloc failed"; - if (bch2_dev_journal_alloc(c, ca)) - goto err; + goto err_late; } mutex_unlock(&c->state_lock); return 0; + err_unlock: mutex_unlock(&c->sb_lock); -err: mutex_unlock(&c->state_lock); +err: + if (ca) + bch2_dev_free(ca); bch2_free_super(&sb); - bch_err(c, "Unable to add device: %s", err); - return ret ?: -EINVAL; + return ret; +err_late: + bch_err(c, "Error going rw after adding device: %s", err); + return -EINVAL; } /* Hot add existing device to running filesystem: */ @@ -1613,12 +1629,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - err = bch2_dev_in_fs(c->disk_sb, sb.sb); + err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); if (err) goto err; - if (__bch2_dev_online(c, &sb)) { - err = "__bch2_dev_online() error"; + if (bch2_dev_attach_bdev(c, &sb)) { + err = "bch2_dev_attach_bdev() error"; goto err; } @@ -1688,7 +1704,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) } mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; mi->nbuckets = cpu_to_le64(nbuckets); bch2_write_super(c); @@ -1721,74 +1737,6 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) return ca; } -int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label) -{ - struct bch_sb_field_disk_groups *groups; - struct bch_disk_group *g; - struct bch_member *mi; - unsigned i, v, nr_groups; - int ret; - - if (strlen(label) > BCH_SB_LABEL_SIZE) - return -EINVAL; - - mutex_lock(&c->sb_lock); - groups = bch2_sb_get_disk_groups(c->disk_sb); - nr_groups = disk_groups_nr(groups); - - if (!strcmp(label, "none")) { - v = 0; - goto write_sb; - } - - ret = __bch2_disk_group_find(groups, label); - if (ret >= 0) { - v = ret + 1; - goto write_sb; - } - - /* not found - create a new disk group: */ - - for (i = 0; - i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); - i++) - ; - - if (i == nr_groups) { - unsigned u64s = - (sizeof(struct bch_sb_field_disk_groups) + - sizeof(struct bch_disk_group) * (nr_groups + 1)) / - sizeof(u64); - - groups = bch2_fs_sb_resize_disk_groups(c, u64s); - if (!groups) { - mutex_unlock(&c->sb_lock); - return -ENOSPC; - } - - nr_groups = disk_groups_nr(groups); - } - - BUG_ON(i >= nr_groups); - - g = &groups->entries[i]; - v = i + 1; - - memcpy(g->label, label, strlen(label)); - if (strlen(label) < sizeof(g->label)) - g->label[strlen(label)] = '\0'; - SET_BCH_GROUP_DELETED(g, 0); - SET_BCH_GROUP_DATA_ALLOWED(g, ~0); -write_sb: - mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; - SET_BCH_MEMBER_GROUP(mi, v); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - /* Filesystem open: */ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, @@ -1845,7 +1793,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, err = "bch2_dev_online() error"; mutex_lock(&c->state_lock); for (i = 0; i < nr_devices; i++) - if (__bch2_dev_online(c, &sb[i])) { + if (bch2_dev_attach_bdev(c, &sb[i])) { mutex_unlock(&c->state_lock); goto err_print; } @@ -1856,15 +1804,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; if (!c->opts.nostart) { - err = __bch2_fs_start(c); + err = bch2_fs_start(c); if (err) goto err_print; } - - err = bch2_fs_online(c); - if (err) - goto err_print; - out: kfree(sb); module_put(THIS_MODULE); @@ -1900,7 +1843,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, if (c) { closure_get(&c->cl); - err = bch2_dev_in_fs(c->disk_sb, sb->sb); + err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); if (err) goto err; } else { @@ -1915,22 +1858,18 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, err = "bch2_dev_online() error"; mutex_lock(&c->sb_lock); - if (__bch2_dev_online(c, sb)) { + if (bch2_dev_attach_bdev(c, sb)) { mutex_unlock(&c->sb_lock); goto err; } mutex_unlock(&c->sb_lock); if (!c->opts.nostart && bch2_fs_may_start(c)) { - err = __bch2_fs_start(c); + err = bch2_fs_start(c); if (err) goto err; } - err = __bch2_fs_online(c); - if (err) - goto err; - closure_put(&c->cl); mutex_unlock(&bch_fs_list_lock); diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 652a572f..a52ee3bb 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -195,7 +195,6 @@ int bch2_dev_online(struct bch_fs *, const char *); int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); -int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index f5468182..ab83ade9 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -7,6 +7,9 @@ struct bch_sb_handle { struct bio *bio; unsigned page_order; fmode_t mode; + unsigned have_layout:1; + unsigned have_bio:1; + unsigned fs_sb:1; }; struct bch_devs_mask { @@ -44,8 +47,9 @@ struct bch_replicas_cpu { }; struct bch_disk_group_cpu { - struct bch_devs_mask devs; bool deleted; + u16 parent; + struct bch_devs_mask devs; }; struct bch_disk_groups_cpu { diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 82457348..e8089db9 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -18,11 +18,13 @@ #include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" +#include "disk_groups.h" #include "inode.h" #include "journal.h" #include "keylist.h" #include "move.h" #include "opts.h" +#include "replicas.h" #include "super-io.h" #include "tier.h" @@ -140,10 +142,10 @@ read_attribute(first_bucket); read_attribute(nbuckets); read_attribute(durability); read_attribute(iostats); -read_attribute(read_priority_stats); -read_attribute(write_priority_stats); -read_attribute(fragmentation_stats); -read_attribute(oldest_gen_stats); +read_attribute(last_read_quantiles); +read_attribute(last_write_quantiles); +read_attribute(fragmentation_quantiles); +read_attribute(oldest_gen_quantiles); read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); @@ -167,7 +169,7 @@ rw_attribute(journal_reclaim_delay_ms); rw_attribute(discard); rw_attribute(cache_replacement_policy); -rw_attribute(group); +rw_attribute(label); rw_attribute(copy_gc_enabled); sysfs_pd_controller_attribute(copy_gc); @@ -546,7 +548,7 @@ STORE(bch2_fs_opts_dir) if (opt->set_sb != SET_NO_SB_OPT) { mutex_lock(&c->sb_lock); - opt->set_sb(c->disk_sb, v); + opt->set_sb(c->disk_sb.sb, v); bch2_write_super(c); mutex_unlock(&c->sb_lock); } @@ -621,36 +623,41 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *); +typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, + size_t, void *); -static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b, - void *private) +static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, + size_t b, void *private) { - struct bucket *g = bucket(ca, b); int rw = (private ? 1 : 0); - return ca->fs->prio_clock[rw].hand - g->prio[rw]; + return bucket_last_io(c, bucket(ca, b), rw); } -static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b, - void *private) +static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, + size_t b, void *private) { struct bucket *g = bucket(ca, b); return bucket_sectors_used(g->mark); } -static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b, - void *private) +static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, + size_t b, void *private) { return bucket_gc_gen(ca, b); } -static ssize_t show_quantiles(struct bch_dev *ca, char *buf, - bucket_map_fn *fn, void *private) +static int unsigned_cmp(const void *_l, const void *_r) { - int cmp(const void *l, const void *r) - { return *((unsigned *) r) - *((unsigned *) l); } + unsigned l = *((unsigned *) _l); + unsigned r = *((unsigned *) _r); + + return (l > r) - (l < r); +} +static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, + char *buf, bucket_map_fn *fn, void *private) +{ size_t i, n; /* Compute 31 quantiles */ unsigned q[31], *p; @@ -666,9 +673,9 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf, } for (i = ca->mi.first_bucket; i < n; i++) - p[i] = fn(ca, i, private); + p[i] = fn(c, ca, i, private); - sort(p, n, sizeof(unsigned), cmp, NULL); + sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); up_read(&ca->bucket_lock); while (n && @@ -804,24 +811,18 @@ SHOW(bch2_dev) sysfs_print(durability, ca->mi.durability); sysfs_print(discard, ca->mi.discard); - if (attr == &sysfs_group) { - struct bch_sb_field_disk_groups *groups; - struct bch_disk_group *g; - unsigned len; - - if (!ca->mi.group) - return scnprintf(out, end - out, "none\n"); - - mutex_lock(&c->sb_lock); - groups = bch2_sb_get_disk_groups(c->disk_sb); - - g = &groups->entries[ca->mi.group - 1]; - len = strnlen(g->label, sizeof(g->label)); - memcpy(buf, g->label, len); - mutex_unlock(&c->sb_lock); + if (attr == &sysfs_label) { + if (ca->mi.group) { + mutex_lock(&c->sb_lock); + out += bch2_disk_path_print(&c->disk_sb, out, end - out, + ca->mi.group - 1); + mutex_unlock(&c->sb_lock); + } else { + out += scnprintf(out, end - out, "none"); + } - buf[len++] = '\n'; - return len; + out += scnprintf(out, end - out, "\n"); + return out - buf; } if (attr == &sysfs_has_data) { @@ -852,14 +853,16 @@ SHOW(bch2_dev) if (attr == &sysfs_iostats) return show_dev_iostats(ca, buf); - if (attr == &sysfs_read_priority_stats) - return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0); - if (attr == &sysfs_write_priority_stats) - return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1); - if (attr == &sysfs_fragmentation_stats) - return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL); - if (attr == &sysfs_oldest_gen_stats) - return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL); + + if (attr == &sysfs_last_read_quantiles) + return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); + if (attr == &sysfs_last_write_quantiles) + return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); + if (attr == &sysfs_fragmentation_quantiles) + return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); + if (attr == &sysfs_oldest_gen_quantiles) + return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); + if (attr == &sysfs_reserve_stats) return show_reserve_stats(ca, buf); if (attr == &sysfs_alloc_debug) @@ -880,7 +883,7 @@ STORE(bch2_dev) bool v = strtoul_or_return(buf); mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; if (v != BCH_MEMBER_DISCARD(mi)) { SET_BCH_MEMBER_DISCARD(mi, v); @@ -896,7 +899,7 @@ STORE(bch2_dev) return v; mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { SET_BCH_MEMBER_REPLACEMENT(mi, v); @@ -905,7 +908,7 @@ STORE(bch2_dev) mutex_unlock(&c->sb_lock); } - if (attr == &sysfs_group) { + if (attr == &sysfs_label) { char *tmp; int ret; @@ -938,16 +941,16 @@ struct attribute *bch2_dev_files[] = { &sysfs_discard, &sysfs_cache_replacement_policy, &sysfs_state_rw, - &sysfs_group, + &sysfs_label, &sysfs_has_data, &sysfs_iostats, /* alloc info - other stats: */ - &sysfs_read_priority_stats, - &sysfs_write_priority_stats, - &sysfs_fragmentation_stats, - &sysfs_oldest_gen_stats, + &sysfs_last_read_quantiles, + &sysfs_last_write_quantiles, + &sysfs_fragmentation_quantiles, + &sysfs_oldest_gen_quantiles, &sysfs_reserve_stats, /* debug: */ diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index 211a844c..a15a0fa9 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "buckets.h" #include "clock.h" +#include "disk_groups.h" #include "extents.h" #include "io.h" #include "move.h" diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 81e942e5..79a98f75 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -86,8 +86,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -static const char *bch2_xattr_invalid(const struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct xattr_handler *handler; struct bkey_s_c_xattr xattr; @@ -126,8 +125,8 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c, } } -static void bch2_xattr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_xattr_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { const struct xattr_handler *handler; struct bkey_s_c_xattr xattr; @@ -159,11 +158,6 @@ static void bch2_xattr_to_text(struct bch_fs *c, char *buf, } } -const struct bkey_ops bch2_bkey_xattr_ops = { - .key_invalid = bch2_xattr_invalid, - .val_to_text = bch2_xattr_to_text, -}; - int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, const char *name, void *buffer, size_t size, int type) { diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 9c815a2d..a58e7e30 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -4,7 +4,14 @@ #include "str_hash.h" extern const struct bch_hash_desc bch2_xattr_hash_desc; -extern const struct bkey_ops bch2_bkey_xattr_ops; + +const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); + +#define bch2_bkey_xattr_ops (struct bkey_ops) { \ + .key_invalid = bch2_xattr_invalid, \ + .val_to_text = bch2_xattr_to_text, \ +} struct dentry; struct xattr_handler;