New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VSR: Ping releases #1670
VSR: Ping releases #1670
Changes from 5 commits
0319566
8a058d8
e1c4aff
5add306
135ba32
5beb02c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -291,7 +291,7 @@ pub const Header = extern struct { | |
checksum_body_padding: u128 = 0, | ||
nonce_reserved: u128 = 0, | ||
cluster: u128, | ||
size: u32 = @sizeOf(Header), | ||
size: u32, | ||
epoch: u32 = 0, | ||
// NB: unlike every other message, pings and pongs use on disk view, rather than in-memory | ||
// view, to avoid disrupting clock synchronization while the view is being updated. | ||
|
@@ -308,13 +308,23 @@ pub const Header = extern struct { | |
checkpoint_op: u64, | ||
|
||
ping_timestamp_monotonic: u64, | ||
release_count: u16, | ||
|
||
reserved: [96]u8 = [_]u8{0} ** 96, | ||
reserved: [94]u8 = [_]u8{0} ** 94, | ||
|
||
fn invalid_header(self: *const @This()) ?[]const u8 { | ||
assert(self.command == .ping); | ||
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)"; | ||
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected"; | ||
if (self.size <= @sizeOf(Header)) return "size <= @sizeOf(Header)"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the code to validate the body of a ping is missing? I.e, checking that the releases are strictly sorted. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function doesn't have access to the message body. We do assert the order in |
||
if (self.size > @sizeOf(Header) + @sizeOf(u16) * constants.vsr_releases_max) { | ||
return "size > limit"; | ||
} | ||
if (self.release_count == 0) return "release_count == 0"; | ||
if (self.release_count > constants.vsr_releases_max) { | ||
return "release_count > vsr_releases_max"; | ||
} | ||
if (self.size != @sizeOf(Header) + self.release_count * @sizeOf(u16)) { | ||
return "size != @sizeOf(Header) + release_count * @sizeOf(u16)"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a separate check that I think the code is correct, but if, eg the |
||
} | ||
if (self.release == 0) return "release == 0"; | ||
if (!vsr.Checkpoint.valid(self.checkpoint_op)) return "checkpoint_op invalid"; | ||
if (self.ping_timestamp_monotonic == 0) return "ping_timestamp_monotonic != expected"; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -210,6 +210,14 @@ pub fn ReplicaType( | |
/// It should never be modified by a running replica. | ||
release_client_min: u16, | ||
|
||
/// A list of all versions of code that are available in the current binary. | ||
/// Includes the current version, newer versions, and older versions. | ||
/// Ordered from lowest/oldest to highest/newest. | ||
/// | ||
/// Note that this is a property (rather than a constant) for the purpose of testing. | ||
/// It should never be modified for a running replica. | ||
releases_bundled: vsr.ReleaseList, | ||
|
||
/// A globally unique integer generated by a crypto rng during replica process startup. | ||
/// Presently, it is used to detect outdated start view messages in recovering head status. | ||
nonce: Nonce, | ||
|
@@ -266,6 +274,17 @@ pub fn ReplicaType( | |
/// - If syncing≠idle then sync_tables=null. | ||
sync_tables: ?ForestTableIterator = null, | ||
|
||
/// The latest release list from every other replica. (Constructed from pings.) | ||
/// | ||
/// Invariants: | ||
/// - upgrade_targets[self.replica] = null | ||
/// - upgrade_targets[*].releases > release | ||
upgrade_targets: [constants.replicas_max]?struct { | ||
checkpoint: u64, | ||
view: u32, | ||
releases: vsr.ReleaseList, | ||
} = .{null} ** constants.replicas_max, | ||
|
||
/// The current view. | ||
/// Initialized from the superblock's VSRState. | ||
/// | ||
|
@@ -487,6 +506,7 @@ pub fn ReplicaType( | |
grid_cache_blocks_count: u32 = Grid.Cache.value_count_max_multiple, | ||
release: u16, | ||
release_client_min: u16, | ||
releases_bundled: []const u16, | ||
}; | ||
|
||
/// Initializes and opens the provided replica using the options. | ||
|
@@ -548,6 +568,7 @@ pub fn ReplicaType( | |
.grid_cache_blocks_count = options.grid_cache_blocks_count, | ||
.release = options.release, | ||
.release_client_min = options.release_client_min, | ||
.releases_bundled = options.releases_bundled, | ||
}); | ||
|
||
// Disable all dynamic allocation from this point onwards. | ||
|
@@ -847,6 +868,7 @@ pub fn ReplicaType( | |
grid_cache_blocks_count: u32, | ||
release: u16, | ||
release_client_min: u16, | ||
releases_bundled: []const u16, | ||
}; | ||
|
||
/// NOTE: self.superblock must be initialized and opened prior to this call. | ||
|
@@ -889,6 +911,9 @@ pub fn ReplicaType( | |
// Flexible quorums are safe if these two quorums intersect so that this relation holds: | ||
assert(quorum_replication + quorum_view_change > replica_count); | ||
|
||
vsr.verify_release_list(options.releases_bundled); | ||
assert(std.mem.indexOfScalar(u16, options.releases_bundled, options.release) != null); | ||
|
||
self.time = options.time; | ||
|
||
// The clock is special-cased for standbys. We want to balance two concerns: | ||
|
@@ -976,6 +1001,9 @@ pub fn ReplicaType( | |
.quorum_majority = quorum_majority, | ||
.release = options.release, | ||
.release_client_min = options.release_client_min, | ||
.releases_bundled = vsr.ReleaseList.from_slice( | ||
options.releases_bundled, | ||
) catch unreachable, | ||
.nonce = options.nonce, | ||
// Copy the (already-initialized) time back, to avoid regressing the monotonic | ||
// clock guard. | ||
|
@@ -1295,6 +1323,29 @@ pub fn ReplicaType( | |
.ping_timestamp_monotonic = message.header.ping_timestamp_monotonic, | ||
.pong_timestamp_wall = @bitCast(self.clock.realtime()), | ||
})); | ||
|
||
if (message.header.replica < self.replica_count) { | ||
const upgrade_targets = &self.upgrade_targets[message.header.replica]; | ||
if (upgrade_targets.* == null or | ||
(upgrade_targets.*.?.checkpoint <= message.header.checkpoint_op and | ||
upgrade_targets.*.?.view <= message.header.view)) | ||
{ | ||
upgrade_targets.* = .{ | ||
.checkpoint = message.header.checkpoint_op, | ||
.view = message.header.view, | ||
.releases = .{}, | ||
}; | ||
|
||
const releases = std.mem.bytesAsSlice(u16, message.body()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. assert that releases are sorted. |
||
assert(releases.len == message.header.release_count); | ||
vsr.verify_release_list(releases); | ||
for (releases) |release| { | ||
if (release > self.release) { | ||
upgrade_targets.*.?.releases.append_assume_capacity(release); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think this can fire? We have a bound for a single message, but different ping messages might have originated from different binaries, so their union might actually exceed what we have here. This is of course and edge case, but there's a related realistic problem: imagine if you run a binary with a new release on one replica, but then retire it in favor of the old binary. The knowledge that that replica has a new release persists until the viewchange/checkpoint boundary, although the actual binary does not support that version... It feels like just replacing the whole set of targets might be safer? But then again, that assumes that the network drops all old messages eventually. Hm, I think I really don't like the union behavior because it is guaranteed to be wrong: if we have two pings with different release sets, then what we support is one release set or the other release set. When we union the two, we are guaranteed to have an unsupported release in there? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We replace the entire list every time, there is no union:
above resets the list to empty before we append anything. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤦 gotta grab one more pu-erh cup as I don't seem to be sufficiently woken up, sorry! |
||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
fn on_pong(self: *Self, message: *const Message.Pong) void { | ||
|
@@ -2570,19 +2621,30 @@ pub fn ReplicaType( | |
fn on_ping_timeout(self: *Self) void { | ||
self.ping_timeout.reset(); | ||
|
||
var ping = Header.Ping{ | ||
const message = self.message_bus.pool.get_message(.ping); | ||
defer self.message_bus.unref(message); | ||
|
||
message.header.* = Header.Ping{ | ||
.command = .ping, | ||
.size = @sizeOf(Header) + @sizeOf(u16) * self.releases_bundled.count_as(u16), | ||
.cluster = self.cluster, | ||
.replica = self.replica, | ||
.view = self.view_durable(), // Don't drop pings while the view is being updated. | ||
.release = self.release, | ||
.checkpoint_id = self.superblock.working.checkpoint_id(), | ||
.checkpoint_op = self.op_checkpoint(), | ||
.ping_timestamp_monotonic = self.clock.monotonic(), | ||
.release_count = self.releases_bundled.count_as(u16), | ||
}; | ||
assert(ping.view <= self.view); | ||
|
||
self.send_header_to_other_replicas_and_standbys(ping.frame_const().*); | ||
const ping_versions = std.mem.bytesAsSlice(u16, message.body()); | ||
stdx.copy_disjoint(.exact, u16, ping_versions, self.releases_bundled.const_slice()); | ||
message.header.set_checksum_body(message.body()); | ||
message.header.set_checksum(); | ||
|
||
assert(message.header.view <= self.view); | ||
|
||
self.send_message_to_other_replicas_and_standbys(message.base()); | ||
} | ||
|
||
fn on_prepare_timeout(self: *Self) void { | ||
|
@@ -6595,6 +6657,15 @@ pub fn ReplicaType( | |
self.send_message_to_other_replicas_base(message.base()); | ||
} | ||
|
||
fn send_message_to_other_replicas_and_standbys(self: *Self, message: *Message) void { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is due to |
||
var replica: u8 = 0; | ||
while (replica < self.node_count) : (replica += 1) { | ||
if (replica != self.replica) { | ||
self.send_message_to_replica_base(replica, message); | ||
} | ||
} | ||
} | ||
|
||
fn send_message_to_other_replicas_base(self: *Self, message: *Message) void { | ||
var replica: u8 = 0; | ||
while (replica < self.replica_count) : (replica += 1) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vsr_releases_max
is a new config/constant which defines the maximum number of releases that can be advertised by a replica in aping
message. (So this is effectively the maximum number of releases that can be compiled into a multiversion binary).I set it to
64
right now, which I chose somewhat arbitrarily: We release on a weekly cadence, there are 52 weeks/year, rounded up to the nearest power-of-2 is 64.