Skip to content

Commit

Permalink
xlog: add request details to panic message for broken LSN
Browse files Browse the repository at this point in the history
Aid the debugging of replication issues related to out-of-order
requests. Adds the details of request/tuple to the diagnostic
message whenever possible.

Closes #3105
  • Loading branch information
ztarvos authored and locker committed Aug 29, 2018
1 parent 92d1e02 commit 48f5555
Show file tree
Hide file tree
Showing 12 changed files with 331 additions and 19 deletions.
6 changes: 2 additions & 4 deletions src/box/applier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,8 +336,7 @@ applier_join(struct applier *applier)
coio_read_xrow(coio, ibuf, &row);
applier->last_row_time = ev_monotonic_now(loop());
if (iproto_type_is_dml(row.type)) {
vclock_follow(&replicaset.vclock, row.replica_id,
row.lsn);
vclock_follow_xrow(&replicaset.vclock, &row);
xstream_write_xc(applier->subscribe_stream, &row);
} else if (row.type == IPROTO_OK) {
/*
Expand Down Expand Up @@ -503,8 +502,7 @@ applier_subscribe(struct applier *applier)
* the row is skipped when the replication
* is resumed.
*/
vclock_follow(&replicaset.vclock, row.replica_id,
row.lsn);
vclock_follow_xrow(&replicaset.vclock, &row);
struct replica *replica = replica_by_id(row.replica_id);
struct latch *latch = (replica ? &replica->order_latch :
&replicaset.applier.order_latch);
Expand Down
4 changes: 2 additions & 2 deletions src/box/recovery.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ recovery_scan(struct recovery *r, struct vclock *end_vclock)
return;
struct xrow_header row;
while (xlog_cursor_next(&cursor, &row, true) == 0)
vclock_follow(end_vclock, row.replica_id, row.lsn);
vclock_follow_xrow(end_vclock, &row);
xlog_cursor_close(&cursor, false);
}

Expand Down Expand Up @@ -266,7 +266,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
* in case of forced recovery, when we skip the
* failed row anyway.
*/
vclock_follow(&r->vclock, row.replica_id, row.lsn);
vclock_follow_xrow(&r->vclock, &row);
if (xstream_write(stream, &row) == 0) {
++row_count;
if (row_count % 100000 == 0)
Expand Down
7 changes: 7 additions & 0 deletions src/box/relay.cc
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,13 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
if (packet->replica_id != relay->replica->id ||
packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe,
packet->replica_id)) {
struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN,
ERRINJ_INT);
if (inj != NULL && packet->lsn == inj->iparam) {
packet->lsn = inj->iparam - 1;
say_warn("injected broken lsn: %lld",
(long long) packet->lsn);
}
relay_send(relay, packet);
}
}
8 changes: 1 addition & 7 deletions src/box/vclock.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,7 @@ vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn)
assert(lsn >= 0);
assert(replica_id < VCLOCK_MAX);
int64_t prev_lsn = vclock->lsn[replica_id];
if (lsn <= prev_lsn) {
/* Never confirm LSN out of order. */
panic("LSN for %u is used twice or COMMIT order is broken: "
"confirmed: %lld, new: %lld",
(unsigned) replica_id,
(long long) prev_lsn, (long long) lsn);
}
assert(lsn > prev_lsn);
/* Easier add each time than check. */
vclock->map |= 1 << replica_id;
vclock->lsn[replica_id] = lsn;
Expand Down
8 changes: 8 additions & 0 deletions src/box/vclock.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,14 @@ vclock_sum(const struct vclock *vclock)
return vclock->signature;
}

/**
* Update vclock with the next LSN value for given replica id.
*
* @param vclock Vector clock.
* @param replica_id Replica identifier.
* @param lsn Next lsn.
* @return previous lsn value.
*/
int64_t
vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn);

Expand Down
14 changes: 9 additions & 5 deletions src/box/wal.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ xlog_write_entry(struct xlog *l, struct journal_entry *entry)
struct xrow_header **row = entry->rows;
for (; row < entry->rows + entry->n_rows; row++) {
(*row)->tm = ev_now(loop());
struct errinj *inj = errinj(ERRINJ_WAL_BREAK_LSN, ERRINJ_INT);
if (inj != NULL && inj->iparam == (*row)->lsn) {
(*row)->lsn = inj->iparam - 1;
say_warn("injected broken lsn: %lld",
(long long) (*row)->lsn);
}
if (xlog_write_row(l, *row) < 0) {
/*
* Rollback all un-written rows
Expand Down Expand Up @@ -652,8 +658,7 @@ wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
(*row)->replica_id = instance_id;
} else {
vclock_follow(&writer->vclock, (*row)->replica_id,
(*row)->lsn);
vclock_follow_xrow(&writer->vclock, *row);
}
}
}
Expand Down Expand Up @@ -878,9 +883,8 @@ wal_write(struct journal *journal, struct journal_entry *entry)
*/
if (vclock_get(&replicaset.vclock,
instance_id) < (*last)->lsn) {
vclock_follow(&replicaset.vclock,
instance_id,
(*last)->lsn);
vclock_follow_xrow(&replicaset.vclock,
*last);
}
break;
}
Expand Down
31 changes: 31 additions & 0 deletions src/box/xrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,37 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out);
void
xrow_decode_error(struct xrow_header *row);

/**
* Update vclock with the next LSN value for given replica id.
* The function will cause panic if the next LSN happens to be
* out of order. The details of provided row are included into
* diagnostic message.
*
* @param vclock Vector clock.
* @param row Data row.
* @return Previous LSN value.
*/
static inline int64_t
vclock_follow_xrow(struct vclock* vclock, const struct xrow_header *row)
{
assert(row);
assert(row->replica_id < VCLOCK_MAX);
if (row->lsn <= vclock->lsn[row->replica_id]) {
struct request req;
const char *req_str = "n/a";
if (xrow_decode_dml((struct xrow_header *)row, &req, 0) == 0)
req_str = request_str(&req);
/* Never confirm LSN out of order. */
panic("LSN for %u is used twice or COMMIT order is broken: "
"confirmed: %lld, new: %lld, req: %s",
(unsigned) row->replica_id,
(long long) vclock->lsn[row->replica_id],
(long long) row->lsn,
req_str);
}
return vclock_follow(vclock, row->replica_id, row->lsn);
}

#if defined(__cplusplus)
} /* extern "C" */

Expand Down
2 changes: 2 additions & 0 deletions src/errinj.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ struct errinj {
_(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
_(ERRINJ_WAL_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \

ENUM0(errinj_id, ERRINJ_LIST);
extern struct errinj errinjs[];
Expand Down
4 changes: 4 additions & 0 deletions test/box/errinj.result
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,12 @@ errinj.info()
state: 0
ERRINJ_XLOG_META:
state: false
ERRINJ_WAL_BREAK_LSN:
state: -1
ERRINJ_WAL_WRITE_DISK:
state: false
ERRINJ_RELAY_BREAK_LSN:
state: -1
ERRINJ_VY_LOG_FILE_RENAME:
state: false
ERRINJ_VY_RUN_WRITE:
Expand Down
187 changes: 187 additions & 0 deletions test/xlog/panic_on_broken_lsn.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
-- Issue 3105: Test logging of request with broken lsn before panicking
-- Two cases are covered: Recovery and Joining a new replica
env = require('test_run')
---
...
test_run = env.new()
---
...
test_run:cleanup_cluster()
---
...
fio = require('fio')
---
...
test_run:cmd("setopt delimiter ';'")
---
- true
...
function grep_file_tail(filepath, bytes, pattern)
local fh = fio.open(filepath, {'O_RDONLY'})
local size = fh:seek(0, 'SEEK_END')
if size < bytes then
bytes = size
end
fh:seek(-bytes, 'SEEK_END')
local line = fh:read(bytes)
fh:close()
return string.match(line, pattern)
end;
---
...
test_run:cmd("setopt delimiter ''");
---
- true
...
-- Testing case of panic on recovery
test_run:cmd('create server panic with script="xlog/panic.lua"')
---
- true
...
test_run:cmd('start server panic')
---
- true
...
test_run:switch('panic')
---
- true
...
box.space._schema:replace{"t0", "v0"}
---
- ['t0', 'v0']
...
lsn = box.info.vclock[1]
---
...
box.error.injection.set("ERRINJ_WAL_BREAK_LSN", lsn + 1)
---
- ok
...
box.space._schema:replace{"t0", "v1"}
---
- ['t0', 'v1']
...
box.error.injection.set("ERRINJ_WAL_BREAK_LSN", -1)
---
- ok
...
test_run:switch('default')
---
- true
...
test_run:cmd('stop server panic')
---
- true
...
dirname = fio.pathjoin(fio.cwd(), "panic")
---
...
xlogs = fio.glob(dirname .. "/*.xlog")
---
...
fio.unlink(xlogs[#xlogs])
---
- true
...
test_run:cmd('start server panic with crash_expected=True')
---
- false
...
logpath = fio.pathjoin(fio.cwd(), 'panic.log')
---
...
-- Check that log contains the mention of broken LSN and the request printout
grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
---
- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {type:
''REPLACE'', lsn: 1, space_id: 272, index_id: 0, tuple: ["t0", "v1"]}'
...
test_run:cmd('cleanup server panic')
---
- true
...
test_run:cmd('delete server panic')
---
- true
...
-- Testing case of panic on joining a new replica
box.schema.user.grant('guest', 'replication')
---
...
_ = box.schema.space.create('test')
---
...
_ = box.space.test:create_index('pk')
---
...
box.space.test:auto_increment{'v0'}
---
- [1, 'v0']
...
lsn = box.info.vclock[1]
---
...
box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
---
- ok
...
box.space.test:auto_increment{'v1'}
---
- [2, 'v1']
...
test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
---
- true
...
test_run:cmd('start server replica with crash_expected=True')
---
- false
...
fiber = require('fiber')
---
...
while box.info.replication == nil do fiber.sleep(0.001) end
---
...
while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
---
...
box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", -1)
---
- ok
...
logpath = fio.pathjoin(fio.cwd(), 'replica.log')
---
...
-- Check that log contains the mention of broken LSN and the request printout
test_run:cmd("push filter 'lsn: "..lsn..", space_id: [0-9]+' to 'lsn: <lsn>, space_id: <space_id>'")
---
- true
...
test_run:cmd("push filter 'confirmed: "..lsn..", new: "..lsn.."' to '<lsn>'")
---
- true
...
grep_file_tail(logpath, 256, "(LSN for 1 is used twice or COMMIT order is broken: confirmed: "..lsn..", new: "..lsn.."), req: ({.*})")
---
- 'LSN for 1 is used twice or COMMIT order is broken: <lsn>'
- '{type: ''INSERT'', lsn: <lsn>, space_id: <space_id>, index_id: 0, tuple: [2, "v1"]}'
...
test_run:cmd("clear filter")
---
- true
...
test_run:cmd('cleanup server replica')
---
- true
...
test_run:cmd('delete server replica')
---
- true
...
box.space.test:drop()
---
...
box.schema.user.revoke('guest', 'replication')
---
...

0 comments on commit 48f5555

Please sign in to comment.