Skip to content

Commit 6998097

Browse files
[release-15.0] VReplication Last Error: retry error if it happens after timeout (vitessio#12114) (vitessio#12302)
* Retry error if it happens after timeout Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Address review comments Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Address review comments Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Improve unit tests Signed-off-by: Rohit Nayak <rohit@planetscale.com> --------- Signed-off-by: Rohit Nayak <rohit@planetscale.com> Co-authored-by: Rohit Nayak <rohit@planetscale.com>
1 parent b632186 commit 6998097

File tree

3 files changed

+80
-35
lines changed

3 files changed

+80
-35
lines changed

go/vt/vttablet/tabletmanager/vreplication/controller.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,12 @@ func newController(ctx context.Context, params map[string]string, dbClientFactor
7272
}
7373

7474
ct := &controller{
75-
vre: vre,
76-
dbClientFactory: dbClientFactory,
77-
mysqld: mysqld,
78-
blpStats: blpStats,
79-
done: make(chan struct{}),
80-
source: &binlogdatapb.BinlogSource{},
81-
lastWorkflowError: newLastError("VReplication Controller", maxTimeToRetryError),
75+
vre: vre,
76+
dbClientFactory: dbClientFactory,
77+
mysqld: mysqld,
78+
blpStats: blpStats,
79+
done: make(chan struct{}),
80+
source: &binlogdatapb.BinlogSource{},
8281
}
8382
log.Infof("creating controller with cell: %v, tabletTypes: %v, and params: %v", cell, tabletTypesStr, params)
8483

@@ -89,6 +88,7 @@ func newController(ctx context.Context, params map[string]string, dbClientFactor
8988
}
9089
ct.id = uint32(id)
9190
ct.workflow = params["workflow"]
91+
ct.lastWorkflowError = newLastError(fmt.Sprintf("VReplication controller %d for workflow %q", ct.id, ct.workflow), maxTimeToRetryError)
9292

9393
state := params["state"]
9494
blpStats.State.Set(state)

go/vt/vttablet/tabletmanager/vreplication/last_error.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,20 @@ import (
2626

2727
/*
2828
* lastError tracks the most recent error for any ongoing process and how long it has persisted.
29-
* The err field should be a vterror so as to ensure we have meaningful error codes, causes, stack
29+
* The err field should be a vterror to ensure we have meaningful error codes, causes, stack
3030
* traces, etc.
3131
*/
3232
type lastError struct {
3333
name string
3434
err error
3535
firstSeen time.Time
36+
lastSeen time.Time
3637
mu sync.Mutex
3738
maxTimeInError time.Duration // if error persists for this long, shouldRetry() will return false
3839
}
3940

4041
func newLastError(name string, maxTimeInError time.Duration) *lastError {
42+
log.Infof("Created last error: %s, with maxTimeInError: %s", name, maxTimeInError)
4143
return &lastError{
4244
name: name,
4345
maxTimeInError: maxTimeInError,
@@ -48,15 +50,27 @@ func (le *lastError) record(err error) {
4850
le.mu.Lock()
4951
defer le.mu.Unlock()
5052
if err == nil {
53+
log.Infof("Resetting last error: %s", le.name)
5154
le.err = nil
5255
le.firstSeen = time.Time{}
56+
le.lastSeen = time.Time{}
5357
return
5458
}
5559
if !vterrors.Equals(err, le.err) {
60+
log.Infof("Got new last error %+v for %s, was %+v", err, le.name, le.err)
5661
le.firstSeen = time.Now()
62+
le.lastSeen = time.Now()
5763
le.err = err
64+
} else {
65+
// same error seen
66+
log.Infof("Got the same last error for %q: %+v ; first seen at %s and last seen %dms ago", le.name, le.err, le.firstSeen, int(time.Since(le.lastSeen).Milliseconds()))
67+
if time.Since(le.lastSeen) > le.maxTimeInError {
68+
// reset firstSeen, since it has been long enough since the last time we saw this error
69+
log.Infof("Resetting firstSeen for %s, since it is too long since the last one", le.name)
70+
le.firstSeen = time.Now()
71+
}
72+
le.lastSeen = time.Now()
5873
}
59-
// The error is unchanged so we don't need to do anything
6074
}
6175

6276
func (le *lastError) shouldRetry() bool {

go/vt/vttablet/tabletmanager/vreplication/last_error_test.go

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,32 +24,63 @@ import (
2424
"github.com/stretchr/testify/require"
2525
)
2626

27-
func TestLastError(t *testing.T) {
28-
le := newLastError("test", 100*time.Millisecond)
27+
const shortWait = 1 * time.Millisecond
28+
const longWait = 150 * time.Millisecond
29+
const maxTimeInError = 100 * time.Millisecond
2930

30-
t.Run("long running error", func(t *testing.T) {
31-
err1 := fmt.Errorf("test1")
31+
// TestLastErrorZeroMaxTime tests maxTimeInError = 0, should always retry
32+
func TestLastErrorZeroMaxTime(t *testing.T) {
33+
le := newLastError("test", 0)
34+
err1 := fmt.Errorf("error1")
35+
le.record(err1)
36+
require.True(t, le.shouldRetry())
37+
time.Sleep(shortWait)
38+
require.True(t, le.shouldRetry())
39+
time.Sleep(longWait)
40+
require.True(t, le.shouldRetry())
41+
}
42+
43+
// TestLastErrorNoError ensures that an uninitialized lastError always retries
44+
func TestLastErrorNoError(t *testing.T) {
45+
le := newLastError("test", maxTimeInError)
46+
require.True(t, le.shouldRetry())
47+
err1 := fmt.Errorf("error1")
48+
le.record(err1)
49+
require.True(t, le.shouldRetry())
50+
le.record(nil)
51+
require.True(t, le.shouldRetry())
52+
}
53+
54+
// TestLastErrorOneError validates that we retry an error if happening within the maxTimeInError, but not after
55+
func TestLastErrorOneError(t *testing.T) {
56+
le := newLastError("test", maxTimeInError)
57+
err1 := fmt.Errorf("error1")
58+
le.record(err1)
59+
require.True(t, le.shouldRetry())
60+
time.Sleep(shortWait)
61+
require.True(t, le.shouldRetry())
62+
time.Sleep(shortWait)
63+
require.True(t, le.shouldRetry())
64+
time.Sleep(longWait)
65+
require.False(t, le.shouldRetry())
66+
}
67+
68+
// TestLastErrorRepeatedError confirms that if same error is repeated we don't retry
69+
// unless it happens after maxTimeInError
70+
func TestLastErrorRepeatedError(t *testing.T) {
71+
le := newLastError("test", maxTimeInError)
72+
err1 := fmt.Errorf("error1")
73+
le.record(err1)
74+
require.True(t, le.shouldRetry())
75+
for i := 1; i < 10; i++ {
3276
le.record(err1)
33-
require.True(t, le.shouldRetry())
34-
time.Sleep(150 * time.Millisecond)
35-
require.False(t, le.shouldRetry())
36-
})
37-
38-
t.Run("new long running error", func(t *testing.T) {
39-
err2 := fmt.Errorf("test2")
40-
le.record(err2)
41-
require.True(t, le.shouldRetry())
42-
for i := 1; i < 10; i++ {
43-
le.record(err2)
44-
}
45-
require.True(t, le.shouldRetry())
46-
time.Sleep(150 * time.Millisecond)
47-
le.record(err2)
48-
require.False(t, le.shouldRetry())
49-
})
50-
51-
t.Run("no error", func(t *testing.T) {
52-
le.record(nil)
53-
require.True(t, le.shouldRetry())
54-
})
77+
time.Sleep(shortWait)
78+
}
79+
require.True(t, le.shouldRetry())
80+
81+
// same error happens after maxTimeInError, so it should retry
82+
time.Sleep(longWait)
83+
require.False(t, le.shouldRetry())
84+
le.record(err1)
85+
require.True(t, le.shouldRetry())
5586
}

0 commit comments

Comments
 (0)