diff --git a/pkg/storage/gc_queue.go b/pkg/storage/gc_queue.go index 00aac943c3bc..b346466141bd 100644 --- a/pkg/storage/gc_queue.go +++ b/pkg/storage/gc_queue.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage/engine/enginepb" "github.com/cockroachdb/cockroach/pkg/storage/gc" + "github.com/cockroachdb/cockroach/pkg/storage/storagebase" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -42,8 +43,34 @@ const ( // on keys and intents. gcKeyScoreThreshold = 2 gcIntentScoreThreshold = 10 + + probablyLargeAbortSpanSysCountThreshold = 10000 + probablyLargeAbortSpanSysBytesThreshold = 16 * (1 << 20) // 16mb ) +func probablyLargeAbortSpan(ms enginepb.MVCCStats) bool { + // If there is "a lot" of data in Sys{Bytes,Count}, then we are likely + // experiencing a large abort span. The abort span is not supposed to + // become that large, but it does happen and causes stability fallout, + // usually due to a combination of shortcomings: + // + // 1. there's no trigger for GC based on abort span size alone (before + // this code block here was written) + // 2. transaction aborts tended to create unnecessary abort span entries, + // fixed (and 19.2-backported) in: + // https://github.com/cockroachdb/cockroach/pull/42765 + // 3. aborting transactions in a busy loop: + // https://github.com/cockroachdb/cockroach/issues/38088 + // (and we suspect this also happens in user apps occasionally) + // 4. large snapshots would never complete due to the queue time limits + // (addressed in https://github.com/cockroachdb/cockroach/pull/44952). + // + // In an ideal world, we would factor in the abort span into this method + // directly, but until then the condition guarding this block will do. + return ms.SysCount >= probablyLargeAbortSpanSysCountThreshold && + ms.SysBytes >= probablyLargeAbortSpanSysBytesThreshold +} + // gcQueue manages a queue of replicas slated to be scanned in their // entirety using the MVCC versions iterator. The gc queue manages the // following tasks: @@ -147,11 +174,8 @@ func makeGCQueueScore( // have slightly different priorities and even symmetrical workloads don't // trigger GC at the same time. r := makeGCQueueScoreImpl( - ctx, int64(repl.RangeID), now, ms, policy, + ctx, int64(repl.RangeID), now, ms, policy, gcThreshold, ) - if (gcThreshold != hlc.Timestamp{}) { - r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime) - } return r } @@ -249,9 +273,14 @@ func makeGCQueueScoreImpl( now hlc.Timestamp, ms enginepb.MVCCStats, policy zonepb.GCPolicy, + gcThreshold hlc.Timestamp, ) gcQueueScore { ms.Forward(now.WallTime) var r gcQueueScore + if (gcThreshold != hlc.Timestamp{}) { + r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime) + } + r.TTL = policy.TTL() // Treat a zero TTL as a one-second TTL, which avoids a priority of infinity @@ -314,6 +343,12 @@ func makeGCQueueScoreImpl( r.ShouldQueue = r.FuzzFactor*valScore > gcKeyScoreThreshold || r.FuzzFactor*r.IntentScore > gcIntentScoreThreshold r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore) + if probablyLargeAbortSpan(ms) && !r.ShouldQueue && + (r.LikelyLastGC == 0 || r.LikelyLastGC > storagebase.TxnCleanupThreshold) { + r.ShouldQueue = true + r.FinalScore++ + } + return r } diff --git a/pkg/storage/gc_queue_test.go b/pkg/storage/gc_queue_test.go index 10fa71d435f3..1b0da15645b2 100644 --- a/pkg/storage/gc_queue_test.go +++ b/pkg/storage/gc_queue_test.go @@ -38,6 +38,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/kr/pretty" "github.com/pkg/errors" + "github.com/stretchr/testify/require" "golang.org/x/sync/syncmap" ) @@ -105,7 +106,7 @@ func TestGCQueueMakeGCScoreInvariantQuick(t *testing.T) { GCBytesAge: gcByteAge, } now := initialNow.Add(timePassed.Nanoseconds(), 0) - r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, zonepb.GCPolicy{TTLSeconds: ttlSec}) + r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, zonepb.GCPolicy{TTLSeconds: ttlSec}, hlc.Timestamp{}) wouldHaveToDeleteSomething := gcBytes*int64(ttlSec) < ms.GCByteAge(now.WallTime) result := !r.ShouldQueue || wouldHaveToDeleteSomething if !result { @@ -126,13 +127,42 @@ func TestGCQueueMakeGCScoreAnomalousStats(t *testing.T) { LiveBytes: int64(liveBytes), ValBytes: int64(valBytes), KeyBytes: int64(keyBytes), - }, zonepb.GCPolicy{TTLSeconds: 60}) + }, zonepb.GCPolicy{TTLSeconds: 60}, hlc.Timestamp{}) return r.DeadFraction >= 0 && r.DeadFraction <= 1 }, &quick.Config{MaxCount: 1000}); err != nil { t.Fatal(err) } } +func TestGCQueueMakeGCScoreLargeAbortSpan(t *testing.T) { + defer leaktest.AfterTest(t)() + const seed = 1 + var ms enginepb.MVCCStats + ms.SysCount += probablyLargeAbortSpanSysCountThreshold + ms.SysBytes += probablyLargeAbortSpanSysBytesThreshold + + thresh := storagebase.TxnCleanupThreshold.Nanoseconds() + + // GC triggered if abort span should all be gc'able and it's likely large. + { + r := makeGCQueueScoreImpl( + context.Background(), seed, + hlc.Timestamp{WallTime: thresh + 1}, + ms, zonepb.GCPolicy{TTLSeconds: 10000}, + hlc.Timestamp{WallTime: 1}, + ) + require.True(t, r.ShouldQueue) + require.NotZero(t, r.FinalScore) + } + + // Heuristic doesn't fire if likely last GC within TxnCleanupThreshold. + { + r := makeGCQueueScoreImpl(context.Background(), seed, hlc.Timestamp{WallTime: storagebase.TxnCleanupThreshold.Nanoseconds()}, ms, zonepb.GCPolicy{TTLSeconds: 10000}, hlc.Timestamp{WallTime: 1}) + require.False(t, r.ShouldQueue) + require.Zero(t, r.FinalScore) + } +} + const cacheFirstLen = 3 type gcTestCacheKey struct { @@ -248,7 +278,7 @@ func (cws *cachedWriteSimulator) shouldQueue( ts := hlc.Timestamp{}.Add(ms.LastUpdateNanos+after.Nanoseconds(), 0) r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, zonepb.GCPolicy{ TTLSeconds: int32(ttl.Seconds()), - }) + }, hlc.Timestamp{}) if fmt.Sprintf("%.2f", r.FinalScore) != fmt.Sprintf("%.2f", prio) || b != r.ShouldQueue { cws.t.Errorf("expected queued=%t (is %t), prio=%.2f, got %.2f: after=%s, ttl=%s:\nms: %+v\nscore: %s", b, r.ShouldQueue, prio, r.FinalScore, after, ttl, ms, r)