Skip to content

Commit 4e504a1

Browse files
authored
improvement(staging-mode): Clarify Squash/Rollback behavior for ContainerMessageType.GC (#24774)
GC ops can be made while in Staging Mode, and we need to be very clear with how they're handled. * **Squashing:** They don't need to be squashed, no personal data to protect * **Rollback:** GC ops submitted while in Staging Mode will be dropped if staged changes are discarded. (these are all "Tombstone Loaded" ops, which prevent data loss in case of invalid GC data) * We will log a new error event `GC_OpDiscarded` to highlight this case * We are also adding a prop `inStagingMode` to all ContainerRuntime logs, including `GC_Tombstone_DataStore_Requested` etc, to further aid investigations Also refactor the Resubmit code so it's clear that being able to support squash is an exceptional case - most op types don't need to deal with it.
1 parent bb49437 commit 4e504a1

File tree

3 files changed

+94
-26
lines changed

3 files changed

+94
-26
lines changed

packages/runtime/container-runtime/src/containerRuntime.ts

Lines changed: 81 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -786,8 +786,12 @@ function canStageMessageOfType(
786786
| ContainerMessageType.GC
787787
| ContainerMessageType.DocumentSchemaChange {
788788
return (
789+
// These are user changes coming up from the runtime's DataStores
789790
type === ContainerMessageType.FluidDataStoreOp ||
791+
// GC ops are used to detect issues in the reference graph so all clients can repair their GC state.
792+
// These can be submitted at any time, including while in Staging Mode.
790793
type === ContainerMessageType.GC ||
794+
// These are typically sent shortly after boot and will not be common in Staging Mode, but it's possible.
791795
type === ContainerMessageType.DocumentSchemaChange
792796
);
793797
}
@@ -1182,6 +1186,8 @@ export class ContainerRuntime
11821186

11831187
public readonly clientDetails: IClientDetails;
11841188

1189+
private readonly isSummarizerClient: boolean;
1190+
11851191
public get storage(): IDocumentStorageService {
11861192
return this._storage;
11871193
}
@@ -1528,6 +1534,11 @@ export class ContainerRuntime
15281534
this.mc = createChildMonitoringContext({
15291535
logger: this.baseLogger,
15301536
namespace: "ContainerRuntime",
1537+
properties: {
1538+
all: {
1539+
inStagingMode: this.inStagingMode,
1540+
},
1541+
},
15311542
});
15321543

15331544
// If we support multiple algorithms in the future, then we would need to manage it here carefully.
@@ -1584,7 +1595,7 @@ export class ContainerRuntime
15841595
// Values are generally expected to be set from the runtime side.
15851596
this.options = options ?? {};
15861597
this.clientDetails = clientDetails;
1587-
const isSummarizerClient = this.clientDetails.type === summarizerClientType;
1598+
this.isSummarizerClient = this.clientDetails.type === summarizerClientType;
15881599
this.loadedFromVersionId = context.getLoadedFromVersion()?.id;
15891600
// eslint-disable-next-line unicorn/consistent-destructuring
15901601
this._getClientId = () => context.clientId;
@@ -1625,7 +1636,7 @@ export class ContainerRuntime
16251636
);
16261637

16271638
// In cases of summarizer, we want to dispose instead since consumer doesn't interact with this container
1628-
this.closeFn = isSummarizerClient ? this.disposeFn : closeFn;
1639+
this.closeFn = this.isSummarizerClient ? this.disposeFn : closeFn;
16291640

16301641
let loadSummaryNumber: number;
16311642
// Get the container creation metadata. For new container, we initialize these. For existing containers,
@@ -1786,7 +1797,7 @@ export class ContainerRuntime
17861797
existing,
17871798
metadata,
17881799
createContainerMetadata: this.createContainerMetadata,
1789-
isSummarizerClient,
1800+
isSummarizerClient: this.isSummarizerClient,
17901801
getNodePackagePath: async (nodePath: string) => this.getGCNodePackagePath(nodePath),
17911802
getLastSummaryTimestampMs: () => this.messageAtLastSummary?.timestamp,
17921803
readAndParseBlob: async <T>(id: string) => readAndParse<T>(this.storage, id),
@@ -2151,8 +2162,7 @@ export class ContainerRuntime
21512162
maxOpsSinceLastSummary,
21522163
);
21532164

2154-
const isSummarizerClient = this.clientDetails.type === summarizerClientType;
2155-
if (isSummarizerClient) {
2165+
if (this.isSummarizerClient) {
21562166
// We want to dynamically import any thing inside summaryDelayLoadedModule module only when we are the summarizer client,
21572167
// so that all non summarizer clients don't have to load the code inside this module.
21582168
const module = await import(
@@ -4641,51 +4651,87 @@ export class ContainerRuntime
46414651

46424652
/**
46434653
* Resubmits each message in the batch, and then flushes the outbox.
4654+
* This typically happens when we reconnect and there are pending messages.
46444655
*
4645-
* @remarks - If the "Offline Load" feature is enabled, the batchId is included in the resubmitted messages,
4656+
* @remarks
4657+
* Attempting to resubmit a batch that has been successfully sequenced will not happen due to
4658+
* checks in the ConnectionStateHandler (Loader layer)
4659+
*
4660+
* The only exception to this would be if the Container "forks" due to misuse of the "Offline Load" feature.
4661+
* If the "Offline Load" feature is enabled, the batchId is included in the resubmitted messages,
46464662
* for correlation to detect container forking.
46474663
*/
46484664
private reSubmitBatch(
46494665
batch: PendingMessageResubmitData[],
46504666
{ batchId, staged, squash }: PendingBatchResubmitMetadata,
46514667
): void {
4668+
assert(
4669+
this._summarizer === undefined,
4670+
0x8f2 /* Summarizer never reconnects so should never resubmit */,
4671+
);
4672+
46524673
const resubmitInfo = {
46534674
// Only include Batch ID if "Offline Load" feature is enabled
46544675
// It's only needed to identify batches across container forks arising from misuse of offline load.
46554676
batchId: this.offlineEnabled ? batchId : undefined,
46564677
staged,
46574678
};
46584679

4680+
const resubmitFn = squash
4681+
? this.reSubmitWithSquashing.bind(this)
4682+
: this.reSubmit.bind(this);
4683+
46594684
this.batchRunner.run(() => {
46604685
for (const message of batch) {
4661-
this.reSubmit(message, squash);
4686+
resubmitFn(message);
46624687
}
46634688
}, resubmitInfo);
46644689

46654690
this.flush(resubmitInfo);
46664691
}
46674692

4668-
private reSubmit(message: PendingMessageResubmitData, squash: boolean): void {
4669-
this.reSubmitCore(message.runtimeOp, message.localOpMetadata, message.opMetadata, squash);
4670-
}
4671-
46724693
/**
4673-
* Finds the right store and asks it to resubmit the message. This typically happens when we
4674-
* reconnect and there are pending messages.
4675-
* ! Note: successfully resubmitting an op that has been successfully sequenced is not possible due to checks in the ConnectionStateHandler (Loader layer)
4676-
* @param message - The original LocalContainerRuntimeMessage.
4677-
* @param localOpMetadata - The local metadata associated with the original message.
4694+
* Resubmit the given message as part of a squash rebase upon exiting Staging Mode.
4695+
* How exactly to resubmit the message is up to the subsystem that submitted the op to begin with.
46784696
*/
4679-
private reSubmitCore(
4680-
message: LocalContainerRuntimeMessage,
4681-
localOpMetadata: unknown,
4682-
opMetadata: Record<string, unknown> | undefined,
4683-
squash: boolean,
4684-
): void {
4697+
private reSubmitWithSquashing(resubmitData: PendingMessageResubmitData): void {
4698+
const message = resubmitData.runtimeOp;
46854699
assert(
4686-
this._summarizer === undefined,
4687-
0x8f2 /* Summarizer never reconnects so should never resubmit */,
4700+
canStageMessageOfType(message.type),
4701+
"Expected message type to be compatible with staging",
46884702
);
4703+
switch (message.type) {
4704+
case ContainerMessageType.FluidDataStoreOp: {
4705+
this.channelCollection.reSubmit(
4706+
message.type,
4707+
message.contents,
4708+
resubmitData.localOpMetadata,
4709+
/* squash: */ true,
4710+
);
4711+
break;
4712+
}
4713+
// NOTE: Squash doesn't apply to GC or DocumentSchemaChange ops, fallback to typical resubmit logic.
4714+
case ContainerMessageType.GC:
4715+
case ContainerMessageType.DocumentSchemaChange: {
4716+
this.reSubmit(resubmitData);
4717+
break;
4718+
}
4719+
default: {
4720+
unreachableCase(message.type);
4721+
}
4722+
}
4723+
}
4724+
4725+
/**
4726+
* Resubmit the given message which was previously submitted to the ContainerRuntime but not successfully
4727+
* transmitted to the ordering service (e.g. due to a disconnect, or being in Staging Mode)
4728+
* How to resubmit is up to the subsystem that submitted the op to begin with
4729+
*/
4730+
private reSubmit({
4731+
runtimeOp: message,
4732+
localOpMetadata,
4733+
opMetadata,
4734+
}: PendingMessageResubmitData): void {
46894735
switch (message.type) {
46904736
case ContainerMessageType.FluidDataStoreOp:
46914737
case ContainerMessageType.Attach:
@@ -4696,7 +4742,7 @@ export class ContainerRuntime
46964742
message.type,
46974743
message.contents,
46984744
localOpMetadata,
4699-
squash,
4745+
/* squash: */ false,
47004746
);
47014747
break;
47024748
}
@@ -4752,7 +4798,16 @@ export class ContainerRuntime
47524798
this.channelCollection.rollback(type, contents, localOpMetadata);
47534799
break;
47544800
}
4755-
case ContainerMessageType.GC:
4801+
case ContainerMessageType.GC: {
4802+
// Just drop it, but log an error, this is not expected and not ideal, but not critical failure either.
4803+
// Currently the only expected type here is TombstoneLoaded, which will have been preceded by one of these events as well:
4804+
// GC_Tombstone_DataStore_Requested, GC_Tombstone_SubDataStore_Requested, GC_Tombstone_Blob_Requested
4805+
this.mc.logger.sendErrorEvent({
4806+
eventName: "GC_OpDiscarded",
4807+
details: { subType: contents.type },
4808+
});
4809+
break;
4810+
}
47564811
case ContainerMessageType.DocumentSchemaChange: {
47574812
throw new Error(`Handling ${type} ops in rolled back batch not yet implemented`);
47584813
}

packages/runtime/container-runtime/src/gc/garbageCollection.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,8 @@ export class GarbageCollector implements IGarbageCollector {
10451045

10461046
// Any time we log a Tombstone Loaded error (via Telemetry Tracker),
10471047
// we want to also trigger autorecovery to avoid the object being deleted
1048+
// i.e. this will be preceded by one of these telemetry events;
1049+
// GC_Tombstone_DataStore_Requested, GC_Tombstone_SubDataStore_Requested, GC_Tombstone_Blob_Requested
10481050
// Note: We don't need to trigger on "Changed" because any change will cause the object
10491051
// to be loaded by the Summarizer, and auto-recovery will be triggered then.
10501052
if (isTombstoned && reason === "Loaded") {

packages/runtime/container-runtime/src/pendingStateManager.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,19 @@ export interface IPendingMessage {
5050
* Unless this pending message came from stashed content, in which case this was roundtripped through string
5151
*/
5252
runtimeOp?: LocalContainerRuntimeMessage | undefined; // Undefined for empty batches and initial messages before parsing
53+
/**
54+
* Local Op Metadata that was passed to the ContainerRuntime when the op was submitted.
55+
* This contains state needed when processing the ack, or to resubmit or rollback the op.
56+
*/
5357
localOpMetadata: unknown;
58+
/**
59+
* Metadata that was passed to the ContainerRuntime when the op was submitted.
60+
* This is rarely used, and may be inspected by the service (as opposed to op contents which is opaque)
61+
*/
5462
opMetadata: Record<string, unknown> | undefined;
63+
/**
64+
* Populated upon processing the op's ack, before moving the pending message to savedOps.
65+
*/
5566
sequenceNumber?: number;
5667
/**
5768
* Info about the batch this pending message belongs to, for validation and for computing the batchId on reconnect

0 commit comments

Comments
 (0)