Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[spicedb] Fix intermittent "Error: 4 DEADLINE_EXCEEDED...Waiting for LB pick" #20637

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/content-service-api/typescript/package.json
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@
"lib"
],
"dependencies": {
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"google-protobuf": "^3.19.1",
"inversify": "^6.0.1",
"opentracing": "^0.14.4"
2 changes: 1 addition & 1 deletion components/gitpod-protocol/package.json
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@
"src"
],
"devDependencies": {
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"@testdeck/mocha": "^0.3.3",
"@types/analytics-node": "^3.1.9",
"@types/chai-subset": "^1.3.3",
87 changes: 87 additions & 0 deletions components/gitpod-protocol/src/util/grpc.ts
Original file line number Diff line number Diff line change
@@ -6,6 +6,8 @@

import * as grpc from "@grpc/grpc-js";
import { Status } from "@grpc/grpc-js/build/src/constants";
import { log } from "./logging";
import { TrustedValue } from "./scrubbing";

export const defaultGRPCOptions = {
"grpc.keepalive_timeout_ms": 10000,
@@ -108,6 +110,91 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
};
}

export function createDebugLogInterceptor(additionalContextF: (() => object) | undefined): grpc.Interceptor {
const FAILURE_STATUS_CODES = new Map([
[Status.ABORTED, true],
[Status.CANCELLED, true],
[Status.DATA_LOSS, true],
[Status.DEADLINE_EXCEEDED, true],
[Status.FAILED_PRECONDITION, true],
[Status.INTERNAL, true],
[Status.PERMISSION_DENIED, true],
[Status.RESOURCE_EXHAUSTED, true],
[Status.UNAUTHENTICATED, true],
[Status.UNAVAILABLE, true],
[Status.UNIMPLEMENTED, true],
[Status.UNKNOWN, true],
]);

return (options, nextCall): grpc.InterceptingCall => {
const methodDef = options.method_definition;
const method = methodDef.path.substring(methodDef.path.lastIndexOf("/") + 1);
const service = methodDef.path.substring(1, methodDef.path.length - method.length - 1);
const labels = {
service,
method,
type: getGrpcMethodType(options.method_definition.requestStream, options.method_definition.responseStream),
};
const requester = new grpc.RequesterBuilder()
.withStart((metadata, listener, next) => {
const newListener = new grpc.ListenerBuilder()
.withOnReceiveStatus((status, next) => {
// If given, call the additionalContext function and log the result
let additionalContext = {};
try {
if (additionalContextF) {
additionalContext = additionalContextF();
}
} catch (e) {}

try {
const info = {
labels: new TrustedValue(labels),
metadata: new TrustedValue(metadata.toJSON()),
code: Status[status.code],
details: status.details,
additionalContext: new TrustedValue(additionalContext),
};
if (FAILURE_STATUS_CODES.has(status.code)) {
log.warn(`grpc call failed`, info);
} else {
log.debug(`grpc call status`, info);
}
} finally {
next(status);
}
})
.build();
try {
log.debug(`grpc call started`, {
labels: new TrustedValue(labels),
metadata: new TrustedValue(metadata.toJSON()),
});
} finally {
next(metadata, newListener);
}
})
.withCancel((next) => {
try {
log.debug(`grpc call cancelled`, { labels: new TrustedValue(labels) });
} finally {
next();
}
})
.build();
return new grpc.InterceptingCall(nextCall(options), requester);
};
}

export function isGrpcError(err: any): err is grpc.StatusObject {
return err.code && err.details;
}

export function isConnectionAlive(client: grpc.Client) {
const cs = client.getChannel().getConnectivityState(false);
return (
cs == grpc.connectivityState.CONNECTING ||
cs == grpc.connectivityState.IDLE ||
cs == grpc.connectivityState.READY
);
}
2 changes: 1 addition & 1 deletion components/ide-metrics-api/typescript-grpc/package.json
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@
"lib"
],
"dependencies": {
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"google-protobuf": "^3.19.1"
},
"devDependencies": {
2 changes: 1 addition & 1 deletion components/image-builder-api/typescript/package.json
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@
"dependencies": {
"@gitpod/content-service": "0.1.5",
"@gitpod/gitpod-protocol": "0.1.5",
"@grpc/grpc-js": "1.10.8",
"@grpc/grpc-js": "1.12.6",
"google-protobuf": "^3.19.1",
"inversify": "^6.0.1",
"opentracing": "^0.14.4"
9 changes: 2 additions & 7 deletions components/image-builder-api/typescript/src/sugar.ts
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ import { ImageBuilderClient } from "./imgbuilder_grpc_pb";
import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing";
import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
import { createClientCallMetricsInterceptor, IClientCallMetrics } from "@gitpod/gitpod-protocol/lib/util/grpc";
import { createClientCallMetricsInterceptor, IClientCallMetrics, isConnectionAlive } from "@gitpod/gitpod-protocol/lib/util/grpc";
import * as opentracing from "opentracing";
import { Metadata } from "@grpc/grpc-js";
import {
@@ -130,12 +130,7 @@ export class PromisifiedImageBuilderClient {
) {}

public isConnectionAlive() {
const cs = this.client.getChannel().getConnectivityState(false);
return (
cs == grpc.connectivityState.CONNECTING ||
cs == grpc.connectivityState.IDLE ||
cs == grpc.connectivityState.READY
);
return isConnectionAlive(this.client);
}

public resolveBaseImage(ctx: TraceContext, request: ResolveBaseImageRequest): Promise<ResolveBaseImageResponse> {
2 changes: 1 addition & 1 deletion components/server/package.json
Original file line number Diff line number Diff line change
@@ -45,7 +45,7 @@
"/src"
],
"dependencies": {
"@authzed/authzed-node": "^0.15.0",
"@authzed/authzed-node": "^1.2.2",
"@connectrpc/connect": "1.1.2",
"@connectrpc/connect-express": "1.1.2",
"@gitbeaker/rest": "^39.12.0",
118 changes: 62 additions & 56 deletions components/server/src/authorization/spicedb-authorizer.ts
Original file line number Diff line number Diff line change
@@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
import { ApplicationError, ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error";
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";

async function tryThree<T>(errMessage: string, code: (attempt: number) => Promise<T>): Promise<T> {
let attempt = 0;
// we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
// last time we checked it was 15 times per day (check logs)
while (attempt++ < 3) {
try {
return await code(attempt);
} catch (err) {
if (
(err.code === grpc.status.INTERNAL ||
err.code === grpc.status.DEADLINE_EXCEEDED ||
err.code === grpc.status.UNAVAILABLE) &&
attempt < 3
) {
let delay = 500 * attempt;
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
// we already waited for timeout, so let's try again immediately
delay = 0;
}

log.warn(errMessage, err, {
attempt,
delay,
code: err.code,
});
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
}

log.error(errMessage, err, {
attempt,
code: err.code,
});
// we don't try again on other errors
throw err;
}
}
throw new Error("unreachable");
}

export function createSpiceDBAuthorizer(clientProvider: SpiceDBClientProvider): SpiceDBAuthorizer {
return new SpiceDBAuthorizer(clientProvider, new RequestLocalZedTokenCache());
}
@@ -71,13 +31,11 @@ interface DeletionResult {
deletedAt?: string;
}

const GRPC_DEADLINE = 10_000;

export class SpiceDBAuthorizer {
constructor(private readonly clientProvider: SpiceDBClientProvider, private readonly tokenCache: ZedTokenCache) {}

private get client(): v1.ZedPromiseClientInterface {
return this.clientProvider.getClient();
}

public async check(req: v1.CheckPermissionRequest, experimentsFields: { userId: string }): Promise<boolean> {
req.consistency = await this.tokenCache.consistency(req.resource);
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
@@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
const timer = spicedbClientLatency.startTimer();
let error: Error | undefined;
try {
const response = await tryThree("[spicedb] Failed to perform authorization check.", () =>
this.client.checkPermission(req, this.callOptions),
const response = await this.call("[spicedb] Failed to perform authorization check.", (client) =>
client.checkPermission(req, this.callOptions),
);
const permitted = response.permissionship === v1.CheckPermissionResponse_Permissionship.HAS_PERMISSION;
return { permitted, checkedAt: response.checkedAt?.token };
@@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
const timer = spicedbClientLatency.startTimer();
let error: Error | undefined;
try {
const response = await tryThree("[spicedb] Failed to write relationships.", () =>
this.client.writeRelationships(
const response = await this.call("[spicedb] Failed to write relationships.", (client) =>
client.writeRelationships(
v1.WriteRelationshipsRequest.create({
updates,
}),
@@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
let error: Error | undefined;
try {
let deletedAt: string | undefined = undefined;
const existing = await tryThree("readRelationships before deleteRelationships failed.", () =>
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
const existing = await this.call("readRelationships before deleteRelationships failed.", (client) =>
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
);
if (existing.length > 0) {
const response = await tryThree("deleteRelationships failed.", () =>
this.client.deleteRelationships(req, this.callOptions),
const response = await this.call("deleteRelationships failed.", (client) =>
client.deleteRelationships(req, this.callOptions),
);
deletedAt = response.deletedAt?.token;
const after = await tryThree("readRelationships failed.", () =>
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
const after = await this.call("readRelationships failed.", (client) =>
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
);
if (after.length > 0) {
log.error("[spicedb] Failed to delete relationships.", { existing, after, request: req });
@@ -213,7 +171,55 @@ export class SpiceDBAuthorizer {
async readRelationships(req: v1.ReadRelationshipsRequest): Promise<v1.ReadRelationshipsResponse[]> {
req.consistency = await this.tokenCache.consistency(undefined);
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
return tryThree("readRelationships failed.", () => this.client.readRelationships(req, this.callOptions));
return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions));
}

/**
* call retrieves a Spicedb client and executes the given code block.
* In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors.
* This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect.
* @param description
* @param code
* @returns
*/
private async call<T>(description: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
const MAX_ATTEMPTS = 3;
let attempt = 0;
while (attempt++ < 3) {
try {
const checkClient = attempt > 1; // the last client error'd out, so check if we should get a new one
const client = this.clientProvider.getClient(checkClient);
return code(client);
} catch (err) {
// Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries
if (
(err.code === grpc.status.DEADLINE_EXCEEDED || err.code === grpc.status.UNAVAILABLE) &&
attempt < MAX_ATTEMPTS
) {
let delay = 500 * attempt;
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
// we already waited for timeout, so let's try again immediately
delay = 0;
}

log.warn(description, err, {
attempt,
delay,
code: err.code,
});
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
}

// Some other error: log and rethrow
log.error(description, err, {
attempt,
code: err.code,
});
throw err;
}
}
throw new Error("unreachable");
}

/**
@@ -223,7 +229,7 @@ export class SpiceDBAuthorizer {
*/
private get callOptions(): grpc.Metadata {
return (<grpc.CallOptions>{
deadline: Date.now() + 8000,
deadline: Date.now() + GRPC_DEADLINE,
}) as any as grpc.Metadata;
}
}
Loading
Oops, something went wrong.
Loading
Oops, something went wrong.