Skip to content

Commit

Permalink
[server] Streamline spicedb gRPC client usage and creation options
Browse files Browse the repository at this point in the history
 - instead of doing retries on two levels, rely on the gRPC-level retries
 - to mitigate the loss of insights, introduce createDebugLogInterceptor
  - client options: use sane defaults derived from the documentation instead of the excessive ones we had in place before
  - use "waitForReady" option: it should a) make our calls for responsive on re-connects, while b) - because we keep re-trying on DEADLINE_EXCEEDED - should be as reliable as before

Tool: gitpod/catfood.gitpod.cloud
  • Loading branch information
geropl committed Feb 27, 2025
1 parent 2a69b53 commit 560e4dc
Showing 4 changed files with 113 additions and 65 deletions.
68 changes: 68 additions & 0 deletions components/gitpod-protocol/src/util/grpc.ts
Original file line number Diff line number Diff line change
@@ -6,6 +6,8 @@

import * as grpc from "@grpc/grpc-js";
import { Status } from "@grpc/grpc-js/build/src/constants";
import { log } from "./logging";
import { TrustedValue } from "./scrubbing";

export const defaultGRPCOptions = {
"grpc.keepalive_timeout_ms": 10000,
@@ -108,6 +110,72 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
};
}

export function createDebugLogInterceptor(): grpc.Interceptor {
const FAILURE_STATUS_CODES = new Map([
[Status.ABORTED, true],
[Status.CANCELLED, true],
[Status.DATA_LOSS, true],
[Status.DEADLINE_EXCEEDED, true],
[Status.FAILED_PRECONDITION, true],
[Status.INTERNAL, true],
[Status.PERMISSION_DENIED, true],
[Status.RESOURCE_EXHAUSTED, true],
[Status.UNAUTHENTICATED, true],
[Status.UNAVAILABLE, true],
[Status.UNIMPLEMENTED, true],
[Status.UNKNOWN, true],
]);

return (options, nextCall): grpc.InterceptingCall => {
const methodDef = options.method_definition;
const method = methodDef.path.substring(methodDef.path.lastIndexOf("/") + 1);
const service = methodDef.path.substring(1, methodDef.path.length - method.length - 1);
const labels = {
service,
method,
type: getGrpcMethodType(options.method_definition.requestStream, options.method_definition.responseStream),
};
const requester = new grpc.RequesterBuilder()
.withStart((metadata, listener, next) => {
const newListener = new grpc.ListenerBuilder()
.withOnReceiveStatus((status, next) => {
try {
const info = {
labels: new TrustedValue(labels),
metadata: new TrustedValue(metadata.toJSON()),
code: Status[status.code],
};
if (FAILURE_STATUS_CODES.has(status.code)) {
log.warn(`grpc call failed`, info);
} else {
log.debug(`grpc call status`, info);
}
} finally {
next(status);
}
})
.build();
try {
log.debug(`grpc call started`, {
labels: new TrustedValue(labels),
metadata: new TrustedValue(metadata.toJSON()),
});
} finally {
next(metadata, newListener);
}
})
.withCancel((next) => {
try {
log.debug(`grpc call cancelled`, { labels: new TrustedValue(labels) });
} finally {
next();
}
})
.build();
return new grpc.InterceptingCall(nextCall(options), requester);
};
}

export function isGrpcError(err: any): err is grpc.StatusObject {
return err.code && err.details;
}
82 changes: 26 additions & 56 deletions components/server/src/authorization/spicedb-authorizer.ts
Original file line number Diff line number Diff line change
@@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
import { ApplicationError, ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error";
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";

async function tryThree<T>(errMessage: string, code: (attempt: number) => Promise<T>): Promise<T> {
let attempt = 0;
// we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
// last time we checked it was 15 times per day (check logs)
while (attempt++ < 3) {
try {
return await code(attempt);
} catch (err) {
if (
(err.code === grpc.status.INTERNAL ||
err.code === grpc.status.DEADLINE_EXCEEDED ||
err.code === grpc.status.UNAVAILABLE) &&
attempt < 3
) {
let delay = 500 * attempt;
if (err.code === grpc.status.DEADLINE_EXCEEDED) {
// we already waited for timeout, so let's try again immediately
delay = 0;
}

log.warn(errMessage, err, {
attempt,
delay,
code: err.code,
});
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
}

log.error(errMessage, err, {
attempt,
code: err.code,
});
// we don't try again on other errors
throw err;
}
}
throw new Error("unreachable");
}

export function createSpiceDBAuthorizer(clientProvider: SpiceDBClientProvider): SpiceDBAuthorizer {
return new SpiceDBAuthorizer(clientProvider, new RequestLocalZedTokenCache());
}
@@ -71,13 +31,11 @@ interface DeletionResult {
deletedAt?: string;
}

const GRPC_DEADLINE = 10_000;

export class SpiceDBAuthorizer {
constructor(private readonly clientProvider: SpiceDBClientProvider, private readonly tokenCache: ZedTokenCache) {}

private get client(): v1.ZedPromiseClientInterface {
return this.clientProvider.getClient();
}

public async check(req: v1.CheckPermissionRequest, experimentsFields: { userId: string }): Promise<boolean> {
req.consistency = await this.tokenCache.consistency(req.resource);
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
@@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
const timer = spicedbClientLatency.startTimer();
let error: Error | undefined;
try {
const response = await tryThree("[spicedb] Failed to perform authorization check.", () =>
this.client.checkPermission(req, this.callOptions),
const response = await this.call("[spicedb] Failed to perform authorization check.", (client) =>
client.checkPermission(req, this.callOptions),
);
const permitted = response.permissionship === v1.CheckPermissionResponse_Permissionship.HAS_PERMISSION;
return { permitted, checkedAt: response.checkedAt?.token };
@@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
const timer = spicedbClientLatency.startTimer();
let error: Error | undefined;
try {
const response = await tryThree("[spicedb] Failed to write relationships.", () =>
this.client.writeRelationships(
const response = await this.call("[spicedb] Failed to write relationships.", (client) =>
client.writeRelationships(
v1.WriteRelationshipsRequest.create({
updates,
}),
@@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
let error: Error | undefined;
try {
let deletedAt: string | undefined = undefined;
const existing = await tryThree("readRelationships before deleteRelationships failed.", () =>
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
const existing = await this.call("readRelationships before deleteRelationships failed.", (client) =>
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
);
if (existing.length > 0) {
const response = await tryThree("deleteRelationships failed.", () =>
this.client.deleteRelationships(req, this.callOptions),
const response = await this.call("deleteRelationships failed.", (client) =>
client.deleteRelationships(req, this.callOptions),
);
deletedAt = response.deletedAt?.token;
const after = await tryThree("readRelationships failed.", () =>
this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
const after = await this.call("readRelationships failed.", (client) =>
client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions),
);
if (after.length > 0) {
log.error("[spicedb] Failed to delete relationships.", { existing, after, request: req });
@@ -213,7 +171,19 @@ export class SpiceDBAuthorizer {
async readRelationships(req: v1.ReadRelationshipsRequest): Promise<v1.ReadRelationshipsResponse[]> {
req.consistency = await this.tokenCache.consistency(undefined);
incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined");
return tryThree("readRelationships failed.", () => this.client.readRelationships(req, this.callOptions));
return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions));
}

private async call<T>(errMessage: string, code: (client: v1.ZedPromiseClientInterface) => Promise<T>): Promise<T> {
try {
const client = this.clientProvider.getClient();
return code(client);
} catch (err) {
log.error(errMessage, err, {
code: err.code,
});
throw err;
}
}

/**
@@ -223,7 +193,7 @@ export class SpiceDBAuthorizer {
*/
private get callOptions(): grpc.Metadata {
return (<grpc.CallOptions>{
deadline: Date.now() + 8000,
deadline: Date.now() + GRPC_DEADLINE,
}) as any as grpc.Metadata;
}
}
25 changes: 17 additions & 8 deletions components/server/src/authorization/spicedb.ts
Original file line number Diff line number Diff line change
@@ -17,24 +17,33 @@ export interface SpiceDBClientConfig {

export type SpiceDBClient = v1.ZedPromiseClientInterface;
type Client = v1.ZedClientInterface & grpc.Client;

const DEFAULT_FEATURE_FLAG_VALUE = "undefined";
const DefaultClientOptions: grpc.ClientOptions = {
// we ping frequently to check if the connection is still alive
"grpc.keepalive_time_ms": 1000,
"grpc.keepalive_timeout_ms": 1000,
"grpc.keepalive_time_ms": 30_000,
"grpc.keepalive_timeout_ms": 4_000,

"grpc.max_reconnect_backoff_ms": 5_000,
"grpc.initial_reconnect_backoff_ms": 1_000,

"grpc.max_reconnect_backoff_ms": 5000,
"grpc.initial_reconnect_backoff_ms": 500,
// docs on client-side retry support: https://github.com/grpc/grpc-node/blob/0c093b0b7f78f691a4f6e41efc184899d7a2d987/examples/retry/README.md?plain=1#L3
"grpc.service_config_disable_resolution": 1, // don't resolve from external, but guarantee to take this config
"grpc.service_config": JSON.stringify({
methodConfig: [
{
// here is the code that shows how an empty shape matches every method: https://github.com/grpc/grpc-node/blob/bfd87a9bf62ebc438bcf98a7af223d5353f4c8b2/packages/grpc-js/src/resolving-load-balancer.ts#L62-L147
name: [{}],
// docs: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto#L88C29-L88C43
waitForReady: true,
// docs: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto#L136
retryPolicy: {
maxAttempts: 10,
initialBackoff: "0.1s",
maxAttempts: 5,
initialBackoff: "1s",
maxBackoff: "5s",
backoffMultiplier: 2.0,
retryableStatusCodes: ["UNAVAILABLE", "DEADLINE_EXCEEDED"],
// validation code: https://github.com/grpc/grpc-node/blob/0c093b0b7f78f691a4f6e41efc184899d7a2d987/packages/grpc-js/src/service-config.ts#L182C1-L197C4
retryableStatusCodes: ["UNAVAILABLE", "DEADLINE_EXCEEDED", "INTERNAL"],
},
},
],
@@ -43,7 +52,7 @@ const DefaultClientOptions: grpc.ClientOptions = {

// Governs how log DNS resolution results are cached (at minimum!)
// default is 30s, which is too long for us during rollouts (where service DNS entries are updated)
"grpc.dns_min_time_between_resolutions_ms": 2000,
"grpc.dns_min_time_between_resolutions_ms": 2_000,
};

export function spiceDBConfigFromEnv(): SpiceDBClientConfig | undefined {
3 changes: 2 additions & 1 deletion components/server/src/container-module.ts
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@ import { DebugApp } from "@gitpod/gitpod-protocol/lib/util/debug-app";
import {
IClientCallMetrics,
createClientCallMetricsInterceptor,
createDebugLogInterceptor,
defaultGRPCOptions,
} from "@gitpod/gitpod-protocol/lib/util/grpc";
import { prometheusClientMiddleware } from "@gitpod/gitpod-protocol/lib/util/nice-grpc";
@@ -341,7 +342,7 @@ export const productionContainerModule = new ContainerModule(
const clientCallMetrics = ctx.container.get<IClientCallMetrics>(IClientCallMetrics);
return new SpiceDBClientProvider(
config, //
[createClientCallMetricsInterceptor(clientCallMetrics)],
[createClientCallMetricsInterceptor(clientCallMetrics), createDebugLogInterceptor()],
);
})
.inSingletonScope();

0 comments on commit 560e4dc

Please sign in to comment.