Skip to content

Commit 663a852

Browse files
authored
Additional keepalive diagnostics (#1692)
Motivation: It's useful to know the state of keepalive when debugging connection issues. gRPC doesn't emit any logs around this at the moment which makes debugging difficult. Modifications: Add additional logs to the idle handler when: - the idle timeout task is scheduled, cancelled, and fires - the keepalive timer is scheduled - the scheduled close timer is fired - the connection is closed - GOAWAY frames are sent (already logs on receive) - PING frames are sent and received Result: Better visibility into connection lifecycle
1 parent 02ff057 commit 663a852

File tree

2 files changed

+49
-15
lines changed

2 files changed

+49
-15
lines changed

Sources/GRPC/GRPCIdleHandler.swift

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -98,19 +98,6 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
9898
)
9999
}
100100

101-
private func sendGoAway(lastStreamID streamID: HTTP2StreamID) {
102-
guard let context = self.context else {
103-
return
104-
}
105-
106-
let frame = HTTP2Frame(
107-
streamID: .rootStream,
108-
payload: .goAway(lastStreamID: streamID, errorCode: .noError, opaqueData: nil)
109-
)
110-
111-
context.writeAndFlush(self.wrapOutboundOut(frame), promise: nil)
112-
}
113-
114101
private func perform(operations: GRPCIdleHandlerStateMachine.Operations) {
115102
// Prod the connection manager.
116103
if let event = operations.connectionManagerEvent, let manager = self.mode.connectionManager {
@@ -137,11 +124,17 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
137124
if let idleTask = operations.idleTask {
138125
switch idleTask {
139126
case let .cancel(task):
127+
self.stateMachine.logger.debug("idle timeout task cancelled")
140128
task.cancel()
141129

142130
case .schedule:
143131
if self.idleTimeout != .nanoseconds(.max), let context = self.context {
132+
self.stateMachine.logger.debug(
133+
"scheduling idle timeout task",
134+
metadata: [MetadataKey.delayMs: "\(self.idleTimeout.milliseconds)"]
135+
)
144136
let task = context.eventLoop.scheduleTask(in: self.idleTimeout) {
137+
self.stateMachine.logger.debug("idle timeout task fired")
145138
self.idleTimeoutFired()
146139
}
147140
self.perform(operations: self.stateMachine.scheduledIdleTimeoutTask(task))
@@ -151,6 +144,13 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
151144

152145
// Send a GOAWAY frame.
153146
if let streamID = operations.sendGoAwayWithLastPeerInitiatedStreamID {
147+
self.stateMachine.logger.debug(
148+
"sending GOAWAY frame",
149+
metadata: [
150+
MetadataKey.h2GoAwayLastStreamID: "\(Int(streamID))"
151+
]
152+
)
153+
154154
let goAwayFrame = HTTP2Frame(
155155
streamID: .rootStream,
156156
payload: .goAway(lastStreamID: streamID, errorCode: .noError, opaqueData: nil)
@@ -175,6 +175,7 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
175175
// Close on the next event-loop tick so we don't drop any events which are
176176
// currently being processed.
177177
context.eventLoop.execute {
178+
self.stateMachine.logger.debug("closing connection")
178179
context.close(mode: .all, promise: nil)
179180
}
180181
}
@@ -186,8 +187,12 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
186187
()
187188

188189
case .ack:
189-
// NIO's HTTP2 handler acks for us so this is a no-op.
190-
()
190+
// NIO's HTTP2 handler acks for us so this is a no-op. Log so it doesn't appear that we are
191+
// ignoring pings.
192+
self.stateMachine.logger.debug(
193+
"sending PING frame",
194+
metadata: [MetadataKey.h2PingAck: "true"]
195+
)
191196

192197
case .cancelScheduledTimeout:
193198
self.scheduledClose?.cancel()
@@ -197,6 +202,15 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
197202
self.schedulePing(in: delay, timeout: timeout)
198203

199204
case let .reply(framePayload):
205+
switch framePayload {
206+
case .ping(_, let ack):
207+
self.stateMachine.logger.debug(
208+
"sending PING frame",
209+
metadata: [MetadataKey.h2PingAck: "\(ack)"]
210+
)
211+
default:
212+
()
213+
}
200214
let frame = HTTP2Frame(streamID: .rootStream, payload: framePayload)
201215
self.context?.writeAndFlush(self.wrapOutboundOut(frame), promise: nil)
202216

@@ -210,6 +224,11 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
210224
return
211225
}
212226

227+
self.stateMachine.logger.debug(
228+
"scheduled keepalive pings",
229+
metadata: [MetadataKey.intervalMs: "\(delay.milliseconds)"]
230+
)
231+
213232
self.scheduledPing = self.context?.eventLoop.scheduleRepeatedTask(
214233
initialDelay: delay,
215234
delay: delay
@@ -226,6 +245,7 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
226245

227246
private func scheduleClose(in timeout: TimeAmount) {
228247
self.scheduledClose = self.context?.eventLoop.scheduleTask(in: timeout) {
248+
self.stateMachine.logger.debug("keepalive timer expired")
229249
self.perform(operations: self.stateMachine.shutdownNow())
230250
}
231251
}
@@ -318,6 +338,10 @@ internal final class GRPCIdleHandler: ChannelInboundHandler {
318338
case let .settings(.settings(settings)):
319339
self.perform(operations: self.stateMachine.receiveSettings(settings))
320340
case let .ping(data, ack):
341+
self.stateMachine.logger.debug(
342+
"received PING frame",
343+
metadata: [MetadataKey.h2PingAck: "\(ack)"]
344+
)
321345
self.handlePingAction(self.pingHandler.read(pingData: data, ack: ack))
322346
default:
323347
// We're not interested in other events.
@@ -350,3 +374,9 @@ extension HTTP2SettingsParameter {
350374
}
351375
}
352376
}
377+
378+
extension TimeAmount {
379+
fileprivate var milliseconds: Int64 {
380+
self.nanoseconds / 1_000_000
381+
}
382+
}

Sources/GRPC/Logger.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ enum MetadataKey {
3131
static let h2DataBytes = "h2_data_bytes"
3232
static let h2GoAwayError = "h2_goaway_error"
3333
static let h2GoAwayLastStreamID = "h2_goaway_last_stream_id"
34+
static let h2PingAck = "h2_ping_ack"
35+
36+
static let delayMs = "delay_ms"
37+
static let intervalMs = "interval_ms"
3438

3539
static let error = "error"
3640
}

0 commit comments

Comments
 (0)