From 3a1b4d23e58d772f57733179025f53e819671344 Mon Sep 17 00:00:00 2001 From: RickyMa Date: Mon, 25 Mar 2024 19:03:23 +0800 Subject: [PATCH] [#1598] fix(server) Fix inaccurate used_direct_memory_size metric (#1599) ### What changes were proposed in this pull request? Fix the inaccurate `used_direct_memory_size` metric. Also `used_direct_memory_size_by_netty` and `used_direct_memory_size_by_grpc_netty` metrics are added to provide more detailed indicators for locating and analyzing in production. ### Why are the changes needed? Fix https://github.com/apache/incubator-uniffle/issues/1598. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested in our env. --- .../uniffle/server/NettyDirectMemoryTracker.java | 15 ++++++++++++--- .../uniffle/server/ShuffleServerMetrics.java | 11 +++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java b/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java index 96206cc659..e9eb17060c 100644 --- a/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java +++ b/server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java @@ -53,11 +53,20 @@ public void start() { service.scheduleAtFixedRate( () -> { try { - long usedDirectMemory = PlatformDependent.usedDirectMemory(); + long usedDirectMemoryByNetty = PlatformDependent.usedDirectMemory(); + long usedDirectMemoryByGrpcNetty = + io.grpc.netty.shaded.io.netty.util.internal.PlatformDependent.usedDirectMemory(); if (LOG.isDebugEnabled()) { - LOG.debug("Current usedDirectMemory:{}", usedDirectMemory); + LOG.debug( + "Current usedDirectMemoryByNetty:{}, usedDirectMemoryByGrpcNetty:{}", + usedDirectMemoryByNetty, + usedDirectMemoryByGrpcNetty); } - ShuffleServerMetrics.gaugeUsedDirectMemorySize.set(usedDirectMemory); + ShuffleServerMetrics.gaugeUsedDirectMemorySizeByNetty.set(usedDirectMemoryByNetty); + ShuffleServerMetrics.gaugeUsedDirectMemorySizeByGrpcNetty.set( + usedDirectMemoryByGrpcNetty); + ShuffleServerMetrics.gaugeUsedDirectMemorySize.set( + usedDirectMemoryByNetty + usedDirectMemoryByGrpcNetty); } catch (Throwable t) { LOG.error("Failed to report direct memory.", t); } diff --git a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java index 274cde0089..f1f37369e6 100644 --- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java +++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java @@ -79,8 +79,9 @@ public class ShuffleServerMetrics { private static final String USED_BUFFER_SIZE = "used_buffer_size"; private static final String READ_USED_BUFFER_SIZE = "read_used_buffer_size"; private static final String USED_DIRECT_MEMORY_SIZE = "used_direct_memory_size"; - private static final String ALLOCATED_DIRECT_MEMORY_SIZE = "allocated_direct_memory_size"; - private static final String PINNED_DIRECT_MEMORY_SIZE = "pinned_direct_memory_size"; + private static final String USED_DIRECT_MEMORY_SIZE_BY_NETTY = "used_direct_memory_size_by_netty"; + private static final String USED_DIRECT_MEMORY_SIZE_BY_GRPC_NETTY = + "used_direct_memory_size_by_grpc_netty"; private static final String TOTAL_FAILED_WRITTEN_EVENT_NUM = "total_failed_written_event_num"; private static final String TOTAL_DROPPED_EVENT_NUM = "total_dropped_event_num"; private static final String TOTAL_HADOOP_WRITE_DATA = "total_hadoop_write_data"; @@ -186,6 +187,8 @@ public class ShuffleServerMetrics { public static Gauge.Child gaugeUsedBufferSize; public static Gauge.Child gaugeReadBufferUsedSize; public static Gauge.Child gaugeUsedDirectMemorySize; + public static Gauge.Child gaugeUsedDirectMemorySizeByNetty; + public static Gauge.Child gaugeUsedDirectMemorySizeByGrpcNetty; public static Gauge.Child gaugeWriteHandler; public static Gauge.Child gaugeEventQueueSize; public static Gauge.Child gaugeHadoopFlushThreadPoolQueueSize; @@ -382,6 +385,10 @@ private static void setUpMetrics() { gaugeUsedBufferSize = metricsManager.addLabeledGauge(USED_BUFFER_SIZE); gaugeReadBufferUsedSize = metricsManager.addLabeledGauge(READ_USED_BUFFER_SIZE); gaugeUsedDirectMemorySize = metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE); + gaugeUsedDirectMemorySizeByNetty = + metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE_BY_NETTY); + gaugeUsedDirectMemorySizeByGrpcNetty = + metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE_BY_GRPC_NETTY); gaugeWriteHandler = metricsManager.addLabeledGauge(TOTAL_WRITE_HANDLER); gaugeEventQueueSize = metricsManager.addLabeledGauge(EVENT_QUEUE_SIZE); gaugeHadoopFlushThreadPoolQueueSize =