Skip to content

Commit

Permalink
Use milliseconds for time histogram metrics
Browse files Browse the repository at this point in the history
LongHistogram's default base2 exponentional aggregagtion is optimized
for latency range of 1ms to 100s. Hence we should record time metrics in
milliseconds instead of micros.

Relates: ES-9065
  • Loading branch information
ywangd committed Aug 1, 2024
1 parent b3e7d43 commit 93d6077
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;

import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM;
import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_HISTOGRAM;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_TOTAL;
Expand All @@ -52,6 +53,7 @@
import static org.elasticsearch.rest.RestStatus.TOO_MANY_REQUESTS;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.lessThanOrEqualTo;

@SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint")
// Need to set up a new cluster for each test because cluster settings use randomized authentication settings
Expand Down Expand Up @@ -98,6 +100,35 @@ private static BlobContainer getBlobContainer(String dataNodeName, String reposi
return blobStoreRepository.blobStore().blobContainer(BlobPath.EMPTY.add(randomIdentifier()));
}

public void testHttpRequestTimeCaptureInMilliseconds() throws IOException {
final String repository = createRepository(randomRepositoryName());
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);
final OperationPurpose purpose = randomFrom(OperationPurpose.values());
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository);
final String blobName = randomIdentifier();

long before = System.nanoTime();
blobContainer.writeBlob(purpose, blobName, new BytesArray(randomBytes(between(10, 1000))), false);
long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - before);
assertThat(getLongHistogramValue(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.PUT_OBJECT), lessThanOrEqualTo(elapsed));

plugin.resetMeter();
before = System.nanoTime();
blobContainer.readBlob(purpose, blobName).close();
elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - before);
assertThat(getLongHistogramValue(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.GET_OBJECT), lessThanOrEqualTo(elapsed));

plugin.resetMeter();
before = System.nanoTime();
blobContainer.deleteBlobsIgnoringIfNotExists(purpose, Iterators.single(blobName));
elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - before);
assertThat(
getLongHistogramValue(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.DELETE_OBJECTS),
lessThanOrEqualTo(elapsed)
);
}

public void testMetricsWithErrors() throws IOException {
final String repository = createRepository(randomRepositoryName());

Expand All @@ -121,7 +152,7 @@ public void testMetricsWithErrors() throws IOException {
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.PUT_OBJECT), equalTo(2L * batch));
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.PUT_OBJECT), equalTo(batch));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.PUT_OBJECT), equalTo(2L * batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.PUT_OBJECT), equalTo(batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.PUT_OBJECT), equalTo(batch));
}

// Get not found
Expand All @@ -141,7 +172,7 @@ public void testMetricsWithErrors() throws IOException {
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(batch));
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));

// Make sure we don't hit the request range not satisfied counters
assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT), equalTo(0L));
Expand All @@ -164,7 +195,7 @@ public void testMetricsWithErrors() throws IOException {
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.LIST_OBJECTS), equalTo(5L * batch));
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(batch));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(5L * batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(batch));
}

// Delete to clean up
Expand All @@ -176,7 +207,7 @@ public void testMetricsWithErrors() throws IOException {
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.DELETE_OBJECTS), equalTo(0L));
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(0L));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(0L));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L));
}

public void testMetricsForRequestRangeNotSatisfied() {
Expand Down Expand Up @@ -208,7 +239,7 @@ public void testMetricsForRequestRangeNotSatisfied() {
);
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(2 * batch));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(2 * batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,11 @@ private void maybeRecordHttpRequestTime(Request<?> request) {
return;
}

final long totalTimeInMicros = getTotalTimeInMicros(requestTimesIncludingRetries);
if (totalTimeInMicros == 0) {
final long totalTimeInMillis = getTotalTimeInMillis(requestTimesIncludingRetries);
if (totalTimeInMillis == 0) {
logger.warn("Expected HttpRequestTime to be tracked for request [{}] but found no count.", request);
} else {
s3RepositoriesMetrics.common().httpRequestTimeInMicroHistogram().record(totalTimeInMicros, attributes);
s3RepositoriesMetrics.common().httpRequestTimeInMillisHistogram().record(totalTimeInMillis, attributes);
}
}

Expand Down Expand Up @@ -271,18 +271,19 @@ private static long getCountForMetric(TimingInfo info, AWSRequestMetrics.Field f
}
}

private static long getTotalTimeInMicros(List<TimingInfo> requestTimesIncludingRetries) {
// Here we calculate the timing in Microseconds for the sum of the individual subMeasurements with the goal of deriving the TTFB
// (time to first byte). We calculate the time in micros for later use with an APM style counter (exposed as a long), rather than
// using the default double exposed by getTimeTakenMillisIfKnown().
long totalTimeInMicros = 0;
private static long getTotalTimeInMillis(List<TimingInfo> requestTimesIncludingRetries) {
// Here we calculate the timing in Milliseconds for the sum of the individual subMeasurements with the goal of deriving the TTFB
// (time to first byte). We calculate the time in millis for later use with an APM style counter (exposed as a long), rather than
// using the default double exposed by getTimeTakenMillisIfKnown(). We don't need sub-millisecond precision. So no need perform
// the data type castings.
long totalTimeInMillis = 0;
for (TimingInfo timingInfo : requestTimesIncludingRetries) {
var endTimeInNanos = timingInfo.getEndTimeNanoIfKnown();
if (endTimeInNanos != null) {
totalTimeInMicros += TimeUnit.NANOSECONDS.toMicros(endTimeInNanos - timingInfo.getStartTimeNano());
totalTimeInMillis += TimeUnit.NANOSECONDS.toMillis(endTimeInNanos - timingInfo.getStartTimeNano());
}
}
return totalTimeInMicros;
return totalTimeInMillis;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public record RepositoriesMetrics(
LongCounter unsuccessfulOperationCounter,
LongHistogram exceptionHistogram,
LongHistogram throttleHistogram,
LongHistogram httpRequestTimeInMicroHistogram
LongHistogram httpRequestTimeInMillisHistogram
) {

public static RepositoriesMetrics NOOP = new RepositoriesMetrics(MeterRegistry.NOOP);
Expand All @@ -36,7 +36,7 @@ public record RepositoriesMetrics(
public static final String METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL = "es.repositories.operations.unsuccessful.total";
public static final String METRIC_EXCEPTIONS_HISTOGRAM = "es.repositories.exceptions.histogram";
public static final String METRIC_THROTTLES_HISTOGRAM = "es.repositories.throttles.histogram";
public static final String HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM = "es.repositories.requests.http_request_time.histogram";
public static final String HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM = "es.repositories.requests.http_request_time.histogram";

public RepositoriesMetrics(MeterRegistry meterRegistry) {
this(
Expand All @@ -54,9 +54,9 @@ public RepositoriesMetrics(MeterRegistry meterRegistry) {
meterRegistry.registerLongHistogram(METRIC_EXCEPTIONS_HISTOGRAM, "repository request exception histogram", "unit"),
meterRegistry.registerLongHistogram(METRIC_THROTTLES_HISTOGRAM, "repository request throttle histogram", "unit"),
meterRegistry.registerLongHistogram(
HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM,
"HttpRequestTime in microseconds expressed as as a histogram",
"micros"
HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM,
"HttpRequestTime in milliseconds expressed as as a histogram",
"ms"
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ public BlobCacheMetrics(MeterRegistry meterRegistry) {
),
meterRegistry.registerLongHistogram(
"es.blob_cache.cache_miss_load_times.histogram",
"The time in microseconds for populating entries in the blob store resulting from a cache miss, expressed as a histogram.",
"micros"
"The time in milliseconds for populating entries in the blob store resulting from a cache miss, expressed as a histogram.",
"ms"
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ public void fillCacheRange(
IntConsumer progressUpdater
) throws IOException {
writer.fillCacheRange(channel, channelPos, streamFactory, relativePos, length, progressUpdater);
var elapsedTime = TimeUnit.NANOSECONDS.toMicros(relativeTimeInNanosSupplier.getAsLong() - startTime);
var elapsedTime = TimeUnit.NANOSECONDS.toMillis(relativeTimeInNanosSupplier.getAsLong() - startTime);
SharedBlobCacheService.this.blobCacheMetrics.getCacheMissLoadTimes().record(elapsedTime);
SharedBlobCacheService.this.blobCacheMetrics.getCacheMissCounter().increment();
}
Expand Down

0 comments on commit 93d6077

Please sign in to comment.