From ec6557fe9823f1f16cc4c934cd588d67e8ab4dd3 Mon Sep 17 00:00:00 2001
From: Noah Kennedy <Nomaxx117@gmail.com>
Date: Wed, 5 Apr 2023 12:01:58 -0500
Subject: [PATCH] task: add short and long delay metrics (#44)

These new metrics bin the scheduled times of a task into short and long buckets.

Co-authored-by: Hayden Stainsby <hds@caffeineconcepts.com>
Co-authored-by: Jack Wrenn <jack@wrenn.fyi>
---
 README.md   |  21 +++++++
 src/task.rs | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+)

diff --git a/README.md b/README.md
index a4abf67..684a6b5 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,14 @@ loop {
   The total number of times that polling tasks completed slowly.
 - **[`total_slow_poll_duration`]**
   The total duration of slow polls.
+- **[`total_short_delay_count`]**
+  The total count of short scheduling delays.
+- **[`total_short_delay_duration`]**
+  The total duration of short scheduling delays.
+- **[`total_long_delay_count`]**
+  The total count of long scheduling delays.
+- **[`total_long_delay_duration`]**
+  The total duration of long scheduling delays.
 
 #### Derived Metrics
 - **[`mean_first_poll_delay`]**  
@@ -83,6 +91,12 @@ loop {
   The mean duration of fast polls.
 - **[`mean_slow_poll_duration`]**  
   The mean duration of slow polls.
+- **[`long_delay_ratio`]**
+- The ratio between the number of long scheduling delays and the number of total schedules.
+- **[`mean_short_delay_duration`]**
+  The mean duration of short schedules.
+- **[`mean_long_delay_duration`]**
+  The mean duration of long schedules.
 
 [`instrumented_count`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#structfield.instrumented_count
 [`dropped_count`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#structfield.dropped_count
@@ -105,6 +119,13 @@ loop {
 [`slow_poll_ratio`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#method.slow_poll_ratio
 [`mean_fast_poll_duration`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#method.mean_fast_poll_duration
 [`mean_slow_poll_duration`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#method.mean_slow_poll_duration
+[`total_short_delay_count`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#structfield.total_short_delay_count
+[`total_short_delay_duration`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#structfield.total_short_delay_duration
+[`total_long_delay_count`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#structfield.total_long_delay_count
+[`total_long_delay_duration`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#structfield.total_long_delay_duration
+[`long_delay_ratio`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#method.long_delay_ratio
+[`mean_short_delay_duration`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#method.mean_short_delay_duration
+[`mean_long_delay_duration`]: https://docs.rs/tokio-metrics/0.2.*/tokio_metrics/struct.TaskMetrics.html#method.mean_long_delay_duration
 
 ## Getting Started With Runtime Metrics
 
diff --git a/src/task.rs b/src/task.rs
index 76cda32..28aadb6 100644
--- a/src/task.rs
+++ b/src/task.rs
@@ -205,6 +205,16 @@ use std::time::{Duration, Instant};
 ///   'scheduled' state of a task is the duration between the instant a task is awoken and the
 ///   instant it is subsequently polled. If this metric increases, it means that, on average, tasks
 ///   spent longer in tokio's queues before being polled.
+/// - **Did [`long_delay_ratio`][TaskMetrics::long_delay_ratio] increase?**
+///   This metric reflects the proportion of scheduling delays which were 'long'. If it increased,
+///   it means that a greater proportion of tasks experienced excessive delays before they could
+///   execute after being woken. This does not necessarily indicate an increase in latency, as this
+///   could be offset by fewer or faster task polls.
+/// - **Did [`mean_long_delay_duration`][TaskMetrics::mean_long_delay_duration] increase?**
+///   This metric reflects the mean duration of long delays. If it increased, it means that, on
+///   average, long delays got even longer. This does not necessarily imply increased task latency:
+///   an increase in average long delay duration could be offset by fewer or faster polls or more
+///   short schedules.
 ///
 /// If so, [*why are my tasks spending more time waiting to be polled?*](#why-are-my-tasks-spending-more-time-waiting-to-be-polled)
 ///
@@ -842,6 +852,10 @@ pub struct TaskMetrics {
     /// The total number of times that tasks were awoken (and then, presumably, scheduled for
     /// execution).
     ///
+    /// ##### Definition
+    /// This metric is equal to [`total_short_delay_duration`][TaskMetrics::total_short_delay_duration]
+    /// + [`total_long_delay_duration`][TaskMetrics::total_long_delay_duration].
+    ///
     /// ##### Derived metrics
     /// - **[`mean_scheduled_duration`][TaskMetrics::mean_scheduled_duration]**   
     ///   The mean duration that tasks spent waiting to be executed after awakening.
@@ -905,10 +919,15 @@ pub struct TaskMetrics {
     ///     assert_eq!(metrics_monitor.cumulative().total_scheduled_count, 5);
     /// }
     /// ```
+    #[doc(alias = "total_delay_count")]
     pub total_scheduled_count: u64,
 
     /// The total duration that tasks spent waiting to be polled after awakening.
     ///
+    /// ##### Definition
+    /// This metric is equal to [`total_short_delay_count`][TaskMetrics::total_short_delay_count]
+    /// + [`total_long_delay_count`][TaskMetrics::total_long_delay_count].
+    ///
     /// ##### Derived metrics
     /// - **[`mean_scheduled_duration`][TaskMetrics::mean_scheduled_duration]**   
     ///   The mean duration that tasks spent waiting to be executed after awakening.
@@ -941,6 +960,7 @@ pub struct TaskMetrics {
     ///     assert!(total_scheduled_duration <= Duration::from_millis(1100));
     /// }
     /// ```
+    #[doc(alias = "total_delay_duration")]
     pub total_scheduled_duration: Duration,
 
     /// The total number of times that tasks were polled.
@@ -1289,6 +1309,49 @@ pub struct TaskMetrics {
     /// }
     /// ```
     pub total_slow_poll_duration: Duration,
+
+    /// The total count of tasks with short scheduling delays.
+    ///
+    /// This is defined as tasks taking strictly less than
+    /// [`long_delay_threshold`][TaskMonitor::long_delay_threshold] to be executed after being
+    /// scheduled.
+    ///
+    /// ##### Derived metrics
+    /// - **[`mean_short_delay_duration`][TaskMetrics::mean_short_delay_duration]**   
+    ///   The mean duration of short scheduling delays.
+    pub total_short_delay_count: u64,
+
+    /// The total count of tasks with long scheduling delays.
+    ///
+    /// This is defined as tasks taking
+    /// [`long_delay_threshold`][TaskMonitor::long_delay_threshold] or longer to be executed
+    /// after being scheduled.
+    ///
+    /// ##### Derived metrics
+    /// - **[`mean_long_delay_duration`][TaskMetrics::mean_long_delay_duration]**   
+    ///   The mean duration of short scheduling delays.
+    pub total_long_delay_count: u64,
+
+    /// The total duration of tasks with short scheduling delays.
+    ///
+    /// This is defined as tasks taking strictly less than
+    /// [`long_delay_threshold`][TaskMonitor::long_delay_threshold] to be executed after being
+    /// scheduled.
+    ///
+    /// ##### Derived metrics
+    /// - **[`mean_short_delay_duration`][TaskMetrics::mean_short_delay_duration]**   
+    ///   The mean duration of short scheduling delays.
+    pub total_short_delay_duration: Duration,
+
+    /// The total number of times that a task had a long scheduling duration.
+    ///
+    /// Here, a long scheduling duration is defined as taking longer to start execution after
+    /// scheduling than [`long_delay_threshold`][TaskMonitor::long_delay_threshold].
+    ///
+    /// ##### Derived metrics
+    /// - **[`mean_long_delay_duration`][TaskMetrics::mean_long_delay_duration]**   
+    ///   The mean duration of short scheduling delays.
+    pub total_long_delay_duration: Duration,
 }
 
 /// Tracks the metrics, shared across the various types.
@@ -1297,6 +1360,9 @@ struct RawMetrics {
     /// A task poll takes longer than this, it is considered a slow poll.
     slow_poll_threshold: Duration,
 
+    /// A scheduling delay of at least this long will be considered a long delay
+    long_delay_threshold: Duration,
+
     /// Total number of instrumented tasks.
     instrumented_count: AtomicU64,
 
@@ -1315,6 +1381,12 @@ struct RawMetrics {
     /// Total number of times tasks were polled slow
     total_slow_poll_count: AtomicU64,
 
+    /// Total number of times tasks had long delay,
+    total_long_delay_count: AtomicU64,
+
+    /// Total number of times tasks had little delay
+    total_short_delay_count: AtomicU64,
+
     /// Total number of times tasks were dropped
     dropped_count: AtomicU64,
 
@@ -1332,6 +1404,12 @@ struct RawMetrics {
 
     /// Total amount of time tasks spent being polled above the slow cut off.
     total_slow_poll_duration: AtomicU64,
+
+    /// Total amount of time tasks spent being polled below the long delay cut off.
+    total_short_delay_duration_ns: AtomicU64,
+
+    /// Total amount of time tasks spent being polled at or above the long delay cut off.
+    total_long_delay_duration_ns: AtomicU64,
 }
 
 #[derive(Debug)]
@@ -1420,6 +1498,7 @@ impl TaskMonitor {
                 total_scheduled_count: AtomicU64::new(0),
                 total_fast_poll_count: AtomicU64::new(0),
                 total_slow_poll_count: AtomicU64::new(0),
+                total_long_delay_count: AtomicU64::new(0),
                 instrumented_count: AtomicU64::new(0),
                 dropped_count: AtomicU64::new(0),
                 total_first_poll_delay_ns: AtomicU64::new(0),
@@ -1427,6 +1506,10 @@ impl TaskMonitor {
                 total_idle_duration_ns: AtomicU64::new(0),
                 total_fast_poll_duration_ns: AtomicU64::new(0),
                 total_slow_poll_duration: AtomicU64::new(0),
+                total_short_delay_duration_ns: AtomicU64::new(0),
+                long_delay_threshold: Default::default(),
+                total_short_delay_count: AtomicU64::new(0),
+                total_long_delay_duration_ns: AtomicU64::new(0),
             }),
         }
     }
@@ -1453,6 +1536,12 @@ impl TaskMonitor {
         self.metrics.slow_poll_threshold
     }
 
+    /// Produces the duration greater-than-or-equal-to at which scheduling delays are categorized
+    /// as long.
+    pub fn long_delay_threshold(&self) -> Duration {
+        self.metrics.long_delay_threshold
+    }
+
     /// Produces an instrumented façade around a given async task.
     ///
     /// ##### Examples
@@ -1677,9 +1766,15 @@ impl TaskMonitor {
                     total_fast_poll_count: latest
                         .total_fast_poll_count
                         .wrapping_sub(previous.total_fast_poll_count),
+                    total_short_delay_count: latest
+                        .total_short_delay_count
+                        .wrapping_sub(previous.total_short_delay_count),
                     total_slow_poll_count: latest
                         .total_slow_poll_count
                         .wrapping_sub(previous.total_slow_poll_count),
+                    total_long_delay_count: latest
+                        .total_long_delay_count
+                        .wrapping_sub(previous.total_long_delay_count),
                     total_first_poll_delay: sub(
                         latest.total_first_poll_delay,
                         previous.total_first_poll_delay,
@@ -1696,10 +1791,18 @@ impl TaskMonitor {
                         latest.total_fast_poll_duration,
                         previous.total_fast_poll_duration,
                     ),
+                    total_short_delay_duration: sub(
+                        latest.total_short_delay_duration,
+                        previous.total_short_delay_duration,
+                    ),
                     total_slow_poll_duration: sub(
                         latest.total_slow_poll_duration,
                         previous.total_slow_poll_duration,
                     ),
+                    total_long_delay_duration: sub(
+                        latest.total_long_delay_duration,
+                        previous.total_long_delay_duration,
+                    ),
                 }
             } else {
                 latest
@@ -1736,6 +1839,8 @@ impl RawMetrics {
             total_scheduled_count: self.total_scheduled_count.load(SeqCst),
             total_fast_poll_count: self.total_fast_poll_count.load(SeqCst),
             total_slow_poll_count: self.total_slow_poll_count.load(SeqCst),
+            total_short_delay_count: self.total_short_delay_count.load(SeqCst),
+            total_long_delay_count: self.total_long_delay_count.load(SeqCst),
             total_first_poll_delay: Duration::from_nanos(
                 self.total_first_poll_delay_ns.load(SeqCst),
             ),
@@ -1749,6 +1854,12 @@ impl RawMetrics {
             total_slow_poll_duration: Duration::from_nanos(
                 self.total_slow_poll_duration.load(SeqCst),
             ),
+            total_short_delay_duration: Duration::from_nanos(
+                self.total_short_delay_duration_ns.load(SeqCst),
+            ),
+            total_long_delay_duration: Duration::from_nanos(
+                self.total_long_delay_duration_ns.load(SeqCst),
+            ),
         }
     }
 }
@@ -2052,6 +2163,15 @@ impl TaskMetrics {
         self.total_slow_poll_count as f64 / self.total_poll_count as f64
     }
 
+    /// The ratio of tasks exceeding [`long_delay_threshold`][TaskMonitor::long_delay_threshold].
+    ///
+    /// ##### Definition
+    /// This metric is derived from [`total_long_delay_count`][TaskMetrics::total_long_delay_count] ÷
+    /// [`total_scheduled_count`][TaskMetrics::total_scheduled_count].
+    pub fn long_delay_ratio(&self) -> f64 {
+        self.total_long_delay_count as f64 / self.total_scheduled_count as f64
+    }
+
     /// The mean duration of fast polls.
     ///
     /// ##### Definition
@@ -2126,6 +2246,20 @@ impl TaskMetrics {
         mean(self.total_fast_poll_duration, self.total_fast_poll_count)
     }
 
+    /// The average time taken for a task with a short scheduling delay to be executed after being
+    /// scheduled.
+    ///
+    /// ##### Definition
+    /// This metric is derived from
+    /// [`total_short_delay_duration`][TaskMetrics::total_short_delay_duration] ÷
+    /// [`total_short_delay_count`][TaskMetrics::total_short_delay_count].
+    pub fn mean_short_delay_duration(&self) -> Duration {
+        mean(
+            self.total_short_delay_duration,
+            self.total_short_delay_count,
+        )
+    }
+
     /// The mean duration of slow polls.
     ///
     /// ##### Definition
@@ -2216,6 +2350,17 @@ impl TaskMetrics {
     pub fn mean_slow_poll_duration(&self) -> Duration {
         mean(self.total_slow_poll_duration, self.total_slow_poll_count)
     }
+
+    /// The average scheduling delay for a task which takes a long time to start executing after
+    /// being scheduled.
+    ///
+    /// ##### Definition
+    /// This metric is derived from
+    /// [`total_long_delay_duration`][TaskMetrics::total_long_delay_duration] ÷
+    /// [`total_long_delay_count`][TaskMetrics::total_long_delay_count].
+    pub fn mean_long_delay_duration(&self) -> Duration {
+        mean(self.total_long_delay_duration, self.total_long_delay_count)
+    }
 }
 
 impl<T: Future> Future for Instrumented<T> {
@@ -2298,6 +2443,18 @@ fn instrument_poll<T, Out>(
             .try_into()
             .unwrap_or(u64::MAX);
 
+        let scheduled = Duration::from_nanos(scheduled_ns);
+
+        let (count_bucket, duration_bucket) = // was the scheduling delay long or short?
+            if scheduled >= metrics.long_delay_threshold {
+                (&metrics.total_long_delay_count, &metrics.total_long_delay_duration_ns)
+            } else {
+                (&metrics.total_short_delay_count, &metrics.total_short_delay_duration_ns)
+            };
+        // update the appropriate bucket
+        count_bucket.fetch_add(1, SeqCst);
+        duration_bucket.fetch_add(scheduled_ns, SeqCst);
+
         // add `scheduled_ns` to the Monitor's total
         metrics
             .total_scheduled_duration_ns