From 986ddb89ac4ee6df14c8c4903bfe9c9427da5b51 Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Tue, 23 Sep 2025 18:20:14 +0200
Subject: [PATCH 1/3] netstacklat: Exclude TCP reads for HOL blocked segments

The 'tcp-socket-read' currently reports the latency for the skb
containing the last TCP segment read from the socket. However, this
segment might have been head of line (HOL) blocked by a previous
segment missing. In this case, netstacklat's reported latency will
include HOL blocking periods that is dependent on external
factors (such as network packet loss, and network latency impacts
retransmission time). As netstacklat is primarily intended to identify
issues within the local host (in the network stack or receiving
applications), filter out any socket reads were the last read SKB
might have experienced HOL-blocking.

To exclude HOL-blocked reads, track whenever a TCP segment arrives
out-of-order (ooo), i.e. ahead of the expected next sequence. These
segments are kept in a separate ooo-queue and later merged back into
the socket receive queue once the missing segment gap has been
filled. The ooo-segments are therefore the only segments that may
experience HOL blocking (as non ooo-segments are immediately added to
the socket receive queue). Specifically, track the right edge (the
maximum) of the ooo sequence range. If the final byte of any socket
read is lower than right edge of the ooo sequence range, filter it out
as potentially HOL blocked. If the last read byte is ahead of the
ooo-range, the last byte must have been read from a segment that
arrived in-order and therefore haven't experienced HOL-blocking, so
it's safe to include the latency measurement. Furthermore, also
invalidate the ooo-range when the last read byte is ahead to prevent
an old ooo-range value sticking around until the sequence number wraps
around.

There are some scenarios that may cause this ooo-filtering to fail.
- If the program is started in the middle of an ongoing flow, there
  may be ooo segments that arrived before we could record them in the
  the tracked ooo-range. This may cause the latency for some initial
  HOL-blocked reads to be reported.

- If multiple reads are done to the socket concurrently, we may not
  correctly track the last read byte. The kernel does not keep a lock
  on the TCP socket at the time our hooked function
  tcp_recv_timestamp() runs. If two reads are done in parallel, it's
  therefore possible that for both reads we will check the last read
  byte (tcp_sock.copied_seq) after the second read has updated it. We
  may then incorrectly conclude that the first read was ahead of the
  ooo-range when it was not, and record its latency when we should
  have excluded it. In practice I belive this issue should be quite
  rare, as most applications will probably not attempt to perform
  multiple concurrent reads to a single connected TCP socket in
  parallel (as then you cannot know which part of the payload the
  parallel reads will return).

- The tracked ooo-range may concurrently be updated at
  tcp_data_queue_ofo() and and tcp_recv_timestamp(), leading to
  inconsitent state.

The last of these issue will be addressed in a later commit that adds
bpf spin locks.

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 headers/vmlinux/vmlinux_net.h | 76 ++++++++++++++++++++++++++++
 netstacklat/netstacklat.bpf.c | 94 +++++++++++++++++++++++++++++++++++
 netstacklat/netstacklat.c     |  3 +-
 3 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h
index b0f6476e..3f414089 100644
--- a/headers/vmlinux/vmlinux_net.h
+++ b/headers/vmlinux/vmlinux_net.h
@@ -161,6 +161,35 @@ struct sk_buff {
 	struct skb_ext *extensions;
 };
 
+struct tcp_skb_cb {
+	__u32 seq;
+	__u32 end_seq;
+	union {
+		struct {
+			u16 tcp_gso_segs;
+			u16 tcp_gso_size;
+		};
+	};
+	__u8 tcp_flags;
+	__u8 sacked;
+	__u8 ip_dsfield;
+	__u8 txstamp_ack : 1;
+	__u8 eor : 1;
+	__u8 has_rxtstamp : 1;
+	__u8 unused : 5;
+	__u32 ack_seq;
+	union {
+		struct {
+			__u32 is_app_limited : 1;
+			__u32 delivered_ce : 20;
+			__u32 unused : 11;
+			__u32 delivered;
+			u64 first_tx_mstamp;
+			u64 delivered_mstamp;
+		} tx;
+	};
+};
+
 struct nf_conn {
 	unsigned long status;
 };
@@ -202,4 +231,51 @@ struct sock {
 	u32 sk_rx_dst_cookie;
 };
 
+struct inet_sock {
+	struct sock sk;
+};
+
+struct inet_connection_sock {
+	struct inet_sock icsk_inet;
+};
+
+struct tcp_sock {
+	struct inet_connection_sock inet_conn;
+	__u8 __cacheline_group_begin__tcp_sock_read_tx[0];
+	u32 max_window;
+	u32 rcv_ssthresh;
+	u32 reordering;
+	u32 notsent_lowat;
+	u16 gso_segs;
+	struct sk_buff *lost_skb_hint;
+	struct sk_buff *retransmit_skb_hint;
+	__u8 __cacheline_group_end__tcp_sock_read_tx[0];
+	__u8 __cacheline_group_begin__tcp_sock_read_txrx[0];
+	u32 tsoffset;
+	u32 snd_wnd;
+	u32 mss_cache;
+	u32 snd_cwnd;
+	u32 prr_out;
+	u32 lost_out;
+	u32 sacked_out;
+	u16 tcp_header_len;
+	u8 scaling_ratio;
+	u8 chrono_type : 2;
+	u8 repair : 1;
+	u8 tcp_usec_ts : 1;
+	u8 is_sack_reneg : 1;
+	u8 is_cwnd_limited : 1;
+	__u8 __cacheline_group_end__tcp_sock_read_txrx[0];
+	__u8 __cacheline_group_begin__tcp_sock_read_rx[0];
+	u32 copied_seq;
+	u32 rcv_tstamp;
+	u32 snd_wl1;
+	u32 tlp_high_seq;
+	u32 rttvar_us;
+	u32 retrans_out;
+	u16 advmss;
+	u16 urg_data;
+	u32 lost;
+};
+
 #endif /* __VMLINUX_NET_H__ */
diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index 1a1b0afe..297ee9db 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -11,6 +11,10 @@
 
 #define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
 
+// Mimic macros from /include/net/tcp.h
+#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
+
 char LICENSE[] SEC("license") = "GPL";
 
 
@@ -38,6 +42,12 @@ struct sk_buff___old {
 	__u8 mono_delivery_time: 1;
 } __attribute__((preserve_access_index));
 
+struct tcp_sock_ooo_range {
+	u32 ooo_seq_end;
+	/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
+	bool active;
+};
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
 	__uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64);
@@ -66,6 +76,22 @@ struct {
 	__type(value, u64);
 } netstack_cgroupfilter SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct tcp_sock_ooo_range);
+} netstack_tcp_ooo_range SEC(".maps");
+
+/*
+ * Is a < b considering u32 wrap around?
+ * Based on the before() function in /include/net/tcp.h
+ */
+static bool u32_lt(u32 a, u32 b)
+{
+	return (s32)(a - b) < 0;
+}
+
 static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
 {
 	u64 zero = 0;
@@ -331,6 +357,60 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	record_latency_since(tstamp, &key);
 }
 
+static void tcp_update_ooo_range(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock_ooo_range *tp_ooo_range;
+
+	tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
+					  BPF_SK_STORAGE_GET_F_CREATE);
+	if (!tp_ooo_range)
+		return;
+
+	if (tp_ooo_range->active) {
+		if (u32_lt(tp_ooo_range->ooo_seq_end, TCP_SKB_CB(skb)->end_seq))
+			tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
+	} else {
+		tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
+		tp_ooo_range->active = true;
+	}
+
+}
+
+static bool tcp_read_in_ooo_range(struct sock *sk)
+{
+	struct tcp_sock_ooo_range *tp_ooo_range;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 last_read_seq;
+	int err;
+
+	tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, 0);
+	if (!tp_ooo_range)
+                /* no recorded ooo-range for sock, so cannot be in ooo-range */
+		return false;
+
+	if (!tp_ooo_range->active)
+		return false;
+
+	err = bpf_core_read(&last_read_seq, sizeof(last_read_seq), &tp->copied_seq);
+	if (err) {
+		/*
+		 * Shouldn't happen.
+		 * Should probably emit some warning if reading copied_seq
+		 * unexpectedly fails. Assume not in ooo-range to avoid
+		 * systematically filtering out ALL values if this does happen.
+		 */
+		bpf_printk("failed to read tcp_sock->copied_seq: err=%d", err);
+		return false;
+	}
+
+	if (u32_lt(tp_ooo_range->ooo_seq_end, last_read_seq)) {
+		tp_ooo_range->active = false;
+		return false;
+	}  else {
+		return true;
+	}
+}
+
 SEC("fentry/ip_rcv_core")
 int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
 	     void *tp, void *res, bool compat_mode)
@@ -396,6 +476,11 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
 {
 	struct timespec64 *ts = &tss->ts[0];
+
+	/* skip if preceeding sock read ended in ooo-range */
+	if (tcp_read_in_ooo_range(sk))
+		return 0;
+
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
 			      NETSTACKLAT_HOOK_TCP_SOCK_READ);
@@ -410,3 +495,12 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 			      NETSTACKLAT_HOOK_UDP_SOCK_READ);
 	return 0;
 }
+
+/* This program should also be disabled if tcp-socket-read is disabled */
+SEC("fentry/tcp_data_queue_ofo")
+int BPF_PROG(netstacklat_tcp_data_queue_ofo, struct sock *sk,
+	     struct sk_buff *skb)
+{
+	tcp_update_ooo_range(sk, skb);
+	return 0;
+}
diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c
index 70dd4111..b3009b73 100644
--- a/netstacklat/netstacklat.c
+++ b/netstacklat/netstacklat.c
@@ -258,7 +258,8 @@ static void hook_to_progs(struct hook_prog_collection *progs,
 		break;
 	case NETSTACKLAT_HOOK_TCP_SOCK_READ:
 		progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp;
-		progs->nprogs = 1;
+		progs->progs[1] = obj->progs.netstacklat_tcp_data_queue_ofo;
+		progs->nprogs = 2;
 		break;
 	case NETSTACKLAT_HOOK_UDP_SOCK_READ:
 		progs->progs[0] = obj->progs.netstacklat_skb_consume_udp;

From 4504b758ce7075fc1b76febd7d35806b1b83bbeb Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Mon, 29 Sep 2025 12:01:51 +0200
Subject: [PATCH 2/3] netstacklat: Protect out-of-order tracking with spinlocks

Currently there is no synchronization between the updates and checks
of the per-socket ooo-range used to filter out potentially HOL-blocked
measurements. While the kernel holds a socket lock when the ooo-range
is updated in tcp_data_queue_ofo(), the kernel does not seem to hold a
socket lock by the time we check the ooo-range in
tcp_recv_timestamp(). To prevent the check of the ooo-range from
reading the ooo-range while it's concurrently updated (resulting in a
potentially inconsistent state), add a bpf_spin_lock to protect
updates and checks of the ooo-range.

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 netstacklat/netstacklat.bpf.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index 297ee9db..bb7abfdb 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -43,6 +43,7 @@ struct sk_buff___old {
 } __attribute__((preserve_access_index));
 
 struct tcp_sock_ooo_range {
+	struct bpf_spin_lock lock;
 	u32 ooo_seq_end;
 	/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
 	bool active;
@@ -366,6 +367,7 @@ static void tcp_update_ooo_range(struct sock *sk, struct sk_buff *skb)
 	if (!tp_ooo_range)
 		return;
 
+	bpf_spin_lock(&tp_ooo_range->lock);
 	if (tp_ooo_range->active) {
 		if (u32_lt(tp_ooo_range->ooo_seq_end, TCP_SKB_CB(skb)->end_seq))
 			tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
@@ -373,6 +375,7 @@ static void tcp_update_ooo_range(struct sock *sk, struct sk_buff *skb)
 		tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
 		tp_ooo_range->active = true;
 	}
+	bpf_spin_unlock(&tp_ooo_range->lock);
 
 }
 
@@ -381,6 +384,7 @@ static bool tcp_read_in_ooo_range(struct sock *sk)
 	struct tcp_sock_ooo_range *tp_ooo_range;
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 last_read_seq;
+	bool ret;
 	int err;
 
 	tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, 0);
@@ -388,9 +392,6 @@ static bool tcp_read_in_ooo_range(struct sock *sk)
                 /* no recorded ooo-range for sock, so cannot be in ooo-range */
 		return false;
 
-	if (!tp_ooo_range->active)
-		return false;
-
 	err = bpf_core_read(&last_read_seq, sizeof(last_read_seq), &tp->copied_seq);
 	if (err) {
 		/*
@@ -403,12 +404,20 @@ static bool tcp_read_in_ooo_range(struct sock *sk)
 		return false;
 	}
 
-	if (u32_lt(tp_ooo_range->ooo_seq_end, last_read_seq)) {
-		tp_ooo_range->active = false;
-		return false;
-	}  else {
-		return true;
+	bpf_spin_lock(&tp_ooo_range->lock);
+	if (!tp_ooo_range->active) {
+		ret = false;
+	} else {
+		if (u32_lt(tp_ooo_range->ooo_seq_end, last_read_seq)) {
+			tp_ooo_range->active = false;
+			ret = false;
+		} else {
+			ret = true;
+		}
 	}
+
+	bpf_spin_unlock(&tp_ooo_range->lock);
+	return ret;
 }
 
 SEC("fentry/ip_rcv_core")

From 94b14b78034fc7e7f3bf767f397b1b5c6567673e Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Mon, 29 Sep 2025 14:00:43 +0200
Subject: [PATCH 3/3] netstacklat: Make HOL-blocking filtering configurable

To make netstacklat only report latency based on the local network and
application stack, it by default excludes TCP reads that may be
head-of-line (HOL) blocked by a missing segment.

However, in some scenarios, it may be desirable to still include these
HOL-blocked measurements. For example, in a data center where the full
network is under your control, high latency caused by drops in the
data center network could still be an issue you wish netstacklat to
detect. Therefore, add the -y/--include-tcp-hol-delay option, which
disables the mechanism used to filter out potentially HOL-blocked
reads.

An alternative design to simply disabling the filtering mechanism,
would be to separate the potentially HOL-blocked measurements into
their own histogram (e.g. "tcp-socket-read-hol-blocked"). However,
that would further increase the data cardinality (which may be a
concern when using it together ebpf-exporter). Furthermore, by simply
disabling the filtering instead we can reduce the overhead a bit and
also allow it as an option on systems were the tcp_data_queue_ofo()
function cannot be probed.

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 netstacklat/netstacklat.bpf.c | 12 ++++++++++--
 netstacklat/netstacklat.c     | 33 +++++++++++++++++++++------------
 netstacklat/netstacklat.h     |  1 +
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index bb7abfdb..e8a3494f 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -27,6 +27,7 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.filter_cgroup = false,
 	.groupby_ifindex = false,
 	.groupby_cgroup = false,
+	.include_hol_blocked = false,
 };
 
 /*
@@ -487,7 +488,7 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	struct timespec64 *ts = &tss->ts[0];
 
 	/* skip if preceeding sock read ended in ooo-range */
-	if (tcp_read_in_ooo_range(sk))
+	if (!user_config.include_hol_blocked && tcp_read_in_ooo_range(sk))
 		return 0;
 
 	record_socket_latency(sk, NULL,
@@ -510,6 +511,13 @@ SEC("fentry/tcp_data_queue_ofo")
 int BPF_PROG(netstacklat_tcp_data_queue_ofo, struct sock *sk,
 	     struct sk_buff *skb)
 {
-	tcp_update_ooo_range(sk, skb);
+	if (!user_config.include_hol_blocked)
+		/*
+		 * It's better to not load this program at all if the ooo-range
+		 * tracking isn't needed (like done by netstacklat.c).
+		 * But if an external loader (like ebpf-exporter) is used,
+		 * this should at least minimze the unncecessary overhead.
+		 */
+		tcp_update_ooo_range(sk, skb);
 	return 0;
 }
diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c
index b3009b73..475c14e8 100644
--- a/netstacklat/netstacklat.c
+++ b/netstacklat/netstacklat.c
@@ -83,18 +83,19 @@ struct netstacklat_config {
 };
 
 static const struct option long_options[] = {
-	{ "help",              no_argument,       NULL, 'h' },
-	{ "report-interval",   required_argument, NULL, 'r' },
-	{ "list-probes",       no_argument,       NULL, 'l' },
-	{ "enable-probes",     required_argument, NULL, 'e' },
-	{ "disable-probes",    required_argument, NULL, 'd' },
-	{ "pids",              required_argument, NULL, 'p' },
-	{ "interfaces",        required_argument, NULL, 'i' },
-	{ "network-namespace", required_argument, NULL, 'n' },
-	{ "cgroups",           required_argument, NULL, 'c' },
-	{ "min-queuelength",   required_argument, NULL, 'q' },
-	{ "groupby-interface", no_argument,       NULL, 'I' },
-	{ "groupby-cgroup",    no_argument,       NULL, 'C' },
+	{ "help",                  no_argument,       NULL, 'h' },
+	{ "report-interval",       required_argument, NULL, 'r' },
+	{ "list-probes",           no_argument,       NULL, 'l' },
+	{ "enable-probes",         required_argument, NULL, 'e' },
+	{ "disable-probes",        required_argument, NULL, 'd' },
+	{ "pids",                  required_argument, NULL, 'p' },
+	{ "interfaces",            required_argument, NULL, 'i' },
+	{ "network-namespace",     required_argument, NULL, 'n' },
+	{ "cgroups",               required_argument, NULL, 'c' },
+	{ "min-queuelength",       required_argument, NULL, 'q' },
+	{ "groupby-interface",     no_argument,       NULL, 'I' },
+	{ "groupby-cgroup",        no_argument,       NULL, 'C' },
+	{ "include-tcp-hol-delay", no_argument,       NULL, 'y' },
 	{ 0, 0, 0, 0 }
 };
 
@@ -565,6 +566,7 @@ static int parse_arguments(int argc, char *argv[],
 	conf->bpf_conf.filter_cgroup = false;
 	conf->bpf_conf.groupby_ifindex = false;
 	conf->bpf_conf.groupby_cgroup = false;
+	conf->bpf_conf.include_hol_blocked = false;
 
 	for (i = 0; i < NETSTACKLAT_N_HOOKS; i++)
 		// All probes enabled by default
@@ -659,6 +661,9 @@ static int parse_arguments(int argc, char *argv[],
 		case 'C': // groupby-cgroup
 			conf->bpf_conf.groupby_cgroup = true;
 			break;
+		case 'y': // include-tcp-hol-delay
+			conf->bpf_conf.include_hol_blocked = true;
+			break;
 		case 'h': // help
 			print_usage(stdout, argv[0]);
 			exit(EXIT_SUCCESS);
@@ -1113,6 +1118,10 @@ static void set_programs_to_load(const struct netstacklat_config *conf,
 			bpf_program__set_autoload(progs.progs[i],
 						  conf->enabled_hooks[hook]);
 	}
+
+	if (conf->bpf_conf.include_hol_blocked)
+		bpf_program__set_autoload(
+			obj->progs.netstacklat_tcp_data_queue_ofo, false);
 }
 
 static int set_map_sizes(const struct netstacklat_config *conf,
diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h
index d0da8553..d1708ce4 100644
--- a/netstacklat/netstacklat.h
+++ b/netstacklat/netstacklat.h
@@ -77,6 +77,7 @@ struct netstacklat_bpf_config {
 	bool filter_cgroup;
 	bool groupby_ifindex;
 	bool groupby_cgroup;
+	bool include_hol_blocked;
 };
 
 #endif