diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index b0f6476e..3f414089 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -161,6 +161,35 @@ struct sk_buff { struct skb_ext *extensions; }; +struct tcp_skb_cb { + __u32 seq; + __u32 end_seq; + union { + struct { + u16 tcp_gso_segs; + u16 tcp_gso_size; + }; + }; + __u8 tcp_flags; + __u8 sacked; + __u8 ip_dsfield; + __u8 txstamp_ack : 1; + __u8 eor : 1; + __u8 has_rxtstamp : 1; + __u8 unused : 5; + __u32 ack_seq; + union { + struct { + __u32 is_app_limited : 1; + __u32 delivered_ce : 20; + __u32 unused : 11; + __u32 delivered; + u64 first_tx_mstamp; + u64 delivered_mstamp; + } tx; + }; +}; + struct nf_conn { unsigned long status; }; @@ -202,4 +231,51 @@ struct sock { u32 sk_rx_dst_cookie; }; +struct inet_sock { + struct sock sk; +}; + +struct inet_connection_sock { + struct inet_sock icsk_inet; +}; + +struct tcp_sock { + struct inet_connection_sock inet_conn; + __u8 __cacheline_group_begin__tcp_sock_read_tx[0]; + u32 max_window; + u32 rcv_ssthresh; + u32 reordering; + u32 notsent_lowat; + u16 gso_segs; + struct sk_buff *lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + __u8 __cacheline_group_end__tcp_sock_read_tx[0]; + __u8 __cacheline_group_begin__tcp_sock_read_txrx[0]; + u32 tsoffset; + u32 snd_wnd; + u32 mss_cache; + u32 snd_cwnd; + u32 prr_out; + u32 lost_out; + u32 sacked_out; + u16 tcp_header_len; + u8 scaling_ratio; + u8 chrono_type : 2; + u8 repair : 1; + u8 tcp_usec_ts : 1; + u8 is_sack_reneg : 1; + u8 is_cwnd_limited : 1; + __u8 __cacheline_group_end__tcp_sock_read_txrx[0]; + __u8 __cacheline_group_begin__tcp_sock_read_rx[0]; + u32 copied_seq; + u32 rcv_tstamp; + u32 snd_wl1; + u32 tlp_high_seq; + u32 rttvar_us; + u32 retrans_out; + u16 advmss; + u16 urg_data; + u32 lost; +}; + #endif /* __VMLINUX_NET_H__ */ diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 1a1b0afe..e8a3494f 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -11,6 +11,10 @@ #define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +// Mimic macros from /include/net/tcp.h +#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) +#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + char LICENSE[] SEC("license") = "GPL"; @@ -23,6 +27,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_cgroup = false, .groupby_ifindex = false, .groupby_cgroup = false, + .include_hol_blocked = false, }; /* @@ -38,6 +43,13 @@ struct sk_buff___old { __u8 mono_delivery_time: 1; } __attribute__((preserve_access_index)); +struct tcp_sock_ooo_range { + struct bpf_spin_lock lock; + u32 ooo_seq_end; + /* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */ + bool active; +}; + struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); @@ -66,6 +78,22 @@ struct { __type(value, u64); } netstack_cgroupfilter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tcp_sock_ooo_range); +} netstack_tcp_ooo_range SEC(".maps"); + +/* + * Is a < b considering u32 wrap around? + * Based on the before() function in /include/net/tcp.h + */ +static bool u32_lt(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) { u64 zero = 0; @@ -331,6 +359,68 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, record_latency_since(tstamp, &key); } +static void tcp_update_ooo_range(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock_ooo_range *tp_ooo_range; + + tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!tp_ooo_range) + return; + + bpf_spin_lock(&tp_ooo_range->lock); + if (tp_ooo_range->active) { + if (u32_lt(tp_ooo_range->ooo_seq_end, TCP_SKB_CB(skb)->end_seq)) + tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq; + } else { + tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq; + tp_ooo_range->active = true; + } + bpf_spin_unlock(&tp_ooo_range->lock); + +} + +static bool tcp_read_in_ooo_range(struct sock *sk) +{ + struct tcp_sock_ooo_range *tp_ooo_range; + struct tcp_sock *tp = tcp_sk(sk); + u32 last_read_seq; + bool ret; + int err; + + tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, 0); + if (!tp_ooo_range) + /* no recorded ooo-range for sock, so cannot be in ooo-range */ + return false; + + err = bpf_core_read(&last_read_seq, sizeof(last_read_seq), &tp->copied_seq); + if (err) { + /* + * Shouldn't happen. + * Should probably emit some warning if reading copied_seq + * unexpectedly fails. Assume not in ooo-range to avoid + * systematically filtering out ALL values if this does happen. + */ + bpf_printk("failed to read tcp_sock->copied_seq: err=%d", err); + return false; + } + + bpf_spin_lock(&tp_ooo_range->lock); + if (!tp_ooo_range->active) { + ret = false; + } else { + if (u32_lt(tp_ooo_range->ooo_seq_end, last_read_seq)) { + tp_ooo_range->active = false; + ret = false; + } else { + ret = true; + } + } + + bpf_spin_unlock(&tp_ooo_range->lock); + return ret; +} + SEC("fentry/ip_rcv_core") int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) @@ -396,6 +486,11 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { struct timespec64 *ts = &tss->ts[0]; + + /* skip if preceeding sock read ended in ooo-range */ + if (!user_config.include_hol_blocked && tcp_read_in_ooo_range(sk)) + return 0; + record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, NETSTACKLAT_HOOK_TCP_SOCK_READ); @@ -410,3 +505,19 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, NETSTACKLAT_HOOK_UDP_SOCK_READ); return 0; } + +/* This program should also be disabled if tcp-socket-read is disabled */ +SEC("fentry/tcp_data_queue_ofo") +int BPF_PROG(netstacklat_tcp_data_queue_ofo, struct sock *sk, + struct sk_buff *skb) +{ + if (!user_config.include_hol_blocked) + /* + * It's better to not load this program at all if the ooo-range + * tracking isn't needed (like done by netstacklat.c). + * But if an external loader (like ebpf-exporter) is used, + * this should at least minimze the unncecessary overhead. + */ + tcp_update_ooo_range(sk, skb); + return 0; +} diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 70dd4111..475c14e8 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -83,18 +83,19 @@ struct netstacklat_config { }; static const struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "report-interval", required_argument, NULL, 'r' }, - { "list-probes", no_argument, NULL, 'l' }, - { "enable-probes", required_argument, NULL, 'e' }, - { "disable-probes", required_argument, NULL, 'd' }, - { "pids", required_argument, NULL, 'p' }, - { "interfaces", required_argument, NULL, 'i' }, - { "network-namespace", required_argument, NULL, 'n' }, - { "cgroups", required_argument, NULL, 'c' }, - { "min-queuelength", required_argument, NULL, 'q' }, - { "groupby-interface", no_argument, NULL, 'I' }, - { "groupby-cgroup", no_argument, NULL, 'C' }, + { "help", no_argument, NULL, 'h' }, + { "report-interval", required_argument, NULL, 'r' }, + { "list-probes", no_argument, NULL, 'l' }, + { "enable-probes", required_argument, NULL, 'e' }, + { "disable-probes", required_argument, NULL, 'd' }, + { "pids", required_argument, NULL, 'p' }, + { "interfaces", required_argument, NULL, 'i' }, + { "network-namespace", required_argument, NULL, 'n' }, + { "cgroups", required_argument, NULL, 'c' }, + { "min-queuelength", required_argument, NULL, 'q' }, + { "groupby-interface", no_argument, NULL, 'I' }, + { "groupby-cgroup", no_argument, NULL, 'C' }, + { "include-tcp-hol-delay", no_argument, NULL, 'y' }, { 0, 0, 0, 0 } }; @@ -258,7 +259,8 @@ static void hook_to_progs(struct hook_prog_collection *progs, break; case NETSTACKLAT_HOOK_TCP_SOCK_READ: progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp; - progs->nprogs = 1; + progs->progs[1] = obj->progs.netstacklat_tcp_data_queue_ofo; + progs->nprogs = 2; break; case NETSTACKLAT_HOOK_UDP_SOCK_READ: progs->progs[0] = obj->progs.netstacklat_skb_consume_udp; @@ -564,6 +566,7 @@ static int parse_arguments(int argc, char *argv[], conf->bpf_conf.filter_cgroup = false; conf->bpf_conf.groupby_ifindex = false; conf->bpf_conf.groupby_cgroup = false; + conf->bpf_conf.include_hol_blocked = false; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) // All probes enabled by default @@ -658,6 +661,9 @@ static int parse_arguments(int argc, char *argv[], case 'C': // groupby-cgroup conf->bpf_conf.groupby_cgroup = true; break; + case 'y': // include-tcp-hol-delay + conf->bpf_conf.include_hol_blocked = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -1112,6 +1118,10 @@ static void set_programs_to_load(const struct netstacklat_config *conf, bpf_program__set_autoload(progs.progs[i], conf->enabled_hooks[hook]); } + + if (conf->bpf_conf.include_hol_blocked) + bpf_program__set_autoload( + obj->progs.netstacklat_tcp_data_queue_ofo, false); } static int set_map_sizes(const struct netstacklat_config *conf, diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index d0da8553..d1708ce4 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -77,6 +77,7 @@ struct netstacklat_bpf_config { bool filter_cgroup; bool groupby_ifindex; bool groupby_cgroup; + bool include_hol_blocked; }; #endif