Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions headers/vmlinux/vmlinux_net.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,35 @@ struct sk_buff {
struct skb_ext *extensions;
};

struct tcp_skb_cb {
__u32 seq;
__u32 end_seq;
union {
struct {
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
};
__u8 tcp_flags;
__u8 sacked;
__u8 ip_dsfield;
__u8 txstamp_ack : 1;
__u8 eor : 1;
__u8 has_rxtstamp : 1;
__u8 unused : 5;
__u32 ack_seq;
union {
struct {
__u32 is_app_limited : 1;
__u32 delivered_ce : 20;
__u32 unused : 11;
__u32 delivered;
u64 first_tx_mstamp;
u64 delivered_mstamp;
} tx;
};
};

struct nf_conn {
unsigned long status;
};
Expand Down Expand Up @@ -202,4 +231,51 @@ struct sock {
u32 sk_rx_dst_cookie;
};

struct inet_sock {
struct sock sk;
};

struct inet_connection_sock {
struct inet_sock icsk_inet;
};

struct tcp_sock {
struct inet_connection_sock inet_conn;
__u8 __cacheline_group_begin__tcp_sock_read_tx[0];
u32 max_window;
u32 rcv_ssthresh;
u32 reordering;
u32 notsent_lowat;
u16 gso_segs;
struct sk_buff *lost_skb_hint;
struct sk_buff *retransmit_skb_hint;
__u8 __cacheline_group_end__tcp_sock_read_tx[0];
__u8 __cacheline_group_begin__tcp_sock_read_txrx[0];
u32 tsoffset;
u32 snd_wnd;
u32 mss_cache;
u32 snd_cwnd;
u32 prr_out;
u32 lost_out;
u32 sacked_out;
u16 tcp_header_len;
u8 scaling_ratio;
u8 chrono_type : 2;
u8 repair : 1;
u8 tcp_usec_ts : 1;
u8 is_sack_reneg : 1;
u8 is_cwnd_limited : 1;
__u8 __cacheline_group_end__tcp_sock_read_txrx[0];
__u8 __cacheline_group_begin__tcp_sock_read_rx[0];
u32 copied_seq;
u32 rcv_tstamp;
u32 snd_wl1;
u32 tlp_high_seq;
u32 rttvar_us;
u32 retrans_out;
u16 advmss;
u16 urg_data;
u32 lost;
};

#endif /* __VMLINUX_NET_H__ */
111 changes: 111 additions & 0 deletions netstacklat/netstacklat.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))

// Mimic macros from /include/net/tcp.h
#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))

char LICENSE[] SEC("license") = "GPL";


Expand All @@ -23,6 +27,7 @@ volatile const struct netstacklat_bpf_config user_config = {
.filter_cgroup = false,
.groupby_ifindex = false,
.groupby_cgroup = false,
.include_hol_blocked = false,
};

/*
Expand All @@ -38,6 +43,13 @@ struct sk_buff___old {
__u8 mono_delivery_time: 1;
} __attribute__((preserve_access_index));

struct tcp_sock_ooo_range {
struct bpf_spin_lock lock;
u32 ooo_seq_end;
/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
bool active;
};

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64);
Expand Down Expand Up @@ -66,6 +78,22 @@ struct {
__type(value, u64);
} netstack_cgroupfilter SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct tcp_sock_ooo_range);
} netstack_tcp_ooo_range SEC(".maps");

/*
* Is a < b considering u32 wrap around?
* Based on the before() function in /include/net/tcp.h
*/
static bool u32_lt(u32 a, u32 b)
{
return (s32)(a - b) < 0;
}

static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
{
u64 zero = 0;
Expand Down Expand Up @@ -331,6 +359,68 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
record_latency_since(tstamp, &key);
}

static void tcp_update_ooo_range(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock_ooo_range *tp_ooo_range;

tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
BPF_SK_STORAGE_GET_F_CREATE);
if (!tp_ooo_range)
return;

bpf_spin_lock(&tp_ooo_range->lock);
if (tp_ooo_range->active) {
if (u32_lt(tp_ooo_range->ooo_seq_end, TCP_SKB_CB(skb)->end_seq))
tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
} else {
tp_ooo_range->ooo_seq_end = TCP_SKB_CB(skb)->end_seq;
tp_ooo_range->active = true;
}
bpf_spin_unlock(&tp_ooo_range->lock);

}

static bool tcp_read_in_ooo_range(struct sock *sk)
{
struct tcp_sock_ooo_range *tp_ooo_range;
struct tcp_sock *tp = tcp_sk(sk);
u32 last_read_seq;
bool ret;
int err;

tp_ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, 0);
if (!tp_ooo_range)
/* no recorded ooo-range for sock, so cannot be in ooo-range */
return false;

err = bpf_core_read(&last_read_seq, sizeof(last_read_seq), &tp->copied_seq);
if (err) {
/*
* Shouldn't happen.
* Should probably emit some warning if reading copied_seq
* unexpectedly fails. Assume not in ooo-range to avoid
* systematically filtering out ALL values if this does happen.
*/
bpf_printk("failed to read tcp_sock->copied_seq: err=%d", err);
return false;
}

bpf_spin_lock(&tp_ooo_range->lock);
if (!tp_ooo_range->active) {
ret = false;
} else {
if (u32_lt(tp_ooo_range->ooo_seq_end, last_read_seq)) {
tp_ooo_range->active = false;
ret = false;
} else {
ret = true;
}
}

bpf_spin_unlock(&tp_ooo_range->lock);
return ret;
}

SEC("fentry/ip_rcv_core")
int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
void *tp, void *res, bool compat_mode)
Expand Down Expand Up @@ -396,6 +486,11 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
struct scm_timestamping_internal *tss)
{
struct timespec64 *ts = &tss->ts[0];

/* skip if preceeding sock read ended in ooo-range */
if (!user_config.include_hol_blocked && tcp_read_in_ooo_range(sk))
return 0;

record_socket_latency(sk, NULL,
(ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
NETSTACKLAT_HOOK_TCP_SOCK_READ);
Expand All @@ -410,3 +505,19 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
NETSTACKLAT_HOOK_UDP_SOCK_READ);
return 0;
}

/* This program should also be disabled if tcp-socket-read is disabled */
SEC("fentry/tcp_data_queue_ofo")
int BPF_PROG(netstacklat_tcp_data_queue_ofo, struct sock *sk,
struct sk_buff *skb)
{
if (!user_config.include_hol_blocked)
/*
* It's better to not load this program at all if the ooo-range
* tracking isn't needed (like done by netstacklat.c).
* But if an external loader (like ebpf-exporter) is used,
* this should at least minimze the unncecessary overhead.
*/
tcp_update_ooo_range(sk, skb);
return 0;
}
36 changes: 23 additions & 13 deletions netstacklat/netstacklat.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,19 @@ struct netstacklat_config {
};

static const struct option long_options[] = {
{ "help", no_argument, NULL, 'h' },
{ "report-interval", required_argument, NULL, 'r' },
{ "list-probes", no_argument, NULL, 'l' },
{ "enable-probes", required_argument, NULL, 'e' },
{ "disable-probes", required_argument, NULL, 'd' },
{ "pids", required_argument, NULL, 'p' },
{ "interfaces", required_argument, NULL, 'i' },
{ "network-namespace", required_argument, NULL, 'n' },
{ "cgroups", required_argument, NULL, 'c' },
{ "min-queuelength", required_argument, NULL, 'q' },
{ "groupby-interface", no_argument, NULL, 'I' },
{ "groupby-cgroup", no_argument, NULL, 'C' },
{ "help", no_argument, NULL, 'h' },
{ "report-interval", required_argument, NULL, 'r' },
{ "list-probes", no_argument, NULL, 'l' },
{ "enable-probes", required_argument, NULL, 'e' },
{ "disable-probes", required_argument, NULL, 'd' },
{ "pids", required_argument, NULL, 'p' },
{ "interfaces", required_argument, NULL, 'i' },
{ "network-namespace", required_argument, NULL, 'n' },
{ "cgroups", required_argument, NULL, 'c' },
{ "min-queuelength", required_argument, NULL, 'q' },
{ "groupby-interface", no_argument, NULL, 'I' },
{ "groupby-cgroup", no_argument, NULL, 'C' },
{ "include-tcp-hol-delay", no_argument, NULL, 'y' },
{ 0, 0, 0, 0 }
};

Expand Down Expand Up @@ -258,7 +259,8 @@ static void hook_to_progs(struct hook_prog_collection *progs,
break;
case NETSTACKLAT_HOOK_TCP_SOCK_READ:
progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp;
progs->nprogs = 1;
progs->progs[1] = obj->progs.netstacklat_tcp_data_queue_ofo;
progs->nprogs = 2;
break;
case NETSTACKLAT_HOOK_UDP_SOCK_READ:
progs->progs[0] = obj->progs.netstacklat_skb_consume_udp;
Expand Down Expand Up @@ -564,6 +566,7 @@ static int parse_arguments(int argc, char *argv[],
conf->bpf_conf.filter_cgroup = false;
conf->bpf_conf.groupby_ifindex = false;
conf->bpf_conf.groupby_cgroup = false;
conf->bpf_conf.include_hol_blocked = false;

for (i = 0; i < NETSTACKLAT_N_HOOKS; i++)
// All probes enabled by default
Expand Down Expand Up @@ -658,6 +661,9 @@ static int parse_arguments(int argc, char *argv[],
case 'C': // groupby-cgroup
conf->bpf_conf.groupby_cgroup = true;
break;
case 'y': // include-tcp-hol-delay
conf->bpf_conf.include_hol_blocked = true;
break;
case 'h': // help
print_usage(stdout, argv[0]);
exit(EXIT_SUCCESS);
Expand Down Expand Up @@ -1112,6 +1118,10 @@ static void set_programs_to_load(const struct netstacklat_config *conf,
bpf_program__set_autoload(progs.progs[i],
conf->enabled_hooks[hook]);
}

if (conf->bpf_conf.include_hol_blocked)
bpf_program__set_autoload(
obj->progs.netstacklat_tcp_data_queue_ofo, false);
}

static int set_map_sizes(const struct netstacklat_config *conf,
Expand Down
1 change: 1 addition & 0 deletions netstacklat/netstacklat.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct netstacklat_bpf_config {
bool filter_cgroup;
bool groupby_ifindex;
bool groupby_cgroup;
bool include_hol_blocked;
};

#endif