From fa6d07b04912983cf56be89025503fc477aaa565 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Aug 2016 14:17:48 -0400 Subject: [PATCH] tools: import tcplate tcplate computes traffic shaping latency by reading egress packet timestamps with nflog and packet sockets. SO_TIMESTAMPING offers a more complete timestamping solution for processes that own file descriptors. Tcplate is geared at casual latency monitoring by administrators. --- tools/tcplate/Makefile | 16 ++ tools/tcplate/libnflog.c | 311 ++++++++++++++++++++ tools/tcplate/libnflog.h | 27 ++ tools/tcplate/libpsock.c | 186 ++++++++++++ tools/tcplate/libpsock.h | 36 +++ tools/tcplate/tcplate.c | 604 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 1180 insertions(+) create mode 100644 tools/tcplate/Makefile create mode 100644 tools/tcplate/libnflog.c create mode 100644 tools/tcplate/libnflog.h create mode 100644 tools/tcplate/libpsock.c create mode 100644 tools/tcplate/libpsock.h create mode 100644 tools/tcplate/tcplate.c diff --git a/tools/tcplate/Makefile b/tools/tcplate/Makefile new file mode 100644 index 0000000..4d8928c --- /dev/null +++ b/tools/tcplate/Makefile @@ -0,0 +1,16 @@ + +.PHONY: all clean distclean + +all: tcplate + +tcplate: tcplate.o libnflog.o libpsock.o + gcc -static $+ -o $@ + +%.o: %.c + gcc -c -Wall -Werror $< -o $@ + +clean: + -rm -f *.o + +distclean: clean + -rm -f tcplate diff --git a/tools/tcplate/libnflog.c b/tools/tcplate/libnflog.c new file mode 100644 index 0000000..08f808a --- /dev/null +++ b/tools/tcplate/libnflog.c @@ -0,0 +1,311 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Netlink support library + * Geared at NETLINK_NETFILTER + * + * Read netfilter nflog output, for instance: + * `iptables -A OUTPUT -j NFLOG --nflog-group=10` + * + * To timestamp every packet, use the xt_time match: + * `iptables -A OUTPUT \ + * -m time --timestart 00:00 --timestop 23:59 \ + * -j NFLOG --nflog-group=10` + * or even + * `iptables -A OUTPUT -m time -j NFLOG --nflog-group=10` + * + * TODO(willemb): optimize by using mmapped ring. + */ + +#define _GNU_SOURCE +#define _BSD_SOURCE /* for be64toh */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnflog.h" + +static int config_group = 10; /* nfnetlink group to follow */ +static int config_debug_lvl = 0; + +#define IOVLEN 8 +#define PKTLEN (1 << 11) + +static void __nflog_sendcmd(int fd, uint8_t cmd, void *msg, int msglen, + uint16_t family, uint16_t group_id) +{ + static int seq_id; + char buf[1024] __attribute__((aligned)); + struct nlmsghdr *nh; + struct nfgenmsg *ng; + struct nfattr *nfa; + int ret; + + memset(buf, 0, sizeof(buf)); + + nh = (void *) buf; + ng = (void *) buf + sizeof(*nh); + + nh->nlmsg_len = NLMSG_LENGTH(sizeof(*ng)); + nh->nlmsg_type = (NFNL_SUBSYS_ULOG << 8) | NFULNL_MSG_CONFIG; + nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nh->nlmsg_pid = 0; + nh->nlmsg_seq = ++seq_id; + + ng->nfgen_family = family; + ng->version = NFNETLINK_V0; + ng->res_id = htons(group_id); + + nfa = (void *) buf + NLMSG_ALIGN(nh->nlmsg_len); + nfa->nfa_type = cmd; + nfa->nfa_len = NFA_LENGTH(msglen); + + memcpy(NFA_DATA(nfa), msg, msglen); + + nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NFA_ALIGN(nfa->nfa_len); + + if (send(fd, buf, nh->nlmsg_len, 0) != nh->nlmsg_len) + error(1, errno, "sendcmd"); + + /* TODO(willemb): handle EINTR */ + ret = recv(fd, buf, sizeof(buf), 0); + if (ret == -1) + error(1, errno, "recv ctrl: sock error"); + if (ret < NLMSG_OK(nh, ret)) + error(1, 0, "recv ctrl: insufficient length"); + if (nh->nlmsg_type != NLMSG_ERROR) + error(1, 0, "recv ctrl: unexpected type"); + ret = *(int *) NLMSG_DATA(nh); + if (ret) + error(1, ret, "recv ctrl: nflog error"); +} + +static void nflog_sendcmd(int fd, uint8_t cmd, uint16_t family, + uint16_t group_id) +{ + struct nfulnl_msg_config_cmd msg; + + memset(&msg, 0, sizeof(msg)); + msg.command = cmd; + __nflog_sendcmd(fd, NFULA_CFG_CMD, &msg, sizeof(msg), family, group_id); +} + +static void nflog_sendcmd_mode(int fd, uint16_t family, uint16_t group_id, + uint8_t mode, uint32_t value) +{ + struct nfulnl_msg_config_mode msg; + + memset(&msg, 0, sizeof(msg)); + msg.copy_mode = mode; + msg.copy_range = htonl(value); + __nflog_sendcmd(fd, NFULA_CFG_MODE, &msg, sizeof(msg), family, group_id); +} + +static void nflog_attach_inet(int fd, unsigned int snaplen) +{ + nflog_sendcmd(fd, NFULNL_CFG_CMD_PF_UNBIND, AF_INET, 0); + /* TODO: recv ack */ + nflog_sendcmd(fd, NFULNL_CFG_CMD_PF_BIND, AF_INET, 0); + /* TODO: recv ack */ + nflog_sendcmd(fd, NFULNL_CFG_CMD_BIND, AF_UNSPEC, config_group); + /* TODO: recv ack */ + + nflog_sendcmd_mode(fd, AF_UNSPEC, config_group, NFULNL_COPY_PACKET, snaplen); + /* TODO: recv ack */ +} + +int nflog_init(unsigned int snaplen) +{ + struct sockaddr_nl nladdr; + int fd, val; + + if (snaplen > PKTLEN) + error(1, 0, "snaplen exceeds pktlen: can cause drops"); + + fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); + if (fd == -1) + error(1, errno, "socket"); + + val = 1 << 21; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val))) + error(1, errno, "setsockopt SO_RCVBUF"); + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_groups = 1 << config_group; + + if (bind(fd, (void *) &nladdr, sizeof(nladdr))) + error(1, errno, "bind"); + + nflog_attach_inet(fd, snaplen); + return fd; +} + +void nflog_exit(int fd) +{ + if (close(fd)) + error(1, errno, "close"); +} + +void nflog_parse(const void *data, unsigned int len, log_fn fn) +{ + const struct nlmsghdr *nh; + + for (nh = (void *) data; NLMSG_OK(nh, len); nh = NLMSG_NEXT(nh, len)) { + const struct nfulnl_msg_packet_timestamp *nf_ts; + const struct nfgenmsg *ng; + const struct nfattr *attr; + uint64_t ts_sec = 0, ts_usec = 0; + const char *pkt; + int plen = 0; + int alen; + + if (nh->nlmsg_type == NLMSG_ERROR) + error(1, 0, "netlink error"); + if (nh->nlmsg_type == NLMSG_NOOP) + error(1, 0, "netlink noop"); + if (nh->nlmsg_len < sizeof(*nh) || len < nh->nlmsg_len) { + fprintf(stderr, "message truncated\n"); + continue; + } + + ng = NLMSG_DATA(nh); + if (config_debug_lvl) + fprintf(stderr, "P family=%s version=%d group=%d len=%d type=%hu\n", + ng->nfgen_family == AF_INET ? "INET" : "other", + ng->version, + ntohs(ng->res_id), + nh->nlmsg_len, + nh->nlmsg_type); + + attr = NFM_NFA(ng); + alen = nh->nlmsg_len - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(*ng))); + while (NFA_OK(attr, alen)) { + switch (NFA_TYPE(attr)) { + case NFULA_PAYLOAD: + pkt = NFA_DATA(attr); + plen = NFA_PAYLOAD(attr); + break; + case NFULA_TIMESTAMP: + nf_ts = NFA_DATA(attr); + ts_sec = be64toh(nf_ts->sec); + ts_usec = be64toh(nf_ts->usec); + break; + case NFULA_GID: + case NFULA_PACKET_HDR: + case NFULA_PREFIX: + case NFULA_IFINDEX_OUTDEV: + case NFULA_UID: + default: + if (config_debug_lvl) + fprintf(stderr, " attr @%lu other type=%d\n", + ((unsigned long) attr) - (unsigned long) ng, + NFA_TYPE(attr)); + } + attr = NFA_NEXT(attr, alen); + } + + if (nh->nlmsg_type == NLMSG_DONE) + break; + + fn(pkt, plen, ts_sec, ts_usec); + } +} + +int nflog_read(int fd, log_fn fn) +{ + static char data[IOVLEN][PKTLEN]; + struct mmsghdr msgs[IOVLEN]; + struct iovec iovecs[IOVLEN]; + int i, len; + + memset(msgs, 0, sizeof(msgs)); + for (i = 0; i < IOVLEN; i++) { + iovecs[i].iov_base = data[i]; + iovecs[i].iov_len = PKTLEN; + msgs[i].msg_hdr.msg_iov = &iovecs[i]; + msgs[i].msg_hdr.msg_iovlen = 1; + } + + len = recvmmsg(fd, msgs, IOVLEN, MSG_DONTWAIT, NULL); + if (len == -1) { + if (errno == EAGAIN || errno == EINTR) + return 0; + if (errno == ENOBUFS) { + static int report_overflow; + if (!report_overflow) { + report_overflow = 1; + fprintf(stderr, "nflog: socket overflow detected. some packets will be lost (only warning once).\n"); + } + return 0; + } + error(1, errno, "recvmsg"); + } + + if (config_debug_lvl > 1) + fprintf(stderr, "recvmmsg len=%u\n", len); + + for (i = 0; i < len; i++) + nflog_parse(data[i], msgs[i].msg_len, fn); + + return 1; +} + +static int nflog_wait(int fd) +{ + struct pollfd pollfd[2]; + int len; + + do { + memset(&pollfd, 0, sizeof(pollfd)); + + pollfd[0].events = POLLIN; + pollfd[0].fd = 0; + + pollfd[1].events = POLLIN; + pollfd[1].fd = fd; + + len = poll(pollfd, 2, 50); + if (len == -1) { + if (errno == EINTR) + continue; + error(1, errno, "poll"); + } + if (len && pollfd[0].revents) + return 0; + } while (!len); + + return 1; +} + +void nflog_loop(int fd, log_fn fn) +{ + while (nflog_wait(fd)) { + while (nflog_read(fd, fn)) {} + } +} + +void nflog_all(log_fn fn, unsigned int snaplen) +{ + int fd; + + fd = nflog_init(snaplen); + nflog_loop(fd, fn); + nflog_exit(fd); +} + diff --git a/tools/tcplate/libnflog.h b/tools/tcplate/libnflog.h new file mode 100644 index 0000000..96af568 --- /dev/null +++ b/tools/tcplate/libnflog.h @@ -0,0 +1,27 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Netfilter LOG support library + * + * Only reads outgoing packets + */ + +#ifndef _LIBNFLOG_H_ +#define _LIBNFLOG_H_ + +#include + +/* can be called with len 0 */ +typedef void (*log_fn)(const void *pkt, unsigned int len, + uint64_t ts_sec, uint64_t ts_usec); + +void nflog_all(log_fn fn, unsigned int snaplen); + +int nflog_init(unsigned int snaplen); +int nflog_read(int fd, log_fn fn); +void nflog_loop(int fd, log_fn fn); +void nflog_exit(int fd); + +#endif // _LIBNFLOG_H_ + diff --git a/tools/tcplate/libpsock.c b/tools/tcplate/libpsock.c new file mode 100644 index 0000000..d810e3d --- /dev/null +++ b/tools/tcplate/libpsock.c @@ -0,0 +1,186 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Packet socket support library + * + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpsock.h" + +static void +psock_init_ring(struct psock *ps) +{ + struct tpacket_req tp; + int frames_per_block; + + if (ps->frame_size & (TPACKET_ALIGNMENT - 1)) + error(1, 0, "illegal frame size"); + + tp.tp_frame_size = ps->frame_size; + tp.tp_frame_nr = ps->frame_count; + + frames_per_block = getpagesize() / ps->frame_size; + tp.tp_block_size = getpagesize(); + tp.tp_block_nr = ps->frame_count / frames_per_block; + + if (setsockopt(ps->fd, SOL_PACKET, PACKET_RX_RING, (void*) &tp, sizeof(tp))) + error(1, errno, "setsockopt() ring"); + + ps->ring = mmap(0, tp.tp_block_size * tp.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, ps->fd, 0); + if (!ps->ring) + error(1, 0, "mmap()"); +} + +struct sock_filter egress_filter[] = { + { BPF_LD|BPF_B|BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_PKTTYPE }, + { BPF_JMP|BPF_JEQ, 1, 0, PACKET_OUTGOING }, + { BPF_RET, 0, 0, 0x00000000 }, + { BPF_RET, 0, 0, 0x0000ffff } +}; + +struct sock_fprog egress_fprog = { + .len = sizeof(egress_filter) / sizeof(egress_filter[0]), + .filter = egress_filter, +}; + +void +psock_init(struct psock *ps) +{ + int val; + + ps->fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (ps->fd < 0) + error(1, errno, "socket()"); + + val = TPACKET_V2; + if (setsockopt(ps->fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt() version"); + val = 1; + if (setsockopt(ps->fd, SOL_PACKET, PACKET_TIMESTAMP, &val, sizeof(val))) + error(1, errno, "setsockopt() tstamp"); + if (setsockopt(ps->fd, SOL_SOCKET, SO_ATTACH_FILTER, + &egress_fprog, sizeof(egress_fprog))) + error(1, errno, "setsockopt() filter"); + + if (ps->dev) { + struct sockaddr_ll laddr; + + memset(&laddr, 0, sizeof(laddr)); + laddr.sll_family = AF_PACKET; + laddr.sll_protocol = htons(ETH_P_ALL); /* must be on ptype_all to sniff egress */ + laddr.sll_ifindex = if_nametoindex(ps->dev); + if (!laddr.sll_ifindex) + error(1, errno, "no such device: %s", ps->dev); + if (bind(ps->fd, (void *) &laddr, sizeof(laddr))) + error(1, errno, "bind device: %s (%d)", ps->dev, laddr.sll_ifindex); + } + + psock_init_ring(ps); +} + +static int +psock_wait(struct psock *ps) +{ + struct pollfd pollset[2]; + int ret; + + pollset[0].fd = 0; + pollset[0].events = POLLIN; + pollset[0].revents = 0; + + pollset[1].fd = ps->fd; + pollset[1].events = POLLIN; + pollset[1].revents = 0; + + ret = poll(pollset, 2, 100); + if (ret < 0 && errno != EINTR && errno != EAGAIN) + error(1, errno, "poll()"); + + if (ret > 0 && pollset[0].revents) + return 0; + + return 1; +} + +int +psock_read(struct psock *ps, psock_fn fn) +{ + struct tpacket2_hdr *header; + + header = (void *) ps->ring + (ps->idx_reader * ps->frame_size); + + if (!(header->tp_status & TP_STATUS_USER)) + return 0; + if (header->tp_status & TP_STATUS_COPY) + error(1, 0, "detected incomplete packed"); + if (header->tp_status & TP_STATUS_LOSING) { + static int report_overflow; + if (!report_overflow) { + report_overflow = 1; + fprintf(stderr, "psock: socket overflow detected. some packets will be lost (only warning once).\n"); + } + } + + fn(header, ((void *) header) + header->tp_mac); + + header->tp_status = TP_STATUS_KERNEL; + ps->idx_reader = (ps->idx_reader + 1) & (ps->frame_count - 1); + return 1; +} + +void +psock_loop(struct psock *ps, psock_fn fn) +{ + while (psock_wait(ps)) { + while (psock_read(ps, fn)) {} + } +} + +void +psock_exit(struct psock *ps) +{ + if (munmap(ps->ring, ps->frame_count * ps->frame_size)) + error(1, errno, "munmap"); + + if (close(ps->fd)) + error(1, errno, "close"); +} + +void +psock_all(int frame_count, int frame_size, const char *dev, psock_fn fn) +{ + struct psock ps; + + memset(&ps, 0, sizeof(ps)); + + ps.frame_count = frame_count; + ps.frame_size = frame_size; + if (dev) + ps.dev = dev; + + psock_init(&ps); + psock_loop(&ps, fn); + psock_exit(&ps); +} + diff --git a/tools/tcplate/libpsock.h b/tools/tcplate/libpsock.h new file mode 100644 index 0000000..e86dfeb --- /dev/null +++ b/tools/tcplate/libpsock.h @@ -0,0 +1,36 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Packet socket support library + * + * Only reads outgoing packets + */ + +#ifndef _LIBPSOCK_H_ +#define _LIBPSOCK_H_ + +#include + +struct psock { + int frame_size; + int frame_count; + const char *dev; /* (optional) device to bind to */ + + /* internal */ + int fd; + char *ring; + int idx_reader; +}; + +typedef void (*psock_fn)(struct tpacket2_hdr *tp, void *pkt); + +void psock_all(int frame_count, int frame_size, const char *dev, psock_fn fn); + +void psock_init(struct psock *ps); +int psock_read(struct psock *ps, psock_fn fn); +void psock_loop(struct psock *ps, psock_fn fn); +void psock_exit(struct psock *ps); + +#endif // _LIBPSOCK_H_ + diff --git a/tools/tcplate/tcplate.c b/tools/tcplate/tcplate.c new file mode 100644 index 0000000..77927f1 --- /dev/null +++ b/tools/tcplate/tcplate.c @@ -0,0 +1,604 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Measure tcp latency through the kernel using pcap and nflog + * + * Read TCP/IP packets using pcap and nflog and calculate the + * latency spent within traffic shaping by subtracting timestamp + * of the first occurrence (iptables) from the timestamp of the + * second occurrence (packet socket). + * + * It has two modes: + * normal: latency of traffic shaping from protocol layer to dev: + * this subtracts a tstamp in packetsock on dev (eth0) + * from a tstamp in the ip layer at iptables NFLOG + * bonding: latency of traffic shaping on bonding slaves: + * this reads packets on every device, sees the same + * on both master (e.g., bond0) and slaves. + * + * Testing: + * verified correctness by adding delay at the relevant traffic + * shaping layer with + * `tc qdisc add dev $ETH root est 1sec 4sec netem limit 40000 delay 20ms` + * + * Implementation: + * tcplate uses two datastructures: + * - table: a hashtable to store new TCP segments and their timestamp + * - logs: a circular buffer to store tstamp diff on second viewing + * Logs is double buffered to allow sorting results offline. + * + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnflog.h" +#include "libpsock.h" + +static int log_len = 10000; +static int table_len = 57251; /* prime */ +static int ival = 1; +static int frame_count = (1 << 14); +static int frame_size = 128; +static int bond_mode; +static int debug_mode; +static int show_extended; +static int verbose; +static char dev[IFNAMSIZ + 1] = "eth0"; +static uint8_t tos_mask = UCHAR_MAX; +static bool tos_filter = false; + +/* race condition. TODO: protect */ +static uint64_t collisions; +static uint64_t pktcount; +static uint64_t count_nflog; +static uint64_t count_psock; + +/* double buffered list of observations */ +static int64_t *logs[2]; +static int log_selector; +static int log_head; +static int exit_hard; + +struct table_key_full { + __be32 ip_src; + __be32 ip_dst; + __be16 tcp_src; + __be16 tcp_dst; + __be32 seqno; +} __attribute__((packed)); + +union table_key { + struct table_key_full full; + __int128 cmp; +}; + +struct table_elem { + union table_key key; + int64_t tstamp; +}; + +/* not thread safe */ +struct table_elem *table; + +/* Show how many table elements are in use */ +static int +table_scan(void) +{ + int i, used = 0; + + for (i = 0; i < table_len; i++) + if (table[i].key.cmp) + used++; + + return used; +} + +static void +log_record(int64_t val) +{ + /* do not wrap log_head, to discern a partial from full log */ + logs[log_selector][log_head % log_len] = val; + log_head++; +} + +/* switch between double buffered logs, return number of recorded events */ +static int +log_rotate(void) +{ + int old_head; + + log_selector = (log_selector + 1) & 0x1; + old_head = log_head; + log_head = 0; + + return old_head; +} + +/* qsort comparison callback */ +static int +log_compar(const void *_a, const void *_b) +{ + const int64_t *a = _a, *b = _b; + return *a < *b ? -1 : (*a > *b ? 1 : 0); +} + +static void +log_show(void) +{ + int len, matches, selector; + + matches = log_rotate(); + len = matches < log_len ? matches : log_len; + selector = (log_selector + 1) & 0x1; + + qsort(logs[selector], len, sizeof(logs[0][0]), log_compar); + if (len >= 100) { + fprintf(stderr, " %8ld %8ld %8ld", + logs[selector][len / 2], + logs[selector][(len * 9) / 10], + logs[selector][(len * 99) / 100]); + + if (show_extended) + fprintf(stderr, " %10lu %10u %10lu %10d", + pktcount, matches, collisions, + table_scan()); + if (show_extended && verbose > 0) + fprintf(stderr, " %10lu %10lu", + count_nflog, count_psock); + write(2, "\n", 1); + } else { + write(2, ".\n", 2); + } + + collisions = 0; + pktcount = 0; + count_nflog = 0; + count_psock = 0; +} + +/* From "The Practice of Programming" via + * PERL_HASH in Perl 5.005, which is GPLv1 */ +static int hash_compute(void *_key, int klen) +{ + const unsigned int multiplier = 37; + unsigned char *cur, *key = _key; + unsigned int h = 0; + + for (cur = key; cur - key < klen; cur++) + h = (h * multiplier) + *cur; + return h + (h >> 5); +} + +static void +packet_process(__be32 ip_src, __be32 ip_dst, + __be16 tcp_src, __be16 tcp_dst, + __be32 seqno, int64_t tstamp, + int caller_type) +{ + union table_key key; + unsigned int idx; + + key.full.ip_src = ip_src; + key.full.ip_dst = ip_dst; + key.full.tcp_src = tcp_src; + key.full.tcp_dst = tcp_dst; + key.full.seqno = seqno; + + idx = hash_compute(&key, sizeof(key)); + idx %= table_len; + + /* if key is new, insert new tstamp */ + if (!table[idx].key.cmp) { +insert: + table[idx].key.cmp = key.cmp; + table[idx].tstamp = tstamp; + pktcount++; + } + /* if collision, record and insert */ + else if (table[idx].key.cmp != key.cmp) { + collisions++; + goto insert; + } + /* else log the diff and clear the key */ + else { + tstamp = tstamp - table[idx].tstamp; + if (tstamp < 0) + tstamp = -tstamp; + log_record(tstamp); + table[idx].key.cmp = 0; + } + + if (debug_mode) + fprintf(stderr, "%s %u:%hu > %u:%hu seqno=%u time=%lu\n", + caller_type == 0 ? "nflog" : "psock", + ntohl(ip_src), ntohs(tcp_src), + ntohl(ip_dst), ntohs(tcp_dst), + ntohl(seqno), tstamp); +} + +static bool +tos_match(uint8_t tos) +{ + if (!tos_filter) + return true; + + if (tos & tos_mask) + return true; + + if (tos == tos_mask) + return true; + + return false; +} + +static void +packet_callback(struct tpacket2_hdr *tp, void *pkt) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + eth = pkt; + + if (eth->h_proto != htons(ETH_P_IP)) + return; + + iph = pkt + sizeof(*eth); + + /* TODO: support IPv6 */ + if (iph->version != 4) + error(1, 0, "bug in parsing ip header"); + + if (iph->protocol != IPPROTO_TCP) + return; + if (!tos_match(iph->tos)) + return; + + tcph = ((void *) iph) + (iph->ihl << 2); + packet_process(iph->saddr, iph->daddr, + tcph->source, tcph->dest, + tcph->seq, + (1000LL * 1000 * tp->tp_sec) + tp->tp_nsec / 1000, + 1); + + count_psock++; +} + +static void +nflog_callback(const void *data, unsigned int len, + uint64_t ts_sec, uint64_t ts_usec) +{ + const struct iphdr *iph = data; + const struct tcphdr *tcph; + + if (!len) + return; + + if (iph->version != 4) + error(1, 0, "bug in parsing ip header"); + if ((iph->ihl << 2) + sizeof(*tcph) > len) + error(1, 0, "nflog snaplen too small"); + + if (iph->protocol != IPPROTO_TCP) + return; + if (!tos_match(iph->tos)) + return; + + tcph = ((void *) iph) + (iph->ihl << 2); + packet_process(iph->saddr, iph->daddr, + tcph->source, tcph->dest, + tcph->seq, + (1000LL * 1000 * ts_sec) + ts_usec, + 0); + + count_nflog++; +} + +static void +sigalrm_handler(int signum) +{ + log_show(); + alarm(ival); +} + +static void +sigint_handler(int signum) +{ + + if (exit_hard) + exit(1); + + /* first try to exit gracefully based in EINTR in poll */ + exit_hard = 1; +} + +static void +__init(void) +{ + logs[0] = malloc(log_len * sizeof(logs[0][0])); + logs[1] = malloc(log_len * sizeof(logs[0][0])); + table = calloc(table_len, sizeof(struct table_elem)); + if (!logs[0] || !logs[1] || !table) + error(1, 0, "alloc"); +} + +static void +__exit(void) +{ + free(table); + free(logs[1]); + free(logs[0]); +} + +static void __attribute__((noreturn)) +usage(const char *filepath) +{ + fprintf(stderr, "usage: %s [-bdfFhqvx] [-c count] [-i iface] [-l loglen] [-L tbllen] [-t ival]\n" + "\n" + "where\n" + " -b sets bonded mode: latency in slave device tc\n" + " -c sets capture queue length (in packets)\n" + " -d debug mode, displays individual records\n" + " -f filter by TOS bits (pass as base 10 or 16)\n" + " -h to show this message and exits\n" + " -i interface (default: eth0)\n" + " -l sets the timestamp log length\n" + " -L sets the tcp segment hashtable length\n" + " -q quiet: suppresses more output\n" + " -t sets the display interval (secs)\n" + " -v sets the verbose option\n" + " -x show extended stats: #matched, collisions, ..\n", + filepath); + exit(1); +} + +static void +parse_opt(int argc, char **argv) +{ + int c; + + while ((c = getopt (argc, argv, "bc:df:hi:l:L:qt:vx")) != -1) + { + switch (c) { + case 'b': + bond_mode = 1; + break; + case 'c': + frame_count = strtoul(optarg, NULL, 10); + break; + case 'd': + debug_mode = 1; + break; + case 'f': + tos_mask = strtoul(optarg, NULL, 0); + tos_filter = true; + break; + case 'h': + usage(argv[0]); + break; + case 'i': + strncpy(dev, optarg, IFNAMSIZ); + break; + case 'l': + log_len = strtoul(optarg, NULL, 10); + break; + case 'L': + table_len = strtoul(optarg, NULL, 10); + break; + case 'q': + if (verbose > 0) + error(1, 0, "pass -q or -v"); + verbose = -1; + break; + case 't': + ival = strtoul(optarg, NULL, 10); + break; + case 'v': + if (verbose < 0) + error(1, 0, "pass -q or -v"); + verbose = 1; + break; + case 'x': + show_extended = 1; + break; + } + } + + if (verbose > 0) { + fprintf(stderr, "mode: %s\n", bond_mode ? "bond" : dev); + fprintf(stderr, "log_len: %u\n", log_len); + fprintf(stderr, "table_len: %u\n", table_len); + fprintf(stderr, "frame_count: %u\n", frame_count); + fprintf(stderr, "frame_size: %u\n", frame_size); + fprintf(stderr, "interval: %u\n", ival); + if (tos_filter) + fprintf(stderr, "tos mask: 0x%x\n", tos_mask); + } +} + +/* @return 1 if data ready, 0 to exit */ +static int do_wait(int fd1, int fd2) +{ + struct pollfd pollset[3]; + int ret; + + pollset[0].fd = 0; + pollset[0].events = POLLIN; + pollset[0].revents = 0; + + pollset[1].fd = fd1; + pollset[1].events = POLLIN; + pollset[1].revents = 0; + + pollset[2].fd = fd2; + pollset[2].events = POLLIN; + pollset[2].revents = 0; + + /* minor race with entering poll(), below */ + if (exit_hard) + return 0; + + ret = poll(pollset, fd2 >= 0 ? 3 : 2, 100); + if (ret < 0 && errno != EINTR) + error(1, errno, "poll()"); + + if (ret > 0 && pollset[0].revents) + return 0; + + return 1; +} + +#define IPT_RULE " -m time -j NFLOG --nflog-group=10 --nflog-threshold=1" +static void __exit_nflog(void) +{ + if (verbose > 0) + system("iptables -v -nL OUTPUT | grep NFLOG"); + if (system("iptables -D OUTPUT " IPT_RULE)) { + error(1, 0, "error while removing log module"); + } +} + +/* + * System configuration change: insert an iptables rule. + * Ensure rollback with atexit() (though this fails with SIGINT, ..) + */ +static void __init_nflog(void) +{ + int ret; + + ret = system("iptables -L OUTPUT | grep -q NFLOG"); + if (ret == -1) + error(1, 0, "read iptables"); + if (WEXITSTATUS(ret) == 0) + error(1, 0, "log module still loaded? try iptables -L"); + + if (system("iptables -A OUTPUT" IPT_RULE)) { + __exit_nflog(); + error(1, 0, "load log module"); + } + atexit(__exit_nflog); +} + +static void __main(void) +{ + struct psock ps; + int logfd; + + memset(&ps, 0, sizeof(ps)); + ps.frame_count = frame_count; + ps.frame_size = frame_size; + /* + * in normal mode, get timestamp at ip layer and eth0 dequeue + * in bond mode, get timestamp at bond0 and eth0 dequeue. + * + * filter psock on eth0 if calculating latency from ip to eth0. + * else, do not filter to read packet on both master and slave, + * but disable nflog. + */ + if (bond_mode) { + logfd = -1; + } else { + /* + * snaplen must be smaller than PKTLEN in nflog_read + * or packets that are > PKTLEN && <= snaplen are dropped + */ + const int snaplen = 60; + + logfd = nflog_init(snaplen); + ps.dev = dev; + } + + psock_init(&ps); + + while (do_wait(ps.fd, logfd)) { + if (logfd != -1) + while (nflog_read(logfd, nflog_callback)) {} + while (psock_read(&ps, packet_callback)) {} + } + + psock_exit(&ps); + if (logfd != -1) + nflog_exit(logfd); +} + +static void +print_header(void) +{ +#define MAIN_HEADER "latency: 50 90 99 (%% us)" +#define EXTRA_HEADER " #total #matches #collis. #tblkeys" + + fprintf(stderr, "\npress Enter to exit\n" + "\n. indicates insufficient data\n" + "\n"); + + if (show_extended) + fprintf(stderr, MAIN_HEADER EXTRA_HEADER "\n"); + else + fprintf(stderr, MAIN_HEADER "\n"); +} + +int +main(int argc, char **argv) +{ + if (verbose >= 0) + fprintf(stderr, "tcplate v1.2: measure traffic shaping TCP latency\n"); + + parse_opt(argc, argv); + + if (verbose >= 0) + print_header(); + + __init(); + + signal(SIGALRM, &sigalrm_handler); + signal(SIGINT, &sigint_handler); + + alarm(ival); + + __init_nflog(); + __main(); + __exit(); + + return 0; +} +