From bd3ff678b733964c689b52ff1b0d2c838edeb8b8 Mon Sep 17 00:00:00 2001 From: Christian Poessinger Date: Thu, 17 Dec 2020 18:30:16 +0100 Subject: [PATCH] xdp: T2666: initial XDP (generic mode) forwarding support The CLI command 'set interfaces ethernet offload-options xdp" enables the XDP generic mode on the given interface. vyos@vyos:~$ show interfaces ethernet eth1 eth1: mtu 1500 xdpgeneric/id:151 qdisc mq state DOWN group default qlen 1000 link/ether 00:50:56:bf:ef:aa brd ff:ff:ff:ff:ff:ff inet6 fe80::250:56ff:febf:efaa/64 scope link tentative valid_lft forever preferred_lft forever Description: fooa XDP code is thankfully copied from [1], thank you for this nice tutorial. NOTE: this is an experimental feature which might break your forwarding/filtering. [1]: https://medium.com/swlh/building-a-xdp-express-data-path-based-peering-router-20db4995da66 --- Makefile | 8 +- debian/rules | 4 + .../interfaces-ethernet.xml.in | 6 + python/vyos/ifconfig/ethernet.py | 21 ++ src/ebpf/.gitignore | 1 + src/ebpf/Makefile | 16 ++ src/ebpf/xdp_drop_ebpf.c | 97 +++++++++ src/ebpf/xdp_router.c | 202 ++++++++++++++++++ 8 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 src/ebpf/.gitignore create mode 100644 src/ebpf/Makefile create mode 100644 src/ebpf/xdp_drop_ebpf.c create mode 100644 src/ebpf/xdp_router.c diff --git a/Makefile b/Makefile index 1ed4634407..8155b231e5 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ OP_TMPL_DIR := templates-op BUILD_DIR := build DATA_DIR := data SHIM_DIR := src/shim +EBPF_DIR := src/ebpf CC := gcc LIBS := -lzmq CFLAGS := @@ -96,8 +97,12 @@ component_versions: $(BUILD_DIR) $(obj) vyshim: $(MAKE) -C $(SHIM_DIR) +.PHONY: vyebpf +vyebpf: + $(MAKE) -C $(EBPF_DIR) + .PHONY: all -all: clean interface_definitions op_mode_definitions component_versions vyshim +all: clean interface_definitions op_mode_definitions component_versions vyshim vyebpf .PHONY: clean clean: @@ -105,6 +110,7 @@ clean: rm -rf $(TMPL_DIR) rm -rf $(OP_TMPL_DIR) $(MAKE) -C $(SHIM_DIR) clean + $(MAKE) -C $(EBPF_DIR) clean .PHONY: test test: diff --git a/debian/rules b/debian/rules index a0cc7a99b5..5995723586 100755 --- a/debian/rules +++ b/debian/rules @@ -78,6 +78,10 @@ override_dh_auto_install: mkdir -p $(DIR)/$(VYOS_DATA_DIR) cp -r data/* $(DIR)/$(VYOS_DATA_DIR) + # Install eBPF plugins + mkdir -p $(DIR)/$(VYOS_DATA_DIR)/ebpf + cp -r src/ebpf/*.o $(DIR)/$(VYOS_DATA_DIR)/ebpf + # Install etc configuration files mkdir -p $(DIR)/etc cp -r src/etc/* $(DIR)/etc diff --git a/interface-definitions/interfaces-ethernet.xml.in b/interface-definitions/interfaces-ethernet.xml.in index 0337c629b0..8bd9b70103 100644 --- a/interface-definitions/interfaces-ethernet.xml.in +++ b/interface-definitions/interfaces-ethernet.xml.in @@ -165,6 +165,12 @@ Must be either 'on' or 'off' + + + Enable eXpress Data Path + + + diff --git a/python/vyos/ifconfig/ethernet.py b/python/vyos/ifconfig/ethernet.py index 12d1ec2654..1bc63eec2e 100644 --- a/python/vyos/ifconfig/ethernet.py +++ b/python/vyos/ifconfig/ethernet.py @@ -251,6 +251,23 @@ def set_ufo(self, state): """ return self.set_interface('ufo', state) + def set_xdp(self, enabled): + """ + """ + ifname = self.config['ifname'] + cmd = f'ip link set dev {ifname} xdp off' + if enabled: + # use 'xdpgeneric' for the time beeing until we can detect supported + # drivers or have a lookup table of whatever kind. This then can be + # replaced by xdpdrv + cmd = f'ip -force link set dev {ifname} xdpgeneric obj /usr/share/vyos/ebpf/xdp_router.o' + try: + return self._cmd(cmd) + except: + from vyos import ConfigError + raise ConfigError('Error: Device does not allow enslaving to a bridge.') + + def set_ring_buffer(self, b_type, b_size): """ Example: @@ -306,6 +323,10 @@ def update(self, config): value = tmp if (tmp != None) else 'off' self.set_ufo(value) + # UDP fragmentation offloading + tmp = dict_search('offload_options.xdp', config) + self.set_xdp(tmp != None) # enable or disable + # Set physical interface speed and duplex if {'speed', 'duplex'} <= set(config): speed = config.get('speed') diff --git a/src/ebpf/.gitignore b/src/ebpf/.gitignore new file mode 100644 index 0000000000..5761abcfdf --- /dev/null +++ b/src/ebpf/.gitignore @@ -0,0 +1 @@ +*.o diff --git a/src/ebpf/Makefile b/src/ebpf/Makefile new file mode 100644 index 0000000000..5b80c32d71 --- /dev/null +++ b/src/ebpf/Makefile @@ -0,0 +1,16 @@ +#clang -target bpf -O2 -c xdp-drop-ebpf.c -o xdp-drop-ebpf.o + +src = $(wildcard *.c) +obj = $(src:.c=.o) +CLANG = clang +CFLAGS = -Wall -Wno-unused-value -Wno-pointer-sign -Wno-compare-distinct-pointer-types -Werror -O2 + +%.o: %.c + $(CLANG) -target bpf $(CFLAGS) -o $@ -c $< + +.PHONY: all +all: $(obj) + +.PHONY: clean +clean: + rm -f *.o diff --git a/src/ebpf/xdp_drop_ebpf.c b/src/ebpf/xdp_drop_ebpf.c new file mode 100644 index 0000000000..a08edf58d7 --- /dev/null +++ b/src/ebpf/xdp_drop_ebpf.c @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include + +#include + +/* IP flags. */ +#define IP_CE 0x8000 /* Flag: "Congestion" */ +#define IP_DF 0x4000 /* Flag: "Don't Fragment" */ +#define IP_MF 0x2000 /* Flag: "More Fragments" */ +#define IP_OFFSET 0x1FFF /* "Fragment Offset" part */ + +#define SEC(NAME) __attribute__((section(NAME), used)) + +#define htons(x) ((__be16)___constant_swab16((x))) +#define htonl(x) ((__be32)___constant_swab32((x))) + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +SEC("prog") +int xdp_drop(struct xdp_md *ctx) { + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + + uint64_t nh_off = sizeof(*eth); + if (data + nh_off > data_end) { + return XDP_PASS; + } + + uint16_t h_proto = eth->h_proto; + int i; + + /* Handle double VLAN tagged packet. See https://en.wikipedia.org/wiki/IEEE_802.1ad */ + for (i = 0; i < 2; i++) { + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) { + return XDP_PASS; + } + h_proto = vhdr->h_vlan_encapsulated_proto; + } + } + + if (h_proto == htons(ETH_P_IP)) { + struct iphdr *iph = data + nh_off; + struct udphdr *udph = data + nh_off + sizeof(struct iphdr); + + uint32_t hostid = iph->daddr >> 24; + + if (udph + 1 > (struct udphdr *)data_end) { + return XDP_PASS; + } + if (hostid == 0 || hostid == 255) { + return XDP_DROP; + } + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) { + return XDP_DROP; + } + if (iph->protocol == IPPROTO_UDP) { + __be16 dport = htons(udph->dest); + __be16 sport = htons(udph->source); + + if (dport == 53 || sport == 53) { + return XDP_DROP; + } + } + } else if (h_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = data + nh_off; + struct udphdr *udph = data + nh_off + sizeof(struct ipv6hdr); + + if (udph + 1 > (struct udphdr *)data_end) { + return XDP_PASS; + } + if (ip6h->nexthdr == IPPROTO_UDP) { + __be16 dport = htons(udph->dest); + __be16 sport = htons(udph->source); + + if (dport == 53 || sport == 53) { + return XDP_DROP; + } + } + } + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/ebpf/xdp_router.c b/src/ebpf/xdp_router.c new file mode 100644 index 0000000000..4fb5c7cb13 --- /dev/null +++ b/src/ebpf/xdp_router.c @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +// Code thankfully copied from: +// https://medium.com/swlh/building-a-xdp-express-data-path-based-peering-router-20db4995da66 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#ifndef XDP_ACTION_MAX +#define XDP_ACTION_MAX (XDP_REDIRECT + 1) +#endif + +#ifndef memcpy +#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) +#endif + +#ifndef AF_INET +#define AF_INET 2 +#endif + +#ifndef AF_INET6 +#define AF_INET6 10 +#endif + +#ifndef IPV6_FLOWINFO_MASK +#define IPV6_FLOWINFO_MASK bpf_htonl(0x0FFFFFFF) +#endif + +/* This is the data record stored in the map */ +struct datarec { + __u64 rx_packets; + __u64 rx_bytes; +}; + +/* Keeps stats per (enum) xdp_action */ +struct bpf_map_def SEC("maps") xdp_stats_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct datarec), + .max_entries = XDP_ACTION_MAX, +}; + +struct bpf_map_def SEC("maps") tx_port = { + .type = BPF_MAP_TYPE_DEVMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 256, +}; + +/* from include/net/ip.h */ +static __always_inline int ip_decrease_ttl(struct iphdr *iph) +{ + __u32 check = iph->check; + check += bpf_htons(0x0100); + iph->check = (__u16)(check + (check >= 0xFFFF)); + return --iph->ttl; +} + +static __always_inline +__u32 xdp_stats_record_action(struct xdp_md *ctx, __u32 action) +{ + if (action >= XDP_ACTION_MAX) + return XDP_ABORTED; + + /* Lookup in kernel BPF-side return pointer to actual data record */ + struct datarec *rec = bpf_map_lookup_elem(&xdp_stats_map, &action); + if (!rec) + return XDP_ABORTED; + + /* BPF_MAP_TYPE_PERCPU_ARRAY returns a data record specific to current + * CPU and XDP hooks runs under Softirq, which makes it safe to update + * without atomic operations. + */ + rec->rx_packets++; + rec->rx_bytes += (ctx->data_end - ctx->data); + + return action; +} + +/* xdp_router is the name of the xdp program */ +SEC("prog") +int xdp_router_func(struct xdp_md *ctx) +{ + /* this is the packet context*/ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct bpf_fib_lookup fib_params = {}; + struct ethhdr *eth = data; + struct ipv6hdr *ip6h; + struct iphdr *iph; + __u16 h_proto; + __u64 nh_off; + int rc; + /* default action is to pass */ + int action = XDP_PASS; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) { + action = XDP_DROP; + goto out; + } + + /* determine if this is IP4 or IPv6 by looking at the Ethernet protocol field */ + h_proto = eth->h_proto; + if (h_proto == bpf_htons(ETH_P_IP)) { + /* IPv4 part of the code */ + iph = data + nh_off; + + if (iph + 1 > data_end) { + action = XDP_DROP; + goto out; + } + /* as a real router, we need to check the TTL to prevent never ending loops*/ + if (iph->ttl <= 1) + goto out; + + /* populate the fib_params fields to prepare for the lookup */ + fib_params.family = AF_INET; + fib_params.tos = iph->tos; + fib_params.l4_protocol = iph->protocol; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = bpf_ntohs(iph->tot_len); + fib_params.ipv4_src = iph->saddr; + fib_params.ipv4_dst = iph->daddr; + } else if (h_proto == bpf_htons(ETH_P_IPV6)) { + /* IPv6 part of the code */ + struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src; + struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst; + + ip6h = data + nh_off; + if (ip6h + 1 > data_end) { + action = XDP_DROP; + goto out; + } + /* as a real router, we need to check the TTL to prevent never ending loops*/ + if (ip6h->hop_limit <= 1) + goto out; + + /* populate the fib_params fields to prepare for the lookup */ + fib_params.family = AF_INET6; + fib_params.flowinfo = *(__be32 *) ip6h & IPV6_FLOWINFO_MASK; + fib_params.l4_protocol = ip6h->nexthdr; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = bpf_ntohs(ip6h->payload_len); + *src = ip6h->saddr; + *dst = ip6h->daddr; + } else { + goto out; + } + + fib_params.ifindex = ctx->ingress_ifindex; + + /* this is where the FIB lookup happens. If the lookup is successful */ + /* it will populate the fib_params.ifindex with the egress interface index */ + + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), 0); + switch (rc) { + case BPF_FIB_LKUP_RET_SUCCESS: /* lookup successful */ + /* we are a router, so we need to decrease the ttl */ + if (h_proto == bpf_htons(ETH_P_IP)) + ip_decrease_ttl(iph); + else if (h_proto == bpf_htons(ETH_P_IPV6)) + ip6h->hop_limit--; + /* set the correct new source and destionation mac addresses */ + /* can be found in fib_params.dmac and fib_params.smac */ + memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); + memcpy(eth->h_source, fib_params.smac, ETH_ALEN); + /* and done, now we set the action to bpf_redirect_map with fib_params.ifindex which is the egress port as paramater */ + action = bpf_redirect_map(&tx_port, fib_params.ifindex, 0); + break; + case BPF_FIB_LKUP_RET_BLACKHOLE: /* dest is blackholed; can be dropped */ + case BPF_FIB_LKUP_RET_UNREACHABLE: /* dest is unreachable; can be dropped */ + case BPF_FIB_LKUP_RET_PROHIBIT: /* dest not allowed; can be dropped */ + action = XDP_DROP; + break; + case BPF_FIB_LKUP_RET_NOT_FWDED: /* packet is not forwarded */ + case BPF_FIB_LKUP_RET_FWD_DISABLED: /* fwding is not enabled on ingress */ + case BPF_FIB_LKUP_RET_UNSUPP_LWT: /* fwd requires encapsulation */ + case BPF_FIB_LKUP_RET_NO_NEIGH: /* no neighbor entry for nh */ + case BPF_FIB_LKUP_RET_FRAG_NEEDED: /* fragmentation required to fwd */ + /* PASS */ + break; + } + +out: + /* and done, update stats and return action */ + return xdp_stats_record_action(ctx, action); +} + +char _license[] SEC("license") = "GPL";