Skip to content

Commit

Permalink
bpf: Introduce SK_LOOKUP program type with a dedicated attach point
Browse files Browse the repository at this point in the history
Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type
BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer
when looking up a listening socket for a new connection request for
connection oriented protocols, or when looking up an unconnected socket for
a packet for connection-less protocols.

When called, SK_LOOKUP BPF program can select a socket that will receive
the packet. This serves as a mechanism to overcome the limits of what
bind() API allows to express. Two use-cases driving this work are:

 (1) steer packets destined to an IP range, on fixed port to a socket

     192.0.2.0/24, port 80 -> NGINX socket

 (2) steer packets destined to an IP address, on any port to a socket

     198.51.100.1, any port -> L7 proxy socket

In its run-time context program receives information about the packet that
triggered the socket lookup. Namely IP version, L4 protocol identifier, and
address 4-tuple. Context can be further extended to include ingress
interface identifier.

To select a socket BPF program fetches it from a map holding socket
references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...)
helper to record the selection. Transport layer then uses the selected
socket as a result of socket lookup.

In its basic form, SK_LOOKUP acts as a filter and hence must return either
SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should
look for a socket to receive the packet, or use the one selected by the
program if available, while SK_DROP informs the transport layer that the
lookup should fail.

This patch only enables the user to attach an SK_LOOKUP program to a
network namespace. Subsequent patches hook it up to run on local delivery
path in ipv4 and ipv6 stacks.

Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com
  • Loading branch information
jsitnicki authored and Alexei Starovoitov committed Jul 18, 2020
1 parent ce3aa9c commit e9ddbb7
Show file tree
Hide file tree
Showing 10 changed files with 312 additions and 4 deletions.
3 changes: 3 additions & 0 deletions include/linux/bpf-netns.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
enum netns_bpf_attach_type {
NETNS_BPF_INVALID = -1,
NETNS_BPF_FLOW_DISSECTOR = 0,
NETNS_BPF_SK_LOOKUP,
MAX_NETNS_BPF_ATTACH_TYPE
};

Expand All @@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
switch (attach_type) {
case BPF_FLOW_DISSECTOR:
return NETNS_BPF_FLOW_DISSECTOR;
case BPF_SK_LOOKUP:
return NETNS_BPF_SK_LOOKUP;
default:
return NETNS_BPF_INVALID;
}
Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ enum bpf_arg_type {
ARG_PTR_TO_INT, /* pointer to int */
ARG_PTR_TO_LONG, /* pointer to long */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */
ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */
Expand Down
2 changes: 2 additions & 0 deletions include/linux/bpf_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
#ifdef CONFIG_INET
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
struct sk_reuseport_md, struct sk_reuseport_kern)
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup,
struct bpf_sk_lookup, struct bpf_sk_lookup_kern)
#endif
#if defined(CONFIG_BPF_JIT)
BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
Expand Down
17 changes: 17 additions & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern {
s32 retval;
};

struct bpf_sk_lookup_kern {
u16 family;
u16 protocol;
struct {
__be32 saddr;
__be32 daddr;
} v4;
struct {
const struct in6_addr *saddr;
const struct in6_addr *daddr;
} v6;
__be16 sport;
u16 dport;
struct sock *selected_sk;
bool no_reuseport;
};

#endif /* __LINUX_FILTER_H__ */
77 changes: 77 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_STRUCT_OPS,
BPF_PROG_TYPE_EXT,
BPF_PROG_TYPE_LSM,
BPF_PROG_TYPE_SK_LOOKUP,
};

enum bpf_attach_type {
Expand Down Expand Up @@ -228,6 +229,7 @@ enum bpf_attach_type {
BPF_XDP_DEVMAP,
BPF_CGROUP_INET_SOCK_RELEASE,
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
__MAX_BPF_ATTACH_TYPE
};

Expand Down Expand Up @@ -3069,6 +3071,10 @@ union bpf_attr {
*
* long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
* Description
* Helper is overloaded depending on BPF program type. This
* description applies to **BPF_PROG_TYPE_SCHED_CLS** and
* **BPF_PROG_TYPE_SCHED_ACT** programs.
*
* Assign the *sk* to the *skb*. When combined with appropriate
* routing configuration to receive the packet towards the socket,
* will cause *skb* to be delivered to the specified socket.
Expand All @@ -3094,6 +3100,56 @@ union bpf_attr {
* **-ESOCKTNOSUPPORT** if the socket type is not supported
* (reuseport).
*
* long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
* Description
* Helper is overloaded depending on BPF program type. This
* description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
*
* Select the *sk* as a result of a socket lookup.
*
* For the operation to succeed passed socket must be compatible
* with the packet description provided by the *ctx* object.
*
* L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
* be an exact match. While IP family (**AF_INET** or
* **AF_INET6**) must be compatible, that is IPv6 sockets
* that are not v6-only can be selected for IPv4 packets.
*
* Only TCP listeners and UDP unconnected sockets can be
* selected. *sk* can also be NULL to reset any previous
* selection.
*
* *flags* argument can combination of following values:
*
* * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
* socket selection, potentially done by a BPF program
* that ran before us.
*
* * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
* load-balancing within reuseport group for the socket
* being selected.
*
* On success *ctx->sk* will point to the selected socket.
*
* Return
* 0 on success, or a negative errno in case of failure.
*
* * **-EAFNOSUPPORT** if socket family (*sk->family*) is
* not compatible with packet family (*ctx->family*).
*
* * **-EEXIST** if socket has been already selected,
* potentially by another program, and
* **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
*
* * **-EINVAL** if unsupported flags were specified.
*
* * **-EPROTOTYPE** if socket L4 protocol
* (*sk->protocol*) doesn't match packet protocol
* (*ctx->protocol*).
*
* * **-ESOCKTNOSUPPORT** if socket is not in allowed
* state (TCP listening or UDP unconnected).
*
* u64 bpf_ktime_get_boot_ns(void)
* Description
* Return the time elapsed since system boot, in nanoseconds.
Expand Down Expand Up @@ -3607,6 +3663,12 @@ enum {
BPF_RINGBUF_HDR_SZ = 8,
};

/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
enum {
BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0),
BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1),
};

/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
Expand Down Expand Up @@ -4349,4 +4411,19 @@ struct bpf_pidns_info {
__u32 pid;
__u32 tgid;
};

/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
struct bpf_sk_lookup {
__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */

__u32 family; /* Protocol family (AF_INET, AF_INET6) */
__u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
__u32 remote_ip4; /* Network byte order */
__u32 remote_ip6[4]; /* Network byte order */
__u32 remote_port; /* Network byte order */
__u32 local_ip4; /* Network byte order */
__u32 local_ip6[4]; /* Network byte order */
__u32 local_port; /* Host byte order */
};

#endif /* _UAPI__LINUX_BPF_H__ */
5 changes: 5 additions & 0 deletions kernel/bpf/net_namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
switch (type) {
case NETNS_BPF_FLOW_DISSECTOR:
return 1;
case NETNS_BPF_SK_LOOKUP:
return 64;
default:
return 0;
}
Expand Down Expand Up @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
case NETNS_BPF_FLOW_DISSECTOR:
err = flow_dissector_bpf_prog_attach_check(net, link->prog);
break;
case NETNS_BPF_SK_LOOKUP:
err = 0; /* nothing to check */
break;
default:
err = -EINVAL;
break;
Expand Down
9 changes: 9 additions & 0 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
case BPF_PROG_TYPE_SK_LOOKUP:
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
Expand Down Expand Up @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_SK_LOOKUP:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
if (!capable(CAP_NET_ADMIN))
Expand Down Expand Up @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
return BPF_PROG_TYPE_TRACING;
case BPF_SK_LOOKUP:
return BPF_PROG_TYPE_SK_LOOKUP;
default:
return BPF_PROG_TYPE_UNSPEC;
}
Expand Down Expand Up @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
Expand Down Expand Up @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr)
ret = tracing_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
default:
Expand Down
13 changes: 10 additions & 3 deletions kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
}
meta->ref_obj_id = reg->ref_obj_id;
}
} else if (arg_type == ARG_PTR_TO_SOCKET) {
} else if (arg_type == ARG_PTR_TO_SOCKET ||
arg_type == ARG_PTR_TO_SOCKET_OR_NULL) {
expected_type = PTR_TO_SOCKET;
if (type != expected_type)
goto err_type;
if (!(register_is_null(reg) &&
arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) {
if (type != expected_type)
goto err_type;
}
} else if (arg_type == ARG_PTR_TO_BTF_ID) {
expected_type = PTR_TO_BTF_ID;
if (type != expected_type)
Expand Down Expand Up @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
break;
case BPF_PROG_TYPE_SK_LOOKUP:
range = tnum_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
Expand Down
Loading

0 comments on commit e9ddbb7

Please sign in to comment.