Skip to content

Commit

Permalink
bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
Browse files Browse the repository at this point in the history
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.

Without this patch:
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc

With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com
  • Loading branch information
fomichev authored and Alexei Starovoitov committed Jan 20, 2021
1 parent 13ca51d commit 9cacf81
Show file tree
Hide file tree
Showing 16 changed files with 506 additions and 7 deletions.
27 changes: 23 additions & 4 deletions include/linux/bpf-cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int __user *optlen, int max_optlen,
int retval);

int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
int optname, void *optval,
int *optlen, int retval);

static inline enum bpf_cgroup_storage_type cgroup_storage_type(
struct bpf_map *map)
{
Expand Down Expand Up @@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
({ \
int __ret = retval; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \
optname, optval, \
optlen, max_optlen, \
retval); \
if (!(sock)->sk_prot->bpf_bypass_getsockopt || \
!INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
tcp_bpf_bypass_getsockopt, \
level, optname)) \
__ret = __cgroup_bpf_run_filter_getsockopt( \
sock, level, optname, optval, optlen, \
max_optlen, retval); \
__ret; \
})

#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
optlen, retval) \
({ \
int __ret = retval; \
if (cgroup_bpf_enabled) \
__ret = __cgroup_bpf_run_filter_getsockopt_kern( \
sock, level, optname, optval, optlen, retval); \
__ret; \
})

Expand Down Expand Up @@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
optlen, max_optlen, retval) ({ retval; })
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
optlen, retval) ({ retval; })
#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
kernel_optval) ({ 0; })

Expand Down
6 changes: 6 additions & 0 deletions include/linux/indirect_call_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,10 @@
#define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
#endif

#if IS_ENABLED(CONFIG_INET)
#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
#endif

#endif
2 changes: 2 additions & 0 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -1174,6 +1174,8 @@ struct proto {

int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
bool (*bpf_bypass_getsockopt)(int level,
int optname);

void (*release_cb)(struct sock *sk);

Expand Down
1 change: 1 addition & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
Expand Down
46 changes: 46 additions & 0 deletions kernel/bpf/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1486,6 +1486,52 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
sockopt_free_buf(&ctx);
return ret;
}

int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
int optname, void *optval,
int *optlen, int retval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = level,
.optname = optname,
.retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
};
int ret;

/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
* user data back into BPF buffer when reval != 0. This is
* done as an optimization to avoid extra copy, assuming
* kernel won't populate the data in case of an error.
* Here we always pass the data and memset() should
* be called if that data shouldn't be "exported".
*/

ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
&ctx, BPF_PROG_RUN);
if (!ret)
return -EPERM;

if (ctx.optlen > *optlen)
return -EFAULT;

/* BPF programs only allowed to set retval to 0, not some
* arbitrary value.
*/
if (ctx.retval != 0 && ctx.retval != retval)
return -EFAULT;

/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;

return ctx.retval;
}
#endif

static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
Expand Down
14 changes: 14 additions & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
release_sock(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, err))
goto zerocopy_rcv_sk_err;
Expand Down Expand Up @@ -4133,6 +4135,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return 0;
}

bool tcp_bpf_bypass_getsockopt(int level, int optname)
{
/* TCP do_tcp_getsockopt has optimized getsockopt implementation
* to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
*/
if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
return true;

return false;
}
EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);

int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -2793,6 +2793,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
Expand Down
1 change: 1 addition & 0 deletions net/ipv6/tcp_ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
Expand Down
3 changes: 3 additions & 0 deletions net/socket.c
Original file line number Diff line number Diff line change
Expand Up @@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
return __sys_setsockopt(fd, level, optname, optval, optlen);
}

INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
int optname));

/*
* Get a socket option. Because we don't know the option lengths we have
* to pass a user mode parameter for the protocols to sort out.
Expand Down

0 comments on commit 9cacf81

Please sign in to comment.