Skip to content

Commit

Permalink
bpf, sockmap: Improved check for empty queue
Browse files Browse the repository at this point in the history
[ Upstream commit 405df89 ]

We noticed some rare sk_buffs were stepping past the queue when system was
under memory pressure. The general theory is to skip enqueueing
sk_buffs when its not necessary which is the normal case with a system
that is properly provisioned for the task, no memory pressure and enough
cpu assigned.

But, if we can't allocate memory due to an ENOMEM error when enqueueing
the sk_buff into the sockmap receive queue we push it onto a delayed
workqueue to retry later. When a new sk_buff is received we then check
if that queue is empty. However, there is a problem with simply checking
the queue length. When a sk_buff is being processed from the ingress queue
but not yet on the sockmap msg receive queue its possible to also recv
a sk_buff through normal path. It will check the ingress queue which is
zero and then skip ahead of the pkt being processed.

Previously we used sock lock from both contexts which made the problem
harder to hit, but not impossible.

To fix instead of popping the skb from the queue entirely we peek the
skb from the queue and do the copy there. This ensures checks to the
queue length are non-zero while skb is being processed. Then finally
when the entire skb has been copied to user space queue or another
socket we pop it off the queue. This way the queue length check allows
bypassing the queue only after the list has been completely processed.

To reproduce issue we run NGINX compliance test with sockmap running and
observe some flakes in our testing that we attributed to this issue.

Fixes: 04919be ("tcp: Introduce tcp_read_skb()")
Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-5-john.fastabend@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
  • Loading branch information
jrfastab authored and gregkh committed Jun 5, 2023
1 parent 1e4e379 commit ba4fec5
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 25 deletions.
1 change: 0 additions & 1 deletion include/linux/skmsg.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ struct sk_psock_link {
};

struct sk_psock_work_state {
struct sk_buff *skb;
u32 len;
u32 off;
};
Expand Down
32 changes: 8 additions & 24 deletions net/core/skmsg.c
Original file line number Diff line number Diff line change
Expand Up @@ -621,16 +621,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,

static void sk_psock_skb_state(struct sk_psock *psock,
struct sk_psock_work_state *state,
struct sk_buff *skb,
int len, int off)
{
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
state->skb = skb;
state->len = len;
state->off = off;
} else {
sock_drop(psock->sk, skb);
}
spin_unlock_bh(&psock->ingress_lock);
}
Expand All @@ -641,23 +637,17 @@ static void sk_psock_backlog(struct work_struct *work)
struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
struct sk_buff *skb = NULL;
u32 len = 0, off = 0;
bool ingress;
u32 len, off;
int ret;

mutex_lock(&psock->work_mutex);
if (unlikely(state->skb)) {
spin_lock_bh(&psock->ingress_lock);
skb = state->skb;
if (unlikely(state->len)) {
len = state->len;
off = state->off;
state->skb = NULL;
spin_unlock_bh(&psock->ingress_lock);
}
if (skb)
goto start;

while ((skb = skb_dequeue(&psock->ingress_skb))) {
while ((skb = skb_peek(&psock->ingress_skb))) {
len = skb->len;
off = 0;
if (skb_bpf_strparser(skb)) {
Expand All @@ -666,7 +656,6 @@ static void sk_psock_backlog(struct work_struct *work)
off = stm->offset;
len = stm->full_len;
}
start:
ingress = skb_bpf_ingress(skb);
skb_bpf_redirect_clear(skb);
do {
Expand All @@ -676,8 +665,7 @@ static void sk_psock_backlog(struct work_struct *work)
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
sk_psock_skb_state(psock, state, skb,
len, off);
sk_psock_skb_state(psock, state, len, off);

/* Delay slightly to prioritize any
* other work that might be here.
Expand All @@ -689,15 +677,16 @@ static void sk_psock_backlog(struct work_struct *work)
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sock_drop(psock->sk, skb);
goto end;
}
off += ret;
len -= ret;
} while (len);

if (!ingress)
skb = skb_dequeue(&psock->ingress_skb);
if (!ingress) {
kfree_skb(skb);
}
}
end:
mutex_unlock(&psock->work_mutex);
Expand Down Expand Up @@ -790,11 +779,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
skb_bpf_redirect_clear(skb);
sock_drop(psock->sk, skb);
}
kfree_skb(psock->work_state.skb);
/* We null the skb here to ensure that calls to sk_psock_backlog
* do not pick up the free'd skb.
*/
psock->work_state.skb = NULL;
__sk_psock_purge_ingress_msg(psock);
}

Expand All @@ -813,7 +797,6 @@ void sk_psock_stop(struct sk_psock *psock)
spin_lock_bh(&psock->ingress_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sk_psock_cork_free(psock);
__sk_psock_zap_ingress(psock);
spin_unlock_bh(&psock->ingress_lock);
}

Expand All @@ -828,6 +811,7 @@ static void sk_psock_destroy(struct work_struct *work)
sk_psock_done_strp(psock);

cancel_delayed_work_sync(&psock->work);
__sk_psock_zap_ingress(psock);
mutex_destroy(&psock->work_mutex);

psock_progs_drop(&psock->progs);
Expand Down

0 comments on commit ba4fec5

Please sign in to comment.