Skip to content

Commit

Permalink
net-tcp: re-generalize TSO sizing in TCP CC module API
Browse files Browse the repository at this point in the history
Reorganize the API for CC modules so that the CC module once again
gets complete control of the TSO sizing decision. This is how the API
was set up around 2016 and the initial BBRv1 upstreaming. Later Eric
Dumazet simplified it. But with wider testing it now seems that to
avoid CPU regressions BBR needs to have a different TSO sizing
function.

This is necessary to handle cases where there are many flows
bottlenecked on the sender host's NIC, in which case BBR's pacing rate
is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because
BBR's pacing rate adapts to the low bandwidth share each flow sees. By
contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very
large cwnd, and thus large pacing rate and large TSO burst size.

Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea
  • Loading branch information
nealcardwell authored and xanmod committed Feb 16, 2021
1 parent 4508a81 commit 71b9fc8
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 21 deletions.
4 changes: 2 additions & 2 deletions include/net/tcp.h
Expand Up @@ -1103,8 +1103,8 @@ struct tcp_congestion_ops {
u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
/* override sysctl_tcp_min_tso_segs */
u32 (*min_tso_segs)(struct sock *sk);
/* pick target number of segments per TSO/GSO skb (optional): */
u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
/* returns the multiplier used in tcp_sndbuf_expand (optional) */
u32 (*sndbuf_expand)(struct sock *sk);
/* react to a specific lost skb (optional) */
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/bpf_tcp_ca.c
Expand Up @@ -16,7 +16,7 @@ static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, cwnd_event),
offsetof(struct tcp_congestion_ops, in_ack_event),
offsetof(struct tcp_congestion_ops, pkts_acked),
offsetof(struct tcp_congestion_ops, min_tso_segs),
offsetof(struct tcp_congestion_ops, tso_segs),
offsetof(struct tcp_congestion_ops, sndbuf_expand),
offsetof(struct tcp_congestion_ops, cong_control),
};
Expand Down
38 changes: 26 additions & 12 deletions net/ipv4/tcp_bbr.c
Expand Up @@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
sk->sk_pacing_rate = rate;
}

/* override sysctl_tcp_min_tso_segs */
static u32 bbr_min_tso_segs(struct sock *sk)
{
return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
}

/* Return the number of segments BBR would like in a TSO/GSO skb, given
* a particular max gso size as a constraint.
*/
static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
u32 gso_max_size)
{
u32 segs;
u64 bytes;

/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;

bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
return segs;
}

/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
{
return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
}

/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
static u32 bbr_tso_segs_goal(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 segs, bytes;

/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
bytes = min_t(unsigned long,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));

return min(segs, 0x7FU);
return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
}

/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
Expand Down Expand Up @@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event = bbr_cwnd_event,
.ssthresh = bbr_ssthresh,
.min_tso_segs = bbr_min_tso_segs,
.tso_segs = bbr_tso_segs,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
Expand Down
11 changes: 5 additions & 6 deletions net/ipv4/tcp_output.c
Expand Up @@ -1989,13 +1989,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 min_tso, tso_segs;
u32 tso_segs;

min_tso = ca_ops->min_tso_segs ?
ca_ops->min_tso_segs(sk) :
sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;

tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
tso_segs = ca_ops->tso_segs ?
ca_ops->tso_segs(sk, mss_now) :
tcp_tso_autosize(sk, mss_now,
sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}

Expand Down

0 comments on commit 71b9fc8

Please sign in to comment.