Permalink
Browse files

Merge branch 'replace-dst_confirm'

Julian Anastasov says:

====================
net: dst_confirm replacement

	This patchset addresses the problem of neighbour
confirmation where received replies from one nexthop
can cause confirmation of different nexthop when using
the same dst. Thanks to YueHaibing <yuehaibing@huawei.com>
for tracking the dst->pending_confirm problem.

	Sockets can obtain cached output route. Such
routes can be to known nexthop (rt_gateway=IP) or to be
used simultaneously for different nexthop IPs by different
subnet prefixes (nh->nh_scope = RT_SCOPE_HOST, rt_gateway=0).

	At first look, there are more problems:

- dst_confirm() sets flag on dst and not on dst->path,
as result, indication is lost when XFRM is used

- DNAT can change the nexthop, so the really used nexthop is
not confirmed

	So, the following solution is to avoid using
dst->pending_confirm.

	The current dst_confirm() usage is as follows:

Protocols confirming dst on received packets:
- TCP (1 dst per socket)
- SCTP (1 dst per transport)
- CXGB*

Protocols supporting sendmsg with MSG_CONFIRM [ | MSG_PROBE ] to
confirm neighbour:
- UDP IPv4/IPv6
- ICMPv4 PING
- RAW IPv4/IPv6
- L2TP/IPv6

MSG_CONFIRM for other purposes (fix not needed):
- CAN

Sending without locking the socket:
- UDP (when no cork)
- RAW (when hdrincl=1)

Redirects from old to new GW:
- rt6_do_redirect

	The patchset includes the following changes:

1. sock: add sk_dst_pending_confirm flag

- used only by TCP with patch 4 to remember the received
indication in sk->sk_dst_pending_confirm

2. net: add dst_pending_confirm flag to skbuff

- skb->dst_pending_confirm will be used by all protocols
in following patches, via skb_{set,get}_dst_pending_confirm

3. sctp: add dst_pending_confirm flag

- SCTP uses per-transport dsts and can not use
sk->sk_dst_pending_confirm like TCP

4. tcp: replace dst_confirm with sk_dst_confirm

5. net: add confirm_neigh method to dst_ops

- IPv4 and IPv6 provision for slow neigh lookups for MSG_PROBE users.
I decided to use neigh lookup only for this case because on
MSG_PROBE the skb may pass MTU checks but it does not reach
the neigh confirmation code. This patch will be used from patch 6.

- xfrm_confirm_neigh: we use the last tunnel address, if present.
When there are only transports, the original dest address is used.

6. net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP

- dst_confirm conversion for UDP, RAW, ICMP and L2TP/IPv6

- these protocols use MSG_CONFIRM propagated by ip*_append_data
to skb->dst_pending_confirm. sk->sk_dst_pending_confirm is not
used because some sending paths do not lock the socket. For
MSG_PROBE we use the slow lookup (dst_confirm_neigh).

- there are also 2 cases that need the slow lookup:
__ip6_rt_update_pmtu and rt6_do_redirect. I hope
&ipv6_hdr(skb)->saddr is the correct nexthop address to use here.

7. net: pending_confirm is not used anymore

- I failed to understand the CXGB* code, I see dst_confirm()
calls but I'm not sure dst_neigh_output() was called. For now
I just removed the dst->pending_confirm flag and left all
dst_confirm() calls there. Any better idea?

- Now may be old function neigh_output() should be restored
instead of dst_neigh_output?
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information...
davem330 committed Feb 7, 2017
2 parents b08d46b + 51ce8bd commit 29ba6e7400a317725bdfb86a725d1824447dbcd7
View
@@ -378,6 +378,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
if (!IS_ERR(neigh)) {
sock_confirm_neigh(skb, neigh);
ret = dst_neigh_output(dst, neigh, skb);
rcu_read_unlock_bh();
return ret;
@@ -574,8 +575,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh))
if (!IS_ERR(neigh)) {
sock_confirm_neigh(skb, neigh);
ret = dst_neigh_output(dst, neigh, skb);
}
rcu_read_unlock_bh();
err:
View
@@ -612,6 +612,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @dst_pending_confirm: need to confirm neighbour
* @napi_id: id of the NAPI struct this skb came from
* @secmark: security marking
* @mark: Generic packet mark
@@ -741,6 +742,7 @@ struct sk_buff {
__u8 csum_level:2;
__u8 csum_bad:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
@@ -3698,6 +3700,16 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
return skb->queue_mapping != 0;
}
static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
skb->dst_pending_confirm = val;
}
static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
return skb->dst_pending_confirm != 0;
}
static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
View
@@ -35,6 +35,22 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32
return n;
}
static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
{
struct neighbour *n;
rcu_read_lock_bh();
n = __ipv4_neigh_lookup_noref(dev, key);
if (n) {
unsigned long now = jiffies;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
rcu_read_unlock_bh();
}
void arp_init(void);
int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
void arp_send(int type, int ptype, __be32 dest_ip,
View
@@ -59,8 +59,6 @@ struct dst_entry {
#define DST_XFRM_QUEUE 0x0100
#define DST_METADATA 0x0200
unsigned short pending_confirm;
short error;
/* A non-zero value of dst->obsolete forces by-hand validation
@@ -78,6 +76,8 @@ struct dst_entry {
#define DST_OBSOLETE_KILL -2
unsigned short header_len; /* more space at head required */
unsigned short trailer_len; /* space to reserve at tail */
unsigned short __pad3;
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 tclassid;
#else
@@ -440,23 +440,13 @@ static inline void dst_rcu_free(struct rcu_head *head)
static inline void dst_confirm(struct dst_entry *dst)
{
dst->pending_confirm = 1;
}
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
struct sk_buff *skb)
{
const struct hh_cache *hh;
if (dst->pending_confirm) {
unsigned long now = jiffies;
dst->pending_confirm = 0;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
@@ -477,6 +467,13 @@ static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst
return IS_ERR(n) ? NULL : n;
}
static inline void dst_confirm_neigh(const struct dst_entry *dst,
const void *daddr)
{
if (dst->ops->confirm_neigh)
dst->ops->confirm_neigh(dst, daddr);
}
static inline void dst_link_failure(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
View
@@ -33,6 +33,8 @@ struct dst_ops {
struct neighbour * (*neigh_lookup)(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
void (*confirm_neigh)(const struct dst_entry *dst,
const void *daddr);
struct kmem_cache *kmem_cachep;
View
@@ -391,6 +391,23 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons
return n;
}
static inline void __ipv6_confirm_neigh(struct net_device *dev,
const void *pkey)
{
struct neighbour *n;
rcu_read_lock_bh();
n = __ipv6_neigh_lookup_noref(dev, pkey);
if (n) {
unsigned long now = jiffies;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
rcu_read_unlock_bh();
}
int ndisc_init(void);
int ndisc_late_init(void);
View
@@ -593,10 +593,8 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr)
*/
static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t)
{
if (t->dst && !dst_check(t->dst, t->dst_cookie)) {
dst_release(t->dst);
t->dst = NULL;
}
if (t->dst && !dst_check(t->dst, t->dst_cookie))
sctp_transport_dst_release(t);
return t->dst;
}
@@ -804,6 +804,8 @@ struct sctp_transport {
__u32 burst_limited; /* Holds old cwnd when max.burst is applied */
__u32 dst_pending_confirm; /* need to confirm neighbour */
/* Destination */
struct dst_entry *dst;
/* Source address. */
@@ -950,6 +952,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *);
void sctp_transport_reset(struct sctp_transport *);
void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
void sctp_transport_immediate_rtx(struct sctp_transport *);
void sctp_transport_dst_release(struct sctp_transport *t);
void sctp_transport_dst_confirm(struct sctp_transport *t);
/* This is the structure we use to queue packets as they come into
View
@@ -240,6 +240,7 @@ struct sock_common {
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
@@ -393,6 +394,8 @@ struct sock {
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
__u32 sk_dst_pending_confirm;
/* Note: 32bit hole on 64bit arches */
long sk_sndtimeo;
struct timer_list sk_timer;
__u32 sk_priority;
@@ -1764,6 +1767,7 @@ static inline void dst_negative_advice(struct sock *sk)
if (ndst != dst) {
rcu_assign_pointer(sk->sk_dst_cache, ndst);
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
}
}
}
@@ -1774,6 +1778,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old_dst;
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
/*
* This can be called while sk is owned by the caller only,
* with no state that can be checked in a rcu_dereference_check() cond
@@ -1789,6 +1794,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old_dst;
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
dst_release(old_dst);
}
@@ -1809,6 +1815,26 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
static inline void sk_dst_confirm(struct sock *sk)
{
if (!sk->sk_dst_pending_confirm)
sk->sk_dst_pending_confirm = 1;
}
static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
if (skb_get_dst_pending_confirm(skb)) {
struct sock *sk = skb->sk;
unsigned long now = jiffies;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
if (sk && sk->sk_dst_pending_confirm)
sk->sk_dst_pending_confirm = 0;
}
}
bool sk_mc_loop(struct sock *sk);
static inline bool sk_can_gso(const struct sock *sk)
View
@@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
dst->pending_confirm = 0;
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
View
@@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -1519,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
af_family_clock_key_strings[newsk->sk_family]);
newsk->sk_dst_cache = NULL;
newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
atomic_set(&newsk->sk_drops, 0);
View
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
int res = dst_neigh_output(dst, neigh, skb);
int res;
sock_confirm_neigh(skb, neigh);
res = dst_neigh_output(dst, neigh, skb);
rcu_read_unlock_bh();
return res;
@@ -886,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->csum = 0;
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -1086,6 +1092,9 @@ static int __ip_append_data(struct sock *sk,
exthdrlen = 0;
csummode = CHECKSUM_NONE;
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
/*
* Put the packet on the pending queue.
*/
View
@@ -848,7 +848,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return err;
do_confirm:
dst_confirm(&rt->dst);
if (msg->msg_flags & MSG_PROBE)
dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
View
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return len;
do_confirm:
dst_confirm(&rt->dst);
if (msg->msg_flags & MSG_PROBE)
dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
View
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
.redirect = ip_do_redirect,
.local_out = __ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
.confirm_neigh = ipv4_confirm_neigh,
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
struct net_device *dev = dst->dev;
const __be32 *pkey = daddr;
const struct rtable *rt;
rt = (const struct rtable *)dst;
if (rt->rt_gateway)
pkey = (const __be32 *)&rt->rt_gateway;
else if (!daddr ||
(rt->rt_flags &
(RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
return;
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
#define IP_IDENTS_SZ 2048u
static atomic_t *ip_idents __read_mostly;
Oops, something went wrong.

0 comments on commit 29ba6e7

Please sign in to comment.