Skip to content

Commit

Permalink
openvswitch: Introduce per-cpu upcall dispatch
Browse files Browse the repository at this point in the history
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.

This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html

Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
markdgray authored and davem330 committed Jul 16, 2021
1 parent 919d527 commit b83d23a
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 3 deletions.
8 changes: 8 additions & 0 deletions include/uapi/linux/openvswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
* set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on
* %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
* not be sent.
* OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
* OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
* @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
* datapath. Always present in notifications.
* @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
Expand All @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */
OVS_DP_ATTR_PAD,
OVS_DP_ATTR_MASKS_CACHE_SIZE,
OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls in per-cpu
* dispatch mode
*/
__OVS_DP_ATTR_MAX
};

Expand Down Expand Up @@ -127,6 +132,9 @@ struct ovs_vport_stats {
/* Allow tc offload recirc sharing */
#define OVS_DP_F_TC_RECIRC_SHARING (1 << 2)

/* Allow per-cpu dispatch of upcalls */
#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3)

/* Fixed logical ports. */
#define OVSP_LOCAL ((__u32)0)

Expand Down
6 changes: 5 additions & 1 deletion net/openvswitch/actions.c
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,11 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
break;

case OVS_USERSPACE_ATTR_PID:
upcall.portid = nla_get_u32(a);
if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
upcall.portid =
ovs_dp_get_upcall_portid(dp, smp_processor_id());
else
upcall.portid = nla_get_u32(a);
break;

case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
Expand Down
72 changes: 70 additions & 2 deletions net/openvswitch/datapath.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,

static void ovs_dp_masks_rebalance(struct work_struct *work);

static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *);

/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp)
{
Expand Down Expand Up @@ -166,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
free_percpu(dp->stats_percpu);
kfree(dp->ports);
ovs_meters_exit(dp);
kfree(dp->upcall_portids);
kfree(dp);
}

Expand Down Expand Up @@ -239,7 +242,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)

memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);

if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
else
upcall.portid = ovs_vport_find_upcall_portid(p, skb);

upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
if (unlikely(error))
Expand Down Expand Up @@ -1594,16 +1602,67 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,

DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);

static int ovs_dp_set_upcall_portids(struct datapath *dp,
const struct nlattr *ids)
{
struct dp_nlsk_pids *old, *dp_nlsk_pids;

if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
return -EINVAL;

old = ovsl_dereference(dp->upcall_portids);

dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids),
GFP_KERNEL);
if (!dp_nlsk_pids)
return -ENOMEM;

dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32);
nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids));

rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids);

kfree_rcu(old, rcu);

return 0;
}

u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
{
struct dp_nlsk_pids *dp_nlsk_pids;

dp_nlsk_pids = rcu_dereference(dp->upcall_portids);

if (dp_nlsk_pids) {
if (cpu_id < dp_nlsk_pids->n_pids) {
return dp_nlsk_pids->pids[cpu_id];
} else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) {
/* If the number of netlink PIDs is mismatched with the number of
* CPUs as seen by the kernel, log this and send the upcall to an
* arbitrary socket (0) in order to not drop packets
*/
pr_info_ratelimited("cpu_id mismatch with handler threads");
return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids];
} else {
return 0;
}
} else {
return 0;
}
}

static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
{
u32 user_features = 0;
int err;

if (a[OVS_DP_ATTR_USER_FEATURES]) {
user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);

if (user_features & ~(OVS_DP_F_VPORT_PIDS |
OVS_DP_F_UNALIGNED |
OVS_DP_F_TC_RECIRC_SHARING))
OVS_DP_F_TC_RECIRC_SHARING |
OVS_DP_F_DISPATCH_UPCALL_PER_CPU))
return -EOPNOTSUPP;

#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
Expand All @@ -1624,6 +1683,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])

dp->user_features = user_features;

if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU &&
a[OVS_DP_ATTR_PER_CPU_PIDS]) {
/* Upcall Netlink Port IDs have been updated */
err = ovs_dp_set_upcall_portids(dp,
a[OVS_DP_ATTR_PER_CPU_PIDS]);
if (err)
return err;
}

if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
static_branch_enable(&tc_recirc_sharing_support);
else
Expand Down
20 changes: 20 additions & 0 deletions net/openvswitch/datapath.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ struct dp_stats_percpu {
struct u64_stats_sync syncp;
};

/**
* struct dp_nlsk_pids - array of netlink portids of for a datapath.
* This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU
* is enabled and must be protected by rcu.
* @rcu: RCU callback head for deferred destruction.
* @n_pids: Size of @pids array.
* @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets
* that miss the flow table.
*/
struct dp_nlsk_pids {
struct rcu_head rcu;
u32 n_pids;
u32 pids[];
};

/**
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
Expand All @@ -61,6 +76,7 @@ struct dp_stats_percpu {
* @net: Reference to net namespace.
* @max_headroom: the maximum headroom of all vports in this datapath; it will
* be used by all the internal vports in this dp.
* @upcall_portids: RCU protected 'struct dp_nlsk_pids'.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
Expand All @@ -87,6 +103,8 @@ struct datapath {

/* Switch meters. */
struct dp_meter_table meter_tbl;

struct dp_nlsk_pids __rcu *upcall_portids;
};

/**
Expand Down Expand Up @@ -243,6 +261,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *,
const struct sw_flow_key *, const struct dp_upcall_info *,
uint32_t cutlen);

u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id);

const char *ovs_dp_name(const struct datapath *dp);
struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
u32 portid, u32 seq, u8 cmd);
Expand Down

0 comments on commit b83d23a

Please sign in to comment.