Skip to content

Commit

Permalink
CP-14765 - Use atomic count of VCPU's on PCPU runqueue to avoid takin…
Browse files Browse the repository at this point in the history
…g spinlock

Credit 1 smp load balancing tries to find work on other PCPUs by looking through
their runqueues. It takes the PCPU runqueue spinlock in order to safely process
the list. This causes lock contention on the PCPU runqueue and cache line boucing.

Use an atomic count of items on PCPU runqueue so that VCPU are not stolen from a
PCPU in an under committed situation and so that the runqueue spinlock is not
taken needlessly.

This should help performance as it reduces cache line bouncing of the spinlock
itself and it should help improve keep the caches hot. Additionally, it may help
improve turbo behaviour on Intel processors as cores will be able to idle more
often.
  • Loading branch information
Malcolm Crossley committed Jan 11, 2016
1 parent 221eb49 commit 0f830b9
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
71 changes: 71 additions & 0 deletions master/sched-credit1-use-per-pcpu-runqueue-count.patch
@@ -0,0 +1,71 @@
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 507e957..bf59499 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -238,6 +238,7 @@ struct csched_private {

static void csched_tick(void *_cpu);
static void csched_acct(void *dummy);
+DEFINE_PER_CPU(atomic_t, runqueue_count);

static inline int
__vcpu_on_runq(struct csched_vcpu *svc)
@@ -280,6 +281,7 @@ __runq_insert(unsigned int cpu, struct csched_vcpu *svc)
}

list_add_tail(&svc->runq_elem, iter);
+ atomic_inc(&per_cpu(runqueue_count, cpu));
}

static inline void
@@ -287,6 +289,7 @@ __runq_remove(struct csched_vcpu *svc)
{
BUG_ON( !__vcpu_on_runq(svc) );
list_del_init(&svc->runq_elem);
+ atomic_dec(&per_cpu(runqueue_count, svc->vcpu->processor));
}


@@ -555,6 +558,7 @@ csched_alloc_pdata(const struct scheduler *ops, int cpu)
/* Start off idling... */
BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
cpumask_set_cpu(cpu, prv->idlers);
+ atomic_set(&per_cpu(runqueue_count, cpu), 0);

spin_unlock_irqrestore(&prv->lock, flags);

@@ -1576,6 +1580,17 @@ csched_load_balance(struct csched_private *prv, int cpu,
goto next_node;
do
{
+ spinlock_t *lock;
+
+ /*
+ * Ignore peer cpu with only one task on its runqueue,
+ * this will race with adding/removing tasks but the
+ * lock for the runqueue is still taken below
+ */
+ if (atomic_read(&per_cpu(runqueue_count, peer_cpu)) < 2) {
+ peer_cpu = cpumask_cycle(peer_cpu, &workers);
+ continue;
+ }
/*
* Get ahold of the scheduler lock for this peer CPU.
*
@@ -1583,7 +1598,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
* could cause a deadlock if the peer CPU is also load
* balancing and trying to lock this CPU.
*/
- spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);
+ lock = pcpu_schedule_trylock(peer_cpu);

if ( !lock )
{
@@ -1806,6 +1821,7 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
runq = &spc->runq;

cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
+ printk(" qcnt %d", atomic_read(&per_cpu(runqueue_count, cpu)));
printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr);
cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
printk("core=%s\n", cpustr);
1 change: 1 addition & 0 deletions master/series
Expand Up @@ -128,6 +128,7 @@ local-cpuid.patch
detect-nehalem-c-state.patch # malcolmc
quirk-hp-gen8-rmrr.patch # malcolmc
quirk-pci-phantom-function-devices.patch # malcolmc
sched-credit1-use-per-pcpu-runqueue-count.patch # malcolmc

# dvrabel - v1 posted
0001-trace-include-timestamp-in-trace-records-added-by-HV.patch
Expand Down

0 comments on commit 0f830b9

Please sign in to comment.