kernel - Another huge HUGE VM performance improvement for many-cores

This requires a bit of explanation. The last single-point spinlocks in the VM system were the spinlocks for the inactive and active queue. Even though these two spinlocks are only held for a very short period of time they can create a major point of contention when one has (e.g.) 48 cores all trying to run a VM fault at the same time. This is an issue with multi-socket/ many-cores systems and not so much an issue with single-socket systems. On many cores systems the global VM fault rate was limited to around ~200-250K zfod faults per second prior to this commit on our 48-core opteron test box. Since any single compiler process can run ~35K zfod faults per second the maximum concurrency topped out at around ~7 concurrent processes. With this commit the global VM fault rate was tested to almost 900K zfod faults per second. That's 900,000 page faults per second (about 3.5 GBytes per second). Typical operation was consistently above 750K zfod faults per second. Maximum concurrency at a 35K fault rate per process is thus increased from 7 processes to over 25 processes, and is probably approaching the physical memory bus limit considering that one also has to take into account generic page-fault overhead above and beyond the memory impact on the page itself. I can't stress enough how important it is to avoid contention entirely when possible on a many-cores system. In this case even though the VM page queue spinlocks are only held for a very short period of time, the convulsing of the cache coherency management between physical cpu sockets when all the cores need to use the spinlock still created an enormous bottleneck. Fixing this one spinlock easily doubled concurrent compiler performance on our 48-core opteron. * Fan-out the PQ_INACTIVE and PQ_ACTIVE page queues from 1 queue to 256 queues, each with its own spin lock. * This removes the last major contention point in the VM system. * -j48 buildkernel test on monster (48-core opteron) now runs in 55 seconds. It was originally 167 seconds, and 101 seconds just prior to this commit. Concurrent compiles are now three times faster (a +200% improvement) on a many-cores box, with virtually no contention at all.
thesjg · Oct 28, 2011 · 027193e · 027193e
1 parent fc9ed34
commit 027193e
Show file tree

Hide file tree

Showing 6 changed files with 400 additions and 272 deletions.
diff --git a/sys/vm/vm_contig.c b/sys/vm/vm_contig.c
@@ -244,7 +244,7 @@ static int
 vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high,
 		   unsigned long alignment, unsigned long boundary, int mflags)
 {
-	int i, start, pass;
+	int i, q, start, pass;
 	vm_offset_t phys;
 	vm_page_t pga = vm_page_array;
 	vm_page_t m;
@@ -302,8 +302,11 @@ vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high,
 			 * This is quite quick, for now stall all
 			 * callers, even if they've specified M_NOWAIT.
 			 */
-			vm_contig_pg_clean(PQ_INACTIVE,
-					   vmstats.v_inactive_count);
+			for (q = 0; q < PQ_L2_SIZE; ++q) {
+				vm_contig_pg_clean(PQ_INACTIVE + q,
+						   vmstats.v_inactive_count);
+				lwkt_yield();
+			}
 
 			/*
 			 * Best effort flush of active pages.
@@ -316,8 +319,11 @@ vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high,
 			 * will fail in the index < 0 case.
 			 */
 			if (pass > 0 && (mflags & M_WAITOK)) {
-				vm_contig_pg_clean(PQ_ACTIVE,
-						   vmstats.v_active_count);
+				for (q = 0; q < PQ_L2_SIZE; ++q) {
+					vm_contig_pg_clean(PQ_ACTIVE + q,
+						       vmstats.v_active_count);
+				}
+				lwkt_yield();
 			}
 
 			/*

diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
@@ -124,10 +124,12 @@ vm_page_queue_init(void)
 		vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count;
 	for (i = 0; i < PQ_L2_SIZE; i++)
 		vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count;
-
-	vm_page_queues[PQ_INACTIVE].cnt = &vmstats.v_inactive_count;
-	vm_page_queues[PQ_ACTIVE].cnt = &vmstats.v_active_count;
-	vm_page_queues[PQ_HOLD].cnt = &vmstats.v_active_count;
+	for (i = 0; i < PQ_L2_SIZE; i++)
+		vm_page_queues[PQ_INACTIVE+i].cnt = &vmstats.v_inactive_count;
+	for (i = 0; i < PQ_L2_SIZE; i++)
+		vm_page_queues[PQ_ACTIVE+i].cnt = &vmstats.v_active_count;
+	for (i = 0; i < PQ_L2_SIZE; i++)
+		vm_page_queues[PQ_HOLD+i].cnt = &vmstats.v_active_count;
 	/* PQ_NONE has no queue */
 
 	for (i = 0; i < PQ_COUNT; i++) {
@@ -719,7 +721,7 @@ vm_page_hold(vm_page_t m)
 	if (m->queue - m->pc == PQ_FREE) {
 		_vm_page_queue_spin_lock(m);
 		_vm_page_rem_queue_spinlocked(m);
-		_vm_page_add_queue_spinlocked(m, PQ_HOLD, 0);
+		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
 		_vm_page_queue_spin_unlock(m);
 	}
 	vm_page_spin_unlock(m);
@@ -736,7 +738,7 @@ vm_page_unhold(vm_page_t m)
 {
 	vm_page_spin_lock(m);
 	atomic_add_int(&m->hold_count, -1);
-	if (m->hold_count == 0 && m->queue == PQ_HOLD) {
+	if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
 		_vm_page_queue_spin_lock(m);
 		_vm_page_rem_queue_spinlocked(m);
 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
@@ -1527,7 +1529,7 @@ vm_page_activate(vm_page_t m)
 	u_short oqueue;
 
 	vm_page_spin_lock(m);
-	if (m->queue != PQ_ACTIVE) {
+	if (m->queue - m->pc != PQ_ACTIVE) {
 		_vm_page_queue_spin_lock(m);
 		oqueue = _vm_page_rem_queue_spinlocked(m);
 		/* page is left spinlocked, queue is unlocked */
@@ -1537,7 +1539,7 @@ vm_page_activate(vm_page_t m)
 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
-			_vm_page_add_queue_spinlocked(m, PQ_ACTIVE, 0);
+			_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
 		}
 		_vm_page_and_queue_spin_unlock(m);
 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
@@ -1667,7 +1669,7 @@ vm_page_free_toq(vm_page_t m)
 
 	if (m->hold_count != 0) {
 		vm_page_flag_clear(m, PG_ZERO);
-		_vm_page_add_queue_spinlocked(m, PQ_HOLD, 0);
+		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
 	} else {
 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
 	}
@@ -1838,13 +1840,14 @@ vm_page_unwire(vm_page_t m, int activate)
 				;
 			} else if (activate) {
 				vm_page_spin_lock(m);
-				_vm_page_add_queue_spinlocked(m, PQ_ACTIVE, 0);
+				_vm_page_add_queue_spinlocked(m,
+							PQ_ACTIVE + m->pc, 0);
 				_vm_page_and_queue_spin_unlock(m);
 			} else {
 				vm_page_spin_lock(m);
 				vm_page_flag_clear(m, PG_WINATCFLS);
-				_vm_page_add_queue_spinlocked(m, PQ_INACTIVE,
-							      0);
+				_vm_page_add_queue_spinlocked(m,
+							PQ_INACTIVE + m->pc, 0);
 				++vm_swapcache_inactive_heuristic;
 				_vm_page_and_queue_spin_unlock(m);
 			}
@@ -1871,7 +1874,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead)
 	/*
 	 * Ignore if already inactive.
 	 */
-	if (m->queue == PQ_INACTIVE)
+	if (m->queue - m->pc == PQ_INACTIVE)
 		return;
 	_vm_page_queue_spin_lock(m);
 	oqueue = _vm_page_rem_queue_spinlocked(m);
@@ -1880,7 +1883,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead)
 		if (oqueue == PQ_CACHE)
 			mycpu->gd_cnt.v_reactivated++;
 		vm_page_flag_clear(m, PG_WINATCFLS);
-		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE, athead);
+		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
 		if (athead == 0)
 			++vm_swapcache_inactive_heuristic;
 	}
@@ -2094,7 +2097,7 @@ vm_page_dontneed(vm_page_t m)
 	 * occassionally leave the page alone
 	 */
 	if ((dnw & 0x01F0) == 0 ||
-	    m->queue == PQ_INACTIVE || 
+	    m->queue - m->pc == PQ_INACTIVE ||
 	    m->queue - m->pc == PQ_CACHE
 	) {
 		if (m->act_count >= ACT_INIT)
@@ -2653,8 +2656,16 @@ DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 	}
 	db_printf("\n");
 
-	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
-		vm_page_queues[PQ_ACTIVE].lcnt,
-		vm_page_queues[PQ_INACTIVE].lcnt);
+	db_printf("PQ_ACTIVE:");
+	for(i=0;i<PQ_L2_SIZE;i++) {
+		db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
+	}
+	db_printf("\n");
+
+	db_printf("PQ_INACTIVE:");
+	for(i=0;i<PQ_L2_SIZE;i++) {
+		db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
+	}
+	db_printf("\n");
 }
 #endif /* DDB */
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
@@ -258,12 +258,12 @@ typedef struct vm_page *vm_page_t;
 #endif
 
 #define PQ_NONE		0
-#define PQ_FREE		1
+#define PQ_FREE		(1 + 0*PQ_MAXL2_SIZE)
 #define PQ_INACTIVE	(1 + 1*PQ_MAXL2_SIZE)
-#define PQ_ACTIVE	(2 + 1*PQ_MAXL2_SIZE)
-#define PQ_CACHE	(3 + 1*PQ_MAXL2_SIZE)
-#define PQ_HOLD		(3 + 2*PQ_MAXL2_SIZE)
-#define PQ_COUNT	(4 + 2*PQ_MAXL2_SIZE)
+#define PQ_ACTIVE	(1 + 2*PQ_MAXL2_SIZE)
+#define PQ_CACHE	(1 + 3*PQ_MAXL2_SIZE)
+#define PQ_HOLD		(1 + 4*PQ_MAXL2_SIZE)
+#define PQ_COUNT	(1 + 5*PQ_MAXL2_SIZE)
 
 /*
  * Scan support