Permalink
Browse files

Support for NUMA interleave policy

Summary:
Feature: NUMA support

Credits for research and implementation: Jeremy Cole and Davi Arnaut

This patch provides startup options:
* flush-caches: Flush and purge buffers/caches
* numa-interleave: Run mysqld with its memory interleaved on all CPUs

It also provides a config option:
* innodb_buffer_pool_populate: pre-allocation of buffer pool
memory at start up:
  -- Use MAP_POPULATE if supported (Linux 2.6.23 and higher)
  -- Forced pre-allocation using memset

Test Plan: mtr

Reviewers: steaphan, pivanof

Reviewed By: pivanof

CC: MarkCallaghan, jtolmer, jeremycole, flamingcow, andrew-ford, pengt, CalvinSun

Differential Revision: https://reviews.facebook.net/D16965
  • Loading branch information...
1 parent 44ea8ea commit 5c030170c88b9d30165919e903eb6b5f54246b65 @inaam-rana inaam-rana committed Mar 18, 2014
@@ -0,0 +1,12 @@
+CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");
+SELECT @@GLOBAL.innodb_buffer_pool_populate;
+@@GLOBAL.innodb_buffer_pool_populate
+1
+1 Expected
+SET @@GLOBAL.innodb_buffer_pool_populate=0;
+ERROR HY000: Variable 'innodb_buffer_pool_populate' is a read only variable
+Expected error 'Read only variable'
+SELECT @@GLOBAL.innodb_buffer_pool_populate;
+@@GLOBAL.innodb_buffer_pool_populate
+1
+1 Expected
@@ -0,0 +1 @@
+--innodb-buffer-pool-populate=true
@@ -0,0 +1,16 @@
+--source include/have_innodb.inc
+
+CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");
+
+# Display current value of innodb_buffer_pool_populate
+SELECT @@GLOBAL.innodb_buffer_pool_populate;
+--echo 1 Expected
+
+# Variable should be read-only
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SET @@GLOBAL.innodb_buffer_pool_populate=0;
+--echo Expected error 'Read only variable'
+
+SELECT @@GLOBAL.innodb_buffer_pool_populate;
+--echo 1 Expected
+
View
@@ -17,6 +17,8 @@ MYSQLD=
niceness=0
mysqld_ld_preload=
mysqld_ld_library_path=
+flush_caches=0
+numa_interleave=0
# Initial logging status: error log is not open, and not using syslog
logging=init
@@ -82,6 +84,9 @@ Usage: $0 [OPTIONS]
--syslog Log messages to syslog with 'logger'
--skip-syslog Log messages to error log (default)
--syslog-tag=TAG Pass -t "mysqld-TAG" to 'logger'
+ --flush-caches Flush and purge buffers/caches
+ --numa-interleave Run mysqld with its memory interleaved
+ on all CPUs
All other options are passed to the mysqld program.
@@ -227,6 +232,8 @@ parse_arguments() {
--skip-syslog) want_syslog=0 ;;
--syslog-tag=*) syslog_tag="$val" ;;
--timezone=*) TZ="$val"; export TZ; ;;
+ --flush-caches) flush_caches=1 ;;
+ --numa-interleave) numa_interleave=1 ;;
--help) usage ;;
@@ -739,6 +746,41 @@ mysqld daemon not started"
fi
fi
+#
+# Flush and purge buffers/caches.
+#
+
+if @TARGET_LINUX@ && test $flush_caches -eq 1
+then
+ # Locate sync, ensure it exists.
+ if ! my_which sync > /dev/null 2>&1
+ then
+ log_error "sync command not found, required for --flush-caches"
+ exit 1
+ # Flush file system buffers.
+ elif ! sync
+ then
+ # Huh, the sync() function is always successful...
+ log_error "sync failed, check if sync is properly installed"
+ fi
+
+ # Locate sysctl, ensure it exists.
+ if ! my_which sysctl > /dev/null 2>&1
+ then
+ log_error "sysctl command not found, required for --flush-caches"
+ exit 1
+ # Purge page cache, dentries and inodes.
+ elif ! sysctl -q -w vm.drop_caches=3
+ then
+ log_error "sysctl failed, check the error message for details"
+ exit 1
+ fi
+elif test $flush_caches -eq 1
+then
+ log_error "--flush-caches is not supported on this platform"
+ exit 1
+fi
+
#
# Uncomment the following lines if you want all tables to be automatically
# checked and repaired during startup. You should add sensible key_buffer
@@ -759,6 +801,31 @@ fi
cmd="`mysqld_ld_preload_text`$NOHUP_NICENESS"
+#
+# Set mysqld's memory interleave policy.
+#
+
+if @TARGET_LINUX@ && test $numa_interleave -eq 1
+then
+ # Locate numactl, ensure it exists.
+ if ! my_which numactl > /dev/null 2>&1
+ then
+ log_error "numactl command not found, required for --numa-interleave"
+ exit 1
+ # Attempt to run a command, ensure it works.
+ elif ! numactl --interleave=all true
+ then
+ log_error "numactl failed, check if numactl is properly installed"
+ fi
+
+ # Launch mysqld with numactl.
+ cmd="$cmd numactl --interleave=all"
+elif test $numa_interleave -eq 1
+then
+ log_error "--numa-interleave is not supported on this platform"
+ exit 1
+fi
+
for i in "$ledir/$MYSQLD" "$defaults" "--basedir=$MY_BASEDIR_VERSION" \
"--datadir=$DATADIR" "--plugin-dir=$plugin_dir" "$USER_OPTION"
do
@@ -1024,7 +1024,8 @@ buf_chunk_init(
/*===========*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
buf_chunk_t* chunk, /*!< out: chunk of buffers */
- ulint mem_size) /*!< in: requested size in bytes */
+ ulint mem_size, /*!< in: requested size in bytes */
+ ibool populate) /*!< in: virtual page preallocation */
{
buf_block_t* block;
byte* frame;
@@ -1038,7 +1039,7 @@ buf_chunk_init(
+ (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
chunk->mem_size = mem_size;
- chunk->mem = os_mem_alloc_large(&chunk->mem_size);
+ chunk->mem = os_mem_alloc_large(&chunk->mem_size, populate);
if (UNIV_UNLIKELY(chunk->mem == NULL)) {
@@ -1236,6 +1237,7 @@ buf_pool_init_instance(
/*===================*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
ulint buf_pool_size, /*!< in: size in bytes */
+ ibool populate, /*!< in: virtual page preallocation */
ulint instance_no) /*!< in: id of the instance */
{
ulint i;
@@ -1258,7 +1260,7 @@ buf_pool_init_instance(
UT_LIST_INIT(buf_pool->free);
- if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) {
+ if (!buf_chunk_init(buf_pool, chunk, buf_pool_size, populate)) {
mem_free(chunk);
mem_free(buf_pool);
@@ -1379,6 +1381,7 @@ dberr_t
buf_pool_init(
/*==========*/
ulint total_size, /*!< in: size of the total pool in bytes */
+ ibool populate, /*!< in: virtual page preallocation */
ulint n_instances) /*!< in: number of instances */
{
ulint i;
@@ -1394,7 +1397,7 @@ buf_pool_init(
for (i = 0; i < n_instances; i++) {
buf_pool_t* ptr = &buf_pool_ptr[i];
- if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
+ if (buf_pool_init_instance(ptr, size, populate, i) != DB_SUCCESS) {
/* Free all the instances created so far. */
buf_pool_free(i);
@@ -15895,6 +15895,12 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
NULL, NULL, 120, 1, 127, 0);
#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+static MYSQL_SYSVAR_BOOL(buffer_pool_populate, srv_buf_pool_populate,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Preallocate (pre-fault) the page frames required for the mapping "
+ "established by the buffer pool memory region. Disabled by default.",
+ NULL, NULL, FALSE);
+
static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
@@ -16368,6 +16374,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(api_bk_commit_interval),
MYSQL_SYSVAR(autoextend_increment),
MYSQL_SYSVAR(buffer_pool_size),
+ MYSQL_SYSVAR(buffer_pool_populate),
MYSQL_SYSVAR(buffer_pool_instances),
MYSQL_SYSVAR(buffer_pool_filename),
MYSQL_SYSVAR(buffer_pool_dump_now),
@@ -229,6 +229,7 @@ dberr_t
buf_pool_init(
/*=========*/
ulint size, /*!< in: Size of the total pool in bytes */
+ ibool populate, /*!< in: Force virtual page preallocation */
ulint n_instances); /*!< in: Number of instances */
/********************************************************************//**
Frees the buffer pool at shutdown. This must not be invoked before
@@ -58,7 +58,8 @@ UNIV_INTERN
void*
os_mem_alloc_large(
/*===============*/
- ulint* n); /*!< in/out: number of bytes */
+ ulint* n, /*!< in/out: number of bytes */
+ ibool populate); /*!< in: virtual page preallocation */
/****************************************************************//**
Frees large pages memory. */
UNIV_INTERN
@@ -276,6 +276,7 @@ extern my_bool srv_use_sys_malloc;
extern ibool srv_use_sys_malloc;
#endif /* UNIV_HOTBACKUP */
extern ulint srv_buf_pool_size; /*!< requested size in bytes */
+extern my_bool srv_buf_pool_populate; /*!< virtual page preallocation */
extern ulint srv_buf_pool_instances; /*!< requested number of buffer pool instances */
extern ulong srv_n_page_hash_locks; /*!< number of locks to
protect buf_pool->page_hash */
@@ -32,6 +32,12 @@ Created 9/30/1995 Heikki Tuuri
#include "ut0mem.h"
#include "ut0byte.h"
+/* Linux release version */
+#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
+#include <string.h> /* strverscmp() */
+#include <sys/utsname.h> /* uname() */
+#endif
+
/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
MAP_ANON but MAP_ANON is marked as deprecated */
#if defined(MAP_ANONYMOUS)
@@ -40,6 +46,13 @@ MAP_ANON but MAP_ANON is marked as deprecated */
#define OS_MAP_ANON MAP_ANON
#endif
+/* Linux's MAP_POPULATE */
+#if defined(MAP_POPULATE)
+#define OS_MAP_POPULATE MAP_POPULATE
+#else
+#define OS_MAP_POPULATE 0
+#endif
+
UNIV_INTERN ibool os_use_large_pages;
/* Large page size. This may be a boot-time option on some platforms */
UNIV_INTERN ulint os_large_page_size;
@@ -62,14 +75,32 @@ os_proc_get_number(void)
#endif
}
+/****************************************************************//**
+Retrieve and compare operating system release.
+@return TRUE if the OS release is equal to, or later than release. */
+UNIV_INTERN
+ibool
+os_compare_release(
+/*===============*/
+ const char* release) /*!< in: OS release */
+{
+#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
+ struct utsname name;
+ return(uname(&name) == 0 && strverscmp(name.release, release) >= 0);
+#else
+ return(FALSE);
+#endif
+}
+
/****************************************************************//**
Allocates large pages memory.
@return allocated memory */
UNIV_INTERN
void*
os_mem_alloc_large(
/*===============*/
- ulint* n) /*!< in/out: number of bytes */
+ ulint* n, /*!< in/out: number of bytes */
+ ibool populate) /*!< in: virtual page preallocation */
{
void* ptr;
ulint size;
@@ -155,7 +186,8 @@ os_mem_alloc_large(
ut_ad(ut_is_2pow(size));
size = *n = ut_2pow_round(*n + (size - 1), size);
ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | OS_MAP_ANON, -1, 0);
+ MAP_PRIVATE | OS_MAP_ANON |
+ (populate ? OS_MAP_POPULATE : 0), -1, 0);
if (UNIV_UNLIKELY(ptr == (void*) -1)) {
fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;"
" errno %lu\n",
@@ -168,6 +200,25 @@ os_mem_alloc_large(
UNIV_MEM_ALLOC(ptr, size);
}
#endif
+
+#if OS_MAP_ANON && OS_MAP_POPULATE
+ /* MAP_POPULATE is only supported for private mappings
+ since Linux 2.6.23. */
+ populate = populate && !os_compare_release("2.6.23");
+
+ if (ptr && populate) {
+ fprintf(stderr, "InnoDB: Warning: mmap(MAP_POPULATE) "
+ "is not supported for private mappings. "
+ "Forcing preallocation by faulting in pages.\n");
+ }
+#endif
+
+ /* Initialize the entire buffer to force the allocation
+ of physical memory page frames. */
+ if (ptr && populate) {
+ memset(ptr, '\0', size);
+ }
+
return(ptr);
}
@@ -208,7 +208,7 @@ row_log_block_allocate(
DBUG_ENTER("row_log_block_allocate");
if (log_buf.block == NULL) {
log_buf.size = srv_sort_buf_size;
- log_buf.block = (byte*) os_mem_alloc_large(&log_buf.size);
+ log_buf.block = (byte*) os_mem_alloc_large(&log_buf.size, FALSE);
DBUG_EXECUTE_IF("simulate_row_log_allocation_failure",
if (log_buf.block)
os_mem_free_large(log_buf.block, log_buf.size);
@@ -3461,7 +3461,7 @@ row_merge_build_indexes(
block_size = 3 * srv_sort_buf_size;
block = static_cast<row_merge_block_t*>(
- os_mem_alloc_large(&block_size));
+ os_mem_alloc_large(&block_size, FALSE));
if (block == NULL) {
DBUG_RETURN(DB_OUT_OF_MEMORY);
@@ -215,6 +215,8 @@ UNIV_INTERN const byte* srv_latin1_ordering;
UNIV_INTERN my_bool srv_use_sys_malloc = TRUE;
/* requested size in kilobytes */
UNIV_INTERN ulint srv_buf_pool_size = ULINT_MAX;
+/* force virtual page preallocation (prefault) */
+UNIV_INTERN my_bool srv_buf_pool_populate = FALSE;
/* requested number of buffer pool instances */
UNIV_INTERN ulint srv_buf_pool_instances = 1;
/* number of locks to protect buf_pool->page_hash */
@@ -1902,7 +1902,8 @@ innobase_start_or_create_for_mysql(void)
ib_logf(IB_LOG_LEVEL_INFO,
"Initializing buffer pool, size = %.1f%c", size, unit);
- err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances);
+ err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_populate,
+ srv_buf_pool_instances);
if (err != DB_SUCCESS) {
ib_logf(IB_LOG_LEVEL_ERROR,

0 comments on commit 5c03017

Please sign in to comment.