Skip to content

Commit

Permalink
WT-5170 Use mmap for I/O on all data files (#5155)
Browse files Browse the repository at this point in the history
* Simplified the implementation to not support file extension and truncation on mmapped files. Reads and writes destined to the outside of the mapped buffer default to the system call path.

* Added stats to measure how much I/O is done via mmap vs system calls

* Remap the mapped region if the file is extended via a write beyond the end of the file.

* Only mmap files that are of type data or log. Code refactoring. Minor bug fixes.

* Introduce remapping the mmapped region if we extended the file size via the write system call. Do not remap every time we have the opportunity, but only occasionally, to avoid the overhead.

* Enable fh_extend and remap the region upon file extension.

* Enable I/O via MMAP for all data files via a runtime option mmap_all. 

* Add low-level testing of mmap_all configuration at 5%.

Co-authored-by: Sasha Fedorova <sasha.fedorova@10gen.com>
Co-authored-by: Sasha Fedorova <sasha@mongodb.com>
Co-authored-by: Keith Bostic <keith.bostic@mongodb.com>
  • Loading branch information
4 people committed Mar 16, 2020
1 parent 1648fdd commit a168e98
Show file tree
Hide file tree
Showing 18 changed files with 933 additions and 479 deletions.
5 changes: 4 additions & 1 deletion dist/api_data.py
Expand Up @@ -988,7 +988,10 @@ def __ge__(self, other):
handle''',
min=15, undoc=True),
Config('mmap', 'true', r'''
Use memory mapping to access files when possible''',
Use memory mapping when accessing files in a read-only mode''',
type='boolean'),
Config('mmap_all', 'false', r'''
Use memory mapping to read and write all data files''',
type='boolean'),
Config('multiprocess', 'false', r'''
permit sharing between processes (will automatically start an
Expand Down
6 changes: 6 additions & 0 deletions dist/s_string.ok
Expand Up @@ -88,6 +88,7 @@ CreateFileMappingW
CreateFileW
Crummey
CustomersPhone
DAX
DECL
DECR
DESC
Expand Down Expand Up @@ -258,6 +259,7 @@ Memrata
Metadata
Mewhort
Mitzenmacher
Mmap
MongoDB
MoveFileExW
Multi
Expand Down Expand Up @@ -975,6 +977,7 @@ mT
madvise
majorp
malloc
mappable
marshall
marshalled
maxCLevel
Expand Down Expand Up @@ -1021,6 +1024,7 @@ mytxn
namespace
namespaces
nbits
nbsp
nchunks
nclr
nd
Expand Down Expand Up @@ -1339,6 +1343,8 @@ unistd
unlink
unlinked
unmap
unmapped
unmapping
unmarshall
unmarshalled
unmerged
Expand Down
6 changes: 6 additions & 0 deletions dist/stat_data.py
Expand Up @@ -193,11 +193,17 @@ def __init__(self, name, desc, flags=''):
##########################################
BlockStat('block_byte_map_read', 'mapped bytes read', 'size'),
BlockStat('block_byte_read', 'bytes read', 'size'),
BlockStat('block_byte_read_mmap', 'bytes read via memory map API', 'size'),
BlockStat('block_byte_read_syscall', 'bytes read via system call API', 'size'),
BlockStat('block_byte_write', 'bytes written', 'size'),
BlockStat('block_byte_write_checkpoint', 'bytes written for checkpoint', 'size'),
BlockStat('block_byte_write_mmap', 'bytes written via memory map API', 'size'),
BlockStat('block_byte_write_syscall', 'bytes written via system call API', 'size'),
BlockStat('block_map_read', 'mapped blocks read'),
BlockStat('block_preload', 'blocks pre-loaded'),
BlockStat('block_read', 'blocks read'),
BlockStat('block_remap_file_resize', 'number of times the file was remapped because it changed size via fallocate or truncate'),
BlockStat('block_remap_file_write', 'number of times the region was remapped via write'),
BlockStat('block_write', 'blocks written'),

##########################################
Expand Down
88 changes: 48 additions & 40 deletions src/config/config_def.c
Expand Up @@ -558,7 +558,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{"io_capacity", "category", NULL, NULL, confchk_wiredtiger_open_io_capacity_subconfigs, 1},
{"log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9},
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"mmap_all", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
Expand Down Expand Up @@ -628,7 +629,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{"io_capacity", "category", NULL, NULL, confchk_wiredtiger_open_io_capacity_subconfigs, 1},
{"log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9},
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"mmap_all", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
Expand Down Expand Up @@ -697,7 +699,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{"io_capacity", "category", NULL, NULL, confchk_wiredtiger_open_io_capacity_subconfigs, 1},
{"log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9},
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"mmap_all", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
Expand Down Expand Up @@ -764,7 +767,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{"io_capacity", "category", NULL, NULL, confchk_wiredtiger_open_io_capacity_subconfigs, 1},
{"log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9},
{"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"mmap", "boolean", NULL, NULL, NULL, 0}, {"mmap_all", "boolean", NULL, NULL, NULL, 0},
{"multiprocess", "boolean", NULL, NULL, NULL, 0},
{"operation_timeout_ms", "int", NULL, "min=1", NULL, 0},
{"operation_tracking", "category", NULL, NULL,
confchk_wiredtiger_open_operation_tracking_subconfigs, 2},
Expand Down Expand Up @@ -1011,16 +1015,17 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
",log=(archive=true,compressor=,enabled=false,file_max=100MB,"
"os_cache_dirty_pct=0,path=\".\",prealloc=true,recover=on,"
"zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
"mmap=true,multiprocess=false,operation_timeout_ms=0,"
"operation_tracking=(enabled=false,path=\".\"),readonly=false,"
"salvage=false,session_max=100,session_scratch_max=2MB,"
"session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
",method=fsync),use_environment=true,use_environment_priv=false,"
"mmap=true,mmap_all=false,multiprocess=false,"
"operation_timeout_ms=0,operation_tracking=(enabled=false,"
"path=\".\"),readonly=false,salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
"statistics=none,statistics_log=(json=false,on_close=false,"
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,write_through=",
confchk_wiredtiger_open, 51},
confchk_wiredtiger_open, 52},
{"wiredtiger_open_all",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
Expand All @@ -1042,16 +1047,17 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
",log=(archive=true,compressor=,enabled=false,file_max=100MB,"
"os_cache_dirty_pct=0,path=\".\",prealloc=true,recover=on,"
"zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
"mmap=true,multiprocess=false,operation_timeout_ms=0,"
"operation_tracking=(enabled=false,path=\".\"),readonly=false,"
"salvage=false,session_max=100,session_scratch_max=2MB,"
"session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
",method=fsync),use_environment=true,use_environment_priv=false,"
"mmap=true,mmap_all=false,multiprocess=false,"
"operation_timeout_ms=0,operation_tracking=(enabled=false,"
"path=\".\"),readonly=false,salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
"statistics=none,statistics_log=(json=false,on_close=false,"
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,version=(major=0,minor=0),write_through=",
confchk_wiredtiger_open_all, 52},
confchk_wiredtiger_open_all, 53},
{"wiredtiger_open_basecfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
Expand All @@ -1071,15 +1077,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"log=(archive=true,compressor=,enabled=false,file_max=100MB,"
"os_cache_dirty_pct=0,path=\".\",prealloc=true,recover=on,"
"zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
"mmap=true,multiprocess=false,operation_timeout_ms=0,"
"operation_tracking=(enabled=false,path=\".\"),readonly=false,"
"salvage=false,session_max=100,session_scratch_max=2MB,"
"session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
",method=fsync),verbose=,version=(major=0,minor=0),write_through=",
confchk_wiredtiger_open_basecfg, 46},
"mmap=true,mmap_all=false,multiprocess=false,"
"operation_timeout_ms=0,operation_tracking=(enabled=false,"
"path=\".\"),readonly=false,salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
"statistics=none,statistics_log=(json=false,on_close=false,"
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),verbose=,version=(major=0,minor=0),write_through=",
confchk_wiredtiger_open_basecfg, 47},
{"wiredtiger_open_usercfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
Expand All @@ -1099,15 +1106,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
"log=(archive=true,compressor=,enabled=false,file_max=100MB,"
"os_cache_dirty_pct=0,path=\".\",prealloc=true,recover=on,"
"zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
"mmap=true,multiprocess=false,operation_timeout_ms=0,"
"operation_tracking=(enabled=false,path=\".\"),readonly=false,"
"salvage=false,session_max=100,session_scratch_max=2MB,"
"session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
",method=fsync),verbose=,write_through=",
confchk_wiredtiger_open_usercfg, 45},
"mmap=true,mmap_all=false,multiprocess=false,"
"operation_timeout_ms=0,operation_tracking=(enabled=false,"
"path=\".\"),readonly=false,salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
"statistics=none,statistics_log=(json=false,on_close=false,"
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),verbose=,write_through=",
confchk_wiredtiger_open_usercfg, 46},
{NULL, NULL, NULL, 0}};

int
Expand Down
3 changes: 3 additions & 0 deletions src/conn/conn_api.c
Expand Up @@ -2572,6 +2572,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
conn->mmap = cval.val != 0;

WT_ERR(__wt_config_gets(session, cfg, "mmap_all", &cval));
conn->mmap_all = cval.val != 0;

WT_ERR(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval));
conn->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND);

Expand Down
8 changes: 4 additions & 4 deletions src/include/connection.h
Expand Up @@ -35,7 +35,7 @@ extern WT_PROCESS __wt_process;

/*
* WT_KEYED_ENCRYPTOR --
* An list entry for an encryptor with a unique (name, keyid).
* A list entry for an encryptor with a unique (name, keyid).
*/
struct __wt_keyed_encryptor {
const char *keyid; /* Key id of encryptor */
Expand Down Expand Up @@ -150,8 +150,7 @@ struct __wt_named_extractor {

/*
* WT_CONN_HOTBACKUP_START --
* Macro to set connection data appropriately for when we commence hot
* backup.
* Macro to set connection data appropriately for when we commence hot backup.
*/
#define WT_CONN_HOTBACKUP_START(conn) \
do { \
Expand Down Expand Up @@ -443,7 +442,8 @@ struct __wt_connection_impl {
uint64_t direct_io; /* O_DIRECT, FILE_FLAG_NO_BUFFERING */
uint64_t write_through; /* FILE_FLAG_WRITE_THROUGH */

bool mmap; /* mmap configuration */
bool mmap; /* use mmap when reading checkpoints */
bool mmap_all; /* use mmap for all I/O on data files */
int page_size; /* OS page size for mmap alignment */

/* AUTOMATIC FLAG VALUE GENERATION START */
Expand Down
5 changes: 5 additions & 0 deletions src/include/extern_posix.h
Expand Up @@ -53,10 +53,15 @@ extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, ui
bool (*run_func)(WT_SESSION_IMPL *), bool *signalled);
extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_map_file(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session);
extern void __wt_prepare_remap_resize_file(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session);
extern void __wt_release_without_remap(WT_FILE_HANDLE *file_handle);
extern void __wt_remap_resize_file(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session);
extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds)
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_stream_set_line_buffer(FILE *fp)
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_thread_id(uintmax_t *id) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_unmap_file(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session);
extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
10 changes: 9 additions & 1 deletion src/include/os.h
Expand Up @@ -11,7 +11,7 @@
/* \
* A call returning 0 indicates success; any call where \
* 0 is not the only successful return must provide an \
* expression evaluating to 0 in all successful cases. \
* expression evaluating to 0 in all successful cases. \
* \
* XXX \
* Casting the call's return to int is because CentOS 7.3.1611 \
Expand Down Expand Up @@ -139,6 +139,14 @@ struct __wt_file_handle_posix {
int fd; /* POSIX file handle */

bool direct_io; /* O_DIRECT configured */

/* The memory buffer and variables if we use mmap for I/O */
uint8_t *mmap_buf;
bool mmap_file_mappable;
int mmap_prot;
volatile uint32_t mmap_resizing;
wt_off_t mmap_size;
volatile uint32_t mmap_usecount;
};
#endif

Expand Down
6 changes: 6 additions & 0 deletions src/include/stat.h
Expand Up @@ -322,10 +322,16 @@ struct __wt_connection_stats {
int64_t block_read;
int64_t block_write;
int64_t block_byte_read;
int64_t block_byte_read_mmap;
int64_t block_byte_read_syscall;
int64_t block_byte_write;
int64_t block_byte_write_checkpoint;
int64_t block_byte_write_mmap;
int64_t block_byte_write_syscall;
int64_t block_map_read;
int64_t block_byte_map_read;
int64_t block_remap_file_resize;
int64_t block_remap_file_write;
int64_t cache_read_app_count;
int64_t cache_read_app_time;
int64_t cache_write_app_count;
Expand Down

0 comments on commit a168e98

Please sign in to comment.