diff --git a/build_win/filelist.win b/build_win/filelist.win index 099451e418d..9d0ee10d305 100644 --- a/build_win/filelist.win +++ b/build_win/filelist.win @@ -45,6 +45,7 @@ src/btree/col_srch.c src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c +src/cache/cache_las.c src/config/config.c src/config/config_api.c src/config/config_check.c diff --git a/dist/filelist b/dist/filelist index c3321cf845d..f33f0e9a962 100644 --- a/dist/filelist +++ b/dist/filelist @@ -45,6 +45,7 @@ src/btree/col_srch.c src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c +src/cache/cache_las.c src/config/config.c src/config/config_api.c src/config/config_check.c diff --git a/dist/flags.py b/dist/flags.py index e3078dbf317..d98f249335e 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -45,9 +45,10 @@ 'READ_WONT_NEED', ], 'rec_write' : [ + 'EVICT_LOOKASIDE', 'EVICTING', - 'SKIP_UPDATE_ERR', - 'SKIP_UPDATE_RESTORE', + 'EVICT_UPDATE_RESTORE', + 'VISIBILITY_ERR', ], 'txn_log_checkpoint' : [ 'TXN_LOG_CKPT_CLEANUP', @@ -106,16 +107,17 @@ 'session' : [ 'SESSION_CAN_WAIT', 'SESSION_CLEAR_EVICT_WALK', + 'SESSION_INTERNAL', 'SESSION_LOCKED_CHECKPOINT', 'SESSION_LOCKED_HANDLE_LIST', 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', 'SESSION_LOCKED_TABLE', - 'SESSION_INTERNAL', 'SESSION_LOGGING_INMEM', + 'SESSION_LOOKASIDE_CURSOR', 'SESSION_NO_CACHE', - 'SESSION_NO_CACHE_CHECK', 'SESSION_NO_DATA_HANDLES', + 'SESSION_NO_EVICTION', 'SESSION_NO_LOGGING', 'SESSION_NO_SCHEMA_LOCK', 'SESSION_QUIET_CORRUPT_FILE', diff --git a/dist/s_string.ok b/dist/s_string.ok index 69baa972d1e..4419662b9c4 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -157,11 +157,13 @@ KVS Kanowski's Kounavis LANGID +LAS LF LLLLLL LLLLLLL LOGREC LOGSCAN +LOOKASIDE LRU LRVv LSB @@ -177,6 +179,7 @@ Levyx Llqr Llqrt LockFile +Lookaside Lookup MALLOC MEM @@ -211,6 +214,7 @@ NetBSD NoAddr Noll Nul +OOB OPTYPE OUTBUFF OVFL @@ -232,6 +236,7 @@ Preload Prepend Qsort RCS +RECNO REF's REFs RET @@ -321,6 +326,7 @@ WiredTiger's WiredTigerCheckpoint WiredTigerException WiredTigerInit +WiredTigerLAS WiredTigerLog WiredTigerPreplog WiredTigerTmplog @@ -506,6 +512,7 @@ dlh dll dlopen dlsym +dmalloc dmsg doxgen doxygen @@ -515,6 +522,7 @@ dsk dsrc dst dstlen +dstrdup dsync dumpcmp dumpfile @@ -651,6 +659,7 @@ kvraw kvs kvsbdb lang +las latencies lbrace lbracket @@ -678,6 +687,7 @@ logread logrec logsize logtest +lookaside lookup lookups lossy @@ -951,6 +961,7 @@ unesc unescaped uninstantiated unistd +unlinked unmap unmarshall unmarshalled diff --git a/dist/stat_data.py b/dist/stat_data.py index 6408128246e..1ec456b4700 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -127,9 +127,9 @@ def __init__(self, name, desc, flags=''): AsyncStat('async_alloc_race', 'number of allocation state races'), AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'), + AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_flush', 'number of flush calls'), AsyncStat('async_full', 'number of times operation allocation failed'), - AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'), AsyncStat('async_nowork', 'number of times worker found no work'), @@ -156,11 +156,11 @@ def __init__(self, name, desc, flags=''): ########################################## CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_inuse', - 'bytes currently in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_inuse', + 'bytes currently in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale'), @@ -172,11 +172,11 @@ def __init__(self, name, desc, flags=''): CacheStat('cache_bytes_read', 'bytes read into cache'), CacheStat('cache_bytes_write', 'bytes written from cache'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), + CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), CacheStat('cache_eviction_dirty', 'modified pages evicted'), - CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_fail', 'pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_force', @@ -204,13 +204,21 @@ def __init__(self, name, desc, flags=''): CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), CacheStat('cache_inmem_split', 'in-memory page splits'), + CacheStat('cache_lookaside_insert', 'lookaside table insert calls'), + CacheStat('cache_lookaside_remove', 'lookaside table remove calls'), CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'), CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'), CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_lookaside', + 'pages read into cache requiring lookaside entries'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_lookaside', + 'page written requiring lookaside records'), + CacheStat('cache_write_restore', + 'pages written requiring in-memory restoration'), ########################################## # Dhandle statistics @@ -236,8 +244,8 @@ def __init__(self, name, desc, flags=''): LogStat('log_compress_len', 'total size of compressed records'), LogStat('log_compress_mem', 'total in-memory size of compressed records'), LogStat('log_compress_small', 'log records too small to compress'), - LogStat('log_compress_writes', 'log records compressed'), LogStat('log_compress_write_fails', 'log records not compressed'), + LogStat('log_compress_writes', 'log records compressed'), LogStat('log_direct_writes', 'log records written directly'), LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'), LogStat('log_prealloc_files', 'pre-allocated log files prepared'), @@ -248,18 +256,17 @@ def __init__(self, name, desc, flags=''): LogStat('log_scan_records', 'records processed by log scan'), LogStat('log_scan_rereads', 'log scan records requiring two reads'), LogStat('log_scans', 'log scan operations'), - LogStat('log_sync', 'log sync operations'), - LogStat('log_sync_dir', 'log sync_dir operations'), - LogStat('log_writes', 'log write operations'), - LogStat('log_write_lsn', 'log server thread advances write LSN'), - + LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_coalesced', 'written slots coalesced'), LogStat('log_slot_consolidated', 'logging bytes consolidated'), - LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_joins', 'consolidated slot joins'), LogStat('log_slot_races', 'consolidated slot join races'), LogStat('log_slot_toobig', 'record size exceeded maximum'), LogStat('log_slot_transitions', 'consolidated slot join transitions'), + LogStat('log_sync', 'log sync operations'), + LogStat('log_sync_dir', 'log sync_dir operations'), + LogStat('log_write_lsn', 'log server thread advances write LSN'), + LogStat('log_writes', 'log write operations'), ########################################## # Reconciliation statistics @@ -278,6 +285,8 @@ def __init__(self, name, desc, flags=''): TxnStat('txn_checkpoint', 'transaction checkpoints'), TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_running', + 'transaction checkpoint currently running', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_min', @@ -286,17 +295,16 @@ def __init__(self, name, desc, flags=''): 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_running', - 'transaction checkpoint currently running', 'no_clear,no_scale'), + TxnStat('txn_commit', 'transactions committed'), + TxnStat('txn_fail_cache', + 'transaction failures due to cache overflow'), TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', - 'no_clear,no_scale'), + 'no_clear,no_scale'), TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), - TxnStat('txn_sync', 'transaction sync calls'), - TxnStat('txn_commit', 'transactions committed'), - TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_sync', 'transaction sync calls'), ########################################## # LSM statistics @@ -433,9 +441,9 @@ def __init__(self, name, desc, flags=''): ########################################## # Block manager statistics ########################################## - BlockStat('block_alloc', 'blocks allocated'), BlockStat('allocation_size', 'file allocation unit size', 'no_aggregate,no_scale'), + BlockStat('block_alloc', 'blocks allocated'), BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'), BlockStat('block_extension', 'allocations requiring file extension'), BlockStat('block_free', 'blocks freed'), @@ -465,17 +473,23 @@ def __init__(self, name, desc, flags=''): CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_lookaside', + 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_lookaside', + 'page written requiring lookaside records'), + CacheStat('cache_write_restore', + 'pages written requiring in-memory restoration'), ########################################## # Compression statistics ########################################## - CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'), CompressStat('compress_raw_fail_temporary', 'raw compression call failed, additional data available'), + CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_read', 'compressed pages read'), CompressStat('compress_write', 'compressed pages written'), CompressStat('compress_write_fail', 'page written failed to compress'), @@ -486,21 +500,21 @@ def __init__(self, name, desc, flags=''): # Reconciliation statistics ########################################## RecStat('rec_dictionary', 'dictionary matches'), + RecStat('rec_multiblock_internal', 'internal page multi-block writes'), + RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), + RecStat('rec_multiblock_max', + 'maximum blocks required for a page', 'max_aggregate,no_scale'), RecStat('rec_overflow_key_internal', 'internal-page overflow keys'), RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'), RecStat('rec_overflow_value', 'overflow values written'), - RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression'), RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression'), - RecStat('rec_multiblock_internal', 'internal page multi-block writes'), - RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), - RecStat('rec_multiblock_max', - 'maximum blocks required for a page', 'max_aggregate,no_scale'), ########################################## # Transaction statistics diff --git a/examples/c/ex_log.c b/examples/c/ex_log.c index 875f73a8c24..d5a8f32487d 100644 --- a/examples/c/ex_log.c +++ b/examples/c/ex_log.c @@ -214,11 +214,13 @@ walk_log(WT_SESSION *session) /* * If the operation is a put, replay it here on the backup - * connection. Note, we cheat by looking only for fileid 1 - * in this example. The metadata is fileid 0. + * connection. + * + * !!! + * Minor cheat: the metadata is fileid 0, skip its records. */ - if (fileid == 1 && rectype == WT_LOGREC_COMMIT && - optype == WT_LOGOP_ROW_PUT) { + if (fileid != 0 && + rectype == WT_LOGREC_COMMIT && optype == WT_LOGOP_ROW_PUT) { if (!in_txn) { ret = session2->begin_transaction(session2, NULL); diff --git a/src/async/async_op.c b/src/async/async_op.c index 7e1920933c2..469dbc8e615 100644 --- a/src/async/async_op.c +++ b/src/async/async_op.c @@ -237,7 +237,7 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id) asyncop->c.set_key = __wt_cursor_set_key; asyncop->c.get_value = __wt_cursor_get_value; asyncop->c.set_value = __wt_cursor_set_value; - asyncop->c.recno = 0; + asyncop->c.recno = WT_RECNO_OOB; memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf)); memset(&asyncop->c.key, 0, sizeof(asyncop->c.key)); memset(&asyncop->c.value, 0, sizeof(asyncop->c.value)); diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 18f8ca54601..79a52dbcaa3 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if - * the page is being reconciled, lock the page down. + * the page is being reconciled, serialize with reconciliation. */ - WT_PAGE_LOCK(session, page); + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); - WT_PAGE_UNLOCK(session, page); + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); WT_RET(ret); } return (0); @@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; - int block_manager_begin, evict_reset, skip; + int block_manager_begin, skip; WT_UNUSED(cfg); - conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; @@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_spin_lock(session, &btree->flush_lock); - /* - * That leaves eviction, we don't want to block eviction. Set a flag - * so reconciliation knows compaction is running. If reconciliation - * sees the flag it locks the page it's writing, we acquire the same - * lock when reading the page's modify information, serializing access. - * The same page lock blocks work on the page, but compaction is an - * uncommon, heavy-weight operation. If it's ever a problem, there's - * no reason we couldn't use an entirely separate lock than the page - * lock. - * - * We also need to ensure we don't race with an on-going reconciliation. - * After we set the flag, wait for eviction of this file to drain, and - * then let eviction continue; - */ - conn->compact_in_memory_pass = 1; - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; @@ -172,11 +151,7 @@ err: if (ref != NULL) if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); - /* - * Unlock will be a release barrier, use it to update the compaction - * status for reconciliation. - */ - conn->compact_in_memory_pass = 0; + /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 9f41e3ae684..ecb02941114 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -517,7 +517,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(__cursor_col_search(session, cbt, NULL)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = 0; + cbt->iface.recno = WT_RECNO_OOB; /* * If not overwriting, fail if the key exists. Creating a @@ -1152,6 +1152,19 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) return (ret); } +/* + * __wt_btcur_init -- + * Initialize an cursor used for internal purposes. + */ +void +__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + memset(cbt, 0, sizeof(WT_CURSOR_BTREE)); + + cbt->iface.session = &session->iface; + cbt->btree = S2BT(session); +} + /* * __wt_btcur_open -- * Open a btree cursor. @@ -1168,14 +1181,22 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) * Close a btree cursor. */ int -__wt_btcur_close(WT_CURSOR_BTREE *cbt) +__wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; - ret = __curfile_leave(cbt); + /* + * The in-memory split and lookaside table code creates low-level btree + * cursors to search/modify leaf pages. Those cursors don't hold hazard + * pointers, nor are they counted in the session handle's cursor count. + * Skip the usual cursor tear-down in that case. + */ + if (!lowlevel) + ret = __curfile_leave(cbt); + __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 77d80cdb3a2..38ef407e160 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -340,6 +340,8 @@ __wt_debug_disk( __dmsg(ds, ", empty-all"); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) __dmsg(ds, ", empty-none"); + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE)) + __dmsg(ds, ", LAS-update"); __dmsg(ds, ", generation %" PRIu64 "\n", dsk->write_gen); @@ -643,12 +645,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", disk-mapped"); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) __dmsg(ds, ", evict-lru"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING)) - __dmsg(ds, ", scanning"); + if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)) + __dmsg(ds, ", reconciliation"); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) __dmsg(ds, ", split-insert"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)) - __dmsg(ds, ", split-locked"); if (mod != NULL) switch (F_ISSET(mod, WT_PM_REC_MASK)) { diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index cddfa0ef801..b0d8ce850ae 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -77,7 +77,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); - ret = __wt_evict_page(session, ref); + ret = __wt_evict(session, ref, 0); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 060a93f543f..73e6affccd3 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -15,7 +15,6 @@ static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t); static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *); static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t); -static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *); /* * __wt_ref_out -- @@ -56,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)); #ifdef HAVE_DIAGNOSTIC { @@ -160,8 +159,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - __wt_free(session, multi->skip); - __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->supd); + __wt_free(session, multi->supd_dsk); __wt_free(session, multi->addr.addr); } __wt_free(session, mod->mod_multi); @@ -235,10 +234,7 @@ __wt_free_ref( * it clean explicitly.) */ if (free_pages && ref->page != NULL) { - if (ref->page->modify != NULL) { - ref->page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, ref->page); - } + __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } @@ -373,7 +369,7 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins) WT_INSERT *next; for (; ins != NULL; ins = next) { - __free_update_list(session, ins->upd); + __wt_free_update_list(session, ins->upd); next = WT_SKIP_NEXT(ins); __wt_free(session, ins); } @@ -395,29 +391,23 @@ __free_update( */ for (updp = update_head; entries > 0; --entries, ++updp) if (*updp != NULL) - __free_update_list(session, *updp); + __wt_free_update_list(session, *updp); /* Free the update array. */ __wt_free(session, update_head); } /* - * __free_update_list -- + * __wt_free_update_list -- * Walk a WT_UPDATE forward-linked list and free the per-thread combination * of a WT_UPDATE structure and its associated data. */ -static void -__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) +void +__wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_UPDATE *next; for (; upd != NULL; upd = next) { - /* Everything we free should be visible to everyone. */ - WT_ASSERT(session, - F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || - upd->txnid == WT_TXN_ABORTED || - __wt_txn_visible_all(session, upd->txnid)); - next = upd->next; __wt_free(session, upd); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 1b630e0c99f..6a4243a0fc7 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -255,27 +255,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) /* Page sizes */ WT_RET(__btree_page_sizes(session)); - /* - * Set special flags for the metadata file. - * Eviction; the metadata file is never evicted. - * Logging; the metadata file is always logged if possible. - */ - if (WT_IS_METADATA(btree->dhandle)) { + WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); + if (cval.val) F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + else + F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); + if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); - } else { - WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); - if (cval.val) - F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); - else - F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); - - WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); - if (cval.val) - F_CLR(btree, WT_BTREE_NO_LOGGING); - else - F_SET(btree, WT_BTREE_NO_LOGGING); - } + else + F_SET(btree, WT_BTREE_NO_LOGGING); /* Checksums */ WT_RET(__wt_config_gets(session, cfg, "checksum", &cval)); @@ -370,7 +360,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno) root_ref->page = root; root_ref->state = WT_REF_MEM; - root_ref->key.recno = is_recno ? 1 : 0; + root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB; root->pg_intl_parent_ref = root_ref; } diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 922dc2892b8..af31e5080d7 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -143,6 +143,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index a3ce39b7758..34e32780471 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -8,6 +8,275 @@ #include "wt_internal.h" +/* + * __wt_las_remove_block -- + * Remove all records matching a key prefix from the lookaside store. + */ +int +__wt_las_remove_block(WT_SESSION_IMPL *session, + WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) +{ + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + uint64_t las_counter, las_txnid; + uint32_t las_id; + int exact; + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + /* + * Search for the block's unique prefix and step through all matching + * records, removing them. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_txnid, &las_counter, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != btree_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + WT_ERR(cursor->remove(cursor)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + return (ret); +} + +/* + * __col_instantiate -- + * Update a column-store page entry based on a lookaside table update list. + */ +static int +__col_instantiate(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_col_search(session, recno, ref, cbt)); + WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0)); + return (0); +} + +/* + * __row_instantiate -- + * Update a row-store page entry based on a lookaside table update list. + */ +static int +__row_instantiate(WT_SESSION_IMPL *session, + WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_row_search(session, key, ref, cbt, 1)); + WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0)); + return (0); +} + +/* + * __las_page_instantiate -- + * Instantiate lookaside update records in a recently read page. + */ +static int +__las_page_instantiate(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) +{ + WT_CURSOR *cursor; + WT_CURSOR_BTREE cbt; + WT_DECL_ITEM(current_key); + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_ITEM(las_value); + WT_DECL_RET; + WT_PAGE *page; + WT_UPDATE *first_upd, *last_upd, *upd; + size_t incr, total_incr; + uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; + uint32_t las_id, upd_size, session_flags; + int exact; + const uint8_t *p; + + cursor = NULL; + page = ref->page; + first_upd = last_upd = upd = NULL; + total_incr = 0; + current_recno = recno = WT_RECNO_OOB; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + + WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_value)); + + /* Open a lookaside table cursor. */ + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * The lookaside records are in key and update order, that is, there + * will be a set of in-order updates for a key, then another set of + * in-order updates for a subsequent key. We process all of the updates + * for a key and then insert those updates into the page, then all the + * updates for the next key, and so on. + * + * Search for the block's unique prefix, stepping through any matching + * records. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_txnid, &las_counter, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != read_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * If the on-page value has become globally visible, this record + * is no longer needed. + */ + if (__wt_txn_visible_all(session, las_txnid)) + continue; + + /* Allocate the WT_UPDATE structure. */ + WT_ERR(cursor->get_value( + cursor, &upd_txnid, &upd_size, las_value)); + WT_ERR(__wt_update_alloc(session, + (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, + &upd, &incr)); + total_incr += incr; + upd->txnid = upd_txnid; + + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = las_key->data; + WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); + if (current_recno == recno) + break; + + if (first_upd != NULL) { + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + } + current_recno = recno; + break; + case WT_PAGE_ROW_LEAF: + if (current_key->size == las_key->size && + memcmp(current_key->data, + las_key->data, las_key->size) == 0) + break; + + if (first_upd != NULL) { + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + } + WT_ERR(__wt_buf_set(session, + current_key, las_key->data, las_key->size)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Append the latest update to the list. */ + if (first_upd == NULL) + first_upd = last_upd = upd; + else { + last_upd->next = upd; + last_upd = upd; + } + upd = NULL; + } + WT_ERR_NOTFOUND_OK(ret); + + /* Insert the last set of updates, if any. */ + if (first_upd != NULL) + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + break; + case WT_PAGE_ROW_LEAF: + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Discard the cursor. */ + WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); + + if (total_incr != 0) { + __wt_cache_page_inmem_incr(session, page, total_incr); + + /* + * We've modified/dirtied the page, but that's not necessary and + * if we keep the page clean, it's easier to evict. We leave the + * lookaside table updates in place, so if we evict this page + * without dirtying it, any future instantiation of it will find + * the records it needs. If the page is dirtied before eviction, + * then we'll write any needed lookaside table records for the + * new location of the page. + */ + __wt_page_modify_clear(session, page); + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + WT_TRET(__wt_btcur_close(&cbt, 1)); + + /* + * On error, upd points to a single unlinked WT_UPDATE structure, + * first_upd points to a list. + */ + if (upd != NULL) + __wt_free(session, upd); + if (first_upd != NULL) + __wt_free_update_list(session, first_upd); + + __wt_scr_free(session, ¤t_key); + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + __wt_scr_free(session, &las_value); + + return (ret); +} + /* * __wt_cache_read -- * Read a page from the file. @@ -15,6 +284,8 @@ int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) { + const WT_PAGE_HEADER *dsk; + WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; @@ -22,6 +293,7 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) uint32_t previous_state; const uint8_t *addr; + btree = S2BT(session); page = NULL; /* @@ -69,12 +341,29 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); + + /* + * Instantiate updates from the database's lookaside table. The + * flag might have been set a long time ago, and we only care + * if the lookaside table is currently active, check that before + * doing any work. + */ + dsk = tmp.data; + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && + __wt_las_is_written(session)) { + WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); + + WT_ERR(__las_page_instantiate( + session, ref, btree->id, addr, addr_size)); + } } WT_ERR(__wt_verbose(session, WT_VERB_READ, "page %p: %s", page, __wt_page_type_string(page->type))); WT_PUBLISH(ref->state, WT_REF_MEM); + return (0); err: /* diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 22d4948e07d..0555374f13d 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1305,7 +1305,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); /* Reset the page. */ page->pg_var_d = save_col_var; @@ -2011,7 +2011,7 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); /* Reset the page. */ page->pg_row_entries += skip_stop; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index a63eadcaeab..58d90c70c51 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -343,7 +343,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) switch (page->type) { case WT_PAGE_COL_INT: - recno = 0; + recno = 0; /* Less than any valid record number. */ WT_INTL_FOREACH_BEGIN(session, page, ref) { WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; @@ -684,13 +684,11 @@ __split_multi_inmem( WT_DECL_RET; WT_PAGE *page; WT_UPDATE *upd; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; uint64_t recno; uint32_t i, slot; - WT_CLEAR(cbt); - cbt.iface.session = &session->iface; - cbt.btree = S2BT(session); + __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); /* @@ -704,22 +702,22 @@ __split_multi_inmem( * allocated page on error, when discarding the allocated WT_REF. */ WT_RET(__wt_page_inmem(session, ref, - multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size, + multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size, WT_PAGE_DISK_ALLOC, &page)); - multi->skip_dsk = NULL; + multi->supd_dsk = NULL; if (orig->type == WT_PAGE_ROW_LEAF) WT_RET(__wt_scr_alloc(session, 0, &key)); /* Re-create each modification we couldn't write. */ - for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip) + for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) switch (orig->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: /* Build a key. */ - upd = skip->ins->upd; - skip->ins->upd = NULL; - recno = WT_INSERT_RECNO(skip->ins); + upd = supd->ins->upd; + supd->ins->upd = NULL; + recno = WT_INSERT_RECNO(supd->ins); /* Search the page. */ WT_ERR(__wt_col_search(session, recno, ref, &cbt)); @@ -730,19 +728,19 @@ __split_multi_inmem( break; case WT_PAGE_ROW_LEAF: /* Build a key. */ - if (skip->ins == NULL) { - slot = WT_ROW_SLOT(orig, skip->rip); + if (supd->ins == NULL) { + slot = WT_ROW_SLOT(orig, supd->rip); upd = orig->pg_row_upd[slot]; orig->pg_row_upd[slot] = NULL; WT_ERR(__wt_row_leaf_key( - session, orig, skip->rip, key, 0)); + session, orig, supd->rip, key, 0)); } else { - upd = skip->ins->upd; - skip->ins->upd = NULL; + upd = supd->ins->upd; + supd->ins->upd = NULL; - key->data = WT_INSERT_KEY(skip->ins); - key->size = WT_INSERT_KEY_SIZE(skip->ins); + key->data = WT_INSERT_KEY(supd->ins); + key->size = WT_INSERT_KEY_SIZE(supd->ins); } /* Search the page. */ @@ -765,7 +763,7 @@ __split_multi_inmem( page->modify->first_dirty_txn = WT_TXN_FIRST; err: /* Free any resources that may have been cached in the cursor. */ - WT_TRET(__wt_btcur_close(&cbt)); + WT_TRET(__wt_btcur_close(&cbt, 1)); __wt_scr_free(session, &key); return (ret); @@ -801,7 +799,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, */ ref->home = NULL; - if (multi->skip == NULL) { + if (multi->supd == NULL) { /* * Copy the address: we could simply take the buffer, but that * would complicate error handling, freeing the reference array @@ -830,7 +828,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, break; } - ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM; + ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM; /* * If our caller wants to track the memory allocations, we have a return @@ -878,25 +876,35 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * memory inside of the lock and may want to invest effort in making the * locked period shorter. * - * We could race with another thread deepening our parent. To deal - * with that, read the parent pointer each time we try to lock it, and - * check that it's still correct after it is locked. + * We could race with another thread deepening our parent. To deal with + * that, read the parent pointer each time we try to lock it, and check + * that it's still correct after it is locked. + * + * We use the reconciliation lock here because not only do we have to + * single-thread the split, we have to lock out reconciliation of the + * parent because reconciliation of the parent can't deal with finding + * a split child during internal page traversal. Basically, there's no + * reason to use a different lock if we have to block reconciliation + * anyway. */ for (;;) { parent = ref->home; - F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret); + F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret); if (ret == 0) { if (parent == ref->home) break; - F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); + F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); continue; } /* - * If we're attempting an in-memory split and we can't lock the - * parent while there is a checkpoint in progress, give up. - * This avoids an infinite loop where we are trying to split a - * page while its parent is being checkpointed. + * A checkpoint reconciling this parent page can deadlock with + * our in-memory split. We have an exclusive page lock on the + * child before we acquire the page's reconciliation lock, and + * reconciliation acquires the page's reconciliation lock before + * it will encounter the child's exclusive lock (which causes + * reconciliation to loop until the exclusive lock is resolved). + * If we can't lock the parent, give up to avoid that deadlock. */ if (LF_ISSET(WT_SPLIT_INMEM) && S2BT(session)->checkpointing) return (EBUSY); @@ -1131,7 +1139,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); + F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); @@ -1198,7 +1206,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * The key-instantiation code checks for races, clear the key fields so * we don't trigger them. */ - child->key.recno = 0; + child->key.recno = WT_RECNO_OOB; child->key.ikey = NULL; child->state = WT_REF_MEM; @@ -1390,8 +1398,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * We marked the new page dirty; we're going to discard it, but * first mark it clean and fix up the cache statistics. */ - right->modify->write_gen = 0; - __wt_cache_dirty_decr(session, right); + __wt_page_modify_clear(session, right); WT_ERR(ret); } @@ -1448,8 +1455,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) * Pages with unresolved changes are not marked clean during * reconciliation, do it now. */ - mod->write_gen = 0; - __wt_cache_dirty_decr(session, page); + __wt_page_modify_clear(session, page); __wt_ref_out(session, ref); /* Swap the new page into place. */ @@ -1506,10 +1512,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * Pages with unresolved changes are not marked clean during * reconciliation, do it now. */ - if (__wt_page_is_modified(page)) { - mod->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + __wt_page_modify_clear(session, page); __wt_page_out(session, &page); return (0); diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index 095e439786c..38396facc3d 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -71,19 +71,20 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: - if (dsk->recno != 0) + if (dsk->recno != WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a record number of zero", - __wt_page_type_string(dsk->type), tag); + "%s page at %s has an invalid record number of %d", + __wt_page_type_string(dsk->type), tag, WT_RECNO_OOB); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - if (dsk->recno == 0) + if (dsk->recno == WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a non-zero record number", + "%s page at %s has a record number, which is illegal for " + "this page type", __wt_page_type_string(dsk->type), tag); } @@ -91,8 +92,6 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, flags = dsk->flags; if (LF_ISSET(WT_PAGE_COMPRESSED)) LF_CLR(WT_PAGE_COMPRESSED); - if (LF_ISSET(WT_PAGE_ENCRYPTED)) - LF_CLR(WT_PAGE_ENCRYPTED); if (dsk->type == WT_PAGE_ROW_LEAF) { if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && LF_ISSET(WT_PAGE_EMPTY_V_NONE)) @@ -105,6 +104,10 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) LF_CLR(WT_PAGE_EMPTY_V_NONE); } + if (LF_ISSET(WT_PAGE_ENCRYPTED)) + LF_CLR(WT_PAGE_ENCRYPTED); + if (LF_ISSET(WT_PAGE_LAS_UPDATE)) + LF_CLR(WT_PAGE_LAS_UPDATE); if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index fb7c9a1ce90..f98c62830e5 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -48,10 +48,10 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the - * append list, not the update list. In addition, a recno of 0 + * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. */ - if (recno == 0 || + if (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page))) append = 1; @@ -160,7 +160,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (cbt->ins_stack[0] == NULL || recno == 0) + if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 49a749b8a02..888c54d1ec9 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -112,6 +112,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * there should only be one update list per key. */ WT_ASSERT(session, *upd_entry == NULL); + /* * Set the "old" entry to the second update in the list * so that the serialization function succeeds in diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c new file mode 100644 index 00000000000..155d806f33b --- /dev/null +++ b/src/cache/cache_las.c @@ -0,0 +1,393 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_las_stats_update -- + * Update the lookaside table statistics for return to the application. + */ +void +__wt_las_stats_update(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS **cstats; + WT_DSRC_STATS **dstats; + + conn = S2C(session); + + /* + * Lookaside table statistics are copied from the underlying lookaside + * table data-source statistics. If there's no lookaside table, values + * remain 0. In the current system, there's always a lookaside table, + * but there's no reason not to be cautious. + */ + if (conn->las_cursor == NULL) + return; + + /* + * We have a cursor, and we need the underlying data handle; we can get + * to it by way of the underlying btree handle, but it's a little ugly. + */ + cstats = conn->stats; + dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats; + + WT_STAT_SET(session, cstats, + cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); + WT_STAT_SET(session, cstats, + cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); +} + +/* + * __las_cursor_create -- + * Open a new lookaside table cursor. + */ +static int +__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +{ + WT_BTREE *btree; + const char *open_cursor_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; + + WT_RET(__wt_open_cursor( + session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); + + /* + * Set special flags for the lookaside table: the lookaside flag (used, + * for example, to avoid writing records during reconciliation), also + * turn off checkpoints and logging. + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) + F_SET(btree, WT_BTREE_LOOKASIDE); + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(btree, WT_BTREE_NO_CHECKPOINT); + if (!F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_SET(btree, WT_BTREE_NO_LOGGING); + + return (0); +} + +/* + * __wt_las_create -- + * Initialize the database's lookaside store. + */ +int +__wt_las_create(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const char *drop_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; + + conn = S2C(session); + + /* + * Done at startup: we cannot do it on demand because we require the + * schema lock to create and drop the file, and it may not always be + * available. + * + * Open an internal session, used for the shared lookaside cursor. + * + * Sessions associated with a lookaside cursor should never be tapped + * for eviction. + */ + WT_RET(__wt_open_internal_session( + conn, "lookaside table", 1, 1, &conn->las_session)); + session = conn->las_session; + F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); + + /* Discard any previous incarnation of the file. */ + WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg)); + + /* Re-create the file. */ + WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); + + /* Open the shared cursor. */ + WT_WITHOUT_DHANDLE(session, + ret = __las_cursor_create(session, &conn->las_cursor)); + + return (ret); +} + +/* + * __wt_las_destroy -- + * Destroy the database's lookaside store. + */ +int +__wt_las_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + if (conn->las_session == NULL) + return (0); + + wt_session = &conn->las_session->iface; + ret = wt_session->close(wt_session, NULL); + + conn->las_cursor = NULL; + conn->las_session = NULL; + + return (ret); +} + +/* + * __wt_las_set_written -- + * Flag that the lookaside table has been written. + */ +void +__wt_las_set_written(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + if (conn->las_written == 0) { + conn->las_written = 1; + + /* + * Push the flag: unnecessary, but from now page reads must deal + * with lookaside table records, and we only do the write once. + */ + WT_FULL_BARRIER(); + } +} + +/* + * __wt_las_is_written -- + * Return if the lookaside table has been written. + */ +int +__wt_las_is_written(WT_SESSION_IMPL *session) +{ + return (S2C(session)->las_written); +} + +/* + * __wt_las_cursor -- + * Return a lookaside cursor. + */ +int +__wt_las_cursor( + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + *cursorp = NULL; + + /* + * We don't want to get tapped for eviction after we start using the + * lookaside cursor; save a copy of the current eviction state, we'll + * turn eviction off before we return. + * + * Don't cache lookaside table pages, we're here because of eviction + * problems and there's no reason to believe lookaside pages will be + * useful more than once. + */ + *session_flags = + F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + + conn = S2C(session); + + /* Eviction and sweep threads have their own lookaside table cursors. */ + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + if (session->las_cursor == NULL) { + WT_WITHOUT_DHANDLE(session, ret = + __las_cursor_create(session, &session->las_cursor)); + WT_RET(ret); + } + + *cursorp = session->las_cursor; + } else { + /* Lock the shared lookaside cursor. */ + __wt_spin_lock(session, &conn->las_lock); + + *cursorp = conn->las_cursor; + } + + /* Turn caching and eviction off. */ + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + + return (0); +} + +/* + * __wt_las_cursor_close -- + * Discard a lookaside cursor. + */ +int +__wt_las_cursor_close( + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_RET; + + conn = S2C(session); + + if ((cursor = *cursorp) == NULL) + return (0); + *cursorp = NULL; + + /* Reset the cursor. */ + ret = cursor->reset(cursor); + + /* + * We turned off caching and eviction while the lookaside cursor was in + * use, restore the session's flags. + */ + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_SET(session, session_flags); + + /* + * Eviction and sweep threads have their own lookaside table cursors; + * else, unlock the shared lookaside cursor. + */ + if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) + __wt_spin_unlock(session, &conn->las_lock); + + return (ret); +} + +/* + * __wt_las_sweep -- + * Sweep the lookaside table. + */ +int +__wt_las_sweep(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + WT_ITEM *key; + uint64_t cnt, las_counter, las_txnid; + uint32_t las_id, session_flags; + int notused; + + conn = S2C(session); + cursor = NULL; + key = &conn->las_sweep_key; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * If we're not starting a new sweep, position the cursor using the key + * from the last call (we don't care if we're before or after the key, + * just roughly in the same spot is fine). + */ + if (conn->las_sweep_call != 0 && key->data != NULL) { + __wt_cursor_set_raw_key(cursor, key); + if ((ret = + cursor->search_near(cursor, ¬used)) == WT_NOTFOUND) { + WT_ERR(cursor->reset(cursor)); + return (0); + } + WT_ERR(ret); + } + + /* + * The sweep server wakes up every 10 seconds (by default), it's a slow + * moving thread. Try to review the entire lookaside table once every 5 + * minutes, or every 30 calls. + * + * The reason is because the lookaside table exists because we're seeing + * cache/eviction pressure (it allows us to trade performance and disk + * space for cache space), and it's likely lookaside blocks are being + * evicted, and reading them back in doesn't help things. A trickier, + * but possibly better, alternative might be to review all lookaside + * blocks in the cache in order to get rid of them, and slowly review + * lookaside blocks that have already been evicted. + * + * We can't know for sure how many records are in the lookaside table, + * the cursor insert and remove statistics aren't updated atomically. + * Start with reviewing 100 rows, and if it takes more than the target + * number of calls to finish, increase the number of rows checked on + * each call; if it takes less than the target calls to finish, then + * decrease the number of rows reviewed on each call (but never less + * than 100). + */ +#define WT_SWEEP_LOOKASIDE_MIN_CNT 100 +#define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 + ++conn->las_sweep_call; + if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) + cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; + + /* Walk the file. */ + for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { + /* + * If the loop terminates after completing a work unit, we will + * continue the table sweep next time. Get a local copy of the + * sweep key, we're going to reset the cursor; do so before + * calling cursor.remove, cursor.remove can discard our hazard + * pointer and the page could be evicted from underneath us. + */ + if (cnt == 1) { + WT_ERR(__wt_cursor_get_raw_key(cursor, key)); + if (!WT_DATA_IN_ITEM(key)) + WT_ERR(__wt_buf_set( + session, key, key->data, key->size)); + } + + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_txnid, &las_counter, las_key)); + + /* + * If the on-page record transaction ID associated with the + * record is globally visible, the record can be discarded. + * + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + if (__wt_txn_visible_all(session, las_txnid)) + WT_ERR(cursor->remove(cursor)); + } + + /* + * When reaching the lookaside table end or the target number of calls, + * adjust the row count. Decrease/increase the row count depending on + * if the number of calls is less/more than the target. + */ + if (ret == WT_NOTFOUND || + conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { + if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && + conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) + conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; + if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) + conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; + } + if (ret == WT_NOTFOUND) + conn->las_sweep_call = 0; + + WT_ERR_NOTFOUND_OK(ret); + + if (0) { +err: __wt_buf_free(session, key); + } + + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + + return (ret); +} diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index b28fca3a71b..b1155d06826 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -2031,11 +2031,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_turtle_init(session)); WT_ERR(__wt_metadata_open(session)); - /* - * Start the worker threads last. - */ + /* Start the worker threads and run recovery. */ WT_ERR(__wt_connection_workers(session, cfg)); + /* Create the lookaside table. */ + WT_ERR(__wt_las_create(session)); + WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 1c4a631cc59..7a8a6cba838 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -55,6 +55,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_rwlock_alloc(session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); @@ -140,6 +141,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); + __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 397f3ff8c38..8bc69bb3e80 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -111,14 +111,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) F_CLR(conn, WT_CONN_SERVER_RUN); WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); + WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, 1)); - WT_TRET(__wt_sweep_destroy(session)); WT_TRET(__wt_evict_destroy(session)); + /* Shut down the lookaside table, after all eviction is complete. */ + WT_TRET(__wt_las_destroy(session)); + /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); @@ -238,9 +241,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* Run recovery. */ WT_RET(__wt_txn_recover(session)); - /* - * Start the handle sweep thread. - */ + /* Start the handle sweep thread. */ WT_RET(__wt_sweep_create(session)); /* Start the optional async threads. */ diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 80698c536cd..3b188bfd22a 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -50,6 +50,7 @@ __wt_conn_stat_init(WT_SESSION_IMPL *session) __wt_async_stats_update(session); __wt_cache_stats_update(session); + __wt_las_stats_update(session); __wt_txn_stats_update(session); WT_STAT_SET(session, stats, file_open, conn->open_file_count); diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 3de9347f38f..8da32416242 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -282,6 +282,13 @@ __sweep_server(void *arg) WT_STAT_FAST_CONN_INCR(session, dh_sweeps); + /* + * Sweep the lookaside table. If the lookaside table hasn't yet + * been written, there's no work to do. + */ + if (__wt_las_is_written(session)) + WT_ERR(__wt_las_sweep(session)); + /* * Mark handles with a time of death, and report whether any * handles are marked dead. If sweep_idle_time is 0, handles @@ -359,8 +366,14 @@ __wt_sweep_create(WT_SESSION_IMPL *session) /* * Handle sweep does enough I/O it may be called upon to perform slow * operations for the block manager. + * + * The sweep thread sweeps the lookaside table for outdated records, + * it gets its own cursor for that purpose. + * + * Don't tap the sweep thread for eviction. */ - F_SET(session, WT_SESSION_CAN_WAIT); + F_SET(session, WT_SESSION_CAN_WAIT | + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); WT_RET(__wt_cond_alloc( session, "handle sweep server", 0, &conn->sweep_cond)); @@ -399,5 +412,9 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session) conn->sweep_session = NULL; } + + /* Discard any saved lookaside key. */ + __wt_buf_free(session, &conn->las_sweep_key); + return (ret); } diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 60d94697189..3d9e5e405e8 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -514,17 +514,23 @@ static int __backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CURSOR_BACKUP *cb; + const char *name; WT_UNUSED(cfg); cb = session->bkp_cursor; + name = session->dhandle->name; /* Ignore files in the process of being bulk-loaded. */ if (F_ISSET(S2BT(session), WT_BTREE_BULK)) return (0); + /* Ignore the lookaside table. */ + if (strcmp(name, WT_LAS_URI) == 0) + return (0); + /* Add the file to the list of files to be copied. */ - return (__backup_list_append(session, cb, session->dhandle->name)); + return (__backup_list_append(session, cb, name)); } /* diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index c58d6899150..8ee57d24413 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -510,7 +510,7 @@ __wt_curds_open( source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); - source->recno = 0; + source->recno = WT_RECNO_OOB; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index c3b54460df2..c998565eb75 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -369,7 +369,7 @@ __curfile_close(WT_CURSOR *cursor) __wt_buf_free(session, &cbulk->last); } - WT_TRET(__wt_btcur_close(cbt)); + WT_TRET(__wt_btcur_close(cbt, 0)); /* The URI is owned by the btree handle. */ cursor->internal_uri = NULL; WT_TRET(__wt_cursor_close(cursor)); diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index b7d8be14e5c..701bd845ae9 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -258,9 +258,9 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) item->data, item->size, "q", &cursor->recno)); } else cursor->recno = va_arg(ap, uint64_t); - if (cursor->recno == 0) + if (cursor->recno == WT_RECNO_OOB) WT_ERR_MSG(session, EINVAL, - "Record numbers must be greater than zero"); + "%d is an invalid record number", WT_RECNO_OOB); buf->data = &cursor->recno; sz = sizeof(cursor->recno); } else { diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 35ff0e4329e..66fabe48fb2 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -80,16 +80,13 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) break; case WT_SYNC_DISCARD: /* - * If we see a dirty page in a dead handle, clean the + * Dead handles may reference dirty pages; clean the * page, both to keep statistics correct, and to let * the page-discard function assert no dirty page is * ever discarded. */ - if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) && - __wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 4f0cad6851a..d9f9f89e059 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -10,14 +10,13 @@ static int __evict_clear_all_walks(WT_SESSION_IMPL *); static int __evict_clear_walks(WT_SESSION_IMPL *); -static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *); static int WT_CDECL __evict_lru_cmp(const void *, const void *); static int __evict_lru_pages(WT_SESSION_IMPL *, int); -static int __evict_lru_walk(WT_SESSION_IMPL *, uint32_t); +static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, int); static int __evict_pass(WT_SESSION_IMPL *); -static int __evict_walk(WT_SESSION_IMPL *, uint32_t); -static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t); +static int __evict_walk(WT_SESSION_IMPL *); +static int __evict_walk_file(WT_SESSION_IMPL *, u_int *); static WT_THREAD_RET __evict_worker(void *); static int __evict_server_work(WT_SESSION_IMPL *); @@ -248,9 +247,16 @@ __evict_workers_resize(WT_SESSION_IMPL *session) for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { WT_ERR(__wt_open_internal_session(conn, - "eviction-worker", 0, 0, &workers[i].session)); + "eviction-worker", 1, 0, &workers[i].session)); workers[i].id = i; - F_SET(workers[i].session, WT_SESSION_CAN_WAIT); + + /* + * Eviction worker threads get their own lookaside table cursor. + * Eviction worker threads may be called upon to perform slow + * operations for the block manager. + */ + F_SET(workers[i].session, + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT); if (i < conn->evict_workers_min) { ++conn->evict_workers; @@ -280,7 +286,7 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* We need a session handle because we're reading/writing pages. */ WT_RET(__wt_open_internal_session( - conn, "eviction-server", 0, 0, &conn->evict_session)); + conn, "eviction-server", 1, 0, &conn->evict_session)); session = conn->evict_session; /* @@ -297,6 +303,9 @@ __wt_evict_create(WT_SESSION_IMPL *session) else F_SET(session, WT_SESSION_CAN_WAIT); + /* The eviction server gets its own lookaside table cursor. */ + F_SET(session, WT_SESSION_LOOKASIDE_CURSOR); + /* * Start the primary eviction server thread after the worker threads * have started to avoid it starting additional worker threads before @@ -406,47 +415,62 @@ err: WT_PANIC_MSG(session, ret, "cache eviction worker error"); } /* - * __evict_has_work -- - * Find out if there is eviction work to be done. + * __evict_update_work -- + * Configure eviction work state. */ static int -__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp) +__evict_update_work(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - uint32_t flags; - int evict, dirty; + uint64_t bytes_inuse, bytes_max, dirty_inuse; conn = S2C(session); cache = conn->cache; - *flagsp = flags = 0; + + /* Clear previous state. */ + cache->state = 0; if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) return (0); - /* Check to see if the eviction server should run. */ - __wt_cache_status(session, &evict, &dirty); - if (evict) - /* The cache is too small. */ - LF_SET(WT_EVICT_PASS_ALL); - else if (dirty) - /* Too many dirty pages, ignore clean pages. */ - LF_SET(WT_EVICT_PASS_DIRTY); - else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { - /* - * Evict pages with oldest generation (which would otherwise - * block application threads) set regardless of whether we have - * reached the eviction trigger. - */ - LF_SET(WT_EVICT_PASS_WOULD_BLOCK); - F_CLR(cache, WT_CACHE_WOULD_BLOCK); + /* + * Page eviction overrides the dirty target and other types of eviction, + * that is, we don't care where we are with respect to the dirty target + * if page eviction is configured. + * + * Avoid division by zero if the cache size has not yet been set in a + * shared cache. + */ + bytes_max = conn->cache_size + 1; + bytes_inuse = __wt_cache_bytes_inuse(cache); + if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) { + FLD_SET(cache->state, WT_EVICT_PASS_ALL); + goto done; } - if (F_ISSET(cache, WT_CACHE_STUCK)) - LF_SET(WT_EVICT_PASS_AGGRESSIVE); + dirty_inuse = __wt_cache_dirty_inuse(cache); + if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) { + FLD_SET(cache->state, WT_EVICT_PASS_DIRTY); + goto done; + } - *flagsp = flags; + /* + * Evict pages with oldest generation (which would otherwise block + * application threads), set regardless of whether we have reached + * the eviction trigger. + */ + if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { + FLD_SET(cache->state, WT_EVICT_PASS_WOULD_BLOCK); + + F_CLR(cache, WT_CACHE_WOULD_BLOCK); + goto done; + } return (0); + +done: if (F_ISSET(cache, WT_CACHE_STUCK)) + FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + return (1); } /* @@ -460,7 +484,6 @@ __evict_pass(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_EVICT_WORKER *worker; uint64_t pages_evicted; - uint32_t flags; int loop; conn = S2C(session); @@ -483,10 +506,10 @@ __evict_pass(WT_SESSION_IMPL *session) } /* - * Increment the shared read generation. We do this - * occasionally even if eviction is not currently required, so - * that pages have some relative read generation when the - * eviction server does need to do some work. + * Increment the shared read generation. Do this occasionally + * even if eviction is not currently required, so that pages + * have some relative read generation when the eviction server + * does need to do some work. */ __wt_cache_read_gen_incr(session); @@ -502,18 +525,17 @@ __evict_pass(WT_SESSION_IMPL *session) */ __wt_txn_update_oldest(session, 1); - WT_RET(__evict_has_work(session, &flags)); - if (flags == 0) + if (!__evict_update_work(session)) break; if (loop > 10) - LF_SET(WT_EVICT_PASS_AGGRESSIVE); + FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); /* * Start a worker if we have capacity and we haven't reached * the eviction targets. */ - if (LF_ISSET(WT_EVICT_PASS_ALL | + if (FLD_ISSET(cache->state, WT_EVICT_PASS_ALL | WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, @@ -532,7 +554,7 @@ __evict_pass(WT_SESSION_IMPL *session) " In use: %" PRIu64 " Dirty: %" PRIu64, conn->cache_size, cache->bytes_inmem, cache->bytes_dirty)); - WT_RET(__evict_lru_walk(session, flags)); + WT_RET(__evict_lru_walk(session)); WT_RET(__evict_server_work(session)); /* @@ -553,7 +575,8 @@ __evict_pass(WT_SESSION_IMPL *session) * Mark the cache as stuck if we need space * and aren't evicting any pages. */ - if (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK)) { + if (!FLD_ISSET(cache->state, + WT_EVICT_PASS_WOULD_BLOCK)) { F_SET(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR( session, cache_eviction_slow); @@ -672,44 +695,6 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) return (ret); } -/* - * __wt_evict_page -- - * Evict a given page. - */ -int -__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_ISOLATION saved_iso; - - /* - * We have to take care when evicting pages not to write a change that: - * (a) is not yet committed; or - * (b) is committed more recently than an in-progress checkpoint. - * - * We handle both of these cases by setting up the transaction context - * before evicting, using a special "eviction" isolation level, where - * only globally visible updates can be evicted. - */ - __wt_txn_update_oldest(session, 1); - txn = &session->txn; - saved_iso = txn->isolation; - txn->isolation = WT_ISO_EVICTION; - - /* - * Sanity check: if a transaction has updates, its updates should not - * be visible to eviction. - */ - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_ID) || - !__wt_txn_visible(session, txn->id)); - - ret = __wt_evict(session, ref, 0); - txn->isolation = saved_iso; - - return (ret); -} - /* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's @@ -808,7 +793,7 @@ __evict_lru_pages(WT_SESSION_IMPL *session, int is_server) * Add pages to the LRU queue to be evicted from cache. */ static int -__evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) +__evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; @@ -819,7 +804,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) cache = S2C(session)->cache; /* Get some more pages to consider for eviction. */ - if ((ret = __evict_walk(session, flags)) != 0) + if ((ret = __evict_walk(session)) != 0) return (ret == EBUSY ? 0 : ret); /* Sort the list into LRU order and restart. */ @@ -851,7 +836,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) /* Track the oldest read generation we have in the queue. */ cache->read_gen_oldest = cache->evict[0].ref->page->read_gen; - if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) + if (FLD_ISSET(cache->state, + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) /* * Take all candidates if we only gathered pages with an oldest * read generation set. @@ -929,7 +915,7 @@ __evict_server_work(WT_SESSION_IMPL *session) * Fill in the array by walking the next set of pages. */ static int -__evict_walk(WT_SESSION_IMPL *session, uint32_t flags) +__evict_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -1023,7 +1009,7 @@ retry: while (slot < max_entries && ret == 0) { * stick in cache until we get aggressive. */ if ((btree->checkpointing || btree->evict_priority != 0) && - !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) continue; /* Skip files if we have used all available hazard pointers. */ @@ -1055,7 +1041,7 @@ retry: while (slot < max_entries && ret == 0) { */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { WT_WITH_DHANDLE(session, dhandle, - ret = __evict_walk_file(session, &slot, flags)); + ret = __evict_walk_file(session, &slot)); WT_ASSERT(session, session->split_gen == 0); } @@ -1093,7 +1079,8 @@ retry: while (slot < max_entries && ret == 0) { */ if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 && slot < max_entries && (retries < 2 || - (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 && + (retries < 10 && + !FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && (slot == cache->evict_entries || slot > start_slot)))) { start_slot = slot; ++retries; @@ -1136,10 +1123,11 @@ __evict_init_candidate( * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) +__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; @@ -1149,8 +1137,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) uint32_t walk_flags; int enough, internal_pages, modified, restarts; + conn = S2C(session); btree = S2BT(session); - cache = S2C(session)->cache; + cache = conn->cache; start = cache->evict + *slotp; end = WT_MIN(start + WT_EVICT_WALK_PER_FILE, cache->evict + cache->evict_slots); @@ -1204,21 +1193,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) goto fast; /* Optionally ignore clean pages. */ - if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY)) + if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY)) continue; /* * If we are only trickling out pages marked for definite * eviction, skip anything that isn't marked. */ - if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && + if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && page->read_gen != WT_READGEN_OLDEST) continue; /* Limit internal pages to 50% unless we get aggressive. */ if (WT_PAGE_IS_INTERNAL(page) && ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && - !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) continue; /* @@ -1233,36 +1222,44 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) continue; /* - * If the page is clean but has modifications that appear too - * new to evict, skip it. + * Additional tests if eviction is likely to succeed. * - * Note: take care with ordering: if we detected that the page - * is modified above, we expect mod != NULL. + * If eviction is stuck or we are helping with forced eviction, + * try anyway: maybe a transaction that was running last time + * we wrote the page has since rolled back, or we can help the + * checkpoint complete sooner. Additionally, being stuck will + * configure lookaside table writes in reconciliation, allowing + * us to evict pages we can't usually evict. */ - mod = page->modify; - if (!modified && mod != NULL && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - !__wt_txn_visible_all(session, mod->rec_max_txn)) - continue; + if (!FLD_ISSET(cache->state, + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { + /* + * Note: take care with ordering: if we detected that + * the page is modified above, we expect mod != NULL. + */ + mod = page->modify; - /* - * If the oldest transaction hasn't changed since the last time - * this page was written, it's unlikely that we can make - * progress. Similarly, if the most recent update on the page - * is not yet globally visible, eviction will fail. These - * heuristics attempt to avoid repeated attempts to evict the - * same page. - * - * That said, if eviction is stuck, or we are helping with - * forced eviction, try anyway: maybe a transaction that was - * running last time we wrote the page has since rolled back, - * or we can help get the checkpoint completed sooner. - */ - if (modified && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - (mod->disk_snap_min == S2C(session)->txn_global.oldest_id || - !__wt_txn_visible_all(session, mod->update_txn))) - continue; + /* + * If the page is clean but has modifications that + * appear too new to evict, skip it. + */ + if (!modified && mod != NULL && + !__wt_txn_visible_all(session, mod->rec_max_txn)) + continue; + + /* + * If the oldest transaction hasn't changed since the + * last time this page was written, it's unlikely we + * can make progress. Similarly, if the most recent + * update on the page is not yet globally visible, + * eviction will fail. These heuristics attempt to + * avoid repeated attempts to evict the same page. + */ + if (modified && + (mod->disk_snap_min == conn->txn_global.oldest_id || + !__wt_txn_visible_all(session, mod->update_txn))) + continue; + } WT_ASSERT(session, evict->ref == NULL); __evict_init_candidate(session, evict, ref); @@ -1428,13 +1425,10 @@ __evict_page(WT_SESSION_IMPL *session, int is_server) * page-discard function assert that no dirty pages are ever * discarded. */ - if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) && - __wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); - WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref)); + WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); @@ -1570,29 +1564,31 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) * NOTE: this function is not called anywhere, it is intended to be called * from a debugger. */ -void -__wt_cache_dump(WT_SESSION_IMPL *session) +int +__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) { - WT_BTREE *btree; + FILE *fp; WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_REF *next_walk; + WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; + WT_REF *next_walk; uint64_t file_intl_pages, file_leaf_pages; uint64_t file_bytes, file_dirty, total_bytes; conn = S2C(session); total_bytes = 0; + if (ofile == NULL) + fp = stdout; + else + WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp)); + + saved_dhandle = session->dhandle; TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - btree = dhandle->handle; - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) - continue; - file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0; next_walk = NULL; session->dhandle = dhandle; @@ -1607,12 +1603,14 @@ __wt_cache_dump(WT_SESSION_IMPL *session) file_bytes += page->memory_footprint; if (__wt_page_is_modified(page)) file_dirty += page->memory_footprint; + (void)__wt_fprintf(fp, + "%" WT_SIZET_FMT ", ", page->memory_footprint); } session->dhandle = NULL; - printf("cache dump: %s%s%s%s:" - " %" PRIu64 " intl pages, %" PRIu64 " leaf pages," - " %" PRIu64 "MB, %" PRIu64 "MB dirty\n", + (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t" + " %" PRIu64 " internal pages, %" PRIu64 " leaf pages," + " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n", dhandle->name, dhandle->checkpoint == NULL ? "" : " [", dhandle->checkpoint == NULL ? "" : dhandle->checkpoint, @@ -1622,9 +1620,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session) total_bytes += file_bytes; } - printf("cache dump: total found = %" PRIu64 "MB" + session->dhandle = saved_dhandle; + + (void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB" " vs tracked inuse %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20); - fflush(stdout); + if (fp != stdout) + WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); + return (0); } #endif diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 1e5faf45de2..6afd406b8ba 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -150,17 +150,12 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) && int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) { - int evict; - /* * If doing normal system eviction, but only in the service of reducing * the number of dirty pages, leave the clean page in cache. */ - if (!closing) { - __wt_cache_status(session, &evict, NULL); - if (!evict) - return (EBUSY); - } + if (!closing && __wt_eviction_dirty_target(session)) + return (EBUSY); /* * Discard the page and update the reference structure; if the page has @@ -184,7 +179,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_ADDR *addr; WT_PAGE *parent; WT_PAGE_MODIFY *mod; - int evict; parent = ref->home; mod = ref->page->modify; @@ -229,11 +223,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * push it out of cache (and read it back in, when needed), we * would rather have more, smaller pages than fewer large pages. */ - if (!closing) { - __wt_cache_status(session, &evict, NULL); - if (!evict) - return (EBUSY); - } + if (!closing && __wt_eviction_dirty_target(session)) + return (EBUSY); /* Discard the parent's address. */ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { @@ -309,8 +300,7 @@ __evict_review( { WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; - uint32_t reconcile_flags; + uint32_t flags; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -331,7 +321,6 @@ __evict_review( /* Now that we have exclusive access, review the page. */ page = ref->page; - mod = page->modify; /* * Fail if an internal has active children, the children must be evicted @@ -347,6 +336,13 @@ __evict_review( /* Check if the page can be evicted. */ if (!closing) { + /* + * Update the oldest ID to avoid wasted effort should it have + * fallen behind current. + */ + if (__wt_page_is_modified(page)) + __wt_txn_update_oldest(session, 1); + if (!__wt_page_can_evict(session, page, 0, inmem_splitp)) return (EBUSY); @@ -361,13 +357,21 @@ __evict_review( return (__wt_split_insert(session, ref)); } + /* If the page is clean, we're done and we can evict. */ + if (!__wt_page_is_modified(page)) + return (0); + + /* If the page is dirty, reconcile it to decide if we can evict it. */ + flags = WT_EVICTING; + /* - * If the page is dirty and can possibly change state, reconcile it to - * determine the final state. - * * If we have an exclusive lock (we're discarding the tree), assert * there are no updates we cannot read. - * + */ + if (closing) + LF_SET(WT_VISIBILITY_ERR); + + /* * Otherwise, if the page we're evicting is a leaf page marked for * forced eviction, set the update-restore flag, so reconciliation will * write blocks it can write and create a list of skipped updates for @@ -380,27 +384,33 @@ __evict_review( * Don't set the update-restore flag for internal pages, they don't have * updates that can be saved and restored. */ - reconcile_flags = WT_EVICTING; - if (__wt_page_is_modified(page)) { - if (closing) - FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR); - else if (!WT_PAGE_IS_INTERNAL(page) && - page->read_gen == WT_READGEN_OLDEST) - FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE); - WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags)); - WT_ASSERT(session, - !__wt_page_is_modified(page) || - FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)); - } + if (!closing && + !WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_OLDEST) + LF_SET(WT_EVICT_UPDATE_RESTORE); /* - * If the page was ever modified, make sure all of the updates - * on the page are old enough they can be discarded from cache. + * Otherwise, if eviction is getting pressed, configure reconciliation + * to write not-yet-globally-visible updates to the lookaside table, + * that allows us to evict pages we'd otherwise have to keep in cache + * to support older transactions. */ - if (!closing && mod != NULL && - !__wt_txn_visible_all(session, mod->rec_max_txn) && - !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)) - return (EBUSY); + if (!closing && __wt_eviction_aggressive(session)) + LF_SET(WT_EVICT_LOOKASIDE); + + WT_RET(__wt_reconcile(session, ref, NULL, flags)); + + /* + * Success: assert the page is clean or reconciliation was configured + * for an update/restore split, and if the page is clean, reconciliation + * was configured for a lookaside table or all updates on the page are + * globally visible. + */ + WT_ASSERT(session, + LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page)); + WT_ASSERT(session, + LF_SET(WT_EVICT_LOOKASIDE) || + __wt_page_is_modified(page) || + __wt_txn_visible_all(session, page->modify->rec_max_txn)); return (0); } diff --git a/src/include/btmem.h b/src/include/btmem.h index 4aa2b1c7a7d..9e20a518aed 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -6,6 +6,8 @@ * See the file LICENSE for redistribution information. */ +#define WT_RECNO_OOB 0 /* Illegal record number */ + /* * WT_PAGE_HEADER -- * Blocks have a common header, a WT_PAGE_HEADER structure followed by a @@ -43,6 +45,7 @@ struct __wt_page_header { #define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */ #define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */ #define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */ +#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */ uint8_t flags; /* 25: flags */ /* @@ -167,6 +170,29 @@ struct __wt_ovfl_txnc { WT_OVFL_TXNC *next[0]; /* Forward-linked skip list */ }; +/* + * Lookaside table support: when a page is being reconciled for eviction and has + * updates that might be required by earlier readers in the system, the updates + * are written into a lookaside table, and restored as necessary if the page is + * read. The key is a unique marker for the page (a file ID plus an address), + * the on-page item's transaction ID (so we can discard any update records from + * the lookaside table once the on-page item's transaction is globally visible), + * a counter (used to ensure the update records remain in the original order), + * and the page key (byte-string for row-store, record number for column-store). + * The value is the WT_UPDATE structure's transaction ID, update size and value. + * + * As the key for the lookaside table is different for row- and column-store, we + * store both key types in a WT_ITEM, building/parsing them in the code, because + * otherwise we'd need two lookaside files with different key formats. We could + * make the lookaside table's key standard by moving the source key into the + * lookaside table value, but that doesn't make the coding any simpler, and it + * makes the lookaside table's value more likely to overflow the page size when + * the row-store key is relatively large. + */ +#define WT_LAS_FORMAT \ + "key_format=" WT_UNCHECKED_STRING(IuQQu) \ + ",value_format=" WT_UNCHECKED_STRING(QIu) + /* * WT_PAGE_MODIFY -- * When a page is modified, there's additional information to maintain. @@ -238,15 +264,17 @@ struct __wt_page_modify { * Eviction, but block wasn't written: unresolved updates and * associated disk image. * - * Skipped updates are either a WT_INSERT, or a row-store leaf - * page entry. + * Saved updates are either a WT_INSERT, or a row-store leaf + * page entry; in the case of creating lookaside records, there + * is an additional value, the committed item's transaction ID. */ - struct __wt_upd_skipped { + struct __wt_save_upd { WT_INSERT *ins; WT_ROW *rip; - } *skip; - uint32_t skip_entries; - void *skip_dsk; + uint64_t onpage_txn; + } *supd; + uint32_t supd_entries; + void *supd_dsk; /* * Block was written: address, size and checksum. @@ -556,9 +584,8 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* @@ -869,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update) * store 4GB objects; I'd rather do that than increase the size of this * structure for a flag bit. */ -#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX) -#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX) +#define WT_UPDATE_DELETED_VALUE UINT32_MAX +#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE) +#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE) uint32_t size; /* update length */ /* The untyped value immediately follows the WT_UPDATE structure. */ diff --git a/src/include/btree.h b/src/include/btree.h index deecd8f6d88..98ce4c22c10 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -146,12 +146,14 @@ struct __wt_btree { /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_BULK 0x00100 /* Bulk-load handle */ #define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */ -#define WT_BTREE_NO_EVICTION 0x00400 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x00800 /* Disable logging */ -#define WT_BTREE_SALVAGE 0x01000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x02000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x04000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x08000 /* Handle is for verify */ +#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */ +#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */ +#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */ +#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/btree.i b/src/include/btree.i index bcba7d47257..64cb0b8043e 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -358,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not * impossible, so we take care to read the global state before - * the atomic increment. If we raced with reconciliation, just - * leave the previous value here: at worst, we will write a - * page in a checkpoint when not absolutely necessary. + * the atomic increment. + * + * If the page was dirty on entry, then last_running == 0. The + * page could have become clean since then, if reconciliation + * completed. In that case, we leave the previous value for + * first_dirty_txn rather than potentially racing to update it, + * at worst, we'll unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; @@ -371,6 +375,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) page->modify->update_txn = session->txn.id; } +/* + * __wt_page_modify_clear -- + * Clean a modified page. + */ +static inline void +__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + /* + * The page must be held exclusive when this call is made, this call + * can only be used when the page is owned by a single thread. + * + * Allow the call to be made on clean pages. + */ + if (__wt_page_is_modified(page)) { + page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } +} + /* * __wt_page_modify_set -- * Mark the page and tree dirty. @@ -533,7 +556,12 @@ __wt_ref_key_instantiated(WT_REF *ref) static inline void __wt_ref_key_clear(WT_REF *ref) { - /* The key union has 2 fields, both of which are 8B. */ + /* + * The key union has 2 8B fields; this is equivalent to: + * + * ref->key.recno = WT_RECNO_OOB; + * ref->key.ikey = NULL; + */ ref->key.recno = 0; } @@ -964,9 +992,6 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * let the threads continue before doing eviction. * * Ignore anything other than large, dirty row-store leaf pages. - * - * XXX KEITH - * Need a better test for append-only workloads. */ if (page->type != WT_PAGE_ROW_LEAF || page->memory_footprint < btree->maxmempage || @@ -1106,7 +1131,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) (void)__wt_atomic_addv32(&btree->evict_busy, 1); too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; - if ((ret = __wt_evict_page(session, ref)) == 0) { + if ((ret = __wt_evict(session, ref, 0)) == 0) { if (too_big) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); else @@ -1157,12 +1182,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * memory_page_max setting, when we see many deleted items, and when we * are attempting to scan without trashing the cache. * - * Fast checks if eviction is disabled for this operation or this tree, - * then perform a general check if eviction will be possible. + * Fast checks if eviction is disabled for this handle, operation or + * tree, then perform a general check if eviction will be possible. */ page = ref->page; if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || !__wt_page_can_evict(session, page, 1, NULL)) return (__wt_hazard_clear(session, page)); diff --git a/src/include/cache.h b/src/include/cache.h index 85b83f22fd4..f98483a215f 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -18,11 +18,6 @@ #define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */ #define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ -#define WT_EVICT_PASS_AGGRESSIVE 0x01 -#define WT_EVICT_PASS_ALL 0x02 -#define WT_EVICT_PASS_DIRTY 0x04 -#define WT_EVICT_PASS_WOULD_BLOCK 0x08 - /* * WT_EVICT_ENTRY -- * Encapsulation of an eviction candidate. @@ -119,6 +114,15 @@ struct __wt_cache { uint64_t cp_saved_app_waits; /* User wait count at last review */ uint64_t cp_saved_read; /* Read count at last review */ + /* + * Work state. + */ +#define WT_EVICT_PASS_AGGRESSIVE 0x01 +#define WT_EVICT_PASS_ALL 0x02 +#define WT_EVICT_PASS_DIRTY 0x04 +#define WT_EVICT_PASS_WOULD_BLOCK 0x08 + uint32_t state; + /* * Flags. */ diff --git a/src/include/cache.i b/src/include/cache.i index 86501d757c7..aa46fdd6d20 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -103,48 +103,6 @@ __wt_cache_dirty_inuse(WT_CACHE *cache) return (dirty_inuse); } -/* - * __wt_cache_status -- - * Return if the cache usage exceeds the eviction or dirty targets. - */ -static inline void -__wt_cache_status(WT_SESSION_IMPL *session, int *evictp, int *dirtyp) -{ - WT_CONNECTION_IMPL *conn; - WT_CACHE *cache; - uint64_t bytes_inuse, bytes_max, dirty_inuse; - - conn = S2C(session); - cache = conn->cache; - - /* - * There's an assumption "evict" overrides "dirty", that is, if eviction - * is required, we no longer care where we are with respect to the dirty - * target. - * - * Avoid division by zero if the cache size has not yet been set in a - * shared cache. - */ - bytes_max = conn->cache_size + 1; - if (evictp != NULL) { - bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) { - *evictp = 1; - return; - } - *evictp = 0; - } - if (dirtyp != NULL) { - dirty_inuse = __wt_cache_dirty_inuse(cache); - if (dirty_inuse > - (cache->eviction_dirty_target * bytes_max) / 100) { - *dirtyp = 1; - return; - } - *dirtyp = 0; - } -} - /* * __wt_session_can_wait -- * Return if a session available for a potentially slow operation. @@ -161,17 +119,40 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (0); /* - * LSM sets the no-cache-check flag when holding the LSM tree lock, + * LSM sets the no-eviction flag when holding the LSM tree lock, * in that case, or when holding the schema lock, we don't want to * highjack the thread for eviction. */ if (F_ISSET(session, - WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) return (0); return (1); } +/* + * __wt_eviction_aggressive -- + * Return if the eviction server is running in aggressive mode. + */ +static inline int +__wt_eviction_aggressive(WT_SESSION_IMPL *session) +{ + return (FLD_ISSET( + S2C(session)->cache->state, WT_EVICT_PASS_AGGRESSIVE) ? 1 : 0); +} + +/* + * __wt_eviction_dirty_target -- + * Return if the eviction server is running to reduce the number of dirty + * pages (versus running to discard pages from the cache). + */ +static inline int +__wt_eviction_dirty_target(WT_SESSION_IMPL *session) +{ + return (FLD_ISSET( + S2C(session)->cache->state, WT_EVICT_PASS_DIRTY) ? 1 : 0); +} + /* * __wt_eviction_needed -- * Return if an application thread should do eviction, and the cache full @@ -230,7 +211,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp) * that case, or when holding the schema or handle list locks (which * block eviction), we don't want to highjack the thread for eviction. */ - if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK | + if (F_ISSET(session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) return (0); diff --git a/src/include/cell.i b/src/include/cell.i index 20a4d214015..d7ecfd3bda4 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -182,7 +182,7 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size) p = cell->__chunk + 1; - if (recno == 0) + if (recno == WT_RECNO_OOB) cell->__chunk[0] = cell_type; /* Type */ else { cell->__chunk[0] = cell_type | WT_CELL_64V; diff --git a/src/include/connection.h b/src/include/connection.h index 0f4419308cb..0a5af6d6327 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -294,8 +294,6 @@ struct __wt_connection_impl { uint64_t ckpt_time_recent; /* Checkpoint time recent/total */ uint64_t ckpt_time_total; - int compact_in_memory_pass; /* Compaction serialization */ - #define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ #define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ #define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ @@ -372,6 +370,20 @@ struct __wt_connection_impl { time_t sweep_interval;/* Handle sweep interval */ u_int sweep_handles_min;/* Handle sweep minimum open */ + /* + * Shared lookaside lock, session and cursor, used by threads accessing + * the lookaside table (other than eviction server and worker threads + * and the sweep thread, all of which have their own lookaside cursors). + */ + WT_SPINLOCK las_lock; /* Lookaside table spinlock */ + WT_SESSION_IMPL *las_session; /* Lookaside table session */ + WT_CURSOR *las_cursor; /* Lookaside table cursor */ + int las_written; /* Lookaside table has been written */ + + WT_ITEM las_sweep_key; /* Sweep server's saved key */ + int las_sweep_call;/* Sweep server's call count */ + uint64_t las_sweep_cnt; /* Sweep server's per-call row count */ + /* Locked: collator list */ TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh; diff --git a/src/include/cursor.i b/src/include/cursor.i index 484af0b4a58..e7fed250251 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -32,7 +32,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt) * and it's a minimal set of things we need to clear. It would be a * lot simpler to clear everything, but we call this function a lot. */ - cbt->recno = 0; + cbt->recno = WT_RECNO_OOB; cbt->ins = NULL; cbt->ins_head = NULL; diff --git a/src/include/extern.h b/src/include/extern.h index 5a3d1913c98..d21e5a1fc5d 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -101,8 +101,9 @@ extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp); extern int __wt_btcur_equals( WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); +extern void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern void __wt_btcur_open(WT_CURSOR_BTREE *cbt); -extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel); extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v); extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile); @@ -121,6 +122,7 @@ extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages); extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages); +extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno); @@ -146,6 +148,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags ); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep); +extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size); extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]); @@ -179,6 +182,14 @@ extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert); extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern void __wt_las_stats_update(WT_SESSION_IMPL *session); +extern int __wt_las_create(WT_SESSION_IMPL *session); +extern int __wt_las_destroy(WT_SESSION_IMPL *session); +extern void __wt_las_set_written(WT_SESSION_IMPL *session); +extern int __wt_las_is_written(WT_SESSION_IMPL *session); +extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); +extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags); +extern int __wt_las_sweep(WT_SESSION_IMPL *session); extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str); extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item); @@ -308,11 +319,10 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_evict_create(WT_SESSION_IMPL *session); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); -extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full); -extern void __wt_cache_dump(WT_SESSION_IMPL *session); +extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); @@ -579,6 +589,8 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); +extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip); diff --git a/src/include/flags.h b/src/include/flags.h index 0bad569670c..ca3c3c38245 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -18,6 +18,8 @@ #define WT_CONN_SERVER_SWEEP 0x00002000 #define WT_CONN_WAS_BACKUP 0x00004000 #define WT_EVICTING 0x00000001 +#define WT_EVICT_LOOKASIDE 0x00000002 +#define WT_EVICT_UPDATE_RESTORE 0x00000004 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 #define WT_FILE_TYPE_DIRECTORY 0x00000004 @@ -49,15 +51,14 @@ #define WT_SESSION_LOCKED_SLOT 0x00000040 #define WT_SESSION_LOCKED_TABLE 0x00000080 #define WT_SESSION_LOGGING_INMEM 0x00000100 -#define WT_SESSION_NO_CACHE 0x00000200 -#define WT_SESSION_NO_CACHE_CHECK 0x00000400 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200 +#define WT_SESSION_NO_CACHE 0x00000400 #define WT_SESSION_NO_DATA_HANDLES 0x00000800 -#define WT_SESSION_NO_LOGGING 0x00001000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00002000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00004000 -#define WT_SESSION_SERVER_ASYNC 0x00008000 -#define WT_SKIP_UPDATE_ERR 0x00000002 -#define WT_SKIP_UPDATE_RESTORE 0x00000004 +#define WT_SESSION_NO_EVICTION 0x00001000 +#define WT_SESSION_NO_LOGGING 0x00002000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000 +#define WT_SESSION_SERVER_ASYNC 0x00010000 #define WT_SYNC_CHECKPOINT 0x00000001 #define WT_SYNC_CLOSE 0x00000002 #define WT_SYNC_DISCARD 0x00000004 @@ -91,6 +92,7 @@ #define WT_VERB_VERIFY 0x00200000 #define WT_VERB_VERSION 0x00400000 #define WT_VERB_WRITE 0x00800000 +#define WT_VISIBILITY_ERR 0x00000008 /* * flags section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/include/hardware.h b/src/include/hardware.h index c9b72f8a609..32353072c5b 100644 --- a/src/include/hardware.h +++ b/src/include/hardware.h @@ -50,6 +50,16 @@ &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ } while (0) +#define F_CAS_ATOMIC_WAIT(p, mask) do { \ + int __ret; \ + for (;;) { \ + F_CAS_ATOMIC(p, mask, __ret); \ + if (__ret == 0) \ + break; \ + __wt_yield(); \ + } \ +} while (0) + #define F_CLR_ATOMIC(p, mask) do { \ uint8_t __orig; \ do { \ diff --git a/src/include/meta.h b/src/include/meta.h index 66547262417..a5a303f1630 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -21,7 +21,9 @@ #define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ #define WT_METADATA_URI "metadata:" /* Metadata alias */ -#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */ +#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ + +#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ /* * Pre computed hash for the metadata file. Used to optimize comparisons diff --git a/src/include/serial.i b/src/include/serial.i index 7b62e66eccb..78ec968953b 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -123,7 +123,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, * If the application didn't specify a record number, allocate a new one * and set up for an append. */ - if ((recno = WT_INSERT_RECNO(new_ins)) == 0) { + if ((recno = WT_INSERT_RECNO(new_ins)) == WT_RECNO_OOB) { recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1; WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL || recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head))); @@ -292,20 +292,20 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, __wt_page_modify_set(session, page); /* - * If there are subsequent WT_UPDATE structures, we're evicting pages - * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE - * structures. Serialization is needed so only one thread does the - * obsolete check at a time, and to protect updates from disappearing - * under reconciliation. + * If there are subsequent obsolete WT_UPDATE structures, discard them. + * Serialization is needed because reconciliation reads the update list, + * and obsolete updates cannot be discarded while reconciliation is in + * progress. Serialization is also needed so only one thread does the + * obsolete check at a time. */ if (upd->next != NULL && __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret); /* If we can't lock it, don't scan, that's okay. */ if (ret != 0) return (0); obsolete = __wt_update_obsolete_check(session, page, upd->next); - F_CLR_ATOMIC(page, WT_PAGE_SCANNING); + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); if (obsolete != NULL) { page->modify->obsolete_check_txn = WT_TXN_NONE; __wt_update_obsolete_free(session, page, obsolete); diff --git a/src/include/session.h b/src/include/session.h index c6c246954f7..a691794fd46 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -76,6 +76,11 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ WT_COMPACT *compact; /* Compact state */ + /* + * Lookaside table cursor, sweep and eviction worker threads only. + */ + WT_CURSOR *las_cursor; /* Lookaside table cursor */ + WT_DATA_HANDLE *meta_dhandle; /* Metadata file */ void *meta_track; /* Metadata operation tracking */ void *meta_track_next; /* Current position */ diff --git a/src/include/stat.h b/src/include/stat.h index 39e6fa063dc..9252e86ed8c 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -276,11 +276,16 @@ struct __wt_connection_stats { int64_t cache_eviction_walk; int64_t cache_eviction_worker_evicting; int64_t cache_inmem_split; + int64_t cache_lookaside_insert; + int64_t cache_lookaside_remove; int64_t cache_overhead; int64_t cache_pages_dirty; int64_t cache_pages_inuse; int64_t cache_read; + int64_t cache_read_lookaside; int64_t cache_write; + int64_t cache_write_lookaside; + int64_t cache_write_restore; int64_t cond_wait; int64_t cursor_create; int64_t cursor_insert; @@ -426,8 +431,11 @@ struct __wt_dsrc_stats { int64_t cache_inmem_split; int64_t cache_overflow_value; int64_t cache_read; + int64_t cache_read_lookaside; int64_t cache_read_overflow; int64_t cache_write; + int64_t cache_write_lookaside; + int64_t cache_write_restore; int64_t compress_raw_fail; int64_t compress_raw_fail_temporary; int64_t compress_raw_ok; diff --git a/src/include/txn.h b/src/include/txn.h index 0e7be1be6bc..4a325c70a95 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -78,9 +78,8 @@ struct __wt_txn_global { }; typedef enum __wt_txn_isolation { - WT_ISO_EVICTION, /* Internal: eviction context */ - WT_ISO_READ_UNCOMMITTED, WT_ISO_READ_COMMITTED, + WT_ISO_READ_UNCOMMITTED, WT_ISO_SNAPSHOT } WT_TXN_ISOLATION; diff --git a/src/include/txn.i b/src/include/txn.i index 1228893871f..6fe35c38850 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -139,6 +139,16 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) return (checkpoint_pinned); } +/* + * __wt_txn_committed -- + * Return if a transaction has been committed. + */ +static inline int +__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id) +{ + return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running) ? 1 : 0); +} + /* * __wt_txn_visible_all -- * Check if a given transaction ID is "globally visible". This is, if @@ -175,13 +185,6 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) if (id == WT_TXN_ABORTED) return (0); - /* - * Eviction only sees globally visible updates, or if there is a - * checkpoint transaction running, use its transaction. - */ - if (txn->isolation == WT_ISO_EVICTION) - return (__wt_txn_visible_all(session, id)); - /* * Read-uncommitted transactions see all other changes. * diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 8d199a33536..33f35bfc62d 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -3647,198 +3647,208 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047 /*! cache: in-memory page splits */ #define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048 +/*! cache: lookaside table insert calls */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1049 +/*! cache: lookaside table remove calls */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1050 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1049 +#define WT_STAT_CONN_CACHE_OVERHEAD 1051 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1050 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1052 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1051 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1053 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1052 +#define WT_STAT_CONN_CACHE_READ 1054 +/*! cache: pages read into cache requiring lookaside entries */ +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1055 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1053 +#define WT_STAT_CONN_CACHE_WRITE 1056 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1057 +/*! cache: pages written requiring in-memory restoration */ +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1058 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1054 +#define WT_STAT_CONN_COND_WAIT 1059 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1055 +#define WT_STAT_CONN_CURSOR_CREATE 1060 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1056 +#define WT_STAT_CONN_CURSOR_INSERT 1061 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1057 +#define WT_STAT_CONN_CURSOR_NEXT 1062 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1058 +#define WT_STAT_CONN_CURSOR_PREV 1063 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1059 +#define WT_STAT_CONN_CURSOR_REMOVE 1064 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1060 +#define WT_STAT_CONN_CURSOR_RESET 1065 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1061 +#define WT_STAT_CONN_CURSOR_RESTART 1066 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1062 +#define WT_STAT_CONN_CURSOR_SEARCH 1067 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1063 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1068 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1064 +#define WT_STAT_CONN_CURSOR_UPDATE 1069 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1065 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1070 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1066 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1071 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1067 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1072 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1068 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1073 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1069 +#define WT_STAT_CONN_DH_SWEEP_REF 1074 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1070 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1075 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1071 +#define WT_STAT_CONN_DH_SWEEP_TOD 1076 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1072 +#define WT_STAT_CONN_DH_SWEEPS 1077 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1073 +#define WT_STAT_CONN_FILE_OPEN 1078 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1074 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1079 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1075 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1080 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1076 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1081 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1077 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1082 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1078 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1083 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1079 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1084 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1080 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1085 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1081 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1086 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1082 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1087 /*! log: log records written directly */ -#define WT_STAT_CONN_LOG_DIRECT_WRITES 1083 +#define WT_STAT_CONN_LOG_DIRECT_WRITES 1088 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1084 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1089 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1085 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1090 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1086 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1091 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1087 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1092 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1088 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1093 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1089 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1094 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1090 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1095 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1091 +#define WT_STAT_CONN_LOG_SCANS 1096 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1092 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1097 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1093 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1098 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1094 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1099 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1095 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1100 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1096 +#define WT_STAT_CONN_LOG_SLOT_RACES 1101 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1097 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1102 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1098 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1103 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1099 +#define WT_STAT_CONN_LOG_SYNC 1104 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1100 +#define WT_STAT_CONN_LOG_SYNC_DIR 1105 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1101 +#define WT_STAT_CONN_LOG_WRITE_LSN 1106 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1102 +#define WT_STAT_CONN_LOG_WRITES 1107 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1103 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1108 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1104 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1109 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1105 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1110 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1106 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1111 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1107 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1112 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1108 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1113 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1109 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1114 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1110 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1115 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1111 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1116 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1112 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1117 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1113 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1118 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1114 +#define WT_STAT_CONN_MEMORY_FREE 1119 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1115 +#define WT_STAT_CONN_MEMORY_GROW 1120 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1116 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1121 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1117 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1122 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1118 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1123 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1119 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1124 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1120 +#define WT_STAT_CONN_PAGE_SLEEP 1125 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1121 +#define WT_STAT_CONN_READ_IO 1126 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1122 +#define WT_STAT_CONN_REC_PAGES 1127 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1123 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1128 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1124 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1129 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1125 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1130 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1126 +#define WT_STAT_CONN_RWLOCK_READ 1131 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1127 +#define WT_STAT_CONN_RWLOCK_WRITE 1132 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1128 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1133 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1129 +#define WT_STAT_CONN_SESSION_OPEN 1134 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1130 +#define WT_STAT_CONN_TXN_BEGIN 1135 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1131 +#define WT_STAT_CONN_TXN_CHECKPOINT 1136 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1132 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1137 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1133 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1138 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1134 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1139 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1135 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1140 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1136 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1141 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1137 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1142 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1138 +#define WT_STAT_CONN_TXN_COMMIT 1143 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1139 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1144 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1140 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1145 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1141 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1146 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1142 +#define WT_STAT_CONN_TXN_ROLLBACK 1147 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1143 +#define WT_STAT_CONN_TXN_SYNC 1148 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1144 +#define WT_STAT_CONN_WRITE_IO 1149 /*! * @} @@ -3940,96 +3950,102 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2045 /*! cache: pages read into cache */ #define WT_STAT_DSRC_CACHE_READ 2046 +/*! cache: pages read into cache requiring lookaside entries */ +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2047 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2047 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2048 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2048 +#define WT_STAT_DSRC_CACHE_WRITE 2049 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2050 +/*! cache: pages written requiring in-memory restoration */ +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2051 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2049 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2052 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2050 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2053 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2051 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2054 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2052 +#define WT_STAT_DSRC_COMPRESS_READ 2055 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2053 +#define WT_STAT_DSRC_COMPRESS_WRITE 2056 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2054 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2057 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2055 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2058 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2056 +#define WT_STAT_DSRC_CURSOR_CREATE 2059 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2057 +#define WT_STAT_DSRC_CURSOR_INSERT 2060 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2058 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2061 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2059 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2062 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2060 +#define WT_STAT_DSRC_CURSOR_NEXT 2063 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2061 +#define WT_STAT_DSRC_CURSOR_PREV 2064 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2062 +#define WT_STAT_DSRC_CURSOR_REMOVE 2065 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2063 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2066 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2064 +#define WT_STAT_DSRC_CURSOR_RESET 2067 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2065 +#define WT_STAT_DSRC_CURSOR_RESTART 2068 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2066 +#define WT_STAT_DSRC_CURSOR_SEARCH 2069 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2067 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2070 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2068 +#define WT_STAT_DSRC_CURSOR_UPDATE 2071 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2069 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2072 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2070 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2073 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2071 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2074 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2072 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2075 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2073 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2076 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2074 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2077 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2075 +#define WT_STAT_DSRC_REC_DICTIONARY 2078 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2076 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2079 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2077 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2080 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2078 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2081 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2079 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2082 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2080 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2083 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2081 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2084 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2082 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2085 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2083 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2086 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2084 +#define WT_STAT_DSRC_REC_PAGES 2087 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2085 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2088 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2086 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2089 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2087 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2090 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2088 +#define WT_STAT_DSRC_SESSION_COMPACT 2091 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2089 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2092 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2090 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2093 /*! @} */ /* * Statistics section: END diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 73a06f784e9..4ae12c594db 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -245,6 +245,8 @@ struct __wt_rwlock; typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; +struct __wt_save_upd; + typedef struct __wt_save_upd WT_SAVE_UPD; struct __wt_scratch_track; typedef struct __wt_scratch_track WT_SCRATCH_TRACK; struct __wt_session_impl; @@ -265,8 +267,6 @@ struct __wt_txn_op; typedef struct __wt_txn_op WT_TXN_OP; struct __wt_txn_state; typedef struct __wt_txn_state WT_TXN_STATE; -struct __wt_upd_skipped; - typedef struct __wt_upd_skipped WT_UPD_SKIPPED; struct __wt_update; typedef struct __wt_update WT_UPDATE; union __wt_rand_state; diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 40991f845e4..01a61359949 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -512,7 +512,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) @@ -632,6 +632,6 @@ err: if (locked) "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index f34f0598261..46db76e099c 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -1144,7 +1144,7 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); return (0); } @@ -1157,7 +1157,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); @@ -1177,7 +1177,7 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); return (0); } @@ -1190,7 +1190,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 0c36c68e9f5..8eba0127b8b 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -301,17 +301,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * - * Use the special eviction isolation level to avoid interfering with - * an application checkpoint: we have already checked that all of the - * updates in this chunk are globally visible. - * - * !!! We can wait here for checkpoints and fsyncs to complete, which - * can be a long time. + * !!! + * We can wait here for checkpoints and fsyncs to complete, which can + * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { + /* + * Set read-uncommitted: we have already checked that all of the + * updates in this chunk are globally visible, use the cheapest + * possible check in reconciliation. + */ saved_isolation = session->txn.isolation; - session->txn.isolation = WT_ISO_EVICTION; + session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); @@ -412,7 +414,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); @@ -446,7 +448,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); } diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index 6d08ce3aa6a..315621f2ae9 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -32,7 +32,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session, WT_ERR(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "file:")) break; - else if (strcmp(uri, WT_METAFILE_URI) == 0) + if (strcmp(uri, WT_METAFILE_URI) == 0) continue; /* diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 227d0fa9a6c..1792d722939 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -37,6 +37,8 @@ __metadata_turtle(const char *key) int __wt_metadata_open(WT_SESSION_IMPL *session) { + WT_BTREE *btree; + if (session->meta_dhandle != NULL) return (0); @@ -45,7 +47,24 @@ __wt_metadata_open(WT_SESSION_IMPL *session) session->meta_dhandle = session->dhandle; WT_ASSERT(session, session->meta_dhandle != NULL); - /* The meta_dhandle doesn't need to stay locked -- release it. */ + /* + * Set special flags for the metadata file: eviction (the metadata file + * is in-memory and never evicted), logging (the metadata file is always + * logged if possible). + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) + F_SET(btree, WT_BTREE_IN_MEMORY); + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + F_SET(btree, WT_BTREE_NO_EVICTION); + if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_CLR(btree, WT_BTREE_NO_LOGGING); + + /* The metadata handle doesn't need to stay locked -- release it. */ return (__wt_session_release_btree(session)); } @@ -59,9 +78,9 @@ __wt_metadata_cursor( { WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; + int is_dead; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL }; - int is_dead; saved_dhandle = session->dhandle; WT_ERR(__wt_metadata_open(session)); diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index 8622bb5b4ca..ef4662aa369 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -213,6 +213,8 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) fh = *fhp; *fhp = NULL; + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: close", fh->name)); + __wt_spin_lock(session, &conn->fh_lock); if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { __wt_spin_unlock(session, &conn->fh_lock); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 6b0ca54065e..74bbcc1853a 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -27,18 +27,41 @@ typedef struct { WT_ITEM dsk; /* Temporary disk-image buffer */ - /* Track whether all changes to the page are written. */ + /* + * Track start/stop write generation to decide if all changes to the + * page are written. + */ + uint32_t orig_write_gen; + + /* + * Track maximum transaction ID seen and first unwritten transaction ID. + */ uint64_t max_txn; uint64_t first_dirty_txn; - uint32_t orig_write_gen; /* - * If page updates are skipped because they are as yet unresolved, or - * the page has updates we cannot discard, the page is left "dirty": - * the page cannot be discarded and a subsequent reconciliation will - * be necessary to discard the page. + * When we can't mark the page clean (for example, checkpoint found some + * uncommitted updates), there's a leave-dirty flag. + */ + int leave_dirty; + + /* + * When evicting pages having uncommitted updates or committed updates + * that are not yet globally visible, there are two ways forward: first, + * uncommitted updates can be saved/restored, that is, evict most of the + * page and create a new, smaller page in which we re-instantiate the + * uncommitted updates. If there are no uncommitted updates, but are + * updates not yet visible to all readers in the system, we can write + * those updates into a database side store, restoring them on demand + * if/when the page is read back into memory. + * + * Both are configured from outside of reconciliation: saving/restoring + * updates is the WT_EVICT_UPDATE_RESTORE flag. Writing not-yet-visible + * updates is the WT_EVICT_LOOKASIDE flag. Both may be set, in which + * case the decision is made once we determine if there are uncommitted + * updates on the page. */ - int leave_dirty; + int evict_skipped_updates; /* * Raw compression (don't get me started, as if normal reconciliation @@ -153,18 +176,12 @@ typedef struct { void *dsk; /* Split's disk image */ /* - * When busy pages get large, we need to be able to evict them - * even when they contain unresolved updates, or updates which - * cannot be evicted because of running transactions. In such - * cases, break the page into multiple blocks, write the blocks - * that can be evicted, saving lists of updates for blocks that - * cannot be evicted, then re-instantiate the blocks that cannot - * be evicted as new, in-memory pages, restoring the updates on - * those pages. + * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and + * WT_EVICT_LOOKASIDE configurations. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ - uint32_t skip_next; - size_t skip_allocated; + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; /* * The key for a row-store page; no column-store key is needed @@ -220,12 +237,12 @@ typedef struct { size_t space_avail; /* Remaining space in this chunk */ /* - * While reviewing updates for each page, we store skipped updates here, + * While reviewing updates for each page, we save WT_UPDATE lists here, * and then move them to per-block areas as the blocks are defined. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ - uint32_t skip_next; - size_t skip_allocated; + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; /* * We don't need to keep the 0th key around on internal pages, the @@ -277,6 +294,9 @@ typedef struct { WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ + int cache_write_lookaside; /* Used the lookaside table */ + int cache_write_restore; /* Used update/restoration */ + uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; @@ -318,8 +338,11 @@ static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); static int __rec_split_write(WT_SESSION_IMPL *, WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int); +static int __rec_update_las( + WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *); static int __rec_write_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); +static int __rec_write_status(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); @@ -338,31 +361,19 @@ int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) { - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; - int page_lock, scan_lock, split_lock; - conn = S2C(session); page = ref->page; mod = page->modify; - page_lock = scan_lock = split_lock = 0; - - /* We're shouldn't get called with a clean page, that's an error. */ - if (!__wt_page_is_modified(page)) - WT_RET_MSG(session, WT_ERROR, - "Attempt to reconcile a clean page."); WT_RET(__wt_verbose(session, WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type))); - WT_STAT_FAST_CONN_INCR(session, rec_pages); - WT_STAT_FAST_DATA_INCR(session, rec_pages); - if (LF_ISSET(WT_EVICTING)) { - WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); - WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); - } + + /* We shouldn't get called with a clean page, that's an error. */ + WT_ASSERT(session, __wt_page_is_modified(page)); #ifdef HAVE_DIAGNOSTIC { @@ -386,39 +397,15 @@ __wt_reconcile(WT_SESSION_IMPL *session, r = session->reconcile; /* - * The compaction process looks at the page's modification information; - * if compaction is running, acquire the page's lock. + * Reconciliation locks the page for three reasons: + * Reconciliation reads the lists of page updates, obsolete updates + * cannot be discarded while reconciliation is in progress; + * The compaction process reads page modification information, which + * reconciliation modifies; + * In-memory splits: reconciliation of an internal page cannot handle + * a child page splitting during the reconciliation. */ - if (conn->compact_in_memory_pass) { - WT_PAGE_LOCK(session, page); - page_lock = 1; - } - - /* - * Reconciliation reads the lists of updates, so obsolete updates cannot - * be discarded while reconciliation is in progress. - */ - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); - if (ret == 0) - break; - __wt_yield(); - } - scan_lock = 1; - - /* - * Mark internal pages as splitting to ensure we don't deadlock when - * performing an in-memory split during a checkpoint. - */ - if (WT_PAGE_IS_INTERNAL(page)) { - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); - if (ret == 0) - break; - __wt_yield(); - } - split_lock = 1; - } + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); /* Reconcile the page. */ switch (page->type) { @@ -445,19 +432,34 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_ILLEGAL_VALUE_SET(session); } + /* Get the final status for the reconciliation. */ + if (ret == 0) + ret = __rec_write_status(session, r, page); + /* Wrap up the page reconciliation. */ if (ret == 0) ret = __rec_write_wrapup(session, r, page); else WT_TRET(__rec_write_wrapup_err(session, r, page)); - /* Release the locks we're holding. */ - if (split_lock) - F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); - if (scan_lock) - F_CLR_ATOMIC(page, WT_PAGE_SCANNING); - if (page_lock) - WT_PAGE_UNLOCK(session, page); + /* Release the reconciliation lock. */ + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + + /* Update statistics. */ + WT_STAT_FAST_CONN_INCR(session, rec_pages); + WT_STAT_FAST_DATA_INCR(session, rec_pages); + if (LF_ISSET(WT_EVICTING)) { + WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); + WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); + } + if (r->cache_write_lookaside) { + WT_STAT_FAST_CONN_INCR(session, cache_write_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_write_lookaside); + } + if (r->cache_write_restore) { + WT_STAT_FAST_CONN_INCR(session, cache_write_restore); + WT_STAT_FAST_DATA_INCR(session, cache_write_restore); + } /* * Clean up the boundary structures: some workloads result in millions @@ -488,6 +490,81 @@ __wt_reconcile(WT_SESSION_IMPL *session, return (__wt_page_parent_modify_set(session, ref, 1)); } +/* + * __rec_write_status -- + * Return the final status for reconciliation. + */ +static int +__rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_PAGE_MODIFY *mod; + + btree = S2BT(session); + mod = page->modify; + + /* + * Return based on how we were called (eviction or checkpoint) and if + * we cleaned the page. + */ + if (r->leave_dirty) { + /* + * Update the page's first unwritten transaction ID. + */ + mod->first_dirty_txn = r->first_dirty_txn; + + /* + * The page remains dirty. + * + * Any checkpoint call cleared the tree's modified flag before + * writing pages, so we must explicitly reset it. We insert a + * barrier after the change for clarity (the requirement is the + * flag be set before a subsequent checkpoint reads it, and + * as the current checkpoint is waiting on this reconciliation + * to complete, there's no risk of that happening) + */ + btree->modified = 1; + WT_FULL_BARRIER(); + + /* If evicting, we've failed. */ + if (F_ISSET(r, WT_EVICTING)) + return (EBUSY); + } else { + /* + * Track the page's maximum transaction ID (used to decide if + * we're likely to be able to evict this page in the future). + */ + mod->rec_max_txn = r->max_txn; + + /* + * Track the tree's maximum transaction ID (used to decide if + * it's safe to discard the tree). Reconciliation for eviction + * is multi-threaded, only update the tree's maximum transaction + * ID when doing a checkpoint. That's sufficient, we only care + * about the maximum transaction ID of current updates in the + * tree, and checkpoint visits every dirty page in the tree. + */ + if (!F_ISSET(r, WT_EVICTING) && + WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) + btree->rec_max_txn = r->max_txn; + + /* + * The page only might be clean; if the write generation is + * unchanged since reconciliation started, it's clean. + * + * If the write generation changed, the page has been written + * since reconciliation started and remains dirty (that can't + * happen when evicting, the page is exclusively locked). + */ + if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) + __wt_cache_dirty_decr(session, page); + else + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + } + + return (0); +} + /* * __rec_root_write -- * Handle the write of a root page. @@ -648,14 +725,49 @@ __rec_write_init(WT_SESSION_IMPL *session, F_SET(&r->dsk, WT_ITEM_ALIGNED); } + /* Reconciliation is not re-entrant, make sure that doesn't happen. */ + WT_ASSERT(session, r->ref == NULL); + /* Remember the configuration. */ r->ref = ref; r->page = page; + + /* + * Lookaside table eviction is configured when eviction gets aggressive, + * adjust the flags for cases we don't support. + */ + if (LF_ISSET(WT_EVICT_LOOKASIDE)) { + /* + * Saving lookaside table updates into the lookaside table won't + * work. + */ + if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) + LF_CLR(WT_EVICT_LOOKASIDE); + + /* + * We don't yet support fixed-length column-store combined with + * the lookaside table. It's not hard to do, but the underlying + * function that reviews which updates can be written to the + * evicted page and which updates need to be written to the + * lookaside table needs access to the original value from the + * page being evicted, and there's no code path for that in the + * case of fixed-length column-store objects. (Row-store and + * variable-width column-store objects provide a reference to + * the unpacked on-page cell for this purpose, but there isn't + * an on-page cell for fixed-length column-store objects.) For + * now, turn it off. + */ + if (page->type == WT_PAGE_COL_FIX) + LF_CLR(WT_EVICT_LOOKASIDE); + } r->flags = flags; /* Track if the page can be marked clean. */ r->leave_dirty = 0; + /* Track if saving/restoring updates. */ + r->evict_skipped_updates = 0; + /* Raw compression. */ r->raw_compression = __rec_raw_compression_config(session, page, salvage); @@ -668,8 +780,8 @@ __rec_write_init(WT_SESSION_IMPL *session, r->all_empty_value = 1; r->any_empty_value = 0; - /* The list of cached, skipped updates. */ - r->skip_next = 0; + /* The list of saved updates. */ + r->supd_next = 0; /* * Dictionary compression only writes repeated values once. We grow @@ -748,7 +860,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __rec_bnd_cleanup(session, r, 1); - __wt_free(session, r->skip); + __wt_free(session, r->supd); __wt_buf_free(session, &r->k.buf); __wt_buf_free(session, &r->v.buf); @@ -784,6 +896,9 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) if (r->bnd == NULL) return; + /* Reconciliation is not re-entrant, make sure that doesn't happen. */ + r->ref = NULL; + /* * Free the boundary structures' memory. In the case of normal cleanup, * discard any memory we won't reuse in the next reconciliation; in the @@ -799,7 +914,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); + __wt_free(session, bnd->supd); __wt_buf_free(session, &bnd->key); } __wt_free(session, r->bnd); @@ -820,66 +935,84 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); + __wt_free(session, bnd->supd); } } } /* - * __rec_skip_update_save -- - * Save a skipped WT_UPDATE list for later restoration. + * __rec_block_free -- + * Helper function to free a block. + */ +static int +__rec_block_free( + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_BM *bm; + WT_BTREE *btree; + + btree = S2BT(session); + bm = btree->bm; + + return (bm->free(bm, session, addr, addr_size)); +} + +/* + * __rec_update_save -- + * Save a WT_UPDATE list for later restoration. */ static int -__rec_skip_update_save( - WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip) +__rec_update_save(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, uint64_t txnid) { WT_RET(__wt_realloc_def( - session, &r->skip_allocated, r->skip_next + 1, &r->skip)); - r->skip[r->skip_next].ins = ins; - r->skip[r->skip_next].rip = rip; - ++r->skip_next; + session, &r->supd_allocated, r->supd_next + 1, &r->supd)); + r->supd[r->supd_next].ins = ins; + r->supd[r->supd_next].rip = rip; + r->supd[r->supd_next].onpage_txn = txnid; + ++r->supd_next; return (0); } /* - * __rec_skip_update_move -- - * Move a skipped WT_UPDATE list from the per-page cache to a specific + * __rec_update_move -- + * Move a saved WT_UPDATE list from the per-page cache to a specific * block's list. */ static int -__rec_skip_update_move( - WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip) +__rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd) { WT_RET(__wt_realloc_def( - session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip)); - bnd->skip[bnd->skip_next] = *skip; - ++bnd->skip_next; + session, &bnd->supd_allocated, bnd->supd_next + 1, &bnd->supd)); + bnd->supd[bnd->supd_next] = *supd; + ++bnd->supd_next; - skip->ins = NULL; - skip->rip = NULL; + supd->ins = NULL; + supd->rip = NULL; return (0); } /* * __rec_txn_read -- - * Return the first visible update in a list (or NULL if none are visible), - * set a flag if any updates were skipped, track the maximum transaction ID on - * the page. + * Return the update in a list that should be written (or NULL if none can + * be written). */ -static inline int +static int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { + WT_BTREE *btree; WT_DECL_RET; - WT_ITEM ovfl; + WT_DECL_ITEM(tmp); WT_PAGE *page; - WT_UPDATE *upd, *upd_list, *upd_ovfl; + WT_UPDATE *append, *upd, *upd_list; size_t notused; uint64_t max_txn, min_txn, txnid; - int skipped; + int append_origv, skipped; *updp = NULL; + btree = S2BT(session); page = r->page; /* @@ -893,13 +1026,16 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } else upd_list = ins->upd; - skipped = 0; - for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; - upd != NULL; upd = upd->next) { + for (skipped = 0, + max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, + upd = upd_list; upd != NULL; upd = upd->next) { if ((txnid = upd->txnid) == WT_TXN_ABORTED) continue; - /* Track the largest/smallest transaction IDs on the list. */ + /* + * Track the largest/smallest transaction IDs on the list and + * the smallest not-globally-visible transaction on the page. + */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; if (WT_TXNID_LT(txnid, min_txn)) @@ -909,132 +1045,230 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->first_dirty_txn = txnid; /* - * Record whether any updates were skipped on the way to finding - * the first visible update. - * - * If updates were skipped before the one being written, future - * reads without intervening modifications to the page could - * see a different value; if no updates were skipped, the page - * can safely be marked clean and does not need to be - * reconciled until modified again. + * Find the first update we can use. */ - if (*updp == NULL) { - if (__wt_txn_visible(session, txnid)) - *updp = upd; - else + if (F_ISSET(r, WT_EVICTING)) { + /* + * Eviction can write any committed update. + * + * When reconciling for eviction, track whether any + * uncommitted updates are found. + */ + if (__wt_txn_committed(session, txnid)) { + if (*updp == NULL) + *updp = upd; + } else skipped = 1; + } else { + /* + * Checkpoint can only write updates visible as of its + * snapshot. + * + * When reconciling for a checkpoint, track whether any + * updates were skipped on the way to finding the first + * visible update. + */ + if (*updp == NULL) { + if (__wt_txn_visible(session, txnid)) + *updp = upd; + else + skipped = 1; + } } } /* * Track the maximum transaction ID in the page. We store this in the - * page at the end of reconciliation if no updates are skipped, it's - * used to avoid evicting clean pages from memory with changes required - * to satisfy a snapshot read. + * tree at the end of reconciliation in the service of checkpoints, it + * is used to avoid discarding trees from memory when they have changes + * required to satisfy a snapshot read. */ if (WT_TXNID_LT(r->max_txn, max_txn)) r->max_txn = max_txn; /* - * If no updates were skipped and all updates are globally visible, the - * page can be marked clean and we're done, regardless of whether we're - * evicting or checkpointing. + * If there are no skipped updates and all updates are globally visible, + * the page can be marked clean and we're done, regardless if evicting + * or checkpointing. * * We have to check both: the oldest transaction ID may have moved while - * we were scanning the update list, so it is possible to skip an update - * but then find that by the end of the scan, all updates are stable. + * we were scanning the update list, so it is possible to find a skipped + * update, but then find all updates are stable at the end of the scan. + * + * Skip the visibility check for the lookaside table as a special-case, + * we know there are no older readers of that table. */ - if (!skipped && __wt_txn_visible_all(session, max_txn)) + if (!skipped && + (F_ISSET(btree, WT_BTREE_LOOKASIDE) || + __wt_txn_visible_all(session, max_txn))) return (0); /* - * If some updates are not globally visible, or were skipped, the page - * cannot be marked clean. + * In some cases, there had better not be skipped updates or updates not + * yet globally visible. */ - r->leave_dirty = 1; - - /* If we're not evicting, we're done, we know what we'll write. */ - if (!F_ISSET(r, WT_EVICTING)) - return (0); - - /* In some cases, there had better not be any updates we can't write. */ - if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + if (F_ISSET(r, WT_VISIBILITY_ERR)) WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); + "reconciliation error, uncommitted update or update not " + "globally visible"); /* - * If evicting and we aren't able to save/restore the not-yet-visible - * updates, the page can't be evicted. + * If not trying to evict the page, we know what we'll write and we're + * done. Because some updates were skipped or are not globally visible, + * the page can't be marked clean. */ - if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE)) - return (EBUSY); + if (!F_ISSET(r, WT_EVICTING)) { + r->leave_dirty = 1; + return (0); + } /* - * Evicting a page with not-yet-visible updates: save and restore the - * list of updates on a newly instantiated page. + * Evicting and there are either uncommitted changes or updates not yet + * globally visible. There are two ways to continue with the eviction, + * based on whether or not there are uncommitted updates. * - * The order of the updates on the list matters so we can't move only - * the unresolved updates, we have to move the entire update list. - * - * Clear the returned update so our caller ignores the key/value pair - * in the case of an insert/append entry (everything we need is in the - * update list), and otherwise writes the original on-page key/value - * pair to which the update list applies. + * First, if there are uncommitted updates, we can evict most of the + * page and create a new, smaller page with just the skipped updates. */ - *updp = NULL; + append_origv = 0; + if (skipped) { + /* + * The save/restore eviction path is only configured if forcibly + * evicting pages (it's intended for large pages that split into + * many smaller pages). If not configured to save/restore the + * updates, fail eviction. + */ + if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + r->evict_skipped_updates = 1; + + /* + * Clear the returned update so our caller ignores the key/value + * pair in the case of an insert/append list entry (everything + * we need is in the update list), and otherwise writes the + * original on-page key/value pair to which the update list + * applies. + */ + *updp = NULL; + + /* The page can't be marked clean. */ + r->leave_dirty = 1; + + /* + * A special-case for overflow values, where we can't write the + * original on-page value item to disk because it's been updated + * or removed. + * + * What happens is that an overflow value is updated or removed + * and its backing blocks freed. If any reader in the system + * might still want the value, a copy was cached in the page + * reconciliation tracking memory, and the page cell set to + * WT_CELL_VALUE_OVFL_RM. Eviction then chose the page and + * we're splitting it up in order to push parts of it out of + * memory. + * + * We could write the original on-page value item to disk... if + * we had a copy. The cache may not have a copy (a globally + * visible update would have kept a value from being cached), or + * an update that subsequently became globally visible could + * cause a cached value to be discarded. Either way, once there + * is a globally visible update, we may not have the original + * value. + * + * Fortunately, if there's a globally visible update we don't + * care about the original version, so we simply ignore it, no + * transaction can ever try and read it. If there isn't a + * globally visible update, there had better be a cached value. + * + * In the latter case, we could write the value out to disk, but + * (1) we are planning on re-instantiating this page in memory, + * it isn't going to disk, and (2) the value item is eventually + * going to be discarded, that seems like a waste of a write. + * Instead, find the cached value and append it to the update + * list we're saving for later restoration. + */ + if (vpack != NULL && + vpack->raw == WT_CELL_VALUE_OVFL_RM && + !__wt_txn_visible_all(session, min_txn)) + append_origv = 1; + } /* - * Handle the case were we don't want to write an original on-page value - * item to disk because it's been updated or removed. - * - * Here's the deal: an overflow value was updated or removed and its - * backing blocks freed. If any transaction in the system might still - * read the value, a copy was cached in page reconciliation tracking - * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction - * then chose the page and we're splitting it up in order to push parts - * of it out of memory. - * - * We could write the original on-page value item to disk... if we had - * a copy. The cache may not have a copy (a globally visible update - * would have kept a value from ever being cached), or an update that - * subsequent became globally visible could cause a cached value to be - * discarded. Either way, once there's a globally visible update, we - * may not have the value. - * - * Fortunately, if there's a globally visible update we don't care about - * the original version, so we simply ignore it, no transaction can ever - * try and read it. If there isn't a globally visible update, there had - * better be a cached value. - * - * In the latter case, we could write the value out to disk, but (1) we - * are planning on re-instantiating this page in memory, it isn't going - * to disk, and (2) the value item is eventually going to be discarded, - * that seems like a waste of a write. Instead, find the cached value - * and append it to the update list we're saving for later restoration. + * Second, if we did NOT skip updates, we can save a copy of the update + * list in the lookaside table and proceed with eviction. If/when the + * page is read back into the cache, we'll re-apply the update list (if + * any of the old readers are still around). */ - if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && - !__wt_txn_visible_all(session, min_txn)) { - if ((ret = __wt_ovfl_txnc_search( - page, vpack->data, vpack->size, &ovfl)) != 0) - WT_PANIC_RET(session, ret, - "cached overflow item discarded early"); + if (!skipped) { + /* + * Lookaside table eviction is only configured when eviction is + * getting aggressive. If not configured to write the lookaside + * table, fail eviction. + */ + if (!F_ISSET(r, WT_EVICT_LOOKASIDE)) + return (EBUSY); /* - * Create an update structure with an impossibly low transaction - * ID and append it to the update list we're about to save. - * Restoring that update list when this page is re-instantiated - * creates an update for the key/value pair visible to every - * running transaction in the system, ensuring the on-page value - * will be ignored. + * If at least one update is globally visible, copy the update + * list and ignore the current on-page value. If no update is + * globally visible, readers require the page's original value. + */ + if (!__wt_txn_visible_all(session, min_txn)) + append_origv = 1; + } + + /* + * We need the original on-page value for some reason: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + */ + if (append_origv) { + /* + * If we don't have a value cell, it's an insert/append list + * key/value pair which simply doesn't exist for some reader; + * place a deleted record at the end of the update list. + */ + if (vpack == NULL || vpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc( + session, NULL, &append, ¬used)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + if ((ret = __wt_page_cell_data_ref( + session, page, vpack, tmp)) == 0) + ret = __wt_update_alloc( + session, tmp, &append, ¬used); + __wt_scr_free(session, &tmp); + WT_RET(ret); + } + + /* + * Give the entry an impossibly low transaction ID to ensure its + * global visibility, append it to the update list. + * + * Note the change to the actual reader-accessible update list: + * from now on, the original on-page value appears at the end + * of the update list, even if this reconciliation subsequently + * fails. */ - WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, ¬used)); - upd_ovfl->txnid = WT_TXN_NONE; + append->txnid = WT_TXN_NONE; for (upd = upd_list; upd->next != NULL; upd = upd->next) ; - upd->next = upd_ovfl; + upd->next = append; } - return (__rec_skip_update_save(session, r, ins, rip)); + /* + * The order of the updates on the list matters, we can't move only the + * unresolved updates, move the entire update list. + * + * If we skipped updates, the transaction value is never used. If we + * didn't skip updates, the list of updates are eventually written to + * the lookaside table, and associated with each update record is the + * transaction ID of the update we wrote in the reconciled page; once + * that transaction ID is globally visible, we know we no longer need + * the lookaside table records, allowing them to be discarded. + */ + return (__rec_update_save( + session, r, ins, rip, skipped ? WT_TXN_NONE : (*updp)->txnid)); } /* @@ -1234,37 +1468,32 @@ static int __rec_child_deleted( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep) { - WT_BM *bm; WT_PAGE_DELETED *page_del; size_t addr_size; const uint8_t *addr; - bm = S2BT(session)->bm; page_del = ref->page_del; /* * Internal pages with child leaf pages in the WT_REF_DELETED state are * a special case during reconciliation. First, if the deletion was a * result of a session truncate call, the deletion may not be visible to - * us. In that case, we proceed as with any change that's not visible - * during reconciliation by setting the skipped flag and ignoring the - * change for the purposes of writing the internal page. + * us. In that case, we proceed as with any change not visible during + * reconciliation by ignoring the change for the purposes of writing the + * internal page. * * In this case, there must be an associated page-deleted structure, and * it holds the transaction ID we care about. + * + * In some cases, there had better not be any updates we can't see. */ - if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) { - /* - * In some cases, there had better not be any updates we can't - * write. - */ - if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) - WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); - } + if (F_ISSET(r, WT_VISIBILITY_ERR) && + page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); /* - * The deletion is visible to us, deal with any underlying disk blocks. + * Deal with any underlying disk blocks. * * First, check to see if there is an address associated with this leaf: * if there isn't, we're done, the underlying page is already gone. If @@ -1291,7 +1520,7 @@ __rec_child_deleted( (page_del == NULL || __wt_txn_visible_all(session, page_del->txnid))) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); - WT_RET(bm->free(bm, session, addr, addr_size)); + WT_RET(__rec_block_free(session, addr, addr_size)); if (__wt_off_page(ref->home, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); @@ -1562,7 +1791,7 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = 0; + bnd->recno = WT_RECNO_OOB; bnd->entries = 0; __wt_free(session, bnd->addr.addr); @@ -1571,9 +1800,9 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->cksum = 0; __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); - bnd->skip_next = 0; - bnd->skip_allocated = 0; + __wt_free(session, bnd->supd); + bnd->supd_next = 0; + bnd->supd_allocated = 0; /* * Don't touch the key, we re-use that memory in each new @@ -1776,8 +2005,12 @@ __rec_split_init(WT_SESSION_IMPL *session, * Return if we're writing a checkpoint. */ static int -__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) +__rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_BOUNDARY *bnd) { + WT_BTREE *btree; + + btree = S2BT(session); + /* * Check to see if we're going to create a checkpoint. * @@ -1792,7 +2025,8 @@ __rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) * we don't do checkpoint writes here; clear the boundary information as * a reminder and create the checkpoint during wrapup. */ - if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) && + bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { bnd->addr.addr = NULL; bnd->addr.size = 0; bnd->addr.type = 0; @@ -1841,7 +2075,7 @@ __rec_split_row_promote( WT_DECL_ITEM(update); WT_DECL_RET; WT_ITEM *max; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; size_t cnt, len, size; uint32_t i; const uint8_t *pa, *pb; @@ -1892,36 +2126,37 @@ __rec_split_row_promote( * the last key and smaller than the current key. */ max = r->last; - for (i = r->skip_next; i > 0; --i) { - skip = &r->skip[i - 1]; - if (skip->ins == NULL) - WT_ERR(__wt_row_leaf_key( - session, r->page, skip->rip, update, 0)); - else { - update->data = WT_INSERT_KEY(skip->ins); - update->size = WT_INSERT_KEY_SIZE(skip->ins); - } + if (r->evict_skipped_updates) + for (i = r->supd_next; i > 0; --i) { + supd = &r->supd[i - 1]; + if (supd->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, r->page, supd->rip, update, 0)); + else { + update->data = WT_INSERT_KEY(supd->ins); + update->size = WT_INSERT_KEY_SIZE(supd->ins); + } - /* Compare against the current key, it must be less. */ - WT_ERR(__wt_compare( - session, btree->collator, update, r->cur, &cmp)); - if (cmp >= 0) - continue; + /* Compare against the current key, it must be less. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->cur, &cmp)); + if (cmp >= 0) + continue; - /* Compare against the last key, it must be greater. */ - WT_ERR(__wt_compare( - session, btree->collator, update, r->last, &cmp)); - if (cmp >= 0) - max = update; + /* Compare against the last key, it must be greater. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->last, &cmp)); + if (cmp >= 0) + max = update; - /* - * The skipped updates are in key-sort order so the entry we're - * looking for is either the last one or the next-to-last one - * in the list. Once we've compared an entry against the last - * key on the page, we're done. - */ - break; - } + /* + * The saved updates are in key-sort order so the entry + * we're looking for is either the last or the next-to- + * last one in the list. Once we've compared an entry + * against the last key on the page, we're done. + */ + break; + } /* * The largest key on the last block must sort before the current key, @@ -2228,7 +2463,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, * We track the record number at each column-store split point, set an * initial value. */ - recno = 0; + recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) recno = last->recno; @@ -2326,10 +2561,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RET(compressor->pre_size(compressor, wt_session, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, (size_t)r->raw_offsets[slots], &result_len)); - extra_skip = 0; - if (btree->kencryptor != NULL) - extra_skip = btree->kencryptor->size_const + - WT_ENCRYPT_LEN_SIZE; + extra_skip = btree->kencryptor == NULL ? 0 : + btree->kencryptor->size_const + WT_ENCRYPT_LEN_SIZE; corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP; WT_RET(bm->write_size(bm, session, &corrected_page_size)); @@ -2477,7 +2710,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = 0; + next->recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2530,7 +2763,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, * * If it's not a checkpoint, write the block. */ - if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) { + if (r->bnd_next == 1 && + last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( session, &r->dsk, dst->mem, dst->size)); @@ -2647,13 +2881,29 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * We only arrive here with no entries to write if the page was entirely - * empty, and if the page is empty, we merge it into its parent during - * the parent's reconciliation. A page with skipped updates isn't truly - * empty, continue on. + * We may arrive here with no entries to write if the page was entirely + * empty or if nothing on the page was visible to us. */ - if (r->entries == 0 && r->skip_next == 0) - return (0); + if (r->entries == 0) { + /* + * Pages with skipped or not-yet-globally visible updates aren't + * really empty; otherwise, the page is truly empty and we will + * merge it into its parent during the parent's reconciliation. + */ + if (r->supd_next == 0) + return (0); + + /* + * If the page has skipped updates, continue on the write path, + * it will be saved/restored after we finish. + * + * If the page has not-yet-globally visible updates, we can't + * continue (we need a page to be written, otherwise we won't + * ever find the updates for future reads). For now, quit. + */ + if (!r->evict_skipped_updates) + return (EBUSY); + } /* Set the boundary reference and increment the count. */ bnd = &r->bnd[r->bnd_next++]; @@ -2666,9 +2916,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); /* If this is a checkpoint, we're done, otherwise write the page. */ - return ( - __rec_is_checkpoint(r, bnd) ? 0 : - __rec_split_write(session, r, bnd, &r->dsk, 1)); + return (__rec_is_checkpoint(session, r, bnd) ? + 0 : __rec_split_write(session, r, bnd, &r->dsk, 1)); } /* @@ -2794,7 +3043,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; size_t addr_size; uint32_t bnd_slot, i, j; int cmp; @@ -2837,23 +3086,23 @@ __rec_split_write(WT_SESSION_IMPL *session, bnd->cksum = 0; /* - * Check if we've skipped updates that belong to this block, and move - * any to the per-block structure. Quit as soon as we find a skipped + * Check if we've saved updates that belong to this block, and move + * any to the per-block structure. Quit as soon as we find a saved * update that doesn't belong to the block, they're in sorted order. * * This code requires a key be filled in for the next block (or the * last block flag be set, if there's no next block). */ - for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) { - /* The last block gets all remaining skipped updates. */ + for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) { + /* The last block gets all remaining saved updates. */ if (last_block) { - WT_ERR(__rec_skip_update_move(session, bnd, skip)); + WT_ERR(__rec_update_move(session, bnd, supd)); continue; } /* - * Get the skipped update's key and compare it with this block's - * key range. If the skipped update list belongs with the block + * Get the saved update's key and compare it with this block's + * key range. If the saved update list belongs with the block * we're about to write, move it to the per-block memory. Check * only to the first update that doesn't go with the block, they * must be in sorted order. @@ -2861,43 +3110,54 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno) - goto skip_check_complete; + if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: - if (skip->ins == NULL) + if (supd->ins == NULL) WT_ERR(__wt_row_leaf_key( - session, page, skip->rip, key, 0)); + session, page, supd->rip, key, 0)); else { - key->data = WT_INSERT_KEY(skip->ins); - key->size = WT_INSERT_KEY_SIZE(skip->ins); + key->data = WT_INSERT_KEY(supd->ins); + key->size = WT_INSERT_KEY_SIZE(supd->ins); } WT_ERR(__wt_compare(session, btree->collator, key, &(bnd + 1)->key, &cmp)); if (cmp >= 0) - goto skip_check_complete; + goto supd_check_complete; break; WT_ILLEGAL_VALUE_ERR(session); } - WT_ERR(__rec_skip_update_move(session, bnd, skip)); + WT_ERR(__rec_update_move(session, bnd, supd)); } -skip_check_complete: +supd_check_complete: /* * If there are updates that weren't moved to the block, shuffle them to - * the beginning of the cached list (we maintain the skipped updates in - * sorted order, new skipped updates must be appended to the list). + * the beginning of the cached list (we maintain the saved updates in + * sorted order, new saved updates must be appended to the list). */ - for (j = 0; i < r->skip_next; ++j, ++i) - r->skip[j] = r->skip[i]; - r->skip_next = j; + for (j = 0; i < r->supd_next; ++j, ++i) + r->supd[j] = r->supd[i]; + r->supd_next = j; + + /* + * If we found updates that weren't globally visible when reconciling + * this page, note that in the page header. + */ + if (!r->evict_skipped_updates && bnd->supd != NULL) { + F_SET(dsk, WT_PAGE_LAS_UPDATE); + r->cache_write_lookaside = 1; + } /* * If we had to skip updates in order to build this disk image, we can't * actually write it. Instead, we will re-instantiate the page using the * disk image and the list of updates we skipped. */ - if (bnd->skip != NULL) { + if (r->evict_skipped_updates && bnd->supd != NULL) { + r->cache_write_restore = 1; + /* * If the buffer is compressed (raw compression was configured), * we have to decompress it so we can instantiate it later. It's @@ -2963,11 +3223,146 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); bnd->addr.size = (uint8_t)addr_size; + /* + * If we found updates that weren't globally visible when reconciling + * this page, copy those updates into the database's lookaside store. + */ + if (!r->evict_skipped_updates && bnd->supd != NULL) + ret = __rec_update_las(session, r, btree->id, bnd); + done: err: __wt_scr_free(session, &key); return (ret); } +/* + * __rec_update_las -- + * Copy a set of updates into the database's lookaside buffer. + */ +static int +__rec_update_las(WT_SESSION_IMPL *session, + WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd) +{ + WT_CURSOR *cursor; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_ITEM las_addr, las_value; + WT_PAGE *page; + WT_SAVE_UPD *list; + WT_UPDATE *upd; + uint64_t las_counter; + uint32_t i, session_flags, slot; + uint8_t *p; + + cursor = NULL; + WT_CLEAR(las_addr); + WT_CLEAR(las_value); + page = r->page; + + /* + * We're writing lookaside records: start instantiating them on pages + * we read (with the right flag set), and start sweeping the file. + */ + __wt_las_set_written(session); + + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* Ensure enough room for a column-store key without checking. */ + WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + /* Enter each update in the boundary list into the lookaside store. */ + for (i = 0, list = bnd->supd; i < bnd->supd_next; ++i, ++list) { + /* + * Each key in the lookaside table is associated with a block, + * and those blocks are freed and reallocated to other pages + * as pages in the tree are modified and reconciled. We want + * to be sure we don't add records to the lookaside table, then + * discard the block to which they apply, then write a new + * block to the same address, and then apply the old records + * to the new block when it's read. We don't want to clean old + * records out of the lookaside table every time we free a block + * because that happens a lot and would be costly; instead, we + * clean out the old records when adding new records into the + * lookaside table. This works because we only read from the + * lookaside table for pages marked with the WT_PAGE_LAS_UPDATE + * flag. If we rewrite a block that has no lookaside records, + * the block won't have that flag set and so the lookaside table + * won't be checked when the block is read. If we rewrite a + * block that has lookaside records, we'll run this code which + * cleans out any old records. + */ + WT_ERR(__wt_las_remove_block( + session, cursor, btree_id, bnd->addr.addr, bnd->addr.size)); + + /* Lookaside table key component: block address. */ + las_addr.data = bnd->addr.addr; + las_addr.size = bnd->addr.size; + + /* Lookaside table key component: source key. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_ERR( + __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); + key->size = WT_PTRDIFF(p, key->data); + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, page, list->rip, key, 0)); + else { + key->data = WT_INSERT_KEY(list->ins); + key->size = WT_INSERT_KEY_SIZE(list->ins); + } + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Lookaside table value component: update reference. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + upd = list->ins->upd; + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) { + slot = WT_ROW_SLOT(page, list->rip); + upd = page->pg_row_upd[slot]; + } else + upd = list->ins->upd; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * Walk the list of updates, storing each key/value pair into + * the lookaside table. + */ + for (las_counter = 1; + upd != NULL; ++las_counter, upd = upd->next) { + cursor->set_key(cursor, btree_id, + &las_addr, list->onpage_txn, las_counter, key); + + if (WT_UPDATE_DELETED_ISSET(upd)) + las_value.size = 0; + else { + las_value.data = WT_UPDATE_DATA(upd); + las_value.size = upd->size; + } + cursor->set_value( + cursor, upd->txnid, upd->size, &las_value); + + WT_ERR(cursor->insert(cursor)); + } + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &key); + return (ret); +} + /* * __wt_bulk_init -- * Bulk insert initialization. @@ -3008,7 +3403,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) recno = 1; break; case BTREE_ROW: - recno = 0; + recno = WT_RECNO_OOB; break; WT_ILLEGAL_VALUE(session); } @@ -3049,6 +3444,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_split_finish(session, r)); WT_RET(__rec_write_wrapup(session, r, r->page)); + WT_RET(__rec_write_status(session, r, r->page)); /* Mark the page's parent and the tree dirty. */ parent = r->ref->home; @@ -3824,7 +4220,7 @@ __rec_col_var(WT_SESSION_IMPL *session, * Write a placeholder. */ WT_ASSERT(session, - F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); data = "@"; size = 1; @@ -4207,7 +4603,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) vtype = state == WT_CHILD_PROXY ? WT_CELL_ADDR_DEL : (u_int)vpack->raw; } - __rec_cell_build_addr(r, p, size, vtype, 0); + __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB); CHILD_RELEASE_ERR(session, hazard, ref); /* @@ -4294,7 +4690,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr = &multi->addr; __rec_cell_build_addr( - r, addr->addr, addr->size, __rec_vtype(addr), 0); + r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ if (key->len + val->len > r->space_avail) @@ -4450,7 +4846,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * Assert the case. */ WT_ASSERT(session, - F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); /* * If the key is also a removed overflow item, @@ -4777,13 +5173,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) static int __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_BM *bm; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_MULTI *multi; uint32_t i; - bm = S2BT(session)->bm; mod = page->modify; /* @@ -4799,17 +5193,17 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - if (multi->skip == NULL) { + if (multi->supd == NULL) { if (multi->addr.reuse) multi->addr.addr = NULL; else { - WT_RET(bm->free(bm, session, + WT_RET(__rec_block_free(session, multi->addr.addr, multi->addr.size)); __wt_free(session, multi->addr.addr); } } else { - __wt_free(session, multi->skip); - __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->supd); + __wt_free(session, multi->supd_dsk); } } __wt_free(session, mod->mod_multi); @@ -4882,7 +5276,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ WT_RET(__wt_ref_info( session, ref, &addr, &addr_size, NULL)); - WT_RET(bm->free(bm, session, addr, addr_size)); + WT_RET(__rec_block_free(session, addr, addr_size)); if (__wt_off_page(ref->home, ref->addr)) { __wt_free( session, ((WT_ADDR *)ref->addr)->addr); @@ -4908,7 +5302,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * are checkpoints, and must be explicitly dropped. */ if (!__wt_ref_is_root(ref)) - WT_RET(bm->free(bm, session, + WT_RET(__rec_block_free(session, mod->mod_replace.addr, mod->mod_replace.size)); /* Discard the replacement page's address. */ @@ -4962,14 +5356,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * nothing to write. Allocate, then initialize the array of * replacement blocks. */ - if (bnd->skip != NULL) { + if (r->evict_skipped_updates) { WT_RET(__wt_calloc_def( session, r->bnd_next, &mod->mod_multi)); multi = mod->mod_multi; - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; bnd->dsk = NULL; mod->mod_multi_entries = 1; @@ -5068,50 +5462,6 @@ err: __wt_scr_free(session, &tkey); F_SET(mod, WT_PM_REC_MULTIBLOCK); break; } - - /* - * If updates were skipped, the tree isn't clean. The checkpoint call - * cleared the tree's modified value before calling the eviction thread, - * so we must explicitly reset the tree's modified flag. We insert a - * barrier after the change for clarity (the requirement is the value - * be set before a subsequent checkpoint reads it, and because the - * current checkpoint is waiting on this reconciliation to complete, - * there's no risk of that happening). - */ - if (r->leave_dirty) { - mod->first_dirty_txn = r->first_dirty_txn; - - btree->modified = 1; - WT_FULL_BARRIER(); - } else { - /* - * If no updates were skipped, we have a new maximum transaction - * written for the page (used to decide if a clean page can be - * evicted). Set the highest transaction ID for the page. - * - * Track the highest transaction ID for the tree (used to decide - * if it's safe to discard all of the pages in the tree without - * further checking). Reconciliation in the service of eviction - * is multi-threaded, only update the tree's maximum transaction - * ID when doing a checkpoint. That's sufficient, we only care - * about the highest transaction ID of any update currently in - * the tree, and checkpoint visits every dirty page in the tree. - */ - mod->rec_max_txn = r->max_txn; - if (!F_ISSET(r, WT_EVICTING) && - WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) - btree->rec_max_txn = r->max_txn; - - /* - * The page only might be clean; if the write generation is - * unchanged since reconciliation started, it's clean. If the - * write generation changed, the page has been written since - * we started reconciliation and remains dirty. - */ - if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) - __wt_cache_dirty_decr(session, page); - } - return (0); } @@ -5122,14 +5472,12 @@ err: __wt_scr_free(session, &tkey); static int __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) { - WT_BM *bm; WT_BOUNDARY *bnd; WT_DECL_RET; WT_MULTI *multi; WT_PAGE_MODIFY *mod; uint32_t i; - bm = S2BT(session)->bm; mod = page->modify; /* @@ -5160,7 +5508,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (bnd->addr.reuse) bnd->addr.addr = NULL; else { - WT_TRET(bm->free(bm, session, + WT_TRET(__rec_block_free(session, bnd->addr.addr, bnd->addr.size)); __wt_free(session, bnd->addr.addr); } @@ -5203,18 +5551,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_row_ikey_alloc(session, 0, bnd->key.data, bnd->key.size, &multi->key.ikey)); - if (bnd->skip == NULL) { + if (r->evict_skipped_updates && bnd->supd != NULL) { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; + bnd->dsk = NULL; + } else { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; - } else { - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; - bnd->dsk = NULL; } } mod->mod_multi_entries = r->bnd_next; @@ -5243,18 +5591,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { multi->key.recno = bnd->recno; - if (bnd->skip == NULL) { + if (r->evict_skipped_updates && bnd->supd != NULL) { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; + bnd->dsk = NULL; + } else { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; - } else { - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; - bnd->dsk = NULL; } } mod->mod_multi_entries = r->bnd_next; diff --git a/src/session/session_api.c b/src/session/session_api.c index 76d91bed3c2..a1f5618a317 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -382,6 +382,22 @@ err: if (cursor != NULL) API_END_RET_NOTFOUND_MAP(session, ret); } +/* + * __wt_session_create -- + * Internal version of WT_SESSION::create. + */ +int +__wt_session_create( + WT_SESSION_IMPL *session, const char *uri, const char *config) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_TABLE_LOCK(session, + ret = __wt_schema_create(session, uri, config))); + return (ret); +} + /* * __session_create -- * WT_SESSION->create method. @@ -423,9 +439,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config) WT_ERR_NOTFOUND_OK(ret); } - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - ret = __wt_schema_create(session, uri, config))); + ret = __wt_session_create(session, uri, config); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -528,6 +542,21 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config) return (__wt_session_compact(wt_session, uri, config)); } +/* + * __wt_session_drop -- + * Internal version of WT_SESSION::drop. + */ +int +__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_TABLE_LOCK(session, + ret = __wt_schema_drop(session, uri, cfg))); + return (ret); +} + /* * __session_drop -- * WT_SESSION->drop method. @@ -544,9 +573,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - ret = __wt_schema_drop(session, uri, cfg))); + ret = __wt_session_drop(session, uri, cfg); err: /* Note: drop operations cannot be unrolled (yet?). */ API_END_RET_NOTFOUND_MAP(session, ret); @@ -1001,7 +1028,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) * operations, but checkpoint does enough I/O it may be called upon to * perform slow operations for the block manager. */ - F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); /* * Only one checkpoint can be active at a time, and checkpoints must run @@ -1016,7 +1043,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); -err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); +err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); API_END_RET_NOTFOUND_MAP(session, ret); } diff --git a/src/support/stat.c b/src/support/stat.c index 04c97d4c2d8..e9d199d57e6 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -50,8 +50,11 @@ static const char * const __stats_dsrc_desc[] = { "cache: in-memory page splits", "cache: overflow values cached in memory", "cache: pages read into cache", + "cache: pages read into cache requiring lookaside entries", "cache: overflow pages read into cache", "cache: pages written from cache", + "cache: page written requiring lookaside records", + "cache: pages written requiring in-memory restoration", "compression: raw compression call failed, no additional data available", "compression: raw compression call failed, additional data available", "compression: raw compression call succeeded", @@ -160,9 +163,12 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_read_overflow = 0; stats->cache_overflow_value = 0; stats->cache_eviction_deepen = 0; + stats->cache_write_lookaside = 0; stats->cache_read = 0; + stats->cache_read_lookaside = 0; stats->cache_eviction_split = 0; stats->cache_write = 0; + stats->cache_write_restore = 0; stats->cache_eviction_clean = 0; stats->compress_read = 0; stats->compress_write = 0; @@ -272,9 +278,12 @@ __wt_stat_dsrc_aggregate_single( to->cache_read_overflow += from->cache_read_overflow; to->cache_overflow_value += from->cache_overflow_value; to->cache_eviction_deepen += from->cache_eviction_deepen; + to->cache_write_lookaside += from->cache_write_lookaside; to->cache_read += from->cache_read; + to->cache_read_lookaside += from->cache_read_lookaside; to->cache_eviction_split += from->cache_eviction_split; to->cache_write += from->cache_write; + to->cache_write_restore += from->cache_write_restore; to->cache_eviction_clean += from->cache_eviction_clean; to->compress_read += from->compress_read; to->compress_write += from->compress_write; @@ -394,9 +403,13 @@ __wt_stat_dsrc_aggregate( to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value); to->cache_eviction_deepen += WT_STAT_READ(from, cache_eviction_deepen); + to->cache_write_lookaside += + WT_STAT_READ(from, cache_write_lookaside); to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_write += WT_STAT_READ(from, cache_write); + to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); to->compress_read += WT_STAT_READ(from, compress_read); to->compress_write += WT_STAT_READ(from, compress_write); @@ -511,11 +524,16 @@ static const char * const __stats_connection_desc[] = { "cache: pages walked for eviction", "cache: eviction worker thread evicting pages", "cache: in-memory page splits", + "cache: lookaside table insert calls", + "cache: lookaside table remove calls", "cache: percentage overhead", "cache: tracked dirty pages in the cache", "cache: pages currently held in the cache", "cache: pages read into cache", + "cache: pages read into cache requiring lookaside entries", "cache: pages written from cache", + "cache: page written requiring lookaside records", + "cache: pages written requiring in-memory restoration", "connection: pthread mutex condition wait calls", "cursor: cursor create calls", "cursor: cursor insert calls", @@ -669,19 +687,24 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_hazard = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; + stats->cache_lookaside_insert = 0; + stats->cache_lookaside_remove = 0; /* not clearing cache_bytes_max */ /* not clearing cache_eviction_maximum_page_size */ stats->cache_eviction_dirty = 0; stats->cache_eviction_deepen = 0; + stats->cache_write_lookaside = 0; /* not clearing cache_pages_inuse */ stats->cache_eviction_force = 0; stats->cache_eviction_force_delete = 0; stats->cache_eviction_app = 0; stats->cache_read = 0; + stats->cache_read_lookaside = 0; stats->cache_eviction_fail = 0; stats->cache_eviction_split = 0; stats->cache_eviction_walk = 0; stats->cache_write = 0; + stats->cache_write_restore = 0; /* not clearing cache_overhead */ /* not clearing cache_bytes_internal */ /* not clearing cache_bytes_leaf */ @@ -838,22 +861,30 @@ __wt_stat_connection_aggregate( to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); + to->cache_lookaside_insert += + WT_STAT_READ(from, cache_lookaside_insert); + to->cache_lookaside_remove += + WT_STAT_READ(from, cache_lookaside_remove); to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max); to->cache_eviction_maximum_page_size += WT_STAT_READ(from, cache_eviction_maximum_page_size); to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); to->cache_eviction_deepen += WT_STAT_READ(from, cache_eviction_deepen); + to->cache_write_lookaside += + WT_STAT_READ(from, cache_write_lookaside); to->cache_pages_inuse += WT_STAT_READ(from, cache_pages_inuse); to->cache_eviction_force += WT_STAT_READ(from, cache_eviction_force); to->cache_eviction_force_delete += WT_STAT_READ(from, cache_eviction_force_delete); to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app); to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk); to->cache_write += WT_STAT_READ(from, cache_write); + to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_overhead += WT_STAT_READ(from, cache_overhead); to->cache_bytes_internal += WT_STAT_READ(from, cache_bytes_internal); to->cache_bytes_leaf += WT_STAT_READ(from, cache_bytes_leaf); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index e671ce28ffb..9f59c53314e 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -246,6 +246,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); + /* Skip files that are never involved in a checkpoint. */ + if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT)) + return (0); + /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); @@ -1164,7 +1168,15 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) btree = S2BT(session); bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0; - /* If the handle is already dead, discard it. */ + /* + * If the handle is already dead or the file isn't durable, force the + * discard. + * + * If the file isn't durable, mark the handle dead, there are asserts + * later on that only dead handles can have modified pages. + */ + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index 3b3a69b7b95..a63720d736f 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -33,18 +33,7 @@ __txn_op_log(WT_SESSION_IMPL *session, * 3) row store remove; or * 4) row store insert/update. */ - if (cbt->btree->type != BTREE_ROW) { - WT_ASSERT(session, cbt->ins != NULL); - recno = WT_INSERT_RECNO(cbt->ins); - WT_ASSERT(session, recno != 0); - - if (WT_UPDATE_DELETED_ISSET(upd)) - WT_ERR(__wt_logop_col_remove_pack(session, logrec, - op->fileid, recno)); - else - WT_ERR(__wt_logop_col_put_pack(session, logrec, - op->fileid, recno, &value)); - } else { + if (cbt->btree->type == BTREE_ROW) { WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); if (WT_UPDATE_DELETED_ISSET(upd)) @@ -53,6 +42,16 @@ __txn_op_log(WT_SESSION_IMPL *session, else WT_ERR(__wt_logop_row_put_pack(session, logrec, op->fileid, &key, &value)); + } else { + recno = WT_INSERT_RECNO(cbt->ins); + WT_ASSERT(session, recno != WT_RECNO_OOB); + + if (WT_UPDATE_DELETED_ISSET(upd)) + WT_ERR(__wt_logop_col_remove_pack(session, logrec, + op->fileid, recno)); + else + WT_ERR(__wt_logop_col_put_pack(session, logrec, + op->fileid, recno, &value)); } err: __wt_buf_free(session, &key); @@ -419,9 +418,9 @@ __wt_txn_truncate_log( } else { op->type = WT_TXN_OP_TRUNCATE_COL; op->u.truncate_col.start = - (start == NULL) ? 0 : start->recno; + (start == NULL) ? WT_RECNO_OOB : start->recno; op->u.truncate_col.stop = - (stop == NULL) ? 0 : stop->recno; + (stop == NULL) ? WT_RECNO_OOB : stop->recno; } /* Write that operation into the in-memory log. */ diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index 7936dc0e9ef..240d0a5ffd3 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -144,10 +144,10 @@ __txn_op_apply( GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ - if (start_recno == 0) { + if (start_recno == WT_RECNO_OOB) { start = NULL; stop = cursor; - } else if (stop_recno == 0) { + } else if (stop_recno == WT_RECNO_OOB) { start = cursor; stop = NULL; } else { diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index 1888c7d967b..1d35f2efc72 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -97,12 +97,15 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag) } /* - * XXX - * We don't normally say anything about the WiredTiger - * metadata, it's not a normal "object" in the database. I'm - * making an exception for the checkpoint and verbose options. + * !!! + * We don't normally say anything about the WiredTiger metadata + * and lookaside tables, they're not application/user "objects" + * in the database. I'm making an exception for the checkpoint + * and verbose options. */ - if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag) + if (cflag || vflag || + (strcmp(key, WT_METADATA_URI) != 0 && + strcmp(key, WT_LAS_URI) != 0)) printf("%s\n", key); if (!cflag && !vflag) diff --git a/test/format/backup.c b/test/format/backup.c index 3b95ea92b5e..5805012e1e0 100644 --- a/test/format/backup.c +++ b/test/format/backup.c @@ -65,8 +65,7 @@ copy_file(const char *name) int ret; len = strlen(g.home) + strlen(g.home_backup) + strlen(name) * 2 + 20; - if ((cmd = malloc(len)) == NULL) - die(errno, "malloc"); + cmd = dmalloc(len); (void)snprintf(cmd, len, "cp %s/%s %s/%s", g.home, name, g.home_backup, name); if ((ret = system(cmd)) != 0) diff --git a/test/format/bulk.c b/test/format/bulk.c index 7cf4ba559dc..203043166a4 100644 --- a/test/format/bulk.c +++ b/test/format/bulk.c @@ -39,6 +39,7 @@ wts_load(void) int is_bulk, ret; conn = g.wts_conn; + keybuf = valbuf = NULL; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) die(ret, "connection.open_session"); diff --git a/test/format/config.c b/test/format/config.c index 6e767a2c6a2..1f19ecf2cd2 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -36,6 +36,7 @@ static const char *config_file_type(u_int); static CONFIG *config_find(const char *, size_t); static int config_is_perm(const char *); static void config_isolation(void); +static void config_lrt(void); static void config_map_checksum(const char *, u_int *); static void config_map_compression(const char *, u_int *); static void config_map_encryption(const char *, u_int *); @@ -102,8 +103,7 @@ config_setup(void) * our configuration, LSM or KVS devices are "tables", but files are * tested as well. */ - if ((g.uri = malloc(256)) == NULL) - die(errno, "malloc"); + g.uri = dmalloc(256); strcpy(g.uri, DATASOURCE("file") ? "file:" : "table:"); if (DATASOURCE("helium")) strcat(g.uri, "dev1/"); @@ -135,12 +135,6 @@ config_setup(void) if (DATASOURCE("helium") || DATASOURCE("kvsbdb")) g.c_reverse = 0; - config_checksum(); - config_compression("compression"); - config_compression("logging_compression"); - config_encryption(); - config_isolation(); - /* * Periodically, run single-threaded so we can compare the results to * a Berkeley DB copy, as long as the thread-count isn't nailed down. @@ -149,6 +143,13 @@ config_setup(void) if (!g.replay && g.run_cnt % 20 == 19 && !config_is_perm("threads")) g.c_threads = 1; + config_checksum(); + config_compression("compression"); + config_compression("logging_compression"); + config_encryption(); + config_isolation(); + config_lrt(); + /* * Periodically, set the delete percentage to 0 so salvage gets run, * as long as the delete percentage isn't nailed down. @@ -328,6 +329,26 @@ config_isolation(void) } } +/* + * config_lrt -- + * Long-running transaction configuration. + */ +static void +config_lrt(void) +{ + /* + * The underlying engine doesn't support a lookaside file for + * fixed-length column stores. + */ + if (g.type == FIX) { + if (config_is_perm("long_running_txn")) + die(EINVAL, + "long_running_txn not supported with fixed-length " + "column store"); + g.c_long_running_txn = 0; + } +} + /* * config_error -- * Display configuration information on error. diff --git a/test/format/format.h b/test/format/format.h index 4ec2734aee9..d82dea5451f 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -310,6 +310,8 @@ void config_file(const char *); void config_print(int); void config_setup(void); void config_single(const char *, int); +void *dmalloc(size_t); +char *dstrdup(const char *); void fclose_and_clear(FILE **); void key_gen(uint8_t *, size_t *, uint64_t); void key_gen_insert(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t); @@ -317,6 +319,7 @@ void key_gen_setup(uint8_t **); void key_len_setup(void); void *lrt(void *); void path_setup(const char *); +int read_row(WT_CURSOR *, WT_ITEM *, uint64_t, int); uint32_t rng(WT_RAND_STATE *); void track(const char *, uint64_t, TINFO *); void val_gen(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t); diff --git a/test/format/lrt.c b/test/format/lrt.c index a00a4e07879..85b6e29f224 100644 --- a/test/format/lrt.c +++ b/test/format/lrt.c @@ -37,33 +37,120 @@ lrt(void *arg) { WT_CONNECTION *conn; WT_CURSOR *cursor; + WT_ITEM key, value; WT_SESSION *session; + size_t buf_len, buf_size; + uint64_t keyno, saved_keyno; u_int period; int pinned, ret; + uint8_t bitfield, *keybuf; + void *buf; - (void)(arg); + (void)(arg); /* Unused parameter */ + + saved_keyno = 0; /* [-Werror=maybe-uninitialized] */ + + key_gen_setup(&keybuf); + memset(&key, 0, sizeof(key)); + key.data = keybuf; + memset(&value, 0, sizeof(value)); + + buf = NULL; + buf_len = buf_size = 0; /* Open a session and cursor. */ conn = g.wts_conn; - if ((ret = conn->open_session( - conn, NULL, "isolation=snapshot", &session)) != 0) + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) die(ret, "connection.open_session"); if ((ret = session->open_cursor( session, g.uri, NULL, NULL, &cursor)) != 0) die(ret, "session.open_cursor"); for (pinned = 0;;) { - /* - * If we have an open cursor, reset it, releasing our pin, else - * position the cursor, creating a snapshot. - */ if (pinned) { + /* Re-read the record at the end of the table. */ + while ((ret = read_row(cursor, + &key, saved_keyno, 1)) == WT_ROLLBACK) + ; + if (ret != 0) + die(ret, "read_row %" PRIu64, saved_keyno); + + /* Compare the previous value with the current one. */ + if (g.type == FIX) { + ret = cursor->get_value(cursor, &bitfield); + value.data = &bitfield; + value.size = 1; + } else + ret = cursor->get_value(cursor, &value); + if (ret != 0) + die(ret, + "cursor.get_value: %" PRIu64, saved_keyno); + + if (buf_size != value.size || + memcmp(buf, value.data, value.size) != 0) + die(0, "mismatched start/stop values"); + + /* End the transaction. */ + if ((ret = + session->commit_transaction(session, NULL)) != 0) + die(ret, "session.commit_transaction"); + + /* Reset the cursor, releasing our pin. */ if ((ret = cursor->reset(cursor)) != 0) die(ret, "cursor.reset"); pinned = 0; } else { - if ((ret = cursor->next(cursor)) != 0) - die(ret, "cursor.reset"); + /* + * Begin transaction: without an explicit transaction, + * the snapshot is only kept around while a cursor is + * positioned. As soon as the cursor loses its position + * a new snapshot will be allocated. + */ + if ((ret = session->begin_transaction( + session, "isolation=snapshot")) != 0) + die(ret, "session.begin_transaction"); + + /* Read a record at the end of the table. */ + do { + saved_keyno = mmrand(NULL, + (u_int)(g.key_cnt - g.key_cnt / 10), + (u_int)g.key_cnt); + while ((ret = read_row(cursor, + &key, saved_keyno, 1)) == WT_ROLLBACK) + ; + } while (ret == WT_NOTFOUND); + if (ret != 0) + die(ret, "read_row %" PRIu64, saved_keyno); + + /* Copy the cursor's value. */ + if (g.type == FIX) { + ret = cursor->get_value(cursor, &bitfield); + value.data = &bitfield; + value.size = 1; + } else + ret = cursor->get_value(cursor, &value); + if (ret != 0) + die(ret, + "cursor.get_value: %" PRIu64, saved_keyno); + if (buf_len < value.size && + (buf = realloc(buf, buf_len = value.size)) == NULL) + die(errno, "malloc"); + memcpy(buf, value.data, buf_size = value.size); + + /* + * Move the cursor to an early record in the table, + * hopefully allowing the page with the record just + * retrieved to be evicted from memory. + */ + do { + keyno = mmrand(NULL, 1, (u_int)g.key_cnt / 5); + while ((ret = read_row(cursor, + &key, keyno, 1)) == WT_ROLLBACK) + ; + } while (ret == WT_NOTFOUND); + if (ret != 0) + die(ret, "read_row %" PRIu64, keyno); + pinned = 1; } @@ -82,5 +169,8 @@ lrt(void *arg) if ((ret = session->close(session, NULL)) != 0) die(ret, "session.close"); + free(keybuf); + free(buf); + return (NULL); } diff --git a/test/format/ops.c b/test/format/ops.c index d07a6d90234..7c38aec4757 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -33,7 +33,6 @@ static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *); static int col_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); static int nextprev(WT_CURSOR *, int, int *); static void *ops(void *); -static int read_row(WT_CURSOR *, WT_ITEM *, uint64_t); static int row_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *); static int row_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); @@ -240,13 +239,13 @@ ops(void *arg) tinfo = arg; - /* Initialize the per-thread random number generator. */ - __wt_random_init(&tinfo->rnd); - conn = g.wts_conn; keybuf = valbuf = NULL; readonly = 0; /* -Wconditional-uninitialized */ + /* Initialize the per-thread random number generator. */ + __wt_random_init(&tinfo->rnd); + /* Set up the default key and value buffers. */ key_gen_setup(&keybuf); val_gen_setup(&tinfo->rnd, &valbuf); @@ -476,7 +475,7 @@ skip_insert: if (col_update(tinfo, } } else { ++tinfo->search; - if (read_row(cursor, &key, keyno)) + if (read_row(cursor, &key, keyno, 0)) if (intxn) goto deadlock; continue; @@ -499,7 +498,7 @@ skip_insert: if (col_update(tinfo, /* Read to confirm the operation. */ ++tinfo->search; - if (read_row(cursor, &key, keyno)) + if (read_row(cursor, &key, keyno, 0)) goto deadlock; /* Reset the cursor: there is no reason to keep pages pinned. */ @@ -584,7 +583,7 @@ wts_read_scan(void) } key.data = keybuf; - if ((ret = read_row(cursor, &key, cnt)) != 0) + if ((ret = read_row(cursor, &key, cnt, 0)) != 0) die(ret, "read_scan"); } @@ -598,8 +597,8 @@ wts_read_scan(void) * read_row -- * Read and verify a single element in a row- or column-store file. */ -static int -read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) +int +read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err) { static int sn = 0; WT_ITEM value; @@ -635,19 +634,24 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) ret = cursor->search(cursor); sn = 1; } - if (ret == 0) { + switch (ret) { + case 0: if (g.type == FIX) { ret = cursor->get_value(cursor, &bitfield); value.data = &bitfield; value.size = 1; - } else { + } else ret = cursor->get_value(cursor, &value); - } - } - if (ret == WT_ROLLBACK) + break; + case WT_ROLLBACK: return (WT_ROLLBACK); - if (ret != 0 && ret != WT_NOTFOUND) + case WT_NOTFOUND: + if (notfound_err) + return (WT_NOTFOUND); + break; + default: die(ret, "read_row: read row %" PRIu64, keyno); + } #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) diff --git a/test/format/t.c b/test/format/t.c index d46cfefb025..603706e0ba1 100644 --- a/test/format/t.c +++ b/test/format/t.c @@ -312,6 +312,11 @@ die(int e, const char *fmt, ...) /* Single-thread error handling. */ (void)pthread_rwlock_wrlock(&g.death_lock); + /* Try and turn off tracking so it doesn't obscure the error message. */ + if (g.track) { + g.track = 0; + fprintf(stderr, "\n"); + } if (fmt != NULL) { /* Death message. */ fprintf(stderr, "%s: ", g.progname); va_start(ap, fmt); diff --git a/test/format/util.c b/test/format/util.c index 9d28b7a81bc..0f4f5de7c20 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -78,8 +78,7 @@ key_gen_setup(uint8_t **keyp) *keyp = NULL; len = MAX(KILOBYTE(100), g.c_key_max); - if ((key = malloc(len)) == NULL) - die(errno, "malloc"); + key = dmalloc(len); for (i = 0; i < len; ++i) key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]); *keyp = key; @@ -139,8 +138,7 @@ val_gen_setup(WT_RAND_STATE *rnd, uint8_t **valp) * data for column-store run-length encoded files. */ len = MAX(KILOBYTE(100), g.c_value_max) + 20; - if ((val = malloc(len)) == NULL) - die(errno, "malloc"); + val = dmalloc(len); for (i = 0; i < len; ++i) val[i] = (uint8_t)("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]); @@ -257,43 +255,36 @@ path_setup(const char *home) size_t len; /* Home directory. */ - if ((g.home = strdup(home == NULL ? "RUNDIR" : home)) == NULL) - die(errno, "malloc"); + g.home = dstrdup(home == NULL ? "RUNDIR" : home); /* Log file. */ len = strlen(g.home) + strlen("log") + 2; - if ((g.home_log = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_log = dmalloc(len); snprintf(g.home_log, len, "%s/%s", g.home, "log"); /* RNG log file. */ len = strlen(g.home) + strlen("rand") + 2; - if ((g.home_rand = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_rand = dmalloc(len); snprintf(g.home_rand, len, "%s/%s", g.home, "rand"); /* Run file. */ len = strlen(g.home) + strlen("CONFIG") + 2; - if ((g.home_config = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_config = dmalloc(len); snprintf(g.home_config, len, "%s/%s", g.home, "CONFIG"); /* Statistics file. */ len = strlen(g.home) + strlen("stats") + 2; - if ((g.home_stats = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_stats = dmalloc(len); snprintf(g.home_stats, len, "%s/%s", g.home, "stats"); /* Backup directory. */ len = strlen(g.home) + strlen("BACKUP") + 2; - if ((g.home_backup = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_backup = dmalloc(len); snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP"); /* BDB directory. */ len = strlen(g.home) + strlen("bdb") + 2; - if ((g.home_bdb = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_bdb = dmalloc(len); snprintf(g.home_bdb, len, "%s/%s", g.home, "bdb"); /* @@ -315,8 +306,7 @@ path_setup(const char *home) "mkdir KVS" #endif len = strlen(g.home) * 3 + strlen(CMD) + 1; - if ((g.home_init = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_init = dmalloc(len); snprintf(g.home_init, len, CMD, g.home, g.home, g.home); /* Backup directory initialize command, remove and re-create it. */ @@ -327,8 +317,7 @@ path_setup(const char *home) #define CMD "rm -rf %s && mkdir %s" #endif len = strlen(g.home_backup) * 2 + strlen(CMD) + 1; - if ((g.home_backup_init = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_backup_init = dmalloc(len); snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup); /* @@ -351,8 +340,7 @@ path_setup(const char *home) "cp WiredTiger* wt* slvg.copy/" #endif len = strlen(g.home) + strlen(CMD) + 1; - if ((g.home_salvage_copy = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_salvage_copy = dmalloc(len); snprintf(g.home_salvage_copy, len, CMD, g.home); } @@ -422,3 +410,31 @@ fclose_and_clear(FILE **fpp) die(errno, "fclose"); return; } + +/* + * dmalloc -- + * Call malloc, dying on failure. + */ +void * +dmalloc(size_t len) +{ + void *p; + + if ((p = malloc(len)) == NULL) + die(errno, "malloc"); + return (p); +} + +/* + * dstrdup -- + * Call strdup, dying on failure. + */ +char * +dstrdup(const char *str) +{ + char *p; + + if ((p = strdup(str)) == NULL) + die(errno, "strdup"); + return (p); +} diff --git a/test/format/wts.c b/test/format/wts.c index 3d3b59810e8..23823c20184 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -462,8 +462,7 @@ wts_dump(const char *tag, int dump_bdb) track("dump files and compare", 0ULL, NULL); len = strlen(g.home) + strlen(BERKELEY_DB_PATH) + strlen(g.uri) + 100; - if ((cmd = malloc(len)) == NULL) - die(errno, "malloc"); + cmd = dmalloc(len); (void)snprintf(cmd, len, "sh s_dumpcmp -h %s %s %s %s %s %s", g.home, @@ -564,9 +563,7 @@ wts_stats(void) /* Data source statistics. */ fprintf(fp, "\n\n====== Data source statistics:\n"); - if ((stat_name = - malloc(strlen("statistics:") + strlen(g.uri) + 1)) == NULL) - die(errno, "malloc"); + stat_name = dmalloc(strlen("statistics:") + strlen(g.uri) + 1); sprintf(stat_name, "statistics:%s", g.uri); if ((ret = session->open_cursor( session, stat_name, NULL, NULL, &cursor)) != 0) diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py index d0474cfa7e6..13422a75a61 100644 --- a/test/suite/test_sweep01.py +++ b/test/suite/test_sweep01.py @@ -192,9 +192,9 @@ def test_ops(self): print "ref1: " + str(ref1) + " ref2: " + str(ref2) print "XX: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) self.assertEqual(nfile2 < nfile1, True) - # The only files that should be left is the metadata, the lock file - # and the active file. - if (nfile2 != 3): + # The only files that should be left are the metadata, the lookaside + # file, the lock file, and the active file. + if (nfile2 != 4): print "close1: " + str(close1) + " close2: " + str(close2) print "remove1: " + str(remove1) + " remove2: " + str(remove2) print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) @@ -203,7 +203,7 @@ def test_ops(self): print "tod1: " + str(tod1) + " tod2: " + str(tod2) print "ref1: " + str(ref1) + " ref2: " + str(ref2) print "XX2: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) - self.assertEqual(nfile2 == 3, True) + self.assertEqual(nfile2 == 4, True) if __name__ == '__main__': wttest.run()