diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 586cd4b864284..63bdd03180145 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,7 +25,10 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes +- anon_min_kbytes - block_dump +- clean_low_kbytes +- clean_min_kbytes - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -106,6 +109,23 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. +anon_min_kbytes +=============== + +This knob provides *hard* protection of anonymous pages. The anonymous pages +on the current node won't be reclaimed under any conditions when their amount +is below vm.anon_min_kbytes. + +This knob may be used to prevent excessive swap thrashing when anonymous +memory is low (for example, when memory is going to be overfilled by +compressed data of zram module). + +Setting this value too high (close to MemTotal) can result in inability to +swap and can lead to early OOM under memory pressure. + +The default value is defined by CONFIG_ANON_MIN_KBYTES. + + block_dump ========== @@ -113,6 +133,44 @@ block_dump enables block I/O debugging when set to a nonzero value. More information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst. +clean_low_kbytes +================ + +This knob provides *best-effort* protection of clean file pages. The file pages +on the current node won't be reclaimed under memory pressure when the amount of +clean file pages is below vm.clean_low_kbytes *unless* we threaten to OOM. + +Protection of clean file pages using this knob may be used when swapping is +still possible to + - prevent disk I/O thrashing under memory pressure; + - improve performance in disk cache-bound tasks under memory pressure. + +Setting it to a high value may result in a early eviction of anonymous pages +into the swap space by attempting to hold the protected amount of clean file +pages in memory. + +The default value is defined by CONFIG_CLEAN_LOW_KBYTES. + + +clean_min_kbytes +================ + +This knob provides *hard* protection of clean file pages. The file pages on the +current node won't be reclaimed under memory pressure when the amount of clean +file pages is below vm.clean_min_kbytes. + +Hard protection of clean file pages using this knob may be used to + - prevent disk I/O thrashing under memory pressure even with no free swap space; + - improve performance in disk cache-bound tasks under memory pressure; + - avoid high latency and prevent livelock in near-OOM conditions. + +Setting it to a high value may result in a early out-of-memory condition due to +the inability to reclaim the protected amount of clean file pages when other +types of pages cannot be reclaimed. + +The default value is defined by CONFIG_CLEAN_MIN_KBYTES. + + compact_memory ============== @@ -869,6 +927,14 @@ be 133 (x + 2x = 200, 2x = 133.33). At 0, the kernel will not initiate swap until the amount of free and file-backed pages is less than the high watermark in a zone. +This knob has no effect if the amount of clean file pages on the current +node is below vm.clean_low_kbytes or vm.clean_min_kbytes. In this case, +only anonymous pages can be reclaimed. + +If the number of anonymous pages on the current node is below +vm.anon_min_kbytes, then only file pages can be reclaimed with +any vm.swappiness value. + unprivileged_userfaultfd ======================== diff --git a/include/linux/mm.h b/include/linux/mm.h index 122ffc83ffbeb..f73dbffd198fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -213,6 +213,10 @@ static inline void __mm_zero_struct_page(struct page *page) extern int sysctl_max_map_count; +extern unsigned long sysctl_anon_min_kbytes; +extern unsigned long sysctl_clean_low_kbytes; +extern unsigned long sysctl_clean_min_kbytes; + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e86520438ec8b..c92d171469f70 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3117,6 +3117,27 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif + { + .procname = "anon_min_kbytes", + .data = &sysctl_anon_min_kbytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "clean_low_kbytes", + .data = &sysctl_clean_low_kbytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "clean_min_kbytes", + .data = &sysctl_clean_min_kbytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { .procname = "user_reserve_kbytes", .data = &sysctl_user_reserve_kbytes, diff --git a/mm/Kconfig b/mm/Kconfig index da125f145bc4b..1c4c718bf2633 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -121,6 +121,69 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. +config ANON_MIN_KBYTES + int "Default value for vm.anon_min_kbytes" + depends on SYSCTL + range 0 4294967295 + default 0 + help + This option sets the default value for vm.anon_min_kbytes sysctl knob. + + The vm.anon_min_kbytes sysctl knob provides *hard* protection of + anonymous pages. The anonymous pages on the current node won't be + reclaimed under any conditions when their amount is below + vm.anon_min_kbytes. This knob may be used to prevent excessive swap + thrashing when anonymous memory is low (for example, when memory is + going to be overfilled by compressed data of zram module). + + Setting this value too high (close to MemTotal) can result in + inability to swap and can lead to early OOM under memory pressure. + +config CLEAN_LOW_KBYTES + int "Default value for vm.clean_low_kbytes" + depends on SYSCTL + range 0 4294967295 + default 0 + help + This option sets the default value for vm.clean_low_kbytes sysctl knob. + + The vm.clean_low_kbytes sysctl knob provides *best-effort* + protection of clean file pages. The file pages on the current node + won't be reclaimed under memory pressure when the amount of clean file + pages is below vm.clean_low_kbytes *unless* we threaten to OOM. + Protection of clean file pages using this knob may be used when + swapping is still possible to + - prevent disk I/O thrashing under memory pressure; + - improve performance in disk cache-bound tasks under memory + pressure. + + Setting it to a high value may result in a early eviction of anonymous + pages into the swap space by attempting to hold the protected amount + of clean file pages in memory. + +config CLEAN_MIN_KBYTES + int "Default value for vm.clean_min_kbytes" + depends on SYSCTL + range 0 4294967295 + default 0 + help + This option sets the default value for vm.clean_min_kbytes sysctl knob. + + The vm.clean_min_kbytes sysctl knob provides *hard* protection of + clean file pages. The file pages on the current node won't be + reclaimed under memory pressure when the amount of clean file pages is + below vm.clean_min_kbytes. Hard protection of clean file pages using + this knob may be used to + - prevent disk I/O thrashing under memory pressure even with no free + swap space; + - improve performance in disk cache-bound tasks under memory + pressure; + - avoid high latency and prevent livelock in near-OOM conditions. + + Setting it to a high value may result in a early out-of-memory condition + due to the inability to reclaim the protected amount of clean file pages + when other types of pages cannot be reclaimed. + config HAVE_MEMBLOCK_PHYS_MAP bool diff --git a/mm/vmscan.c b/mm/vmscan.c index 67e6cabc1e01d..f648a8d8f3d5e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -126,6 +126,15 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* The anonymous pages on the current node are below vm.anon_min_kbytes */ + unsigned int anon_below_min:1; + + /* The clean file pages on the current node are below vm.clean_low_kbytes */ + unsigned int clean_below_low:1; + + /* The clean file pages on the current node are below vm.clean_min_kbytes */ + unsigned int clean_below_min:1; + /* Allocation order */ s8 order; @@ -172,6 +181,10 @@ struct scan_control { #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif +unsigned long sysctl_anon_min_kbytes __read_mostly = CONFIG_ANON_MIN_KBYTES; +unsigned long sysctl_clean_low_kbytes __read_mostly = CONFIG_CLEAN_LOW_KBYTES; +unsigned long sysctl_clean_min_kbytes __read_mostly = CONFIG_CLEAN_MIN_KBYTES; + /* * From 0 .. 200. Higher means more swappy. */ @@ -2436,6 +2449,54 @@ enum scan_balance { SCAN_FILE, }; +static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) +{ + /* + * Check the number of anonymous pages to protect them from + * reclaiming if their amount is below the specified. + */ + if (sysctl_anon_min_kbytes) { + unsigned long reclaimable_anon; + + reclaimable_anon = + node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_ISOLATED_ANON); + reclaimable_anon <<= (PAGE_SHIFT - 10); + + sc->anon_below_min = reclaimable_anon < sysctl_anon_min_kbytes; + } else + sc->anon_below_min = 0; + + /* + * Check the number of clean file pages to protect them from + * reclaiming if their amount is below the specified. + */ + if (sysctl_clean_low_kbytes || sysctl_clean_min_kbytes) { + unsigned long reclaimable_file, dirty, clean; + + reclaimable_file = + node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ISOLATED_FILE); + dirty = node_page_state(pgdat, NR_FILE_DIRTY); + /* + * node_page_state() sum can go out of sync since + * all the values are not read at once. + */ + if (likely(reclaimable_file > dirty)) + clean = (reclaimable_file - dirty) << (PAGE_SHIFT - 10); + else + clean = 0; + + sc->clean_below_low = clean < sysctl_clean_low_kbytes; + sc->clean_below_min = clean < sysctl_clean_min_kbytes; + } else { + sc->clean_below_low = 0; + sc->clean_below_min = 0; + } +} + static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; @@ -2534,6 +2595,8 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) !(sc->may_deactivate & DEACTIVATE_ANON) && anon >> sc->priority; } + + prepare_workingset_protection(pgdat, sc); } /* @@ -2593,6 +2656,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* + * Force-scan anon if clean file pages is under vm.clean_low_kbytes + * or vm.clean_min_kbytes. + */ + if (sc->clean_below_low || sc->clean_below_min) { + scan_balance = SCAN_ANON; + goto out; + } + /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. @@ -2737,6 +2809,25 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, BUG(); } + /* + * Hard protection of the working set. + */ + if (file) { + /* + * Don't reclaim file pages when the amount of + * clean file pages is below vm.clean_min_kbytes. + */ + if (sc->clean_below_min) + scan = 0; + } else { + /* + * Don't reclaim anonymous pages when their + * amount is below vm.anon_min_kbytes. + */ + if (sc->anon_below_min) + scan = 0; + } + nr[lru] = scan; } }