diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index c59889de122b..468ae7dec1e1 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,6 +25,9 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes +- anon_min_ratio +- clean_low_ratio +- clean_min_ratio - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -106,6 +109,67 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. +anon_min_ratio +============== + +This knob provides *hard* protection of anonymous pages. The anonymous pages +on the current node won't be reclaimed under any conditions when their amount +is below vm.anon_min_ratio. + +This knob may be used to prevent excessive swap thrashing when anonymous +memory is low (for example, when memory is going to be overfilled by +compressed data of zram module). + +Setting this value too high (close to 100) can result in inability to +swap and can lead to early OOM under memory pressure. + +The unit of measurement is the percentage of the total memory of the node. + +The default value is 15. + + +clean_low_ratio +================ + +This knob provides *best-effort* protection of clean file pages. The file pages +on the current node won't be reclaimed under memory pressure when the amount of +clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. + +Protection of clean file pages using this knob may be used when swapping is +still possible to + - prevent disk I/O thrashing under memory pressure; + - improve performance in disk cache-bound tasks under memory pressure. + +Setting it to a high value may result in a early eviction of anonymous pages +into the swap space by attempting to hold the protected amount of clean file +pages in memory. + +The unit of measurement is the percentage of the total memory of the node. + +The default value is 0. + + +clean_min_ratio +================ + +This knob provides *hard* protection of clean file pages. The file pages on the +current node won't be reclaimed under memory pressure when the amount of clean +file pages is below vm.clean_min_ratio. + +Hard protection of clean file pages using this knob may be used to + - prevent disk I/O thrashing under memory pressure even with no free swap space; + - improve performance in disk cache-bound tasks under memory pressure; + - avoid high latency and prevent livelock in near-OOM conditions. + +Setting it to a high value may result in a early out-of-memory condition due to +the inability to reclaim the protected amount of clean file pages when other +types of pages cannot be reclaimed. + +The unit of measurement is the percentage of the total memory of the node. + +The default value is 15. + + compact_memory ============== @@ -910,6 +974,14 @@ be 133 (x + 2x = 200, 2x = 133.33). At 0, the kernel will not initiate swap until the amount of free and file-backed pages is less than the high watermark in a zone. +This knob has no effect if the amount of clean file pages on the current +node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, +only anonymous pages can be reclaimed. + +If the number of anonymous pages on the current node is below +vm.anon_min_ratio, then only file pages can be reclaimed with +any vm.swappiness value. + unprivileged_userfaultfd ======================== diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..573810f234e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -195,6 +195,14 @@ static inline void __mm_zero_struct_page(struct page *page) extern int sysctl_max_map_count; +extern bool sysctl_workingset_protection; +extern u8 sysctl_anon_min_ratio; +extern u8 sysctl_clean_low_ratio; +extern u8 sysctl_clean_min_ratio; +int vm_workingset_protection_update_handler( + struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 77b99a3eceab..bc57b9978569 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2227,6 +2227,40 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif + { + .procname = "workingset_protection", + .data = &sysctl_workingset_protection, + .maxlen = sizeof(bool), + .mode = 0644, + .proc_handler = &proc_dobool, + }, + { + .procname = "anon_min_ratio", + .data = &sysctl_anon_min_ratio, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = &vm_workingset_protection_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "clean_low_ratio", + .data = &sysctl_clean_low_ratio, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = &vm_workingset_protection_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "clean_min_ratio", + .data = &sysctl_clean_min_ratio, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = &vm_workingset_protection_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, { .procname = "user_reserve_kbytes", .data = &sysctl_user_reserve_kbytes, diff --git a/mm/Kconfig b/mm/Kconfig index ffc3a2ba3a8c..5c4dd9c7c22d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -486,6 +486,69 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP bool +config ANON_MIN_RATIO + int "Default value for vm.anon_min_ratio" + depends on SYSCTL + range 0 100 + default 15 + help + This option sets the default value for vm.anon_min_ratio sysctl knob. + + The vm.anon_min_ratio sysctl knob provides *hard* protection of + anonymous pages. The anonymous pages on the current node won't be + reclaimed under any conditions when their amount is below + vm.anon_min_ratio. This knob may be used to prevent excessive swap + thrashing when anonymous memory is low (for example, when memory is + going to be overfilled by compressed data of zram module). + + Setting this value too high (close to MemTotal) can result in + inability to swap and can lead to early OOM under memory pressure. + +config CLEAN_LOW_RATIO + int "Default value for vm.clean_low_ratio" + depends on SYSCTL + range 0 100 + default 0 + help + This option sets the default value for vm.clean_low_ratio sysctl knob. + + The vm.clean_low_ratio sysctl knob provides *best-effort* + protection of clean file pages. The file pages on the current node + won't be reclaimed under memory pressure when the amount of clean file + pages is below vm.clean_low_ratio *unless* we threaten to OOM. + Protection of clean file pages using this knob may be used when + swapping is still possible to + - prevent disk I/O thrashing under memory pressure; + - improve performance in disk cache-bound tasks under memory + pressure. + + Setting it to a high value may result in a early eviction of anonymous + pages into the swap space by attempting to hold the protected amount + of clean file pages in memory. + +config CLEAN_MIN_RATIO + int "Default value for vm.clean_min_ratio" + depends on SYSCTL + range 0 100 + default 15 + help + This option sets the default value for vm.clean_min_ratio sysctl knob. + + The vm.clean_min_ratio sysctl knob provides *hard* protection of + clean file pages. The file pages on the current node won't be + reclaimed under memory pressure when the amount of clean file pages is + below vm.clean_min_ratio. Hard protection of clean file pages using + this knob may be used to + - prevent disk I/O thrashing under memory pressure even with no free + swap space; + - improve performance in disk cache-bound tasks under memory + pressure; + - avoid high latency and prevent livelock in near-OOM conditions. + + Setting it to a high value may result in a early out-of-memory condition + due to the inability to reclaim the protected amount of clean file pages + when other types of pages cannot be reclaimed. + config HAVE_MEMBLOCK_PHYS_MAP bool diff --git a/mm/mm_init.c b/mm/mm_init.c index 2c19f5515e36..4470205bab48 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2749,6 +2749,7 @@ static void __init mem_init_print_info(void) , K(totalhigh_pages()) #endif ); + printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.5 by Masahito Suzuki (forked from hakavlad's original le9 patch)"); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 709a91e0f06b..46dee9f2a58a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -133,6 +133,15 @@ struct scan_control { /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* The anonymous pages on the current node are below vm.anon_min_ratio */ + unsigned int anon_below_min:1; + + /* The clean file pages on the current node are below vm.clean_low_ratio */ + unsigned int clean_below_low:1; + + /* The clean file pages on the current node are below vm.clean_min_ratio */ + unsigned int clean_below_min:1; + /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; @@ -182,6 +191,15 @@ struct scan_control { #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif +bool sysctl_workingset_protection __read_mostly = false; +u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; +u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; +u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; +static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; +static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; +static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; +static u64 workingset_protection_prev_totalram __read_mostly = 0; + /* * From 0 .. 200. Higher means more swappy. */ @@ -1052,6 +1070,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, folio_mapped(folio) && folio_test_referenced(folio)) goto keep_locked; + if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min) + goto keep_locked; + /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -2371,6 +2392,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* + * Force-scan anon if clean file pages is under vm.clean_low_ratio + * or vm.clean_min_ratio. + */ + if (sc->clean_below_low || sc->clean_below_min) { + scan_balance = SCAN_ANON; + goto out; + } + /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. @@ -2515,6 +2545,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, BUG(); } + /* + * Hard protection of the working set. + * Don't reclaim anon/file pages when the amount is + * below the watermark of the same type. + */ + if (file ? sc->clean_below_min : sc->anon_below_min) + scan = 0; + nr[lru] = scan; } } @@ -3924,6 +3962,23 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc /* to protect the working set of the last N jiffies */ static unsigned long lru_gen_min_ttl __read_mostly; +static void do_invoke_oom(struct scan_control *sc, bool try_memcg) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + .order = sc->order, + }; + + if (try_memcg && mem_cgroup_oom_synchronize(true)) + return; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} +#define invoke_oom(sc) do_invoke_oom(sc, true) +#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false) + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; @@ -3952,14 +4007,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */ - if (mutex_trylock(&oom_lock)) { - struct oom_control oc = { - .gfp_mask = sc->gfp_mask, - }; + invoke_oom_nomemcg(sc); +} - out_of_memory(&oc); +int vm_workingset_protection_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + workingset_protection_prev_totalram = 0; - mutex_unlock(&oom_lock); + return 0; +} + +static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long node_mem_total; + struct sysinfo i; + + if (!(sysctl_workingset_protection)) { + sc->anon_below_min = 0; + sc->clean_below_low = 0; + sc->clean_below_min = 0; + return; + } + + if (likely(sysctl_anon_min_ratio || + sysctl_clean_low_ratio || + sysctl_clean_min_ratio)) { +#ifdef CONFIG_NUMA + si_meminfo_node(&i, pgdat->node_id); +#else //CONFIG_NUMA + si_meminfo(&i); +#endif //CONFIG_NUMA + node_mem_total = i.totalram; + + if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { + sysctl_anon_min_ratio_kb = + node_mem_total * sysctl_anon_min_ratio / 100; + sysctl_clean_low_ratio_kb = + node_mem_total * sysctl_clean_low_ratio / 100; + sysctl_clean_min_ratio_kb = + node_mem_total * sysctl_clean_min_ratio / 100; + workingset_protection_prev_totalram = node_mem_total; + } + } + + /* + * Check the number of anonymous pages to protect them from + * reclaiming if their amount is below the specified. + */ + if (sysctl_anon_min_ratio) { + unsigned long reclaimable_anon; + + reclaimable_anon = + node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_ISOLATED_ANON); + + sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; + } else + sc->anon_below_min = 0; + + /* + * Check the number of clean file pages to protect them from + * reclaiming if their amount is below the specified. + */ + if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { + unsigned long reclaimable_file, dirty, clean; + + reclaimable_file = + node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ISOLATED_FILE); + dirty = node_page_state(pgdat, NR_FILE_DIRTY); + /* + * node_page_state() sum can go out of sync since + * all the values are not read at once. + */ + if (likely(reclaimable_file > dirty)) + clean = reclaimable_file - dirty; + else + clean = 0; + + sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; + sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; + } else { + sc->clean_below_low = 0; + sc->clean_below_min = 0; } } @@ -4462,6 +4599,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw */ if (!swappiness) type = LRU_GEN_FILE; + else if (sc->clean_below_min || sc->clean_below_low) + type = LRU_GEN_ANON; else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) @@ -4471,7 +4610,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw else type = get_type_to_scan(lruvec, swappiness, &tier); - for (i = !swappiness; i < ANON_AND_FILE; i++) { + for (i = 0; i < ANON_AND_FILE; i++) { if (tier < 0) tier = get_tier_idx(lruvec, type); @@ -4749,6 +4888,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + prepare_workingset_protection(pgdat, sc); mem_cgroup_calculate_protection(NULL, memcg); if (mem_cgroup_below_min(NULL, memcg)) @@ -5899,6 +6039,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) prepare_scan_control(pgdat, sc); + prepare_workingset_protection(pgdat, sc); + shrink_node_memcgs(pgdat, sc); flush_reclaim_state(sc); @@ -5987,6 +6129,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) */ if (reclaimable) pgdat->kswapd_failures = 0; + else if (sc->clean_below_min && !sc->priority) + invoke_oom(sc); } /*