From 99c7e5a01cc044f73640d2803ff2cf3cfded02d6 Mon Sep 17 00:00:00 2001 From: Alexandre Frade Date: Wed, 10 Apr 2024 17:35:28 +0000 Subject: [PATCH] Revert "mm/vmscan: add sysctl knobs for protecting the working set" This reverts commit d1ff80f39c97a0c6135e2c95da1cc3c51d86aeb7. --- Documentation/admin-guide/sysctl/vm.rst | 72 ---------- include/linux/mm.h | 8 -- kernel/sysctl.c | 34 ----- mm/Kconfig | 63 --------- mm/mm_init.c | 1 - mm/vmscan.c | 170 +----------------------- 6 files changed, 7 insertions(+), 341 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5cc069c428379..45ba1f4dc0048 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,9 +25,6 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes -- anon_min_ratio -- clean_low_ratio -- clean_min_ratio - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -109,67 +106,6 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. -anon_min_ratio -============== - -This knob provides *hard* protection of anonymous pages. The anonymous pages -on the current node won't be reclaimed under any conditions when their amount -is below vm.anon_min_ratio. - -This knob may be used to prevent excessive swap thrashing when anonymous -memory is low (for example, when memory is going to be overfilled by -compressed data of zram module). - -Setting this value too high (close to 100) can result in inability to -swap and can lead to early OOM under memory pressure. - -The unit of measurement is the percentage of the total memory of the node. - -The default value is 15. - - -clean_low_ratio -================ - -This knob provides *best-effort* protection of clean file pages. The file pages -on the current node won't be reclaimed under memory pressure when the amount of -clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. - -Protection of clean file pages using this knob may be used when swapping is -still possible to - - prevent disk I/O thrashing under memory pressure; - - improve performance in disk cache-bound tasks under memory pressure. - -Setting it to a high value may result in a early eviction of anonymous pages -into the swap space by attempting to hold the protected amount of clean file -pages in memory. - -The unit of measurement is the percentage of the total memory of the node. - -The default value is 0. - - -clean_min_ratio -================ - -This knob provides *hard* protection of clean file pages. The file pages on the -current node won't be reclaimed under memory pressure when the amount of clean -file pages is below vm.clean_min_ratio. - -Hard protection of clean file pages using this knob may be used to - - prevent disk I/O thrashing under memory pressure even with no free swap space; - - improve performance in disk cache-bound tasks under memory pressure; - - avoid high latency and prevent livelock in near-OOM conditions. - -Setting it to a high value may result in a early out-of-memory condition due to -the inability to reclaim the protected amount of clean file pages when other -types of pages cannot be reclaimed. - -The unit of measurement is the percentage of the total memory of the node. - -The default value is 15. - - compact_memory ============== @@ -974,14 +910,6 @@ be 133 (x + 2x = 200, 2x = 133.33). At 0, the kernel will not initiate swap until the amount of free and file-backed pages is less than the high watermark in a zone. -This knob has no effect if the amount of clean file pages on the current -node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, -only anonymous pages can be reclaimed. - -If the number of anonymous pages on the current node is below -vm.anon_min_ratio, then only file pages can be reclaimed with -any vm.swappiness value. - unprivileged_userfaultfd ======================== diff --git a/include/linux/mm.h b/include/linux/mm.h index 9e6731543f10f..bf5d0b1b16f43 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -195,14 +195,6 @@ static inline void __mm_zero_struct_page(struct page *page) extern int sysctl_max_map_count; -extern bool sysctl_workingset_protection; -extern u8 sysctl_anon_min_ratio; -extern u8 sysctl_clean_low_ratio; -extern u8 sysctl_clean_min_ratio; -int vm_workingset_protection_update_handler( - struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 128a40e0b5dd0..d37130095aece 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2236,40 +2236,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif - { - .procname = "workingset_protection", - .data = &sysctl_workingset_protection, - .maxlen = sizeof(bool), - .mode = 0644, - .proc_handler = &proc_dobool, - }, - { - .procname = "anon_min_ratio", - .data = &sysctl_anon_min_ratio, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = &vm_workingset_protection_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "clean_low_ratio", - .data = &sysctl_clean_low_ratio, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = &vm_workingset_protection_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "clean_min_ratio", - .data = &sysctl_clean_min_ratio, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = &vm_workingset_protection_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, { .procname = "user_reserve_kbytes", .data = &sysctl_user_reserve_kbytes, diff --git a/mm/Kconfig b/mm/Kconfig index 4c21fdb6ec833..264a2df5ecf5b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -509,69 +509,6 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP bool -config ANON_MIN_RATIO - int "Default value for vm.anon_min_ratio" - depends on SYSCTL - range 0 100 - default 15 - help - This option sets the default value for vm.anon_min_ratio sysctl knob. - - The vm.anon_min_ratio sysctl knob provides *hard* protection of - anonymous pages. The anonymous pages on the current node won't be - reclaimed under any conditions when their amount is below - vm.anon_min_ratio. This knob may be used to prevent excessive swap - thrashing when anonymous memory is low (for example, when memory is - going to be overfilled by compressed data of zram module). - - Setting this value too high (close to MemTotal) can result in - inability to swap and can lead to early OOM under memory pressure. - -config CLEAN_LOW_RATIO - int "Default value for vm.clean_low_ratio" - depends on SYSCTL - range 0 100 - default 0 - help - This option sets the default value for vm.clean_low_ratio sysctl knob. - - The vm.clean_low_ratio sysctl knob provides *best-effort* - protection of clean file pages. The file pages on the current node - won't be reclaimed under memory pressure when the amount of clean file - pages is below vm.clean_low_ratio *unless* we threaten to OOM. - Protection of clean file pages using this knob may be used when - swapping is still possible to - - prevent disk I/O thrashing under memory pressure; - - improve performance in disk cache-bound tasks under memory - pressure. - - Setting it to a high value may result in a early eviction of anonymous - pages into the swap space by attempting to hold the protected amount - of clean file pages in memory. - -config CLEAN_MIN_RATIO - int "Default value for vm.clean_min_ratio" - depends on SYSCTL - range 0 100 - default 15 - help - This option sets the default value for vm.clean_min_ratio sysctl knob. - - The vm.clean_min_ratio sysctl knob provides *hard* protection of - clean file pages. The file pages on the current node won't be - reclaimed under memory pressure when the amount of clean file pages is - below vm.clean_min_ratio. Hard protection of clean file pages using - this knob may be used to - - prevent disk I/O thrashing under memory pressure even with no free - swap space; - - improve performance in disk cache-bound tasks under memory - pressure; - - avoid high latency and prevent livelock in near-OOM conditions. - - Setting it to a high value may result in a early out-of-memory condition - due to the inability to reclaim the protected amount of clean file pages - when other types of pages cannot be reclaimed. - config HAVE_MEMBLOCK_PHYS_MAP bool diff --git a/mm/mm_init.c b/mm/mm_init.c index 363e32b6f7003..77fd04c83d046 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2760,7 +2760,6 @@ static void __init mem_init_print_info(void) , K(totalhigh_pages()) #endif ); - printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.3 by Masahito Suzuki (forked from hakavlad's original le9 patch)"); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 2693f8e078a15..078221bdf47a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -134,15 +134,6 @@ struct scan_control { /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; - /* The anonymous pages on the current node are below vm.anon_min_ratio */ - unsigned int anon_below_min:1; - - /* The clean file pages on the current node are below vm.clean_low_ratio */ - unsigned int clean_below_low:1; - - /* The clean file pages on the current node are below vm.clean_min_ratio */ - unsigned int clean_below_min:1; - /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; @@ -192,15 +183,6 @@ struct scan_control { #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif -bool sysctl_workingset_protection __read_mostly = true; -u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; -u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; -u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; -static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; -static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; -static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; -static u64 workingset_protection_prev_totalram __read_mostly = 0; - /* * From 0 .. 200. Higher means more swappy. */ @@ -1770,9 +1752,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, folio_mapped(folio) && folio_test_referenced(folio)) goto keep_locked; - if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -3092,15 +3071,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } - /* - * Force-scan anon if clean file pages is under vm.clean_low_ratio - * or vm.clean_min_ratio. - */ - if (sc->clean_below_low || sc->clean_below_min) { - scan_balance = SCAN_ANON; - goto out; - } - /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. @@ -3245,25 +3215,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, BUG(); } - /* - * Hard protection of the working set. - */ - if (file) { - /* - * Don't reclaim file pages when the amount of - * clean file pages is below vm.clean_min_ratio. - */ - if (sc->clean_below_min) - scan = 0; - } else { - /* - * Don't reclaim anonymous pages when their - * amount is below vm.anon_min_ratio. - */ - if (sc->anon_below_min) - scan = 0; - } - nr[lru] = scan; } } @@ -4646,23 +4597,6 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc /* to protect the working set of the last N jiffies */ static unsigned long lru_gen_min_ttl __read_mostly; -static void do_invoke_oom(struct scan_control *sc, bool try_memcg) { - struct oom_control oc = { - .gfp_mask = sc->gfp_mask, - .order = sc->order, - }; - - if (try_memcg && mem_cgroup_oom_synchronize(true)) - return; - - if (!mutex_trylock(&oom_lock)) - return; - out_of_memory(&oc); - mutex_unlock(&oom_lock); -} -#define invoke_oom(sc) do_invoke_oom(sc, true) -#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false) - static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; @@ -4691,96 +4625,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */ - invoke_oom_nomemcg(sc); -} - -int vm_workingset_protection_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; - - workingset_protection_prev_totalram = 0; - - return 0; -} - -static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) -{ - unsigned long node_mem_total; - struct sysinfo i; - - if (!(sysctl_workingset_protection)) { - sc->anon_below_min = 0; - sc->clean_below_low = 0; - sc->clean_below_min = 0; - return; - } - - if (likely(sysctl_anon_min_ratio || - sysctl_clean_low_ratio || - sysctl_clean_min_ratio)) { -#ifdef CONFIG_NUMA - si_meminfo_node(&i, pgdat->node_id); -#else //CONFIG_NUMA - si_meminfo(&i); -#endif //CONFIG_NUMA - node_mem_total = i.totalram; - - if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { - sysctl_anon_min_ratio_kb = - node_mem_total * sysctl_anon_min_ratio / 100; - sysctl_clean_low_ratio_kb = - node_mem_total * sysctl_clean_low_ratio / 100; - sysctl_clean_min_ratio_kb = - node_mem_total * sysctl_clean_min_ratio / 100; - workingset_protection_prev_totalram = node_mem_total; - } - } - - /* - * Check the number of anonymous pages to protect them from - * reclaiming if their amount is below the specified. - */ - if (sysctl_anon_min_ratio) { - unsigned long reclaimable_anon; - - reclaimable_anon = - node_page_state(pgdat, NR_ACTIVE_ANON) + - node_page_state(pgdat, NR_INACTIVE_ANON) + - node_page_state(pgdat, NR_ISOLATED_ANON); + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; - sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; - } else - sc->anon_below_min = 0; + out_of_memory(&oc); - /* - * Check the number of clean file pages to protect them from - * reclaiming if their amount is below the specified. - */ - if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { - unsigned long reclaimable_file, dirty, clean; - - reclaimable_file = - node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE) + - node_page_state(pgdat, NR_ISOLATED_FILE); - dirty = node_page_state(pgdat, NR_FILE_DIRTY); - /* - * node_page_state() sum can go out of sync since - * all the values are not read at once. - */ - if (likely(reclaimable_file > dirty)) - clean = reclaimable_file - dirty; - else - clean = 0; - - sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; - sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; - } else { - sc->clean_below_low = 0; - sc->clean_below_min = 0; + mutex_unlock(&oom_lock); } } @@ -5289,8 +5141,6 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw */ if (!swappiness) type = LRU_GEN_FILE; - else if (sc->clean_below_min || sc->clean_below_low) - type = LRU_GEN_ANON; else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) @@ -5300,7 +5150,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw else type = get_type_to_scan(lruvec, swappiness, &tier); - for (i = 0; i < ANON_AND_FILE; i++) { + for (i = !swappiness; i < ANON_AND_FILE; i++) { if (tier < 0) tier = get_tier_idx(lruvec, type); @@ -5575,7 +5425,6 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - prepare_workingset_protection(pgdat, sc); mem_cgroup_calculate_protection(NULL, memcg); if (mem_cgroup_below_min(NULL, memcg)) @@ -6723,8 +6572,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) prepare_scan_count(pgdat, sc); - prepare_workingset_protection(pgdat, sc); - shrink_node_memcgs(pgdat, sc); flush_reclaim_state(sc); @@ -6813,9 +6660,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) */ if (reclaimable) pgdat->kswapd_failures = 0; - - if (sc->clean_below_min && pgdat->kswapd_failures && !sc->priority) - invoke_oom(sc); } /*