-
Notifications
You must be signed in to change notification settings - Fork 102
/
vmscan.c
7496 lines (6214 loc) · 206 KB
/
vmscan.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/oom.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
#include <linux/sched/sysctl.h>
#include "internal.h"
#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
*/
nodemask_t *nodemask;
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
*/
struct mem_cgroup *target_mem_cgroup;
/*
* Scan pressure balancing between anon and file LRUs
*/
unsigned long anon_cost;
unsigned long file_cost;
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
unsigned int may_deactivate:2;
unsigned int force_deactivate:1;
unsigned int skipped_deactivate:1;
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
/* Can mapped folios be reclaimed? */
unsigned int may_unmap:1;
/* Can folios be swapped as part of reclaim? */
unsigned int may_swap:1;
/* Proactive reclaim invoked by userspace through memory.reclaim */
unsigned int proactive:1;
/*
* Cgroup memory below memory.low is protected as long as we
* don't threaten to OOM. If any cgroup is reclaimed at
* reduced force or passed over entirely due to its memory.low
* setting (memcg_low_skipped), and nothing is reclaimed as a
* result, then go back for one more cycle that reclaims the protected
* memory (memcg_low_reclaim) to avert OOM.
*/
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
/* There is easily reclaimable cold cache in the current node */
unsigned int cache_trim_mode:1;
/* The file folios on the current node are dangerously low */
unsigned int file_is_tiny:1;
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
/* The highest zone to isolate folios for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
gfp_t gfp_mask;
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
struct {
unsigned int dirty;
unsigned int unqueued_dirty;
unsigned int congested;
unsigned int writeback;
unsigned int immediate;
unsigned int file_taken;
unsigned int taken;
} nr;
/* for recording the reclaimed slab by now */
struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_folio(_folio, _base, _field) \
do { \
if ((_folio)->lru.prev != _base) { \
struct folio *prev; \
\
prev = lru_to_folio(&(_folio->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
#endif
/*
* From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
#ifdef CONFIG_MEMCG
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
{
return sc->target_mem_cgroup;
}
/*
* Returns true for reclaim on the root cgroup. This is true for direct
* allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
*/
static bool root_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
}
/**
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
* completely broken with the legacy memcg and direct stalling in
* shrink_folio_list() is used for throttling instead, which lacks all the
* niceties such as fairness, adaptive pausing, bandwidth proportional
* allocation and configurability.
*
* This function tests whether the vmscan currently in progress can assume
* that the normal dirty throttling mechanism is operational.
*/
static bool writeback_throttling_sane(struct scan_control *sc)
{
if (!cgroup_reclaim(sc))
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return true;
#endif
return false;
}
#else
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
}
static bool root_reclaim(struct scan_control *sc)
{
return true;
}
static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
}
#endif
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
/* Check for an overwrite */
WARN_ON_ONCE(rs && task->reclaim_state);
/* Check for the nulling of an already-nulled member */
WARN_ON_ONCE(!rs && !task->reclaim_state);
task->reclaim_state = rs;
}
/*
* flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
* scan_control->nr_reclaimed.
*/
static void flush_reclaim_state(struct scan_control *sc)
{
/*
* Currently, reclaim_state->reclaimed includes three types of pages
* freed outside of vmscan:
* (1) Slab pages.
* (2) Clean file pages from pruned inodes (on highmem systems).
* (3) XFS freed buffer pages.
*
* For all of these cases, we cannot universally link the pages to a
* single memcg. For example, a memcg-aware shrinker can free one object
* charged to the target memcg, causing an entire page to be freed.
* If we count the entire page as reclaimed from the memcg, we end up
* overestimating the reclaimed amount (potentially under-reclaiming).
*
* Only count such pages for global reclaim to prevent under-reclaiming
* from the target memcg; preventing unnecessary retries during memcg
* charging and false positives from proactive reclaim.
*
* For uncommon cases where the freed pages were actually mostly
* charged to the target memcg, we end up underestimating the reclaimed
* amount. This should be fine. The freed pages will be uncharged
* anyway, even if they are not counted here properly, and we will be
* able to make forward progress in charging (which is usually in a
* retry loop).
*
* We can go one step further, and report the uncharged objcg pages in
* memcg reclaim, to make reporting more accurate and reduce
* underestimation, but it's probably not worth the complexity for now.
*/
if (current->reclaim_state && root_reclaim(sc)) {
sc->nr_reclaimed += current->reclaim_state->reclaimed;
current->reclaim_state->reclaimed = 0;
}
}
static bool can_demote(int nid, struct scan_control *sc)
{
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
if (next_demotion_node(nid) == NUMA_NO_NODE)
return false;
return true;
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
int nid,
struct scan_control *sc)
{
if (memcg == NULL) {
/*
* For non-memcg reclaim, is there
* space in any swap device?
*/
if (get_nr_swap_pages() > 0)
return true;
} else {
/* Is the memcg below its swap limit? */
if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
return true;
}
/*
* The page can not be swapped.
*
* Can it be reclaimed from this node via demotion?
*/
return can_demote(nid, sc);
}
/*
* This misses isolated folios which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
* not expected that isolated folios will be a dominating factor.
*/
unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
return nr;
}
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
* @lru: lru to use
* @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/
static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zone_idx)
{
unsigned long size = 0;
int zid;
for (zid = 0; zid <= zone_idx; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
if (!managed_zone(zone))
continue;
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
}
return size;
}
static unsigned long drop_slab_node(int nid)
{
unsigned long freed = 0;
struct mem_cgroup *memcg = NULL;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
return freed;
}
void drop_slab(void)
{
int nid;
int shift = 0;
unsigned long freed;
do {
freed = 0;
for_each_online_node(nid) {
if (fatal_signal_pending(current))
return;
freed += drop_slab_node(nid);
}
} while ((freed >> shift++) > 1);
}
static int reclaimer_offset(void)
{
BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
PGSCAN_DIRECT - PGSCAN_KSWAPD);
BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
static inline int is_page_cache_freeable(struct folio *folio)
{
/*
* A freeable page cache folio is referenced only by the caller
* that isolated the folio, the page cache and optional filesystem
* private data at folio->private.
*/
return folio_ref_count(folio) - folio_test_private(folio) ==
1 + folio_nr_pages(folio);
}
/*
* We detected a synchronous write error writing a folio out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
* prevents it from being freed up. But we have a ref on the folio and once
* that folio is locked, the mapping is pinned.
*
* We're allowed to run sleeping folio_lock() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
struct folio *folio, int error)
{
folio_lock(folio);
if (folio_mapping(folio) == mapping)
mapping_set_error(mapping, error);
folio_unlock(folio);
}
static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
/*
* If there are a lot of dirty/writeback folios then do not
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
}
if (2 * write_pending <= reclaimable)
return true;
return false;
}
void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
{
wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
long timeout, ret;
DEFINE_WAIT(wait);
/*
* Do not throttle user workers, kthreads other than kswapd or
* workqueues. They may be required for reclaim to make
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
cond_resched();
return;
}
/*
* These figures are pulled out of thin air.
* VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
* parallel reclaimers which is a short-lived event so the timeout is
* short. Failing to make progress or waiting on writeback are
* potentially long-lived events so use a longer timeout. This is shaky
* logic as a failure to make progress could be due to anything from
* writeback to a slow device to excessive referenced folios at the tail
* of the inactive LRU.
*/
switch(reason) {
case VMSCAN_THROTTLE_WRITEBACK:
timeout = HZ/10;
if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
WRITE_ONCE(pgdat->nr_reclaim_start,
node_page_state(pgdat, NR_THROTTLED_WRITTEN));
}
break;
case VMSCAN_THROTTLE_CONGESTED:
fallthrough;
case VMSCAN_THROTTLE_NOPROGRESS:
if (skip_throttle_noprogress(pgdat)) {
cond_resched();
return;
}
timeout = 1;
break;
case VMSCAN_THROTTLE_ISOLATED:
timeout = HZ/50;
break;
default:
WARN_ON_ONCE(1);
timeout = HZ;
break;
}
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = schedule_timeout(timeout);
finish_wait(wqh, &wait);
if (reason == VMSCAN_THROTTLE_WRITEBACK)
atomic_dec(&pgdat->nr_writeback_throttled);
trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
jiffies_to_usecs(timeout - ret),
reason);
}
/*
* Account for folios written if tasks are throttled waiting on dirty
* folios to clean. If enough folios have been cleaned since throttling
* started then wakeup the throttled tasks.
*/
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
int nr_throttled)
{
unsigned long nr_written;
node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
/*
* This is an inaccurate read as the per-cpu deltas may not
* be synchronised. However, given that the system is
* writeback throttled, it is not worth taking the penalty
* of getting an accurate count. At worst, the throttle
* timeout guarantees forward progress.
*/
nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
READ_ONCE(pgdat->nr_reclaim_start);
if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
}
/* possible outcome of pageout() */
typedef enum {
/* failed to write folio out, folio is locked */
PAGE_KEEP,
/* move folio to the active list, folio is locked */
PAGE_ACTIVATE,
/* folio has been sent to the disk successfully, folio is unlocked */
PAGE_SUCCESS,
/* folio is clean and locked */
PAGE_CLEAN,
} pageout_t;
/*
* pageout is called by shrink_folio_list() for each dirty folio.
* Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug)
{
/*
* If the folio is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in __generic_file_write_iter() against
* this folio's queue, we can perform writeback even if that
* will block.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
*/
if (!is_page_cache_freeable(folio))
return PAGE_KEEP;
if (!mapping) {
/*
* Some data journaling orphaned folios can have
* folio->mapping == NULL while being dirty with clean buffers.
*/
if (folio_test_private(folio)) {
if (try_to_free_buffers(folio)) {
folio_clear_dirty(folio);
pr_info("%s: orphaned folio\n", __func__);
return PAGE_CLEAN;
}
}
return PAGE_KEEP;
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
.swap_plug = plug,
};
folio_set_reclaim(folio);
res = mapping->a_ops->writepage(&folio->page, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
folio_clear_reclaim(folio);
return PAGE_ACTIVATE;
}
if (!folio_test_writeback(folio)) {
/* synchronous write or broken a_ops? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
node_stat_add_folio(folio, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
/*
* Same as remove_mapping, but if the folio is removed from the mapping, it
* gets returned with a refcount of 0.
*/
static int __remove_mapping(struct address_space *mapping, struct folio *folio,
bool reclaimed, struct mem_cgroup *target_memcg)
{
int refcount;
void *shadow = NULL;
BUG_ON(!folio_test_locked(folio));
BUG_ON(mapping != folio_mapping(folio));
if (!folio_test_swapcache(folio))
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy folio.
*
* Must be careful with the order of the tests. When someone has
* a ref to the folio, it may be possible that they dirty it then
* drop the reference. So if the dirty flag is tested before the
* refcount here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !folio_test_dirty(folio) [good]
* folio_set_dirty(folio);
* folio_put(folio);
* !refcount(folio) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the folio->flags
* load is not satisfied before that of folio->_refcount.
*
* Note that if the dirty flag is always set via folio_mark_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
refcount = 1 + folio_nr_pages(folio);
if (!folio_ref_freeze(folio, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
if (unlikely(folio_test_dirty(folio))) {
folio_ref_unfreeze(folio, refcount);
goto cannot_free;
}
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio->swap;
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
mem_cgroup_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
free_folio = mapping->a_ops->free_folio;
/*
* Remember a shadow entry for reclaimed file cache in
* order to detect refaults, thus thrashing, later on.
*
* But don't store shadows in an address space that is
* already exiting. This is not just an optimization,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
*
* We also don't store shadows for DAX mappings because the
* only page cache folios found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
* same address_space.
*/
if (reclaimed && folio_is_file_lru(folio) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(folio, target_memcg);
__filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (free_folio)
free_folio(folio);
}
return 1;
cannot_free:
xa_unlock_irq(&mapping->i_pages);
if (!folio_test_swapcache(folio))
spin_unlock(&mapping->host->i_lock);
return 0;
}
/**
* remove_mapping() - Attempt to remove a folio from its mapping.
* @mapping: The address space.
* @folio: The folio to remove.
*
* If the folio is dirty, under writeback or if someone else has a ref
* on it, removal will fail.
* Return: The number of pages removed from the mapping. 0 if the folio
* could not be removed.
* Context: The caller should have a single refcount on the folio and
* hold its lock.
*/
long remove_mapping(struct address_space *mapping, struct folio *folio)
{
if (__remove_mapping(mapping, folio, false, NULL)) {
/*
* Unfreezing the refcount with 1 effectively
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
folio_ref_unfreeze(folio, 1);
return folio_nr_pages(folio);
}
return 0;
}
/**
* folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
* @folio: Folio to be returned to an LRU list.
*
* Add previously isolated @folio to appropriate LRU list.
* The folio may still be unevictable for other reasons.
*
* Context: lru_lock must not be held, interrupts must be enabled.
*/
void folio_putback_lru(struct folio *folio)
{
folio_add_lru(folio);
folio_put(folio); /* drop ref from isolate */
}
enum folio_references {
FOLIOREF_RECLAIM,
FOLIOREF_RECLAIM_CLEAN,
FOLIOREF_KEEP,
FOLIOREF_ACTIVATE,
};
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
unsigned long vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
* Let the folio, now marked Mlocked, be moved to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
return FOLIOREF_ACTIVATE;
/* rmap lock contention: rotate */
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
if (referenced_ptes) {
/*
* All mapped folios start out with page table
* references from the instantiating fault, so we need
* to look twice if a mapped file/anon folio is used more
* than once.
*
* Mark it and spare it for another trip around the
* inactive list. Another page table reference will
* lead to its activation.
*
* Note: the mark is set for activated folios as well
* so that recently deactivated but used folios are
* quickly recovered.
*/
folio_set_referenced(folio);
if (referenced_folio || referenced_ptes > 1)
return FOLIOREF_ACTIVATE;
/*
* Activate file-backed executable folios after first usage.
*/
if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
return FOLIOREF_ACTIVATE;
return FOLIOREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */
if (referenced_folio && folio_is_file_lru(folio))
return FOLIOREF_RECLAIM_CLEAN;
return FOLIOREF_RECLAIM;
}
/* Check if a folio is dirty or under writeback */
static void folio_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
/*
* Anonymous folios are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them.
* MADV_FREE anonymous folios are put into inactive file list too.
* They could be mistakenly treated as file lru. So further anon
* test is needed.
*/
if (!folio_is_file_lru(folio) ||
(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
*dirty = false;
*writeback = false;
return;
}
/* By default assume that the folio flags are accurate */
*dirty = folio_test_dirty(folio);
*writeback = folio_test_writeback(folio);
/* Verify dirty/writeback state if the filesystem supports it */
if (!folio_test_private(folio))
return;
mapping = folio_mapping(folio);
if (mapping && mapping->a_ops->is_dirty_writeback)
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
static struct folio *alloc_demote_folio(struct folio *src,
unsigned long private)
{
struct folio *dst;
nodemask_t *allowed_mask;
struct migration_target_control *mtc;
mtc = (struct migration_target_control *)private;
allowed_mask = mtc->nmask;
/*
* make sure we allocate from the target node first also trying to
* demote or reclaim pages from the target node via kswapd if we are
* low on free memory on target node. If we don't do this and if
* we have free memory on the slower(lower) memtier, we would start
* allocating pages from slower(lower) memory tiers without even forcing
* a demotion of cold pages from the target memtier. This can result
* in the kernel placing hot pages in slower(lower) memory tiers.
*/
mtc->nmask = NULL;
mtc->gfp_mask |= __GFP_THISNODE;
dst = alloc_migration_target(src, (unsigned long)mtc);
if (dst)
return dst;
mtc->gfp_mask &= ~__GFP_THISNODE;
mtc->nmask = allowed_mask;
return alloc_migration_target(src, (unsigned long)mtc);
}
/*
* Take folios on @demote_folios and attempt to demote them to another node.
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
struct pglist_data *pgdat)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
nodemask_t allowed_mask;
struct migration_target_control mtc = {
/*
* Allocate from 'node', or fail quickly and quietly.
* When this happens, 'page' will likely just be discarded
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask
};
if (list_empty(demote_folios))
return 0;
if (target_nid == NUMA_NO_NODE)
return 0;
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
__count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
return nr_succeeded;
}
static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
{
if (gfp_mask & __GFP_FS)
return true;
if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
return false;
/*
* We can "enter_fs" for swap-cache with only __GFP_IO
* providing this isn't SWP_FS_OPS.
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_FS_OPS, so the data_race
* is safe.
*/
return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}