Skip to content

Commit

Permalink
mm: start tracking VMAs with maple tree
Browse files Browse the repository at this point in the history
Start tracking the VMAs with the new maple tree structure in parallel with
the rb_tree.  Add debug and trace events for maple tree operations and
duplicate the rb_tree that is created on forks into the maple tree.

The maple tree is added to the mm_struct including the mm_init struct,
added support in required mm/mmap functions, added tracking in kernel/fork
for process forking, and used to find the unmapped_area and checked
against what the rbtree finds.

This also moves the mmap_lock() in exit_mmap() since the oom reaper call
does walk the VMAs.  Otherwise lockdep will be unhappy if oom happens.

When splitting a vma fails due to allocations of the maple tree nodes,
the error path in __split_vma() calls new->vm_ops->close(new).  The page
accounting for hugetlb is actually in the close() operation,  so it
accounts for the removal of 1/2 of the VMA which was not adjusted.  This
results in a negative exit value.  To avoid the negative charge, set
vm_start = vm_end and vm_pgoff = 0.

There is also a potential accounting issue in special mappings from
insert_vm_struct() failing to allocate, so reverse the charge there in
the failure scenario.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
  • Loading branch information
howlett authored and xanmod committed Oct 3, 2022
1 parent ebce1b5 commit 9ad235c
Show file tree
Hide file tree
Showing 9 changed files with 435 additions and 36 deletions.
1 change: 1 addition & 0 deletions arch/x86/kernel/tboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ void __init tboot_probe(void)
static pgd_t *tboot_pg_dir;
static struct mm_struct tboot_mm = {
.mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
Expand Down
1 change: 1 addition & 0 deletions drivers/firmware/efi/efi.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;

struct mm_struct efi_mm = {
.mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),
Expand Down
5 changes: 5 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2543,6 +2543,8 @@ extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
/* mmap.c */
void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
Expand Down Expand Up @@ -2606,6 +2608,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);

void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);

static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
unsigned long start,
Expand Down
3 changes: 3 additions & 0 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
Expand Down Expand Up @@ -486,6 +487,7 @@ struct kioctx_table;
struct mm_struct {
struct {
struct vm_area_struct *mmap; /* list of VMAs */
struct maple_tree mm_mt;
struct rb_root mm_rb;
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
Expand Down Expand Up @@ -697,6 +699,7 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};

#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
extern struct mm_struct init_mm;

/* Pointer magic because the dynamic array size confuses some compilers. */
Expand Down
73 changes: 73 additions & 0 deletions include/trace/events/mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area,
__entry->low_limit, __entry->high_limit, __entry->align_mask,
__entry->align_offset)
);

TRACE_EVENT(vma_mas_szero,
TP_PROTO(struct maple_tree *mt, unsigned long start,
unsigned long end),

TP_ARGS(mt, start, end),

TP_STRUCT__entry(
__field(struct maple_tree *, mt)
__field(unsigned long, start)
__field(unsigned long, end)
),

TP_fast_assign(
__entry->mt = mt;
__entry->start = start;
__entry->end = end;
),

TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,",
__entry->mt,
(unsigned long) __entry->start,
(unsigned long) __entry->end
)
);

TRACE_EVENT(vma_store,
TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma),

TP_ARGS(mt, vma),

TP_STRUCT__entry(
__field(struct maple_tree *, mt)
__field(struct vm_area_struct *, vma)
__field(unsigned long, vm_start)
__field(unsigned long, vm_end)
),

TP_fast_assign(
__entry->mt = mt;
__entry->vma = vma;
__entry->vm_start = vma->vm_start;
__entry->vm_end = vma->vm_end - 1;
),

TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,",
__entry->mt, __entry->vma,
(unsigned long) __entry->vm_start,
(unsigned long) __entry->vm_end
)
);


TRACE_EVENT(exit_mmap,
TP_PROTO(struct mm_struct *mm),

TP_ARGS(mm),

TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(struct maple_tree *, mt)
),

TP_fast_assign(
__entry->mm = mm;
__entry->mt = &mm->mm_mt;
),

TP_printk("mt_mod %p, DESTROY\n",
__entry->mt
)
);

#endif

/* This part must be outside protection */
Expand Down
20 changes: 17 additions & 3 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval;
unsigned long charge;
LIST_HEAD(uf);
MA_STATE(mas, &mm->mm_mt, 0, 0);

uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
Expand Down Expand Up @@ -619,6 +620,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
khugepaged_fork(mm, oldmm);

retval = mas_expected_entries(&mas, oldmm->map_count);
if (retval)
goto out;

prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;
Expand All @@ -634,7 +639,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
goto out;
goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
Expand Down Expand Up @@ -699,6 +704,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;

/* Link the vma into the MT */
mas.index = tmp->vm_start;
mas.last = tmp->vm_end - 1;
mas_store(&mas, tmp);

mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
retval = copy_page_range(tmp, mpnt);
Expand All @@ -707,10 +717,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp);

if (retval)
goto out;
goto loop_out;
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
mas_destroy(&mas);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
Expand All @@ -726,7 +738,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
goto out;
goto loop_out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
Expand Down Expand Up @@ -1116,6 +1128,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
{
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
Expand Down
2 changes: 2 additions & 0 deletions mm/init-mm.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h>
#include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
Expand Down Expand Up @@ -29,6 +30,7 @@
*/
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
Expand Down

0 comments on commit 9ad235c

Please sign in to comment.