migration: Map hugetlbfs ramblocks twice, and pre-allocate

Add a RAMBlock.host_mirror for all the hugetlbfs backed guest memories. It'll be used to remap the same region twice and it'll be used to service page faults using UFFDIO_CONTINUE. To make sure all accesses to these ranges will generate minor page faults not missing page faults, we need to pre-allocate the files to make sure page cache exist start from the beginning. Signed-off-by: Peter Xu <peterx@redhat.com>
xzpeter · Jan 12, 2023 · 6205062 · 6205062
1 parent ec84087
commit 6205062
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 0 deletions.
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
@@ -28,6 +28,13 @@ struct RAMBlock {
     struct rcu_head rcu;
     struct MemoryRegion *mr;
     uint8_t *host;
+    /*
+     * This is only used for hugetlbfs ramblocks where doublemap is
+     * enabled.  The pointer is managed by dest host migration code, and
+     * should be NULL when migration is finished.  On src host, it should
+     * always be NULL.
+     */
+    uint8_t *host_mirror;
     uint8_t *colo_cache; /* For colo, VM's ram cache */
     ram_addr_t offset;
     ram_addr_t used_length;

diff --git a/migration/ram.c b/migration/ram.c
@@ -3879,6 +3879,57 @@ void colo_release_ram_cache(void)
     ram_state_cleanup(&ram_state);
 }
 
+static int migrate_hugetlb_doublemap_init(void)
+{
+    RAMBlock *rb;
+    void *addr;
+    int ret;
+
+    if (!migrate_hugetlb_doublemap()) {
+        return 0;
+    }
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
+        if (qemu_ram_is_hugetlb(rb)) {
+            /*
+             * Firstly, we remap the same ramblock into another range of
+             * virtual address, so that we can write to the pages without
+             * touching the page tables that directly mapped for the guest.
+             */
+            addr = ramblock_file_map(rb);
+            if (addr == MAP_FAILED) {
+                ret = -errno;
+                error_report("%s: Duplicate mapping for hugetlb ramblock '%s'"
+                             "failed: %s", __func__, qemu_ram_get_idstr(rb),
+                             strerror(errno));
+                return ret;
+            }
+            rb->host_mirror = addr;
+
+            /*
+             * We need to make sure we pre-allocate the range with
+             * hugetlbfs pages before hand, so that all the page fault will
+             * be trapped as MINOR faults always, rather than MISSING
+             * faults in userfaultfd.
+             */
+            ret = qemu_madvise(addr, rb->mmap_length, QEMU_MADV_POPULATE_WRITE);
+            if (ret) {
+                error_report("Failed to populate hugetlb ramblock '%s': "
+                             "%s", qemu_ram_get_idstr(rb), strerror(-ret));
+                return ret;
+            }
+        }
+    }
+
+    /*
+     * When reach here, it means we've setup the mirror mapping for all the
+     * hugetlbfs pages.  Hence when page fault happens, we'll be able to
+     * resolve page faults using UFFDIO_CONTINUE for hugetlbfs pages, but
+     * we'll keep using UFFDIO_COPY for anonymous pages.
+     */
+    return 0;
+}
+
 /**
  * ram_load_setup: Setup RAM for migration incoming side
  *
@@ -3893,6 +3944,10 @@ static int ram_load_setup(QEMUFile *f, void *opaque)
         return -1;
     }
 
+    if (migrate_hugetlb_doublemap_init()) {
+        return -1;
+    }
+
     xbzrle_load_setup();
     ramblock_recv_map_init();
 
@@ -3913,6 +3968,10 @@ static int ram_load_cleanup(void *opaque)
     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
         g_free(rb->receivedmap);
         rb->receivedmap = NULL;
+        if (rb->host_mirror) {
+            munmap(rb->host_mirror, rb->mmap_length);
+            rb->host_mirror = NULL;
+        }
     }
 
     return 0;