Skip to content

Commit

Permalink
s390/kvm: support collaborative memory management
Browse files Browse the repository at this point in the history
This patch enables Collaborative Memory Management (CMM) for kvm
on s390. CMM allows the guest to inform the host about page usage
(see arch/s390/mm/cmm.c). The host uses this information to avoid
swapping in unused pages in the page fault handler. Further, a CPU
provided list of unused invalid pages is processed to reclaim swap
space of not yet accessed unused pages.

[ Martin Schwidefsky: patch reordering and cleanup ]

Signed-off-by: Konstantin Weitz <konstantin.weitz@gmail.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
  • Loading branch information
konne88 authored and Martin Schwidefsky committed Feb 21, 2014
1 parent 4596172 commit b31288f
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 1 deletion.
5 changes: 4 additions & 1 deletion arch/s390/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ struct kvm_s390_sie_block {
__u64 gbea; /* 0x0180 */
__u8 reserved188[24]; /* 0x0188 */
__u32 fac; /* 0x01a0 */
__u8 reserved1a4[68]; /* 0x01a4 */
__u8 reserved1a4[20]; /* 0x01a4 */
__u64 cbrlo; /* 0x01b8 */
__u8 reserved1c0[40]; /* 0x01c0 */
__u64 itdba; /* 0x01e8 */
__u8 reserved1f0[16]; /* 0x01f0 */
} __attribute__((packed));
Expand Down Expand Up @@ -155,6 +157,7 @@ struct kvm_vcpu_stat {
u32 instruction_stsi;
u32 instruction_stfl;
u32 instruction_tprot;
u32 instruction_essa;
u32 instruction_sigp_sense;
u32 instruction_sigp_sense_running;
u32 instruction_sigp_external_call;
Expand Down
26 changes: 26 additions & 0 deletions arch/s390/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ extern unsigned long MODULES_END;
#define _PAGE_READ 0x010 /* SW pte read bit */
#define _PAGE_WRITE 0x020 /* SW pte write bit */
#define _PAGE_SPECIAL 0x040 /* SW associated with special page */
#define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */
#define __HAVE_ARCH_PTE_SPECIAL

/* Set of bits not changed in pte_modify */
Expand Down Expand Up @@ -394,6 +395,12 @@ extern unsigned long MODULES_END;

#endif /* CONFIG_64BIT */

/* Guest Page State used for virtualization */
#define _PGSTE_GPS_ZERO 0x0000000080000000UL
#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL
#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL
#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL

/*
* A user page table pointer has the space-switch-event bit, the
* private-space-control bit and the storage-alteration-event-control
Expand Down Expand Up @@ -617,6 +624,14 @@ static inline int pte_none(pte_t pte)
return pte_val(pte) == _PAGE_INVALID;
}

static inline int pte_swap(pte_t pte)
{
/* Bit pattern: (pte & 0x603) == 0x402 */
return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT |
_PAGE_TYPE | _PAGE_PRESENT))
== (_PAGE_INVALID | _PAGE_TYPE);
}

static inline int pte_file(pte_t pte)
{
/* Bit pattern: (pte & 0x601) == 0x600 */
Expand Down Expand Up @@ -821,6 +836,7 @@ unsigned long gmap_translate(unsigned long address, struct gmap *);
unsigned long __gmap_fault(unsigned long address, struct gmap *);
unsigned long gmap_fault(unsigned long address, struct gmap *);
void gmap_discard(unsigned long from, unsigned long to, struct gmap *);
void __gmap_zap(unsigned long address, struct gmap *);

void gmap_register_ipte_notifier(struct gmap_notifier *);
void gmap_unregister_ipte_notifier(struct gmap_notifier *);
Expand Down Expand Up @@ -852,6 +868,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,

if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
pgste_set_key(ptep, pgste, entry);
pgste_set_pte(ptep, entry);
pgste_set_unlock(ptep, pgste);
Expand Down Expand Up @@ -881,6 +898,12 @@ static inline int pte_young(pte_t pte)
return (pte_val(pte) & _PAGE_YOUNG) != 0;
}

#define __HAVE_ARCH_PTE_UNUSED
static inline int pte_unused(pte_t pte)
{
return pte_val(pte) & _PAGE_UNUSED;
}

/*
* pgd/pmd/pte modification functions
*/
Expand Down Expand Up @@ -1196,6 +1219,9 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
pte_val(*ptep) = _PAGE_INVALID;

if (mm_has_pgste(vma->vm_mm)) {
if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
_PGSTE_GPS_USAGE_UNUSED)
pte_val(pte) |= _PAGE_UNUSED;
pgste = pgste_update_all(&pte, pgste);
pgste_set_unlock(ptep, pgste);
}
Expand Down
25 changes: 25 additions & 0 deletions arch/s390/kvm/kvm-s390.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
{ "instruction_stsch", VCPU_STAT(instruction_stsch) },
{ "instruction_chsc", VCPU_STAT(instruction_chsc) },
{ "instruction_essa", VCPU_STAT(instruction_essa) },
{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
Expand Down Expand Up @@ -283,7 +284,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (kvm_is_ucontrol(vcpu->kvm))
gmap_free(vcpu->arch.gmap);

if (vcpu->arch.sie_block->cbrlo)
__free_page(__pfn_to_page(
vcpu->arch.sie_block->cbrlo >> PAGE_SHIFT));
free_page((unsigned long)(vcpu->arch.sie_block));

kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vcpu);
}
Expand Down Expand Up @@ -390,6 +395,8 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)

int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
struct page *cbrl;

atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
CPUSTAT_SM |
CPUSTAT_STOPPED |
Expand All @@ -401,6 +408,14 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ecb2 = 8;
vcpu->arch.sie_block->eca = 0xC1002001U;
vcpu->arch.sie_block->fac = (int) (long) vfacilities;
if (kvm_enabled_cmma()) {
cbrl = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (cbrl) {
vcpu->arch.sie_block->ecb2 |= 0x80;
vcpu->arch.sie_block->ecb2 &= ~0x08;
vcpu->arch.sie_block->cbrlo = page_to_phys(cbrl);
}
}
hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
(unsigned long) vcpu);
Expand Down Expand Up @@ -761,6 +776,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
return rc;
}

bool kvm_enabled_cmma(void)
{
if (!MACHINE_IS_LPAR)
return false;
/* only enable for z10 and later */
if (!MACHINE_HAS_EDAT1)
return false;
return true;
}

static int __vcpu_run(struct kvm_vcpu *vcpu)
{
int rc, exit_reason;
Expand Down
2 changes: 2 additions & 0 deletions arch/s390/kvm/kvm-s390.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ void s390_vcpu_block(struct kvm_vcpu *vcpu);
void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
void exit_sie(struct kvm_vcpu *vcpu);
void exit_sie_sync(struct kvm_vcpu *vcpu);
/* are we going to support cmma? */
bool kvm_enabled_cmma(void);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);

Expand Down
41 changes: 41 additions & 0 deletions arch/s390/kvm/priv.c
Original file line number Diff line number Diff line change
Expand Up @@ -636,8 +636,49 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
return 0;
}

static int handle_essa(struct kvm_vcpu *vcpu)
{
/* entries expected to be 1FF */
int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
unsigned long *cbrlo, cbrle;
struct gmap *gmap;
int i;

VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries);
gmap = vcpu->arch.gmap;
vcpu->stat.instruction_essa++;
if (!kvm_enabled_cmma() || !vcpu->arch.sie_block->cbrlo)
return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);

if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);

if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);

/* Rewind PSW to repeat the ESSA instruction */
vcpu->arch.sie_block->gpsw.addr =
__rewind_psw(vcpu->arch.sie_block->gpsw, 4);
vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
down_read(&gmap->mm->mmap_sem);
for (i = 0; i < entries; ++i) {
cbrle = cbrlo[i];
if (unlikely(cbrle & ~PAGE_MASK || cbrle < 2 * PAGE_SIZE))
/* invalid entry */
break;
/* try to free backing */
__gmap_zap(cbrle, gmap);
}
up_read(&gmap->mm->mmap_sem);
if (i < entries)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
return 0;
}

static const intercept_handler_t b9_handlers[256] = {
[0x8d] = handle_epsw,
[0xab] = handle_essa,
[0xaf] = handle_pfmf,
};

Expand Down
77 changes: 77 additions & 0 deletions arch/s390/mm/pgtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <linux/quicklist.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/swapops.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
Expand Down Expand Up @@ -594,6 +595,82 @@ unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_fault);

static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
{
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
struct page *page = migration_entry_to_page(entry);

if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
}
free_swap_and_cache(entry);
}

/**
* The mm->mmap_sem lock must be held
*/
static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
{
unsigned long ptev, pgstev;
spinlock_t *ptl;
pgste_t pgste;
pte_t *ptep, pte;

ptep = get_locked_pte(mm, address, &ptl);
if (unlikely(!ptep))
return;
pte = *ptep;
if (!pte_swap(pte))
goto out_pte;
/* Zap unused and logically-zero pages */
pgste = pgste_get_lock(ptep);
pgstev = pgste_val(pgste);
ptev = pte_val(pte);
if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
pte_clear(mm, address, ptep);
}
pgste_set_unlock(ptep, pgste);
out_pte:
pte_unmap_unlock(*ptep, ptl);
}

/*
* this function is assumed to be called with mmap_sem held
*/
void __gmap_zap(unsigned long address, struct gmap *gmap)
{
unsigned long *table, *segment_ptr;
unsigned long segment, pgstev, ptev;
struct gmap_pgtable *mp;
struct page *page;

segment_ptr = gmap_table_walk(address, gmap);
if (IS_ERR(segment_ptr))
return;
segment = *segment_ptr;
if (segment & _SEGMENT_ENTRY_INVALID)
return;
page = pfn_to_page(segment >> PAGE_SHIFT);
mp = (struct gmap_pgtable *) page->index;
address = mp->vmaddr | (address & ~PMD_MASK);
/* Page table is present */
table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
table = table + ((address >> 12) & 0xff);
pgstev = table[PTRS_PER_PTE];
ptev = table[0];
/* quick check, checked again with locks held */
if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
gmap_zap_unused(gmap->mm, address);
}
EXPORT_SYMBOL_GPL(__gmap_zap);

void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
{

Expand Down

0 comments on commit b31288f

Please sign in to comment.