Skip to content

Commit

Permalink
x86_64, switch_to(): Load TLS descriptors before switching DS and ES
Browse files Browse the repository at this point in the history
Otherwise, if buggy user code points DS or ES into the TLS
array, they would be corrupted after a context switch.

This also significantly improves the comments and documents some
gotchas in the code.

Before this patch, the both tests below failed.  With this
patch, the es test passes, although the gsbase test still fails.

 ----- begin es test -----

/*
 * Copyright (c) 2014 Andy Lutomirski
 * GPL v2
 */

static unsigned short GDT3(int idx)
{
	return (idx << 3) | 3;
}

static int create_tls(int idx, unsigned int base)
{
	struct user_desc desc = {
		.entry_number    = idx,
		.base_addr       = base,
		.limit           = 0xfffff,
		.seg_32bit       = 1,
		.contents        = 0, /* Data, grow-up */
		.read_exec_only  = 0,
		.limit_in_pages  = 1,
		.seg_not_present = 0,
		.useable         = 0,
	};

	if (syscall(SYS_set_thread_area, &desc) != 0)
		err(1, "set_thread_area");

	return desc.entry_number;
}

int main()
{
	int idx = create_tls(-1, 0);
	printf("Allocated GDT index %d\n", idx);

	unsigned short orig_es;
	asm volatile ("mov %%es,%0" : "=rm" (orig_es));

	int errors = 0;
	int total = 1000;
	for (int i = 0; i < total; i++) {
		asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
		usleep(100);

		unsigned short es;
		asm volatile ("mov %%es,%0" : "=rm" (es));
		asm volatile ("mov %0,%%es" : : "rm" (orig_es));
		if (es != GDT3(idx)) {
			if (errors == 0)
				printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
				       GDT3(idx), es);
			errors++;
		}
	}

	if (errors) {
		printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
		return 1;
	} else {
		printf("[OK]\tES was preserved\n");
		return 0;
	}
}

 ----- end es test -----

 ----- begin gsbase test -----

/*
 * gsbase.c, a gsbase test
 * Copyright (c) 2014 Andy Lutomirski
 * GPL v2
 */

static unsigned char *testptr, *testptr2;

static unsigned char read_gs_testvals(void)
{
	unsigned char ret;
	asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
	return ret;
}

int main()
{
	int errors = 0;

	testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
	if (testptr == MAP_FAILED)
		err(1, "mmap");

	testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
	if (testptr2 == MAP_FAILED)
		err(1, "mmap");

	*testptr = 0;
	*testptr2 = 1;

	if (syscall(SYS_arch_prctl, ARCH_SET_GS,
		    (unsigned long)testptr2 - (unsigned long)testptr) != 0)
		err(1, "ARCH_SET_GS");

	usleep(100);

	if (read_gs_testvals() == 1) {
		printf("[OK]\tARCH_SET_GS worked\n");
	} else {
		printf("[FAIL]\tARCH_SET_GS failed\n");
		errors++;
	}

	asm volatile ("mov %0,%%gs" : : "r" (0));

	if (read_gs_testvals() == 0) {
		printf("[OK]\tWriting 0 to gs worked\n");
	} else {
		printf("[FAIL]\tWriting 0 to gs failed\n");
		errors++;
	}

	usleep(100);

	if (read_gs_testvals() == 0) {
		printf("[OK]\tgsbase is still zero\n");
	} else {
		printf("[FAIL]\tgsbase was corrupted\n");
		errors++;
	}

	return errors == 0 ? 0 : 1;
}

 ----- end gsbase test -----

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: <stable@vger.kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
amluto authored and Ingo Molnar committed Dec 11, 2014
1 parent 29258cf commit f647d7c
Showing 1 changed file with 73 additions and 28 deletions.
101 changes: 73 additions & 28 deletions arch/x86/kernel/process_64.c
Expand Up @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)


fpu = switch_fpu_prepare(prev_p, next_p, cpu); fpu = switch_fpu_prepare(prev_p, next_p, cpu);


/* /* Reload esp0 and ss1. */
* Reload esp0, LDT and the page table pointer:
*/
load_sp0(tss, next); load_sp0(tss, next);


/*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
loadsegment(es, next->es);

savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);


/* We must save %fs and %gs before load_TLS() because /* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS(). * %fs and %gs may be cleared by load_TLS().
* *
Expand All @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
savesegment(fs, fsindex); savesegment(fs, fsindex);
savesegment(gs, gsindex); savesegment(gs, gsindex);


/*
* Load TLS before restoring any segments so that segment loads
* reference the correct GDT entries.
*/
load_TLS(next, cpu); load_TLS(next, cpu);


/* /*
* Leave lazy mode, flushing any hypercalls made here. * Leave lazy mode, flushing any hypercalls made here. This
* This must be done before restoring TLS segments so * must be done after loading TLS entries in the GDT but before
* the GDT and LDT are properly updated, and must be * loading segments that might reference them, and and it must
* done before math_state_restore, so the TS bit is up * be done before math_state_restore, so the TS bit is up to
* to date. * date.
*/ */
arch_end_context_switch(next_p); arch_end_context_switch(next_p);


/* Switch DS and ES.
*
* Reading them only returns the selectors, but writing them (if
* nonzero) loads the full descriptor from the GDT or LDT. The
* LDT for next is loaded in switch_mm, and the GDT is loaded
* above.
*
* We therefore need to write new values to the segment
* registers on every context switch unless both the new and old
* values are zero.
*
* Note that we don't need to do anything for CS and SS, as
* those are saved and restored as part of pt_regs.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
loadsegment(es, next->es);

savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);

/* /*
* Switch FS and GS. * Switch FS and GS.
* *
* Segment register != 0 always requires a reload. Also * These are even more complicated than FS and GS: they have
* reload when it has changed. When prev process used 64bit * 64-bit bases are that controlled by arch_prctl. Those bases
* base always reload to avoid an information leak. * only differ from the values in the GDT or LDT if the selector
* is 0.
*
* Loading the segment register resets the hidden base part of
* the register to 0 or the value from the GDT / LDT. If the
* next base address zero, writing 0 to the segment register is
* much faster than using wrmsr to explicitly zero the base.
*
* The thread_struct.fs and thread_struct.gs values are 0
* if the fs and gs bases respectively are not overridden
* from the values implied by fsindex and gsindex. They
* are nonzero, and store the nonzero base addresses, if
* the bases are overridden.
*
* (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
* be impossible.
*
* Therefore we need to reload the segment registers if either
* the old or new selector is nonzero, and we need to override
* the base address if next thread expects it to be overridden.
*
* This code is unnecessarily slow in the case where the old and
* new indexes are zero and the new base is nonzero -- it will
* unnecessarily write 0 to the selector before writing the new
* base address.
*
* Note: This all depends on arch_prctl being the only way that
* user code can override the segment base. Once wrfsbase and
* wrgsbase are enabled, most of this code will need to change.
*/ */
if (unlikely(fsindex | next->fsindex | prev->fs)) { if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex); loadsegment(fs, next->fsindex);

/* /*
* Check if the user used a selector != 0; if yes * If user code wrote a nonzero value to FS, then it also
* clear 64bit base, since overloaded base is always * cleared the overridden base address.
* mapped to the Null selector *
* XXX: if user code wrote 0 to FS and cleared the base
* address itself, we won't notice and we'll incorrectly
* restore the prior base address next time we reschdule
* the process.
*/ */
if (fsindex) if (fsindex)
prev->fs = 0; prev->fs = 0;
} }
/* when next process has a 64bit base use it */
if (next->fs) if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs); wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex; prev->fsindex = fsindex;


if (unlikely(gsindex | next->gsindex | prev->gs)) { if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex); load_gs_index(next->gsindex);

/* This works (and fails) the same way as fsindex above. */
if (gsindex) if (gsindex)
prev->gs = 0; prev->gs = 0;
} }
Expand Down

0 comments on commit f647d7c

Please sign in to comment.