Skip to content

Commit f647d7c

Browse files
amlutoIngo Molnar
authored and
Ingo Molnar
committed
x86_64, switch_to(): Load TLS descriptors before switching DS and ES
Otherwise, if buggy user code points DS or ES into the TLS array, they would be corrupted after a context switch. This also significantly improves the comments and documents some gotchas in the code. Before this patch, the both tests below failed. With this patch, the es test passes, although the gsbase test still fails. ----- begin es test ----- /* * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned short GDT3(int idx) { return (idx << 3) | 3; } static int create_tls(int idx, unsigned int base) { struct user_desc desc = { .entry_number = idx, .base_addr = base, .limit = 0xfffff, .seg_32bit = 1, .contents = 0, /* Data, grow-up */ .read_exec_only = 0, .limit_in_pages = 1, .seg_not_present = 0, .useable = 0, }; if (syscall(SYS_set_thread_area, &desc) != 0) err(1, "set_thread_area"); return desc.entry_number; } int main() { int idx = create_tls(-1, 0); printf("Allocated GDT index %d\n", idx); unsigned short orig_es; asm volatile ("mov %%es,%0" : "=rm" (orig_es)); int errors = 0; int total = 1000; for (int i = 0; i < total; i++) { asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx))); usleep(100); unsigned short es; asm volatile ("mov %%es,%0" : "=rm" (es)); asm volatile ("mov %0,%%es" : : "rm" (orig_es)); if (es != GDT3(idx)) { if (errors == 0) printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n", GDT3(idx), es); errors++; } } if (errors) { printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total); return 1; } else { printf("[OK]\tES was preserved\n"); return 0; } } ----- end es test ----- ----- begin gsbase test ----- /* * gsbase.c, a gsbase test * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned char *testptr, *testptr2; static unsigned char read_gs_testvals(void) { unsigned char ret; asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr)); return ret; } int main() { int errors = 0; testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr == MAP_FAILED) err(1, "mmap"); testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr2 == MAP_FAILED) err(1, "mmap"); *testptr = 0; *testptr2 = 1; if (syscall(SYS_arch_prctl, ARCH_SET_GS, (unsigned long)testptr2 - (unsigned long)testptr) != 0) err(1, "ARCH_SET_GS"); usleep(100); if (read_gs_testvals() == 1) { printf("[OK]\tARCH_SET_GS worked\n"); } else { printf("[FAIL]\tARCH_SET_GS failed\n"); errors++; } asm volatile ("mov %0,%%gs" : : "r" (0)); if (read_gs_testvals() == 0) { printf("[OK]\tWriting 0 to gs worked\n"); } else { printf("[FAIL]\tWriting 0 to gs failed\n"); errors++; } usleep(100); if (read_gs_testvals() == 0) { printf("[OK]\tgsbase is still zero\n"); } else { printf("[FAIL]\tgsbase was corrupted\n"); errors++; } return errors == 0 ? 0 : 1; } ----- end gsbase test ----- Signed-off-by: Andy Lutomirski <luto@amacapital.net> Cc: <stable@vger.kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 29258cf commit f647d7c

File tree

1 file changed

+73
-28
lines changed

1 file changed

+73
-28
lines changed

Diff for: arch/x86/kernel/process_64.c

+73-28
Original file line numberDiff line numberDiff line change
@@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
283283

284284
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285285

286-
/*
287-
* Reload esp0, LDT and the page table pointer:
288-
*/
286+
/* Reload esp0 and ss1. */
289287
load_sp0(tss, next);
290288

291-
/*
292-
* Switch DS and ES.
293-
* This won't pick up thread selector changes, but I guess that is ok.
294-
*/
295-
savesegment(es, prev->es);
296-
if (unlikely(next->es | prev->es))
297-
loadsegment(es, next->es);
298-
299-
savesegment(ds, prev->ds);
300-
if (unlikely(next->ds | prev->ds))
301-
loadsegment(ds, next->ds);
302-
303-
304289
/* We must save %fs and %gs before load_TLS() because
305290
* %fs and %gs may be cleared by load_TLS().
306291
*
@@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
309294
savesegment(fs, fsindex);
310295
savesegment(gs, gsindex);
311296

297+
/*
298+
* Load TLS before restoring any segments so that segment loads
299+
* reference the correct GDT entries.
300+
*/
312301
load_TLS(next, cpu);
313302

314303
/*
315-
* Leave lazy mode, flushing any hypercalls made here.
316-
* This must be done before restoring TLS segments so
317-
* the GDT and LDT are properly updated, and must be
318-
* done before math_state_restore, so the TS bit is up
319-
* to date.
304+
* Leave lazy mode, flushing any hypercalls made here. This
305+
* must be done after loading TLS entries in the GDT but before
306+
* loading segments that might reference them, and and it must
307+
* be done before math_state_restore, so the TS bit is up to
308+
* date.
320309
*/
321310
arch_end_context_switch(next_p);
322311

312+
/* Switch DS and ES.
313+
*
314+
* Reading them only returns the selectors, but writing them (if
315+
* nonzero) loads the full descriptor from the GDT or LDT. The
316+
* LDT for next is loaded in switch_mm, and the GDT is loaded
317+
* above.
318+
*
319+
* We therefore need to write new values to the segment
320+
* registers on every context switch unless both the new and old
321+
* values are zero.
322+
*
323+
* Note that we don't need to do anything for CS and SS, as
324+
* those are saved and restored as part of pt_regs.
325+
*/
326+
savesegment(es, prev->es);
327+
if (unlikely(next->es | prev->es))
328+
loadsegment(es, next->es);
329+
330+
savesegment(ds, prev->ds);
331+
if (unlikely(next->ds | prev->ds))
332+
loadsegment(ds, next->ds);
333+
323334
/*
324335
* Switch FS and GS.
325336
*
326-
* Segment register != 0 always requires a reload. Also
327-
* reload when it has changed. When prev process used 64bit
328-
* base always reload to avoid an information leak.
337+
* These are even more complicated than FS and GS: they have
338+
* 64-bit bases are that controlled by arch_prctl. Those bases
339+
* only differ from the values in the GDT or LDT if the selector
340+
* is 0.
341+
*
342+
* Loading the segment register resets the hidden base part of
343+
* the register to 0 or the value from the GDT / LDT. If the
344+
* next base address zero, writing 0 to the segment register is
345+
* much faster than using wrmsr to explicitly zero the base.
346+
*
347+
* The thread_struct.fs and thread_struct.gs values are 0
348+
* if the fs and gs bases respectively are not overridden
349+
* from the values implied by fsindex and gsindex. They
350+
* are nonzero, and store the nonzero base addresses, if
351+
* the bases are overridden.
352+
*
353+
* (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
354+
* be impossible.
355+
*
356+
* Therefore we need to reload the segment registers if either
357+
* the old or new selector is nonzero, and we need to override
358+
* the base address if next thread expects it to be overridden.
359+
*
360+
* This code is unnecessarily slow in the case where the old and
361+
* new indexes are zero and the new base is nonzero -- it will
362+
* unnecessarily write 0 to the selector before writing the new
363+
* base address.
364+
*
365+
* Note: This all depends on arch_prctl being the only way that
366+
* user code can override the segment base. Once wrfsbase and
367+
* wrgsbase are enabled, most of this code will need to change.
329368
*/
330369
if (unlikely(fsindex | next->fsindex | prev->fs)) {
331370
loadsegment(fs, next->fsindex);
371+
332372
/*
333-
* Check if the user used a selector != 0; if yes
334-
* clear 64bit base, since overloaded base is always
335-
* mapped to the Null selector
373+
* If user code wrote a nonzero value to FS, then it also
374+
* cleared the overridden base address.
375+
*
376+
* XXX: if user code wrote 0 to FS and cleared the base
377+
* address itself, we won't notice and we'll incorrectly
378+
* restore the prior base address next time we reschdule
379+
* the process.
336380
*/
337381
if (fsindex)
338382
prev->fs = 0;
339383
}
340-
/* when next process has a 64bit base use it */
341384
if (next->fs)
342385
wrmsrl(MSR_FS_BASE, next->fs);
343386
prev->fsindex = fsindex;
344387

345388
if (unlikely(gsindex | next->gsindex | prev->gs)) {
346389
load_gs_index(next->gsindex);
390+
391+
/* This works (and fails) the same way as fsindex above. */
347392
if (gsindex)
348393
prev->gs = 0;
349394
}

0 commit comments

Comments
 (0)