@@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
283283
284284 fpu = switch_fpu_prepare (prev_p , next_p , cpu );
285285
286- /*
287- * Reload esp0, LDT and the page table pointer:
288- */
286+ /* Reload esp0 and ss1. */
289287 load_sp0 (tss , next );
290288
291- /*
292- * Switch DS and ES.
293- * This won't pick up thread selector changes, but I guess that is ok.
294- */
295- savesegment (es , prev -> es );
296- if (unlikely (next -> es | prev -> es ))
297- loadsegment (es , next -> es );
298-
299- savesegment (ds , prev -> ds );
300- if (unlikely (next -> ds | prev -> ds ))
301- loadsegment (ds , next -> ds );
302-
303-
304289 /* We must save %fs and %gs before load_TLS() because
305290 * %fs and %gs may be cleared by load_TLS().
306291 *
@@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
309294 savesegment (fs , fsindex );
310295 savesegment (gs , gsindex );
311296
297+ /*
298+ * Load TLS before restoring any segments so that segment loads
299+ * reference the correct GDT entries.
300+ */
312301 load_TLS (next , cpu );
313302
314303 /*
315- * Leave lazy mode, flushing any hypercalls made here.
316- * This must be done before restoring TLS segments so
317- * the GDT and LDT are properly updated , and must be
318- * done before math_state_restore, so the TS bit is up
319- * to date.
304+ * Leave lazy mode, flushing any hypercalls made here. This
305+ * must be done after loading TLS entries in the GDT but before
306+ * loading segments that might reference them , and and it must
307+ * be done before math_state_restore, so the TS bit is up to
308+ * date.
320309 */
321310 arch_end_context_switch (next_p );
322311
312+ /* Switch DS and ES.
313+ *
314+ * Reading them only returns the selectors, but writing them (if
315+ * nonzero) loads the full descriptor from the GDT or LDT. The
316+ * LDT for next is loaded in switch_mm, and the GDT is loaded
317+ * above.
318+ *
319+ * We therefore need to write new values to the segment
320+ * registers on every context switch unless both the new and old
321+ * values are zero.
322+ *
323+ * Note that we don't need to do anything for CS and SS, as
324+ * those are saved and restored as part of pt_regs.
325+ */
326+ savesegment (es , prev -> es );
327+ if (unlikely (next -> es | prev -> es ))
328+ loadsegment (es , next -> es );
329+
330+ savesegment (ds , prev -> ds );
331+ if (unlikely (next -> ds | prev -> ds ))
332+ loadsegment (ds , next -> ds );
333+
323334 /*
324335 * Switch FS and GS.
325336 *
326- * Segment register != 0 always requires a reload. Also
327- * reload when it has changed. When prev process used 64bit
328- * base always reload to avoid an information leak.
337+ * These are even more complicated than FS and GS: they have
338+ * 64-bit bases are that controlled by arch_prctl. Those bases
339+ * only differ from the values in the GDT or LDT if the selector
340+ * is 0.
341+ *
342+ * Loading the segment register resets the hidden base part of
343+ * the register to 0 or the value from the GDT / LDT. If the
344+ * next base address zero, writing 0 to the segment register is
345+ * much faster than using wrmsr to explicitly zero the base.
346+ *
347+ * The thread_struct.fs and thread_struct.gs values are 0
348+ * if the fs and gs bases respectively are not overridden
349+ * from the values implied by fsindex and gsindex. They
350+ * are nonzero, and store the nonzero base addresses, if
351+ * the bases are overridden.
352+ *
353+ * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
354+ * be impossible.
355+ *
356+ * Therefore we need to reload the segment registers if either
357+ * the old or new selector is nonzero, and we need to override
358+ * the base address if next thread expects it to be overridden.
359+ *
360+ * This code is unnecessarily slow in the case where the old and
361+ * new indexes are zero and the new base is nonzero -- it will
362+ * unnecessarily write 0 to the selector before writing the new
363+ * base address.
364+ *
365+ * Note: This all depends on arch_prctl being the only way that
366+ * user code can override the segment base. Once wrfsbase and
367+ * wrgsbase are enabled, most of this code will need to change.
329368 */
330369 if (unlikely (fsindex | next -> fsindex | prev -> fs )) {
331370 loadsegment (fs , next -> fsindex );
371+
332372 /*
333- * Check if the user used a selector != 0; if yes
334- * clear 64bit base, since overloaded base is always
335- * mapped to the Null selector
373+ * If user code wrote a nonzero value to FS, then it also
374+ * cleared the overridden base address.
375+ *
376+ * XXX: if user code wrote 0 to FS and cleared the base
377+ * address itself, we won't notice and we'll incorrectly
378+ * restore the prior base address next time we reschdule
379+ * the process.
336380 */
337381 if (fsindex )
338382 prev -> fs = 0 ;
339383 }
340- /* when next process has a 64bit base use it */
341384 if (next -> fs )
342385 wrmsrl (MSR_FS_BASE , next -> fs );
343386 prev -> fsindex = fsindex ;
344387
345388 if (unlikely (gsindex | next -> gsindex | prev -> gs )) {
346389 load_gs_index (next -> gsindex );
390+
391+ /* This works (and fails) the same way as fsindex above. */
347392 if (gsindex )
348393 prev -> gs = 0 ;
349394 }
0 commit comments