Skip to content

Commit 71b3c12

Browse files
amlutoIngo Molnar
authored and
Ingo Molnar
committed
x86/mm: Add barriers and document switch_mm()-vs-flush synchronization
When switch_mm() activates a new PGD, it also sets a bit that tells other CPUs that the PGD is in use so that TLB flush IPIs will be sent. In order for that to work correctly, the bit needs to be visible prior to loading the PGD and therefore starting to fill the local TLB. Document all the barriers that make this work correctly and add a couple that were missing. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent afd2ff9 commit 71b3c12

File tree

2 files changed

+58
-4
lines changed

2 files changed

+58
-4
lines changed

Diff for: arch/x86/include/asm/mmu_context.h

+32-1
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
116116
#endif
117117
cpumask_set_cpu(cpu, mm_cpumask(next));
118118

119-
/* Re-load page tables */
119+
/*
120+
* Re-load page tables.
121+
*
122+
* This logic has an ordering constraint:
123+
*
124+
* CPU 0: Write to a PTE for 'next'
125+
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
126+
* CPU 1: set bit 1 in next's mm_cpumask
127+
* CPU 1: load from the PTE that CPU 0 writes (implicit)
128+
*
129+
* We need to prevent an outcome in which CPU 1 observes
130+
* the new PTE value and CPU 0 observes bit 1 clear in
131+
* mm_cpumask. (If that occurs, then the IPI will never
132+
* be sent, and CPU 0's TLB will contain a stale entry.)
133+
*
134+
* The bad outcome can occur if either CPU's load is
135+
* reordered before that CPU's store, so both CPUs much
136+
* execute full barriers to prevent this from happening.
137+
*
138+
* Thus, switch_mm needs a full barrier between the
139+
* store to mm_cpumask and any operation that could load
140+
* from next->pgd. This barrier synchronizes with
141+
* remote TLB flushers. Fortunately, load_cr3 is
142+
* serializing and thus acts as a full barrier.
143+
*
144+
*/
120145
load_cr3(next->pgd);
146+
121147
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
122148

123149
/* Stop flush ipis for the previous mm */
@@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
156182
* schedule, protecting us from simultaneous changes.
157183
*/
158184
cpumask_set_cpu(cpu, mm_cpumask(next));
185+
159186
/*
160187
* We were in lazy tlb mode and leave_mm disabled
161188
* tlb flush IPI delivery. We must reload CR3
162189
* to make sure to use no freed page tables.
190+
*
191+
* As above, this is a barrier that forces
192+
* TLB repopulation to be ordered after the
193+
* store to mm_cpumask.
163194
*/
164195
load_cr3(next->pgd);
165196
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

Diff for: arch/x86/mm/tlb.c

+26-3
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
161161
preempt_disable();
162162

163163
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
164+
165+
/* This is an implicit full barrier that synchronizes with switch_mm. */
164166
local_flush_tlb();
167+
165168
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
166169
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
167170
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
188191
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
189192

190193
preempt_disable();
191-
if (current->active_mm != mm)
194+
if (current->active_mm != mm) {
195+
/* Synchronize with switch_mm. */
196+
smp_mb();
197+
192198
goto out;
199+
}
193200

194201
if (!current->mm) {
195202
leave_mm(smp_processor_id());
203+
204+
/* Synchronize with switch_mm. */
205+
smp_mb();
206+
196207
goto out;
197208
}
198209

199210
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
200211
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
201212

213+
/*
214+
* Both branches below are implicit full barriers (MOV to CR or
215+
* INVLPG) that synchronize with switch_mm.
216+
*/
202217
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
203218
base_pages_to_flush = TLB_FLUSH_ALL;
204219
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
228243
preempt_disable();
229244

230245
if (current->active_mm == mm) {
231-
if (current->mm)
246+
if (current->mm) {
247+
/*
248+
* Implicit full barrier (INVLPG) that synchronizes
249+
* with switch_mm.
250+
*/
232251
__flush_tlb_one(start);
233-
else
252+
} else {
234253
leave_mm(smp_processor_id());
254+
255+
/* Synchronize with switch_mm. */
256+
smp_mb();
257+
}
235258
}
236259

237260
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)

0 commit comments

Comments
 (0)