diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 2c88bdb8ff7cc..4e224ef359406 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -335,7 +335,6 @@ struct drm_i915_gem_object { #define I915_BO_READONLY BIT(7) #define I915_TILING_QUIRK_BIT 8 /* unknown swizzling; do not release! */ #define I915_BO_PROTECTED BIT(9) -#define I915_BO_WAS_BOUND_BIT 10 /** * @mem_flags - Mutable placement-related flags * @@ -598,6 +597,8 @@ struct drm_i915_gem_object { * pages were last acquired. */ bool dirty:1; + + u32 tlb; } mm; struct { diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 6835279943df8..8357dbdcab5cb 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -191,6 +191,18 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr) vunmap(ptr); } +static void flush_tlb_invalidate(struct drm_i915_gem_object *obj) +{ + struct drm_i915_private *i915 = to_i915(obj->base.dev); + struct intel_gt *gt = to_gt(i915); + + if (!obj->mm.tlb) + return; + + intel_gt_invalidate_tlb(gt, obj->mm.tlb); + obj->mm.tlb = 0; +} + struct sg_table * __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj) { @@ -216,14 +228,7 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj) __i915_gem_object_reset_page_iter(obj); obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0; - if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) { - struct drm_i915_private *i915 = to_i915(obj->base.dev); - struct intel_gt *gt = to_gt(i915); - intel_wakeref_t wakeref; - - with_intel_gt_pm_if_awake(gt, wakeref) - intel_gt_invalidate_tlbs(gt); - } + flush_tlb_invalidate(obj); return pages; } diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 80a6c3c59176d..a47dcf7663aef 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -36,8 +36,6 @@ static void __intel_gt_init_early(struct intel_gt *gt) { spin_lock_init(>->irq_lock); - mutex_init(>->tlb_invalidate_lock); - INIT_LIST_HEAD(>->closed_vma); spin_lock_init(>->closed_lock); @@ -48,6 +46,8 @@ static void __intel_gt_init_early(struct intel_gt *gt) intel_gt_init_reset(gt); intel_gt_init_requests(gt); intel_gt_init_timelines(gt); + mutex_init(>->tlb.invalidate_lock); + seqcount_mutex_init(>->tlb.seqno, >->tlb.invalidate_lock); intel_gt_pm_init_early(gt); intel_uc_init_early(>->uc); @@ -833,6 +833,7 @@ void intel_gt_driver_late_release_all(struct drm_i915_private *i915) intel_gt_fini_requests(gt); intel_gt_fini_reset(gt); intel_gt_fini_timelines(gt); + mutex_destroy(>->tlb.invalidate_lock); intel_engines_free(gt); } } @@ -1165,7 +1166,7 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, return rb; } -void intel_gt_invalidate_tlbs(struct intel_gt *gt) +static void mmio_invalidate_full(struct intel_gt *gt) { static const i915_reg_t gen8_regs[] = { [RENDER_CLASS] = GEN8_RTCR, @@ -1188,12 +1189,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt) const i915_reg_t *regs; unsigned int num = 0; - if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) - return; - - if (intel_gt_is_wedged(gt)) - return; - if (GRAPHICS_VER(i915) == 12) { regs = gen12_regs; num = ARRAY_SIZE(gen12_regs); @@ -1208,9 +1203,6 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt) "Platform does not implement TLB invalidation!")) return; - GEM_TRACE("\n"); - - mutex_lock(>->tlb_invalidate_lock); intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */ @@ -1230,6 +1222,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt) awake |= engine->mask; } + GT_TRACE(gt, "invalidated engines %08x\n", awake); + /* Wa_2207587034:tgl,dg1,rkl,adl-s,adl-p */ if (awake && (IS_TIGERLAKE(i915) || @@ -1269,5 +1263,38 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt) * transitions. */ intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL); - mutex_unlock(>->tlb_invalidate_lock); +} + +static bool tlb_seqno_passed(const struct intel_gt *gt, u32 seqno) +{ + u32 cur = intel_gt_tlb_seqno(gt); + + /* Only skip if a *full* TLB invalidate barrier has passed */ + return (s32)(cur - ALIGN(seqno, 2)) > 0; +} + +void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno) +{ + intel_wakeref_t wakeref; + + if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) + return; + + if (intel_gt_is_wedged(gt)) + return; + + if (tlb_seqno_passed(gt, seqno)) + return; + + with_intel_gt_pm_if_awake(gt, wakeref) { + mutex_lock(>->tlb.invalidate_lock); + if (tlb_seqno_passed(gt, seqno)) + goto unlock; + + mmio_invalidate_full(gt); + + write_seqcount_invalidate(>->tlb.seqno); +unlock: + mutex_unlock(>->tlb.invalidate_lock); + } } diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 44c6cb63ccbc8..d5a2af76d6a52 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -123,7 +123,17 @@ void intel_gt_info_print(const struct intel_gt_info *info, void intel_gt_watchdog_work(struct work_struct *work); -void intel_gt_invalidate_tlbs(struct intel_gt *gt); +static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt) +{ + return seqprop_sequence(>->tlb.seqno); +} + +static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt) +{ + return intel_gt_tlb_seqno(gt) | 1; +} + +void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno); struct resource intel_pci_resource(struct pci_dev *pdev, int bar); diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index edd7a3cf5f5f5..b1120f73dc8c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,22 @@ struct intel_gt { struct intel_uc uc; struct intel_gsc gsc; - struct mutex tlb_invalidate_lock; + struct { + /* Serialize global tlb invalidations */ + struct mutex invalidate_lock; + + /* + * Batch TLB invalidations + * + * After unbinding the PTE, we need to ensure the TLB + * are invalidated prior to releasing the physical pages. + * But we only need one such invalidation for all unbinds, + * so we track how many TLB invalidations have been + * performed since unbind the PTE and only emit an extra + * invalidate if no full barrier has been passed. + */ + seqcount_mutex_t seqno; + } tlb; struct i915_wa_list wa_list; diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c index d8b94d6385598..2da6c82a8bd2a 100644 --- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c @@ -206,8 +206,12 @@ void ppgtt_bind_vma(struct i915_address_space *vm, void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma_resource *vma_res) { - if (vma_res->allocated) - vm->clear_range(vm, vma_res->start, vma_res->vma_size); + if (!vma_res->allocated) + return; + + vm->clear_range(vm, vma_res->start, vma_res->vma_size); + if (vma_res->tlb) + vma_invalidate_tlb(vm, *vma_res->tlb); } static unsigned long pd_count(u64 size, int shift) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 04d12f278f572..04b9d38ac8f28 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -537,8 +537,6 @@ int i915_vma_bind(struct i915_vma *vma, bind_flags); } - set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags); - atomic_or(bind_flags, &vma->flags); return 0; } @@ -1301,6 +1299,19 @@ I915_SELFTEST_EXPORT int i915_vma_get_pages(struct i915_vma *vma) return err; } +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb) +{ + /* + * Before we release the pages that were bound by this vma, we + * must invalidate all the TLBs that may still have a reference + * back to our physical address. It only needs to be done once, + * so after updating the PTE to point away from the pages, record + * the most recent TLB invalidation seqno, and if we have not yet + * flushed the TLBs upon release, perform a full invalidation. + */ + WRITE_ONCE(tlb, intel_gt_next_invalidate_tlb_full(vm->gt)); +} + static void __vma_put_pages(struct i915_vma *vma, unsigned int count) { /* We allocate under vma_get_pages, so beware the shrinker */ @@ -1927,7 +1938,12 @@ struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async) vma->vm->skip_pte_rewrite; trace_i915_vma_unbind(vma); - unbind_fence = i915_vma_resource_unbind(vma_res); + if (async) + unbind_fence = i915_vma_resource_unbind(vma_res, + &vma->obj->mm.tlb); + else + unbind_fence = i915_vma_resource_unbind(vma_res, NULL); + vma->resource = NULL; atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE), @@ -1935,10 +1951,13 @@ struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async) i915_vma_detach(vma); - if (!async && unbind_fence) { - dma_fence_wait(unbind_fence, false); - dma_fence_put(unbind_fence); - unbind_fence = NULL; + if (!async) { + if (unbind_fence) { + dma_fence_wait(unbind_fence, false); + dma_fence_put(unbind_fence); + unbind_fence = NULL; + } + vma_invalidate_tlb(vma->vm, vma->obj->mm.tlb); } /* diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index 88ca0bd9c9003..5048eed536da4 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -213,6 +213,7 @@ bool i915_vma_misplaced(const struct i915_vma *vma, u64 size, u64 alignment, u64 flags); void __i915_vma_set_map_and_fenceable(struct i915_vma *vma); void i915_vma_revoke_mmap(struct i915_vma *vma); +void vma_invalidate_tlb(struct i915_address_space *vm, u32 tlb); struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async); int __i915_vma_unbind(struct i915_vma *vma); int __must_check i915_vma_unbind(struct i915_vma *vma); diff --git a/drivers/gpu/drm/i915/i915_vma_resource.c b/drivers/gpu/drm/i915/i915_vma_resource.c index 27c55027387a0..5a67995ea5fe2 100644 --- a/drivers/gpu/drm/i915/i915_vma_resource.c +++ b/drivers/gpu/drm/i915/i915_vma_resource.c @@ -223,10 +223,13 @@ i915_vma_resource_fence_notify(struct i915_sw_fence *fence, * Return: A refcounted pointer to a dma-fence that signals when unbinding is * complete. */ -struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res) +struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res, + u32 *tlb) { struct i915_address_space *vm = vma_res->vm; + vma_res->tlb = tlb; + /* Reference for the sw fence */ i915_vma_resource_get(vma_res); diff --git a/drivers/gpu/drm/i915/i915_vma_resource.h b/drivers/gpu/drm/i915/i915_vma_resource.h index 5d8427caa2ba2..06923d1816e7e 100644 --- a/drivers/gpu/drm/i915/i915_vma_resource.h +++ b/drivers/gpu/drm/i915/i915_vma_resource.h @@ -67,6 +67,7 @@ struct i915_page_sizes { * taken when the unbind is scheduled. * @skip_pte_rewrite: During ggtt suspend and vm takedown pte rewriting * needs to be skipped for unbind. + * @tlb: pointer for obj->mm.tlb, if async unbind. Otherwise, NULL * * The lifetime of a struct i915_vma_resource is from a binding request to * the actual possible asynchronous unbind has completed. @@ -119,6 +120,8 @@ struct i915_vma_resource { bool immediate_unbind:1; bool needs_wakeref:1; bool skip_pte_rewrite:1; + + u32 *tlb; }; bool i915_vma_resource_hold(struct i915_vma_resource *vma_res, @@ -131,7 +134,8 @@ struct i915_vma_resource *i915_vma_resource_alloc(void); void i915_vma_resource_free(struct i915_vma_resource *vma_res); -struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res); +struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res, + u32 *tlb); void __i915_vma_resource_init(struct i915_vma_resource *vma_res);