Skip to content

Commit

Permalink
[Metal] Support CommandList.CopyBuffer and UpdateBuffer when region i…
Browse files Browse the repository at this point in the history
…s unaligned.

* Metal's blit encoder does not support buffer copies with unaligned regions on macOS.
  An unaligned region is one where the size or offset is not a multiple of 4 bytes.
* When necessary, these copies are now performed with a specialized compute shader, which
  does a byte-by-byte copy of the data from source to destination.
* The compute shader is embedded into Veldrid.dll and lazily loaded at runtime if the
  unaligned copy code path is used.

Validation changes:

* An exception is now thrown if you attempt to create a buffer with BufferUsage.UniformBuffer
  whose size is not a multiple of 16 bytes.
  • Loading branch information
mellinoe committed Mar 13, 2018
1 parent de32772 commit 903ea1c
Show file tree
Hide file tree
Showing 13 changed files with 250 additions and 26 deletions.
4 changes: 4 additions & 0 deletions src/Veldrid.MetalBindings/MTLComputeCommandEncoder.cs
Expand Up @@ -15,6 +15,7 @@ public struct MTLComputeCommandEncoder
private static readonly Selector sel_endEncoding = "endEncoding";
private static readonly Selector sel_setTexture = "setTexture:atIndex:";
private static readonly Selector sel_setSamplerState = "setSamplerState:atIndex:";
private static readonly Selector sel_setBytes = "setBytes:length:atIndex:";

public void setComputePipelineState(MTLComputePipelineState state)
=> objc_msgSend(NativePtr, sel_setComputePipelineState, state.NativePtr);
Expand All @@ -25,6 +26,9 @@ public void setBuffer(MTLBuffer buffer, UIntPtr offset, UIntPtr index)
offset,
index);

public unsafe void setBytes(void* bytes, UIntPtr length, UIntPtr index)
=> objc_msgSend(NativePtr, sel_setBytes, bytes, length, index);

public void dispatchThreadGroups(MTLSize threadgroupsPerGrid, MTLSize threadsPerThreadgroup)
=> objc_msgSend(NativePtr, sel_dispatchThreadgroups0, threadgroupsPerGrid, threadsPerThreadgroup);

Expand Down
1 change: 1 addition & 0 deletions src/Veldrid.MetalBindings/MTLComputePipelineState.cs
Expand Up @@ -6,5 +6,6 @@ public struct MTLComputePipelineState
{
public readonly IntPtr NativePtr;
public MTLComputePipelineState(IntPtr ptr) => NativePtr = ptr;
public bool IsNull => NativePtr == IntPtr.Zero;
}
}
2 changes: 2 additions & 0 deletions src/Veldrid.MetalBindings/ObjectiveCRuntime.cs
Expand Up @@ -31,6 +31,8 @@ public static unsafe class ObjectiveCRuntime
[DllImport(ObjCLibrary, EntryPoint = "objc_msgSend")]
public static extern void objc_msgSend(IntPtr receiver, Selector selector, IntPtr a, UIntPtr b, UIntPtr c);
[DllImport(ObjCLibrary, EntryPoint = "objc_msgSend")]
public static extern void objc_msgSend(IntPtr receiver, Selector selector, void* a, UIntPtr b, UIntPtr c);
[DllImport(ObjCLibrary, EntryPoint = "objc_msgSend")]
public static extern void objc_msgSend(IntPtr receiver, Selector selector, MTLPrimitiveType a, UIntPtr b, UIntPtr c, UIntPtr d, UIntPtr e);
[DllImport(ObjCLibrary, EntryPoint = "objc_msgSend")]
public static extern void objc_msgSend(IntPtr receiver, Selector selector, NSRange a);
Expand Down
98 changes: 98 additions & 0 deletions src/Veldrid.Tests/BufferTests.cs
Expand Up @@ -303,10 +303,108 @@ public void CommandList_Update_Staging()
}
}

[Theory]
[InlineData(
60, BufferUsage.VertexBuffer, 1,
70, BufferUsage.VertexBuffer, 13,
11)]
[InlineData(
60, BufferUsage.Staging, 1,
70, BufferUsage.VertexBuffer, 13,
11)]
[InlineData(
60, BufferUsage.VertexBuffer, 1,
70, BufferUsage.Staging, 13,
11)]
[InlineData(
60, BufferUsage.Staging, 1,
70, BufferUsage.Staging, 13,
11)]
[InlineData(
5, BufferUsage.VertexBuffer, 3,
10, BufferUsage.VertexBuffer, 7,
2)]
public void Copy_UnalignedRegion(
uint srcBufferSize, BufferUsage srcUsage, uint srcCopyOffset,
uint dstBufferSize, BufferUsage dstUsage, uint dstCopyOffset,
uint copySize)
{
DeviceBuffer src = CreateBuffer(srcBufferSize, srcUsage);
DeviceBuffer dst = CreateBuffer(dstBufferSize, dstUsage);

byte[] data = Enumerable.Range(0, (int)srcBufferSize).Select(i => (byte)i).ToArray();
GD.UpdateBuffer(src, 0, data);

CommandList cl = RF.CreateCommandList();
cl.Begin();
cl.CopyBuffer(src, srcCopyOffset, dst, dstCopyOffset, copySize);
cl.End();

GD.SubmitCommands(cl);
GD.WaitForIdle();

DeviceBuffer readback = GetReadback(dst);

MappedResourceView<byte> readView = GD.Map<byte>(readback, MapMode.Read);
for (uint i = 0; i < copySize; i++)
{
byte expected = data[i + srcCopyOffset];
byte actual = readView[i + dstCopyOffset];
Assert.Equal(expected, actual);
}
GD.Unmap(readback);
}

[Theory]
[InlineData(BufferUsage.VertexBuffer, 13, 5, 1)]
[InlineData(BufferUsage.Staging, 13, 5, 1)]
public void CommandList_UpdateNonStaging_Unaligned(BufferUsage usage, uint bufferSize, uint dataSize, uint offset)
{
DeviceBuffer buffer = CreateBuffer(bufferSize, usage);
byte[] data = Enumerable.Range(0, (int)dataSize).Select(i => (byte)i).ToArray();
CommandList cl = RF.CreateCommandList();
cl.Begin();
cl.UpdateBuffer(buffer, offset, data);
cl.End();
GD.SubmitCommands(cl);
GD.WaitForIdle();

DeviceBuffer readback = GetReadback(buffer);
MappedResourceView<byte> readView = GD.Map<byte>(readback, MapMode.Read);
for (uint i = 0; i < dataSize; i++)
{
byte expected = data[i];
byte actual = readView[i + offset];
Assert.Equal(expected, actual);
}
GD.Unmap(readback);
}

private DeviceBuffer CreateBuffer(uint size, BufferUsage usage)
{
return RF.CreateBuffer(new BufferDescription(size, usage));
}

private DeviceBuffer GetReadback(DeviceBuffer buffer)
{
DeviceBuffer readback;
if ((buffer.Usage & BufferUsage.Staging) != 0)
{
readback = buffer;
}
else
{
readback = CreateBuffer(buffer.SizeInBytes, BufferUsage.Staging);
CommandList cl = RF.CreateCommandList();
cl.Begin();
cl.CopyBuffer(buffer, 0, readback, 0, buffer.SizeInBytes);
cl.End();
GD.SubmitCommands(cl);
GD.WaitForIdle();
}

return readback;
}
}

public class OpenGLBufferTests : BufferTestBase<OpenGLDeviceCreator> { }
Expand Down
Binary file not shown.
Binary file not shown.
21 changes: 21 additions & 0 deletions src/Veldrid/MTL/EmbeddedShaders/UnalignedBufferCopy.metal
@@ -0,0 +1,21 @@
#include <metal_stdlib>
using namespace metal;

// This must be kept in sync with MTLUnalignedBufferCopyInfo.cs
struct CopyInfo
{
uint32_t SrcOffset;
uint32_t DstOffset;
uint32_t CopySize;
};

kernel void copy_bytes(
device uint8_t* src [[ buffer(0) ]],
device uint8_t* dst [[ buffer(1) ]],
constant CopyInfo& info [[ buffer(2) ]])
{
for (uint32_t i = 0; i < info.CopySize; i++)
{
dst[i + info.DstOffset] = src[i + info.SrcOffset];
}
}
12 changes: 12 additions & 0 deletions src/Veldrid/MTL/EmbeddedShaders/recompile-shaders.sh
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

xcrun -sdk macosx metal $script_dir/UnalignedBufferCopy.metal -o $script_dir/UnalignedBufferCopy.macos.air
xcrun -sdk macosx metallib $script_dir/UnalignedBufferCopy.macos.air -o $script_dir/MTL_UnalignedBufferCopy_macOS.metallib

xcrun -sdk iphoneos metal $script_dir/UnalignedBufferCopy.metal -o $script_dir/UnalignedBufferCopy.ios.air
xcrun -sdk iphoneos metallib $script_dir/UnalignedBufferCopy.ios.air -o $script_dir/MTL_UnalignedBufferCopy_iOS.metallib

rm $script_dir/UnalignedBufferCopy.macos.air
rm $script_dir/UnalignedBufferCopy.ios.air
69 changes: 43 additions & 26 deletions src/Veldrid/MTL/MTLCommandList.cs
Expand Up @@ -222,6 +222,7 @@ private void PreComputeCommand()
public override void End()
{
EnsureNoBlitEncoder();
EnsureNoComputeEncoder();

if (!_currentFramebufferEverActive && _mtlFramebuffer != null)
{
Expand Down Expand Up @@ -280,29 +281,30 @@ public override void SetViewport(uint index, ref Viewport viewport)

public override void UpdateBuffer(DeviceBuffer buffer, uint bufferOffsetInBytes, IntPtr source, uint sizeInBytes)
{
if (bufferOffsetInBytes % 4 != 0)
{
throw new VeldridException("Metal needs 4-byte-multiple buffer copy size and offset.");
}
if (sizeInBytes % 4 != 0 && bufferOffsetInBytes != 0 && sizeInBytes != buffer.SizeInBytes)
{
throw new VeldridException("Metal needs 4-byte-multiple buffer copy size and offset.");
}

Debug.Assert(bufferOffsetInBytes % 4 == 0);

uint sizeRoundFactor = (4 - (sizeInBytes % 4)) % 4;
bool useComputeCopy = (bufferOffsetInBytes % 4 != 0)
|| (sizeInBytes % 4 != 0 && bufferOffsetInBytes != 0 && sizeInBytes != buffer.SizeInBytes);

MTLBuffer dstMTLBuffer = Util.AssertSubtype<DeviceBuffer, MTLBuffer>(buffer);
// TODO: Cache these, and rely on the command buffer's completion callback to add them back to a shared pool.
MTLBuffer copySrc = Util.AssertSubtype<DeviceBuffer, MTLBuffer>(
_gd.ResourceFactory.CreateBuffer(new BufferDescription(sizeInBytes, BufferUsage.Staging)));
_gd.UpdateBuffer(copySrc, 0, source, sizeInBytes);
EnsureBlitEncoder();
_bce.copy(
copySrc.DeviceBuffer, UIntPtr.Zero,
dstMTLBuffer.DeviceBuffer, (UIntPtr)bufferOffsetInBytes,
(UIntPtr)(sizeInBytes + sizeRoundFactor));

if (useComputeCopy)
{
CopyBufferCore(copySrc, 0, buffer, bufferOffsetInBytes, sizeInBytes);
}
else
{
Debug.Assert(bufferOffsetInBytes % 4 == 0);
uint sizeRoundFactor = (4 - (sizeInBytes % 4)) % 4;
EnsureBlitEncoder();
_bce.copy(
copySrc.DeviceBuffer, UIntPtr.Zero,
dstMTLBuffer.DeviceBuffer, (UIntPtr)bufferOffsetInBytes,
(UIntPtr)(sizeInBytes + sizeRoundFactor));
}

copySrc.Dispose();
}

Expand All @@ -313,18 +315,33 @@ public override void UpdateBuffer(DeviceBuffer buffer, uint bufferOffsetInBytes,
uint destinationOffset,
uint sizeInBytes)
{
MTLBuffer mtlSrc = Util.AssertSubtype<DeviceBuffer, MTLBuffer>(source);
MTLBuffer mtlDst = Util.AssertSubtype<DeviceBuffer, MTLBuffer>(destination);

if (sourceOffset % 4 != 0 || sizeInBytes % 4 != 0)
{
throw new NotImplementedException("Metal needs 4-byte-multiple buffer copy size and offset.");
}
// Unaligned copy -- use special compute shader.
EnsureComputeEncoder();
_cce.setComputePipelineState(_gd.GetUnalignedBufferCopyPipeline());
_cce.setBuffer(mtlSrc.DeviceBuffer, UIntPtr.Zero, (UIntPtr)0);
_cce.setBuffer(mtlDst.DeviceBuffer, UIntPtr.Zero, (UIntPtr)1);

EnsureBlitEncoder();
MTLBuffer mtlSrc = Util.AssertSubtype<DeviceBuffer, MTLBuffer>(source);
MTLBuffer mtlDst = Util.AssertSubtype<DeviceBuffer, MTLBuffer>(destination);
_bce.copy(
mtlSrc.DeviceBuffer, (UIntPtr)sourceOffset,
mtlDst.DeviceBuffer, (UIntPtr)destinationOffset,
(UIntPtr)sizeInBytes);
MTLUnalignedBufferCopyInfo copyInfo;
copyInfo.SourceOffset = sourceOffset;
copyInfo.DestinationOffset = destinationOffset;
copyInfo.CopySize = sizeInBytes;

_cce.setBytes(&copyInfo, (UIntPtr)sizeof(MTLUnalignedBufferCopyInfo), (UIntPtr)2);
_cce.dispatchThreadGroups(new MTLSize(1, 1, 1), new MTLSize(1, 1, 1));
}
else
{
EnsureBlitEncoder();
_bce.copy(
mtlSrc.DeviceBuffer, (UIntPtr)sourceOffset,
mtlDst.DeviceBuffer, (UIntPtr)destinationOffset,
(UIntPtr)sizeInBytes);
}
}

protected override void CopyTextureCore(
Expand Down
47 changes: 47 additions & 0 deletions src/Veldrid/MTL/MTLGraphicsDevice.cs
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
Expand All @@ -21,6 +22,12 @@ internal unsafe class MTLGraphicsDevice : GraphicsDevice
private readonly object _resetEventsLock = new object();
private readonly List<ManualResetEvent[]> _resetEvents = new List<ManualResetEvent[]>();

private const string UnalignedBufferCopyPipelineMacOSName = "MTL_UnalignedBufferCopy_macOS";
private const string UnalignedBufferCopyPipelineiOSName = "MTL_UnalignedBufferCopy_iOS";
private readonly object _unalignedBufferCopyPipelineLock = new object();
private MTLShader _unalignedBufferCopyShader;
private MTLComputePipelineState _unalignedBufferCopyPipeline;

public MTLDevice Device => _device;
public MTLCommandQueue CommandQueue => _commandQueue;
public MTLFeatureSupport Features { get; }
Expand Down Expand Up @@ -225,6 +232,11 @@ private MappedResource MapTexture(MTLTexture texture, MapMode mode, uint subreso
protected override void PlatformDispose()
{
WaitForIdle();
if (!_unalignedBufferCopyPipeline.IsNull)
{
_unalignedBufferCopyShader.Dispose();
ObjectiveCRuntime.release(_unalignedBufferCopyPipeline.NativePtr);
}
_mainSwapchain?.Dispose();
ObjectiveCRuntime.release(_commandQueue.NativePtr);
ObjectiveCRuntime.release(_device.NativePtr);
Expand Down Expand Up @@ -294,5 +306,40 @@ public override void ResetFence(Fence fence)
{
Util.AssertSubtype<Fence, MTLFence>(fence).Reset();
}

internal MTLComputePipelineState GetUnalignedBufferCopyPipeline()
{
lock (_unalignedBufferCopyPipelineLock)
{
if (_unalignedBufferCopyPipeline.IsNull)
{
MTLComputePipelineDescriptor descriptor = MTLUtil.AllocInit<MTLComputePipelineDescriptor>(
nameof(MTLComputePipelineDescriptor));
MTLPipelineBufferDescriptor buffer0 = descriptor.buffers[0];
buffer0.mutability = MTLMutability.Mutable;
MTLPipelineBufferDescriptor buffer1 = descriptor.buffers[1];
buffer0.mutability = MTLMutability.Mutable;

Debug.Assert(_unalignedBufferCopyShader == null);
string name = Features.IsMacOS ? UnalignedBufferCopyPipelineMacOSName : UnalignedBufferCopyPipelineiOSName;
using (Stream resourceStream = typeof(MTLGraphicsDevice).Assembly.GetManifestResourceStream(name))
{
byte[] data = new byte[resourceStream.Length];
using (MemoryStream ms = new MemoryStream(data))
{
resourceStream.CopyTo(ms);
ShaderDescription shaderDesc = new ShaderDescription(ShaderStages.Compute, data, "copy_bytes");
_unalignedBufferCopyShader = new MTLShader(ref shaderDesc, this);
}
}

descriptor.computeFunction = _unalignedBufferCopyShader.Function;
_unalignedBufferCopyPipeline = _device.newComputePipelineStateWithDescriptor(descriptor);
ObjectiveCRuntime.release(descriptor.NativePtr);
}

return _unalignedBufferCopyPipeline;
}
}
}
}
9 changes: 9 additions & 0 deletions src/Veldrid/MTL/MTLUnalignedBufferCopyInfo.cs
@@ -0,0 +1,9 @@
namespace Veldrid.MTL
{
internal struct MTLUnalignedBufferCopyInfo
{
public uint SourceOffset;
public uint DestinationOffset;
public uint CopySize;
}
}
4 changes: 4 additions & 0 deletions src/Veldrid/ResourceFactory.cs
Expand Up @@ -165,6 +165,10 @@ public DeviceBuffer CreateBuffer(ref BufferDescription description)
{
throw new VeldridException("Buffers with Staging Usage must not specify any other Usage flags.");
}
if ((description.Usage & BufferUsage.UniformBuffer) != 0 && (description.SizeInBytes % 16) != 0)
{
throw new VeldridException($"Uniform buffer size must be a multiple of 16 bytes.");
}
#endif
return CreateBufferCore(ref description);
}
Expand Down
9 changes: 9 additions & 0 deletions src/Veldrid/Veldrid.csproj
Expand Up @@ -36,6 +36,15 @@
<Compile Remove="OpenGL/**/*" Condition="'$(ExcludeOpenGL)' == 'true'" />
</ItemGroup>

<ItemGroup Condition="'$(ExcludeMetal)' != 'true'">
<EmbeddedResource Include="MTL/EmbeddedShaders/MTL_UnalignedBufferCopy_macOS.metallib">
<LogicalName>MTL_UnalignedBufferCopy_macOS</LogicalName>
</EmbeddedResource>
<EmbeddedResource Include="MTL/EmbeddedShaders/MTL_UnalignedBufferCopy_iOS.metallib">
<LogicalName>MTL_UnalignedBufferCopy_iOS</LogicalName>
</EmbeddedResource>
</ItemGroup>

<PropertyGroup>
<Description>A low-level, hardware-accelerated graphics and compute library for .NET, with backends for Direct3D 11, Vulkan, and OpenGL. Veldrid can be used to create high-performance 2D and 3D games, simulations, tools, and other graphical applications.</Description>
<PackageTags>3D Graphics Direct3D DirectX Vulkan OpenGL Metal Core Standard Game</PackageTags>
Expand Down

0 comments on commit 903ea1c

Please sign in to comment.