From 80d44f908501dc1e44addcefece25c09050ef26d Mon Sep 17 00:00:00 2001 From: Anton Fedchin Date: Fri, 10 Jul 2015 18:22:11 +0300 Subject: [PATCH 1/3] [dx11] Moved CDVDCodecUtils::CopyDXVA2Picture to WinRenderer and optimize copying a DXVA texture to YUV buffer with sse4. --- project/VS2010Express/XBMC.vcxproj | 1 + project/VS2010Express/XBMC.vcxproj.filters | 3 + xbmc/cores/VideoRenderers/RenderManager.cpp | 4 - xbmc/cores/VideoRenderers/WinRenderer.cpp | 117 +++++++++++++++- xbmc/cores/VideoRenderers/WinRenderer.h | 11 +- .../dvdplayer/DVDCodecs/DVDCodecUtils.cpp | 102 -------------- .../cores/dvdplayer/DVDCodecs/DVDCodecUtils.h | 1 - xbmc/utils/win32/gpu_memcpy_sse4.h | 128 ++++++++++++++++++ 8 files changed, 255 insertions(+), 112 deletions(-) create mode 100644 xbmc/utils/win32/gpu_memcpy_sse4.h diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj index f876e46f8e27f..764d85cd5631b 100644 --- a/project/VS2010Express/XBMC.vcxproj +++ b/project/VS2010Express/XBMC.vcxproj @@ -1126,6 +1126,7 @@ + diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters index 79ad351f86b70..66dafd3c347b6 100644 --- a/project/VS2010Express/XBMC.vcxproj.filters +++ b/project/VS2010Express/XBMC.vcxproj.filters @@ -6187,6 +6187,9 @@ video\jobs + + utils\win32 + diff --git a/xbmc/cores/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoRenderers/RenderManager.cpp index 09327a37c692a..f35e0b3bbdf01 100644 --- a/xbmc/cores/VideoRenderers/RenderManager.cpp +++ b/xbmc/cores/VideoRenderers/RenderManager.cpp @@ -991,10 +991,6 @@ int CXBMCRenderManager::AddVideoPicture(DVDVideoPicture& pic) { CDVDCodecUtils::CopyYUV422PackedPicture(&image, &pic); } - else if(pic.format == RENDER_FMT_DXVA) - { - CDVDCodecUtils::CopyDXVA2Picture(&image, &pic); - } #ifdef HAVE_LIBVDPAU else if(pic.format == RENDER_FMT_VDPAU || pic.format == RENDER_FMT_VDPAU_420) diff --git a/xbmc/cores/VideoRenderers/WinRenderer.cpp b/xbmc/cores/VideoRenderers/WinRenderer.cpp index 9f3bf1325b0e0..98c1a1c16f336 100644 --- a/xbmc/cores/VideoRenderers/WinRenderer.cpp +++ b/xbmc/cores/VideoRenderers/WinRenderer.cpp @@ -30,7 +30,9 @@ #include "settings/MediaSettings.h" #include "settings/Settings.h" #include "threads/SingleLock.h" +#include "utils/CPUInfo.h" #include "utils/log.h" +#include "utils/win32/gpu_memcpy_sse4.h" #include "VideoShaders/WinVideoFilter.h" #include "windowing/WindowingFactory.h" @@ -286,6 +288,18 @@ bool CWinRenderer::AddVideoPicture(DVDVideoPicture* picture, int index) m_frameIdx += 2; return true; } + else if (picture->format == RENDER_FMT_DXVA) + { + int source = index; + if (source < 0 || NextYV12Texture() < 0) + return false; + + YUVBuffer *buf = (YUVBuffer*)m_VideoBuffers[source]; + if (buf->IsReadyToRender()) + return false; + + return buf->CopyFromDXVA(reinterpret_cast(picture->dxva->view)); + } return false; } @@ -1261,6 +1275,7 @@ bool YUVBuffer::Create(ERenderFormat format, unsigned int width, unsigned int he void YUVBuffer::Release() { + SAFE_RELEASE(m_staging); for(unsigned i = 0; i < m_activeplanes; i++) { planes[i].texture.Release(); @@ -1275,9 +1290,9 @@ void YUVBuffer::StartRender() m_locked = false; - for(unsigned i = 0; i < m_activeplanes; i++) + for (unsigned i = 0; i < m_activeplanes; i++) { - if(planes[i].texture.Get() && planes[i].rect.pData) + if (planes[i].texture.Get() && planes[i].rect.pData) if (!planes[i].texture.UnlockRect(0)) CLog::Log(LOGERROR, __FUNCTION__" - failed to unlock texture %d", i); memset(&planes[i].rect, 0, sizeof(planes[i].rect)); @@ -1353,10 +1368,104 @@ void YUVBuffer::Clear() } bool YUVBuffer::IsReadyToRender() +{ + return !m_locked; +} + +bool YUVBuffer::CopyFromDXVA(ID3D11VideoDecoderOutputView* pView) +{ + if (!pView) + return false; + + HRESULT hr = S_OK; + D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC vpivd; + pView->GetDesc(&vpivd); + ID3D11Resource* resource = nullptr; + pView->GetResource(&resource); + + if (!m_staging) + { + // create staging texture + ID3D11Texture2D* surface = nullptr; + hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), reinterpret_cast(&surface)); + if (SUCCEEDED(hr)) + { + D3D11_TEXTURE2D_DESC tDesc; + surface->GetDesc(&tDesc); + SAFE_RELEASE(surface); + + CD3D11_TEXTURE2D_DESC sDesc(tDesc); + sDesc.ArraySize = 1; + sDesc.Usage = D3D11_USAGE_STAGING; + sDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + sDesc.BindFlags = 0; + + hr = g_Windowing.Get3D11Device()->CreateTexture2D(&sDesc, nullptr, &m_staging); + if (SUCCEEDED(hr)) + m_sDesc = sDesc; + } + } + + if (m_staging) + { + ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext(); + // copy content from decoder texture to temporary texture. + pContext->CopySubresourceRegion(m_staging, + D3D11CalcSubresource(0, 0, 1), + 0, 0, 0, + resource, + D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1), + nullptr); + PerformCopy(); + } + SAFE_RELEASE(resource); + + return SUCCEEDED(hr); +} + +void YUVBuffer::PerformCopy() { if (!m_locked) - return true; - return false; + return; + + ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext(); + D3D11_MAPPED_SUBRESOURCE rectangle; + if (SUCCEEDED(pContext->Map(m_staging, 0, D3D11_MAP_READ, 0, &rectangle))) + { + void* (*copy_func)(void* d, const void* s, size_t size) = + ((g_cpuInfo.GetCPUFeatures() & CPU_FEATURE_SSE4) != 0) ? gpu_memcpy : memcpy; + + uint8_t* s_y = static_cast(rectangle.pData); + uint8_t *s_uv = static_cast(rectangle.pData) + m_sDesc.Height * rectangle.RowPitch; + uint8_t* d_y = static_cast(planes[PLANE_Y].rect.pData); + uint8_t *d_uv = static_cast(planes[PLANE_UV].rect.pData); + + if ( planes[PLANE_Y ].rect.RowPitch == rectangle.RowPitch + && planes[PLANE_UV].rect.RowPitch == rectangle.RowPitch) + { + copy_func(d_y, s_y, rectangle.RowPitch * m_height); + copy_func(d_uv, s_uv, rectangle.RowPitch * m_height >> 1); + } + else + { + for (unsigned y = 0; y < m_sDesc.Height >> 1; ++y) + { + // Copy Y + copy_func(d_y, s_y, planes[PLANE_Y].rect.RowPitch); + s_y += rectangle.RowPitch; + d_y += planes[PLANE_Y].rect.RowPitch; + // Copy Y + copy_func(d_y, s_y, planes[PLANE_Y].rect.RowPitch); + s_y += rectangle.RowPitch; + d_y += planes[PLANE_Y].rect.RowPitch; + // Copy UV + copy_func(d_uv, s_uv, planes[PLANE_UV].rect.RowPitch); + s_uv += rectangle.RowPitch; + d_uv += planes[PLANE_UV].rect.RowPitch; + } + } + pContext->Unmap(m_staging, 0); + } } #endif diff --git a/xbmc/cores/VideoRenderers/WinRenderer.h b/xbmc/cores/VideoRenderers/WinRenderer.h index a693a84e1db36..79fa33e65f3c6 100644 --- a/xbmc/cores/VideoRenderers/WinRenderer.h +++ b/xbmc/cores/VideoRenderers/WinRenderer.h @@ -102,7 +102,10 @@ struct SVideoPlane struct YUVBuffer : SVideoBuffer { - YUVBuffer() : m_width(0), m_height(0), m_format(RENDER_FMT_NONE), m_activeplanes(0), m_locked(false) {} + YUVBuffer() : m_width(0), m_height(0), m_format(RENDER_FMT_NONE), m_activeplanes(0), m_locked(false), m_staging(nullptr) + { + memset(&m_sDesc, 0, sizeof(CD3D11_TEXTURE2D_DESC)); + } ~YUVBuffer(); bool Create(ERenderFormat format, unsigned int width, unsigned int height, bool dynamic); virtual void Release(); @@ -111,16 +114,21 @@ struct YUVBuffer : SVideoBuffer virtual void Clear(); unsigned int GetActivePlanes() { return m_activeplanes; } virtual bool IsReadyToRender(); + bool CopyFromDXVA(ID3D11VideoDecoderOutputView* pView); SVideoPlane planes[MAX_PLANES]; private: + void PerformCopy(); + unsigned int m_width; unsigned int m_height; ERenderFormat m_format; unsigned int m_activeplanes; bool m_locked; D3D11_MAP m_mapType; + ID3D11Texture2D* m_staging; + CD3D11_TEXTURE2D_DESC m_sDesc; }; struct DXVABuffer : SVideoBuffer @@ -189,6 +197,7 @@ class CWinRenderer : public CBaseRenderer void SelectPSVideoFilter(); void UpdatePSVideoFilter(); bool CreateIntermediateRenderTarget(unsigned int width, unsigned int height); + bool CopyDXVA2YUVBuffer(ID3D11VideoDecoderOutputView* pView, YUVBuffer *pBuf); void RenderProcessor(DWORD flags); int m_iYV12RenderBuffer; diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp index 864a5f13818d4..a65af666a9749 100644 --- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp +++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp @@ -24,10 +24,6 @@ #include "utils/log.h" #include "cores/FFmpeg.h" #include "Util.h" -#ifdef HAS_DX -#include "cores/dvdplayer/DVDCodecs/Video/DXVA.h" -#include "windowing/WindowingFactory.h" -#endif #ifdef TARGET_WINDOWS #pragma comment(lib, "avcodec.lib") @@ -357,104 +353,6 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture return true; } -bool CDVDCodecUtils::CopyDXVA2Picture(YV12Image* pImage, DVDVideoPicture *pSrc) -{ -#ifdef HAS_DX - HRESULT hr; - switch (pSrc->extended_format) - { - case DXGI_FORMAT_NV12: // MAKEFOURCC('N', 'V', '1', '2'): - // Future... - //case DXGI_FORMAT_420_OPAQUE: // MAKEFOURCC('Y', 'V', '1', '2'): - //case MAKEFOURCC('Y','V','V','Y'): - what is it? - break; - default: - CLog::Log(LOGWARNING, "CDVDCodecUtils::CopyDXVA2Picture colorspace not supported"); - return false; - } - - // TODO: Optimize this later using shaders/swscale/etc. - ID3D11VideoDecoderOutputView* view = reinterpret_cast(pSrc->dxva->view); - if (!view) - return false; - - ID3D11Resource* resource = nullptr; - ID3D11Texture2D* surface = nullptr; - view->GetResource(&resource); - hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), reinterpret_cast(&surface)); - SAFE_RELEASE(resource); - if (FAILED(hr)) - return false; - - D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC vpivd; - view->GetDesc(&vpivd); - - D3D11_TEXTURE2D_DESC tDesc; - surface->GetDesc(&tDesc); - - int subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, tDesc.MipLevels); - - // we cannot read from dxva decoder texture so create new one with read access and copy content to it. - CD3D11_TEXTURE2D_DESC sDesc(tDesc); - sDesc.ArraySize = 1; - sDesc.Usage = D3D11_USAGE_STAGING; - sDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; - sDesc.BindFlags = 0; - - ID3D11Texture2D* staging = nullptr; - hr = g_Windowing.Get3D11Device()->CreateTexture2D(&sDesc, nullptr, &staging); - if (FAILED(hr)) - { - SAFE_RELEASE(surface); - return false; - } - - ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext(); - // copy content from decoder texture to temporary texture. - pContext->CopySubresourceRegion(staging, D3D11CalcSubresource(0, 0, tDesc.MipLevels), 0, 0, 0, surface, subresource, nullptr); - - D3D11_MAPPED_SUBRESOURCE rectangle; - if (FAILED(pContext->Map(staging, 0, D3D11_MAP_READ, 0, &rectangle))) - return false; - - switch (pSrc->extended_format) - { - case DXGI_FORMAT_NV12: - { - uint8_t* s_y = (uint8_t*)(rectangle.pData); - uint8_t* d_y = pImage->plane[0]; - uint8_t *s_uv = ((uint8_t*)(rectangle.pData)) + sDesc.Height * rectangle.RowPitch; - uint8_t *d_uv = pImage->plane[1]; - for (unsigned y = 0; y < pSrc->iHeight >> 1; ++y) - { - // Copy Y - memcpy(d_y, s_y, pSrc->iWidth); - s_y += rectangle.RowPitch; - d_y += pImage->stride[0]; - // Copy Y - memcpy(d_y, s_y, pSrc->iWidth); - s_y += rectangle.RowPitch; - d_y += pImage->stride[0]; - // Copy UV - memcpy(d_uv, s_uv, pSrc->iWidth); - s_uv += rectangle.RowPitch; - d_uv += pImage->stride[1]; - } - } - break; - case DXGI_FORMAT_420_OPAQUE: - // not implemented yet - break; - } - pContext->Unmap(staging, 0); - SAFE_RELEASE(surface); - SAFE_RELEASE(staging); - return true; - -#endif // HAS_DX - return false; -} - bool CDVDCodecUtils::IsVP3CompatibleWidth(int width) { // known hardware limitation of purevideo 3 (VP3). (the Nvidia 9400 is a purevideo 3 chip) diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h index c8ed9b9ab160c..13587feabd144 100644 --- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h +++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h @@ -37,7 +37,6 @@ class CDVDCodecUtils static DVDVideoPicture* ConvertToYUV422PackedPicture(DVDVideoPicture *pSrc, ERenderFormat format); static bool CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc); static bool CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture *pSrc); - static bool CopyDXVA2Picture(YV12Image* pImage, DVDVideoPicture *pSrc); static bool IsVP3CompatibleWidth(int width); diff --git a/xbmc/utils/win32/gpu_memcpy_sse4.h b/xbmc/utils/win32/gpu_memcpy_sse4.h new file mode 100644 index 0000000000000..5fbea9d99d675 --- /dev/null +++ b/xbmc/utils/win32/gpu_memcpy_sse4.h @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2011-2015 Hendrik Leppkes + * http://www.1f0.de + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Taken from the QuickSync decoder by Eric Gur + */ + +#include + +// gpu_memcpy is a memcpy style function that copied data very fast from a +// GPU tiled memory (write back) +// Performance tip: page offset (12 lsb) of both addresses should be different +// optimally use a 2K offset between them. +inline void* gpu_memcpy(void* d, const void* s, size_t size) +{ + static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 + + if (d == nullptr || s == nullptr) return nullptr; + + // If memory is not aligned, use memcpy + bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0; + if (!isAligned) + { + return memcpy(d, s, size); + } + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +#ifdef _M_X64 + __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; +#endif + + size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop + size_t end = 0; + + __m128i* pTrg = (__m128i*)d; + __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); + __m128i* pSrc = (__m128i*)s; + + // Make sure source is synced - doesn't hurt if not needed. + _mm_sfence(); + + while (pTrg < pTrgEnd) + { + // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA + // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad) + xmm0 = _mm_stream_load_si128(pSrc); + xmm1 = _mm_stream_load_si128(pSrc + 1); + xmm2 = _mm_stream_load_si128(pSrc + 2); + xmm3 = _mm_stream_load_si128(pSrc + 3); + xmm4 = _mm_stream_load_si128(pSrc + 4); + xmm5 = _mm_stream_load_si128(pSrc + 5); + xmm6 = _mm_stream_load_si128(pSrc + 6); + xmm7 = _mm_stream_load_si128(pSrc + 7); +#ifdef _M_X64 // Use all 16 xmm registers + xmm8 = _mm_stream_load_si128(pSrc + 8); + xmm9 = _mm_stream_load_si128(pSrc + 9); + xmm10 = _mm_stream_load_si128(pSrc + 10); + xmm11 = _mm_stream_load_si128(pSrc + 11); + xmm12 = _mm_stream_load_si128(pSrc + 12); + xmm13 = _mm_stream_load_si128(pSrc + 13); + xmm14 = _mm_stream_load_si128(pSrc + 14); + xmm15 = _mm_stream_load_si128(pSrc + 15); +#endif + pSrc += regsInLoop; + // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store) + _mm_store_si128(pTrg , xmm0); + _mm_store_si128(pTrg + 1, xmm1); + _mm_store_si128(pTrg + 2, xmm2); + _mm_store_si128(pTrg + 3, xmm3); + _mm_store_si128(pTrg + 4, xmm4); + _mm_store_si128(pTrg + 5, xmm5); + _mm_store_si128(pTrg + 6, xmm6); + _mm_store_si128(pTrg + 7, xmm7); +#ifdef _M_X64 // Use all 16 xmm registers + _mm_store_si128(pTrg + 8, xmm8); + _mm_store_si128(pTrg + 9, xmm9); + _mm_store_si128(pTrg + 10, xmm10); + _mm_store_si128(pTrg + 11, xmm11); + _mm_store_si128(pTrg + 12, xmm12); + _mm_store_si128(pTrg + 13, xmm13); + _mm_store_si128(pTrg + 14, xmm14); + _mm_store_si128(pTrg + 15, xmm15); +#endif + pTrg += regsInLoop; + } + + // Copy in 16 byte steps + if (reminder >= 16) + { + size = reminder; + reminder = size & 15; + end = size >> 4; + for (size_t i = 0; i < end; ++i) + { + pTrg[i] = _mm_stream_load_si128(pSrc + i); + } + } + + // Copy last bytes - shouldn't happen as strides are modulu 16 + if (reminder) + { + __m128i temp = _mm_stream_load_si128(pSrc + end); + + char* ps = (char*)(&temp); + char* pt = (char*)(pTrg + end); + + for (size_t i = 0; i < reminder; ++i) + { + pt[i] = ps[i]; + } + } + + return d; +} \ No newline at end of file From 95d07d3faee82314d94f880421fec87c70d1ca45 Mon Sep 17 00:00:00 2001 From: Anton Fedchin Date: Sat, 18 Jul 2015 13:28:14 +0300 Subject: [PATCH 2/3] [dxva] CProcessorHD::Convert - Optimize method with sse2 instructions. --- project/VS2010Express/XBMC.vcxproj | 1 + project/VS2010Express/XBMC.vcxproj.filters | 3 + xbmc/cores/VideoRenderers/DXVAHD.cpp | 54 ++++++---- xbmc/utils/win32/memcpy_sse2.h | 113 +++++++++++++++++++++ 4 files changed, 149 insertions(+), 22 deletions(-) create mode 100644 xbmc/utils/win32/memcpy_sse2.h diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj index 764d85cd5631b..199bd8dd20ed2 100644 --- a/project/VS2010Express/XBMC.vcxproj +++ b/project/VS2010Express/XBMC.vcxproj @@ -1127,6 +1127,7 @@ + diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters index 66dafd3c347b6..4a8cab1925a25 100644 --- a/project/VS2010Express/XBMC.vcxproj.filters +++ b/project/VS2010Express/XBMC.vcxproj.filters @@ -6190,6 +6190,9 @@ utils\win32 + + utils\win32 + diff --git a/xbmc/cores/VideoRenderers/DXVAHD.cpp b/xbmc/cores/VideoRenderers/DXVAHD.cpp index 8e90a3f9c881f..1ca33dbd194f2 100644 --- a/xbmc/cores/VideoRenderers/DXVAHD.cpp +++ b/xbmc/cores/VideoRenderers/DXVAHD.cpp @@ -34,6 +34,7 @@ #include "settings/MediaSettings.h" #include "utils/AutoPtrHandle.h" #include "utils/Log.h" +#include "utils/win32/memcpy_sse2.h" #include "win32/WIN32Util.h" #include "windowing/WindowingFactory.h" @@ -434,9 +435,9 @@ bool CProcessorHD::CreateSurfaces() CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) { // RENDER_FMT_YUV420P -> DXGI_FORMAT_NV12 - // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010/DXGI_FORMAT_Y410 - // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016/DXGI_FORMAT_Y416 - if (picture->format != RENDER_FMT_YUV420P + // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010 + // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016 + if ( picture->format != RENDER_FMT_YUV420P && picture->format != RENDER_FMT_YUV420P10 && picture->format != RENDER_FMT_YUV420P16) { @@ -470,28 +471,38 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) return nullptr; } - // Convert to NV12 - Luma - // TODO: Optimize this later using shaders/swscale/etc. - uint8_t *s = picture->data[0]; - uint8_t* bits = (uint8_t*)rectangle.pData; - for (unsigned y = 0; y < picture->iHeight; y++) + if (picture->format == RENDER_FMT_YUV420P) { - memcpy(bits, s, picture->iWidth); - s += picture->iLineSize[0]; - bits += rectangle.RowPitch; + uint8_t* pData = static_cast(rectangle.pData); + uint8_t* dst[] = { pData, pData + sDesc.Height * rectangle.RowPitch }; + int dstStride[] = { rectangle.RowPitch, rectangle.RowPitch }; + convert_yuv420_nv12(picture->data, picture->iLineSize, picture->iHeight, picture->iWidth, dst, dstStride); } - - // Convert to NV12 - Chroma - uint8_t *s_u, *s_v, *d_uv; - for (unsigned y = 0; y < picture->iHeight / 2; y++) + else { - s_u = picture->data[1] + y * picture->iLineSize[1]; - s_v = picture->data[2] + y * picture->iLineSize[2]; - d_uv = (uint8_t*)rectangle.pData + (sDesc.Height + y) * rectangle.RowPitch; - for (unsigned x = 0; x < picture->iWidth / 2; x++) + // TODO: Optimize this later using sse2/sse4 + uint16_t * d_y = static_cast(rectangle.pData); + uint16_t * d_uv = d_y + sDesc.Height * rectangle.RowPitch; + // Convert to NV12 - Luma + for (size_t line = 0; line < picture->iHeight; ++line) { - *d_uv++ = *s_u++; - *d_uv++ = *s_v++; + uint16_t * y = (uint16_t*)(picture->data[0] + picture->iLineSize[0] * line); + uint16_t * d = d_y + rectangle.RowPitch * line; + memcpy(d, y, picture->iLineSize[0]); + } + // Convert to NV12 - Chroma + size_t chromaWidth = (picture->iWidth + 1) >> 1; + size_t chromaHeight = picture->iHeight >> 1; + for (size_t line = 0; line < chromaHeight; ++line) + { + uint16_t * u = (uint16_t*)picture->data[1] + line * picture->iLineSize[1]; + uint16_t * v = (uint16_t*)picture->data[2] + line * picture->iLineSize[2]; + uint16_t * d = d_uv + line * rectangle.RowPitch; + for (size_t x = 0; x < chromaWidth; x++) + { + *d++ = *u++; + *d++ = *v++; + } } } pContext->Unmap(texture, subresource); @@ -503,7 +514,6 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) return pic; } - bool CProcessorHD::ApplyFilter(D3D11_VIDEO_PROCESSOR_FILTER filter, int value, int min, int max, int def) { if (filter >= NUM_FILTERS) diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h new file mode 100644 index 0000000000000..c585136547487 --- /dev/null +++ b/xbmc/utils/win32/memcpy_sse2.h @@ -0,0 +1,113 @@ +/* +* Copyright (C) 2005-2015 Team Kodi +* http://kodi.tv +* +* This library is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License as published by the Free Software Foundation; either +* version 2.1 of the License, or (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with this library; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +* +*/ + +#include + +inline void* memcpy_aligned(void* dst, const void* src, size_t size) +{ + size_t i; + __m128i xmm1, xmm2, xmm3, xmm4; + + // if memory is not aligned, use memcpy + if ((((size_t)(src) | (size_t)(dst)) & 0xF)) + return memcpy(dst, src, size); + + uint8_t* d = (uint8_t*)(dst); + uint8_t* s = (uint8_t*)(src); + + for (i = 0; i < size - 63; i += 64) + { + xmm1 = _mm_load_si128((__m128i*)(s + i + 0)); + xmm2 = _mm_load_si128((__m128i*)(s + i + 16)); + xmm3 = _mm_load_si128((__m128i*)(s + i + 32)); + xmm4 = _mm_load_si128((__m128i*)(s + i + 48)); + _mm_stream_si128((__m128i*)(d + i + 0), xmm1); + _mm_stream_si128((__m128i*)(d + i + 16), xmm2); + _mm_stream_si128((__m128i*)(d + i + 32), xmm3); + _mm_stream_si128((__m128i*)(d + i + 48), xmm4); + } + for (; i < size; i += 16) + { + xmm1 = _mm_load_si128((__m128i*)(s + i)); + _mm_stream_si128((__m128i*)(d + i), xmm1); + } + return dst; +} + +inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[]) +{ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + _mm_sfence(); + + // Convert to NV12 - Luma + if (srcStride[0] == dstStride[0]) + memcpy_aligned(dst[0], src[0], srcStride[0] * height); + else + { + for (size_t line = 0; line < height; ++line) + { + uint8_t * s = src[0] + srcStride[0] * line; + uint8_t * d = dst[0] + dstStride[0] * line; + memcpy_aligned(d, s, srcStride[0]); + } + } + // Convert to NV12 - Chroma + size_t chromaWidth = (width + 1) >> 1; + size_t chromaHeight = height >> 1; + for (size_t line = 0; line < chromaHeight; ++line) + { + size_t i; + uint8_t * u = src[1] + line * srcStride[1]; + uint8_t * v = src[2] + line * srcStride[2]; + uint8_t * d = dst[1] + line * dstStride[1]; + for (i = 0; i < (chromaWidth - 31); i += 32) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); + xmm2 = _mm_load_si128((__m128i*)(v + i + 16)); + xmm3 = _mm_load_si128((__m128i*)(u + i + 16)); + + xmm4 = xmm0; + xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); + xmm4 = _mm_unpackhi_epi8(xmm1, xmm4); + + xmm1 = xmm2; + xmm2 = _mm_unpacklo_epi8(xmm3, xmm2); + xmm1 = _mm_unpackhi_epi8(xmm3, xmm1); + + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4); + _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2); + _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1); + } + for (; i < chromaWidth; i += 16) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); + + xmm2 = xmm0; + xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); + xmm2 = _mm_unpackhi_epi8(xmm1, xmm2); + + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); + } + } +} From f0a82491eec5848501878d50495e4a0fdf6a52d6 Mon Sep 17 00:00:00 2001 From: Anton Fedchin Date: Mon, 27 Jul 2015 17:21:34 +0300 Subject: [PATCH 3/3] [dx11] CProcessorHD: Get rig of unneeded std::map. --- xbmc/cores/VideoRenderers/DXVAHD.cpp | 67 ++++++++++++---------------- xbmc/cores/VideoRenderers/DXVAHD.h | 2 +- 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/xbmc/cores/VideoRenderers/DXVAHD.cpp b/xbmc/cores/VideoRenderers/DXVAHD.cpp index 1ca33dbd194f2..fb21b70f2ff6e 100644 --- a/xbmc/cores/VideoRenderers/DXVAHD.cpp +++ b/xbmc/cores/VideoRenderers/DXVAHD.cpp @@ -59,7 +59,6 @@ CProcessorHD::CProcessorHD() g_Windowing.Register(this); m_context = nullptr; - m_mappedResource.clear(); m_width = 0; m_height = 0; } @@ -84,12 +83,6 @@ void CProcessorHD::Close() SAFE_RELEASE(m_pEnumerator); SAFE_RELEASE(m_pVideoProcessor); SAFE_RELEASE(m_context); - std::map::iterator it = m_mappedResource.begin(); - for (; it != m_mappedResource.end(); ++it) - { - if (it->second) it->second->Release(); - } - m_mappedResource.clear(); } bool CProcessorHD::UpdateSize(const DXVA2_VideoDesc& dsc) @@ -125,6 +118,7 @@ bool CProcessorHD::PreInit() return false; } + memset(&m_texDesc, 0, sizeof(D3D11_TEXTURE2D_DESC)); return true; } @@ -382,42 +376,38 @@ bool CProcessorHD::OpenProcessor() bool CProcessorHD::CreateSurfaces() { + HRESULT hr; + size_t idx; ID3D11Device* pD3DDevice = g_Windowing.Get3D11Device(); // we cannot use texture array (like in decoder) for USAGE_DYNAMIC, so create separete textures - CD3D11_TEXTURE2D_DESC desc(m_textureFormat, (m_width + 15) & ~15, (m_height + 15) & ~15, 1, 1, D3D11_BIND_DECODER, D3D11_USAGE_DYNAMIC, D3D11_CPU_ACCESS_WRITE); - D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC pivd = {0}; - pivd.FourCC = 0; - pivd.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D; + CD3D11_TEXTURE2D_DESC texDesc(m_textureFormat, FFALIGN(m_width, 16), FFALIGN(m_height, 16), 1, 1, D3D11_BIND_DECODER, D3D11_USAGE_DYNAMIC, D3D11_CPU_ACCESS_WRITE); + D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC pivd = { 0, D3D11_VPIV_DIMENSION_TEXTURE2D }; pivd.Texture2D.ArraySlice = 0; pivd.Texture2D.MipSlice = 0; - ID3D11Texture2D* resource[32]; - ID3D11VideoProcessorInputView* views[32]; - memset(views, 0, 32 * sizeof(ID3D11VideoProcessorInputView*)); - memset(resource, 0, 32 * sizeof(ID3D11Texture2D*)); - bool needRelease = false; - + ID3D11VideoProcessorInputView* views[32] = { 0 }; CLog::Log(LOGDEBUG, "%s - Creating %d processor surfaces with format %d.", __FUNCTION__, m_size, m_textureFormat); - for (unsigned idx = 0; idx < m_size; idx++) + for (idx = 0; idx < m_size; idx++) { - if ( FAILED(pD3DDevice->CreateTexture2D(&desc, NULL, &resource[idx])) - || FAILED(m_pVideoDevice->CreateVideoProcessorInputView(resource[idx], m_pEnumerator, &pivd, &views[idx]))) - { - SAFE_RELEASE(resource[idx]); - SAFE_RELEASE(views[idx]); - needRelease = true; - } + ID3D11Texture2D* pTexture = nullptr; + hr = pD3DDevice->CreateTexture2D(&texDesc, NULL, &pTexture); + if (FAILED(hr)) + break; + + hr = m_pVideoDevice->CreateVideoProcessorInputView(pTexture, m_pEnumerator, &pivd, &views[idx]); + SAFE_RELEASE(pTexture); + if (FAILED(hr)) + break; } - if (needRelease) + if (idx != m_size) { + // something goes wrong CLog::Log(LOGERROR, "%s - Failed to create processor surfaces.", __FUNCTION__); - for (unsigned idx = 0; idx < m_size; idx++) { - SAFE_RELEASE(resource[idx]); SAFE_RELEASE(views[idx]); } return false; @@ -426,9 +416,10 @@ bool CProcessorHD::CreateSurfaces() m_context = new CSurfaceContext(); for (unsigned int i = 0; i < m_size; i++) { - m_mappedResource[views[i]] = resource[i]; m_context->AddSurface(views[i]); } + + m_texDesc = texDesc; return true; } @@ -453,18 +444,17 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) } ID3D11VideoProcessorInputView* view = reinterpret_cast(pView); - ID3D11Texture2D* texture = m_mappedResource[view]; + + ID3D11Resource* pResource = nullptr; + view->GetResource(&pResource); D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC vpivd; view->GetDesc(&vpivd); - int subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1); - - D3D11_TEXTURE2D_DESC sDesc; - texture->GetDesc(&sDesc); + UINT subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1); D3D11_MAPPED_SUBRESOURCE rectangle; ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext(); - if (FAILED(pContext->Map(texture, subresource, D3D11_MAP_WRITE_DISCARD, 0, &rectangle))) + if (FAILED(pContext->Map(pResource, subresource, D3D11_MAP_WRITE_DISCARD, 0, &rectangle))) { CLog::Log(LOGERROR, "%s - could not lock rect", __FUNCTION__); m_context->ClearReference(view); @@ -474,7 +464,7 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) if (picture->format == RENDER_FMT_YUV420P) { uint8_t* pData = static_cast(rectangle.pData); - uint8_t* dst[] = { pData, pData + sDesc.Height * rectangle.RowPitch }; + uint8_t* dst[] = { pData, pData + m_texDesc.Height * rectangle.RowPitch }; int dstStride[] = { rectangle.RowPitch, rectangle.RowPitch }; convert_yuv420_nv12(picture->data, picture->iLineSize, picture->iHeight, picture->iWidth, dst, dstStride); } @@ -482,7 +472,7 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) { // TODO: Optimize this later using sse2/sse4 uint16_t * d_y = static_cast(rectangle.pData); - uint16_t * d_uv = d_y + sDesc.Height * rectangle.RowPitch; + uint16_t * d_uv = d_y + m_texDesc.Height * rectangle.RowPitch; // Convert to NV12 - Luma for (size_t line = 0; line < picture->iHeight; ++line) { @@ -505,7 +495,8 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture) } } } - pContext->Unmap(texture, subresource); + pContext->Unmap(pResource, subresource); + SAFE_RELEASE(pResource); m_context->ClearReference(view); m_context->MarkRender(view); diff --git a/xbmc/cores/VideoRenderers/DXVAHD.h b/xbmc/cores/VideoRenderers/DXVAHD.h index 609c7ee3a8466..154668f12b911 100644 --- a/xbmc/cores/VideoRenderers/DXVAHD.h +++ b/xbmc/cores/VideoRenderers/DXVAHD.h @@ -98,7 +98,7 @@ class CProcessorHD : ID3DResource unsigned int m_procIndex; D3D11_VIDEO_PROCESSOR_RATE_CONVERSION_CAPS m_rateCaps; - std::map m_mappedResource; + D3D11_TEXTURE2D_DESC m_texDesc; }; };