From 80d44f908501dc1e44addcefece25c09050ef26d Mon Sep 17 00:00:00 2001
From: Anton Fedchin <afedchin@ruswizards.com>
Date: Fri, 10 Jul 2015 18:22:11 +0300
Subject: [PATCH 1/3] [dx11] Moved CDVDCodecUtils::CopyDXVA2Picture to
 WinRenderer and optimize copying a DXVA texture to YUV buffer with sse4.

---
 project/VS2010Express/XBMC.vcxproj            |   1 +
 project/VS2010Express/XBMC.vcxproj.filters    |   3 +
 xbmc/cores/VideoRenderers/RenderManager.cpp   |   4 -
 xbmc/cores/VideoRenderers/WinRenderer.cpp     | 117 +++++++++++++++-
 xbmc/cores/VideoRenderers/WinRenderer.h       |  11 +-
 .../dvdplayer/DVDCodecs/DVDCodecUtils.cpp     | 102 --------------
 .../cores/dvdplayer/DVDCodecs/DVDCodecUtils.h |   1 -
 xbmc/utils/win32/gpu_memcpy_sse4.h            | 128 ++++++++++++++++++
 8 files changed, 255 insertions(+), 112 deletions(-)
 create mode 100644 xbmc/utils/win32/gpu_memcpy_sse4.h
diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
index f876e46f8e27f..764d85cd5631b 100644
--- a/project/VS2010Express/XBMC.vcxproj
+++ b/project/VS2010Express/XBMC.vcxproj
@@ -1126,6 +1126,7 @@
     <ClInclude Include="..\..\xbmc\utils\Utf8Utils.h" />
     <ClInclude Include="..\..\xbmc\utils\uXstrings.h" />
     <ClInclude Include="..\..\xbmc\utils\Vector.h" />
+    <ClInclude Include="..\..\xbmc\utils\win32\gpu_memcpy_sse4.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\Win32InterfaceForCLog.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\Win32Log.h" />
     <ClInclude Include="..\..\xbmc\utils\XSLTUtils.h" />
diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
index 79ad351f86b70..66dafd3c347b6 100644
--- a/project/VS2010Express/XBMC.vcxproj.filters
+++ b/project/VS2010Express/XBMC.vcxproj.filters
@@ -6187,6 +6187,9 @@
     <ClInclude Include="..\..\xbmc\video\jobs\VideoLibraryRefreshingJob.h">
       <Filter>video\jobs</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\xbmc\utils\win32\gpu_memcpy_sse4.h">
+      <Filter>utils\win32</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
diff --git a/xbmc/cores/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoRenderers/RenderManager.cpp
index 09327a37c692a..f35e0b3bbdf01 100644
--- a/xbmc/cores/VideoRenderers/RenderManager.cpp
+++ b/xbmc/cores/VideoRenderers/RenderManager.cpp
@@ -991,10 +991,6 @@ int CXBMCRenderManager::AddVideoPicture(DVDVideoPicture& pic)
   {
     CDVDCodecUtils::CopyYUV422PackedPicture(&image, &pic);
   }
-  else if(pic.format == RENDER_FMT_DXVA)
-  {
-    CDVDCodecUtils::CopyDXVA2Picture(&image, &pic);
-  }
 #ifdef HAVE_LIBVDPAU
   else if(pic.format == RENDER_FMT_VDPAU
        || pic.format == RENDER_FMT_VDPAU_420)
diff --git a/xbmc/cores/VideoRenderers/WinRenderer.cpp b/xbmc/cores/VideoRenderers/WinRenderer.cpp
index 9f3bf1325b0e0..98c1a1c16f336 100644
--- a/xbmc/cores/VideoRenderers/WinRenderer.cpp
+++ b/xbmc/cores/VideoRenderers/WinRenderer.cpp
@@ -30,7 +30,9 @@
 #include "settings/MediaSettings.h"
 #include "settings/Settings.h"
 #include "threads/SingleLock.h"
+#include "utils/CPUInfo.h"
 #include "utils/log.h"
+#include "utils/win32/gpu_memcpy_sse4.h"
 #include "VideoShaders/WinVideoFilter.h"
 #include "windowing/WindowingFactory.h"
 
@@ -286,6 +288,18 @@ bool CWinRenderer::AddVideoPicture(DVDVideoPicture* picture, int index)
     m_frameIdx += 2;
     return true;
   }
+  else if (picture->format == RENDER_FMT_DXVA)
+  {
+    int source = index;
+    if (source < 0 || NextYV12Texture() < 0)
+      return false;
+
+    YUVBuffer *buf = (YUVBuffer*)m_VideoBuffers[source];
+    if (buf->IsReadyToRender())
+      return false;
+
+    return buf->CopyFromDXVA(reinterpret_cast<ID3D11VideoDecoderOutputView*>(picture->dxva->view));
+  }
   return false;
 }
 
@@ -1261,6 +1275,7 @@ bool YUVBuffer::Create(ERenderFormat format, unsigned int width, unsigned int he
 
 void YUVBuffer::Release()
 {
+  SAFE_RELEASE(m_staging);
   for(unsigned i = 0; i < m_activeplanes; i++)
   {
     planes[i].texture.Release();
@@ -1275,9 +1290,9 @@ void YUVBuffer::StartRender()
 
   m_locked = false;
 
-  for(unsigned i = 0; i < m_activeplanes; i++)
+  for (unsigned i = 0; i < m_activeplanes; i++)
   {
-    if(planes[i].texture.Get() && planes[i].rect.pData)
+    if (planes[i].texture.Get() && planes[i].rect.pData)
       if (!planes[i].texture.UnlockRect(0))
         CLog::Log(LOGERROR, __FUNCTION__" - failed to unlock texture %d", i);
     memset(&planes[i].rect, 0, sizeof(planes[i].rect));
@@ -1353,10 +1368,104 @@ void YUVBuffer::Clear()
 }
 
 bool YUVBuffer::IsReadyToRender()
+{
+  return !m_locked;
+}
+
+bool YUVBuffer::CopyFromDXVA(ID3D11VideoDecoderOutputView* pView)
+{
+  if (!pView)
+    return false;
+
+  HRESULT hr = S_OK;
+  D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC vpivd;
+  pView->GetDesc(&vpivd);
+  ID3D11Resource* resource = nullptr;
+  pView->GetResource(&resource);
+
+  if (!m_staging)
+  {
+    // create staging texture
+    ID3D11Texture2D* surface = nullptr;
+    hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), reinterpret_cast<void**>(&surface));
+    if (SUCCEEDED(hr))
+    {
+      D3D11_TEXTURE2D_DESC tDesc;
+      surface->GetDesc(&tDesc);
+      SAFE_RELEASE(surface);
+
+      CD3D11_TEXTURE2D_DESC sDesc(tDesc);
+      sDesc.ArraySize = 1;
+      sDesc.Usage = D3D11_USAGE_STAGING;
+      sDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+      sDesc.BindFlags = 0;
+
+      hr = g_Windowing.Get3D11Device()->CreateTexture2D(&sDesc, nullptr, &m_staging);
+      if (SUCCEEDED(hr))
+        m_sDesc = sDesc;
+    }
+  }
+
+  if (m_staging)
+  {
+    ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
+    // copy content from decoder texture to temporary texture.
+    pContext->CopySubresourceRegion(m_staging,
+                                    D3D11CalcSubresource(0, 0, 1),
+                                    0, 0, 0,
+                                    resource,
+                                    D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1),
+                                    nullptr);
+    PerformCopy();
+  }
+  SAFE_RELEASE(resource);
+
+  return SUCCEEDED(hr);
+}
+
+void YUVBuffer::PerformCopy()
 {
   if (!m_locked)
-    return true;
-  return false;
+    return;
+
+  ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
+  D3D11_MAPPED_SUBRESOURCE rectangle;
+  if (SUCCEEDED(pContext->Map(m_staging, 0, D3D11_MAP_READ, 0, &rectangle)))
+  {
+    void* (*copy_func)(void* d, const void* s, size_t size) =
+        ((g_cpuInfo.GetCPUFeatures() & CPU_FEATURE_SSE4) != 0) ? gpu_memcpy : memcpy;
+
+    uint8_t* s_y = static_cast<uint8_t*>(rectangle.pData);
+    uint8_t *s_uv = static_cast<uint8_t*>(rectangle.pData) + m_sDesc.Height * rectangle.RowPitch;
+    uint8_t* d_y = static_cast<uint8_t*>(planes[PLANE_Y].rect.pData);
+    uint8_t *d_uv = static_cast<uint8_t*>(planes[PLANE_UV].rect.pData);
+
+    if ( planes[PLANE_Y ].rect.RowPitch == rectangle.RowPitch
+      && planes[PLANE_UV].rect.RowPitch == rectangle.RowPitch)
+    {
+      copy_func(d_y, s_y, rectangle.RowPitch * m_height);
+      copy_func(d_uv, s_uv, rectangle.RowPitch * m_height >> 1);
+    }
+    else
+    {
+      for (unsigned y = 0; y < m_sDesc.Height >> 1; ++y)
+      {
+        // Copy Y
+        copy_func(d_y, s_y, planes[PLANE_Y].rect.RowPitch);
+        s_y += rectangle.RowPitch;
+        d_y += planes[PLANE_Y].rect.RowPitch;
+        // Copy Y
+        copy_func(d_y, s_y, planes[PLANE_Y].rect.RowPitch);
+        s_y += rectangle.RowPitch;
+        d_y += planes[PLANE_Y].rect.RowPitch;
+        // Copy UV
+        copy_func(d_uv, s_uv, planes[PLANE_UV].rect.RowPitch);
+        s_uv += rectangle.RowPitch;
+        d_uv += planes[PLANE_UV].rect.RowPitch;
+      }
+    }
+    pContext->Unmap(m_staging, 0);
+  }
 }
 
 #endif
diff --git a/xbmc/cores/VideoRenderers/WinRenderer.h b/xbmc/cores/VideoRenderers/WinRenderer.h
index a693a84e1db36..79fa33e65f3c6 100644
--- a/xbmc/cores/VideoRenderers/WinRenderer.h
+++ b/xbmc/cores/VideoRenderers/WinRenderer.h
@@ -102,7 +102,10 @@ struct SVideoPlane
 
 struct YUVBuffer : SVideoBuffer
 {
-  YUVBuffer() : m_width(0), m_height(0), m_format(RENDER_FMT_NONE), m_activeplanes(0), m_locked(false) {}
+  YUVBuffer() : m_width(0), m_height(0), m_format(RENDER_FMT_NONE), m_activeplanes(0), m_locked(false), m_staging(nullptr)
+  {
+    memset(&m_sDesc, 0, sizeof(CD3D11_TEXTURE2D_DESC));
+  }
   ~YUVBuffer();
   bool Create(ERenderFormat format, unsigned int width, unsigned int height, bool dynamic);
   virtual void Release();
@@ -111,16 +114,21 @@ struct YUVBuffer : SVideoBuffer
   virtual void Clear();
   unsigned int GetActivePlanes() { return m_activeplanes; }
   virtual bool IsReadyToRender();
+  bool CopyFromDXVA(ID3D11VideoDecoderOutputView* pView);
 
   SVideoPlane planes[MAX_PLANES];
 
 private:
+  void PerformCopy();
+
   unsigned int     m_width;
   unsigned int     m_height;
   ERenderFormat    m_format;
   unsigned int     m_activeplanes;
   bool             m_locked;
   D3D11_MAP        m_mapType;
+  ID3D11Texture2D* m_staging;
+  CD3D11_TEXTURE2D_DESC m_sDesc;
 };
 
 struct DXVABuffer : SVideoBuffer
@@ -189,6 +197,7 @@ class CWinRenderer : public CBaseRenderer
   void SelectPSVideoFilter();
   void UpdatePSVideoFilter();
   bool CreateIntermediateRenderTarget(unsigned int width, unsigned int height);
+  bool CopyDXVA2YUVBuffer(ID3D11VideoDecoderOutputView* pView, YUVBuffer *pBuf);
 
   void RenderProcessor(DWORD flags);
   int  m_iYV12RenderBuffer;
diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
index 864a5f13818d4..a65af666a9749 100644
--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
@@ -24,10 +24,6 @@
 #include "utils/log.h"
 #include "cores/FFmpeg.h"
 #include "Util.h"
-#ifdef HAS_DX
-#include "cores/dvdplayer/DVDCodecs/Video/DXVA.h"
-#include "windowing/WindowingFactory.h"
-#endif
 
 #ifdef TARGET_WINDOWS
 #pragma comment(lib, "avcodec.lib")
@@ -357,104 +353,6 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture
   return true;
 }
 
-bool CDVDCodecUtils::CopyDXVA2Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
-{
-#ifdef HAS_DX
-  HRESULT hr;
-  switch (pSrc->extended_format)
-  {
-    case DXGI_FORMAT_NV12: // MAKEFOURCC('N', 'V', '1', '2'):
-    // Future...
-    //case DXGI_FORMAT_420_OPAQUE: // MAKEFOURCC('Y', 'V', '1', '2'):
-    //case MAKEFOURCC('Y','V','V','Y'): - what is it?
-      break;
-    default:
-      CLog::Log(LOGWARNING, "CDVDCodecUtils::CopyDXVA2Picture colorspace not supported");
-      return false;
-  }
-
-  // TODO: Optimize this later using shaders/swscale/etc. 
-  ID3D11VideoDecoderOutputView* view = reinterpret_cast<ID3D11VideoDecoderOutputView*>(pSrc->dxva->view);
-  if (!view)
-    return false;
-
-  ID3D11Resource* resource = nullptr;
-  ID3D11Texture2D* surface = nullptr;
-  view->GetResource(&resource);
-  hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), reinterpret_cast<void**>(&surface));
-  SAFE_RELEASE(resource);
-  if (FAILED(hr))
-    return false;
-
-  D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC vpivd;
-  view->GetDesc(&vpivd);
-
-  D3D11_TEXTURE2D_DESC tDesc;
-  surface->GetDesc(&tDesc);
-
-  int subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, tDesc.MipLevels);
-
-  // we cannot read from dxva decoder texture so create new one with read access and copy content to it.
-  CD3D11_TEXTURE2D_DESC sDesc(tDesc);
-  sDesc.ArraySize = 1;
-  sDesc.Usage = D3D11_USAGE_STAGING;
-  sDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
-  sDesc.BindFlags = 0;
-
-  ID3D11Texture2D* staging = nullptr;
-  hr = g_Windowing.Get3D11Device()->CreateTexture2D(&sDesc, nullptr, &staging);
-  if (FAILED(hr))
-  {
-    SAFE_RELEASE(surface);
-    return false;
-  }
-
-  ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
-  // copy content from decoder texture to temporary texture.
-  pContext->CopySubresourceRegion(staging, D3D11CalcSubresource(0, 0, tDesc.MipLevels), 0, 0, 0, surface, subresource, nullptr);
-
-  D3D11_MAPPED_SUBRESOURCE rectangle;
-  if (FAILED(pContext->Map(staging, 0, D3D11_MAP_READ, 0, &rectangle)))
-    return false;
-
-  switch (pSrc->extended_format)
-  {
-  case DXGI_FORMAT_NV12:
-    {
-      uint8_t* s_y = (uint8_t*)(rectangle.pData);
-      uint8_t* d_y = pImage->plane[0];
-      uint8_t *s_uv = ((uint8_t*)(rectangle.pData)) + sDesc.Height * rectangle.RowPitch;
-      uint8_t *d_uv = pImage->plane[1];
-      for (unsigned y = 0; y < pSrc->iHeight >> 1; ++y)
-      {
-        // Copy Y
-        memcpy(d_y, s_y, pSrc->iWidth);
-        s_y += rectangle.RowPitch;
-        d_y += pImage->stride[0];
-        // Copy Y
-        memcpy(d_y, s_y, pSrc->iWidth);
-        s_y += rectangle.RowPitch;
-        d_y += pImage->stride[0];
-        // Copy UV
-        memcpy(d_uv, s_uv, pSrc->iWidth);
-        s_uv += rectangle.RowPitch;
-        d_uv += pImage->stride[1];
-      }
-    }
-    break;
-  case DXGI_FORMAT_420_OPAQUE:
-    // not implemented yet
-    break;
-  }
-  pContext->Unmap(staging, 0);
-  SAFE_RELEASE(surface);
-  SAFE_RELEASE(staging);
-  return true;
-
-#endif // HAS_DX
-  return false;
-}
-
 bool CDVDCodecUtils::IsVP3CompatibleWidth(int width)
 {
   // known hardware limitation of purevideo 3 (VP3). (the Nvidia 9400 is a purevideo 3 chip)
diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h
index c8ed9b9ab160c..13587feabd144 100644
--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h
+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h
@@ -37,7 +37,6 @@ class CDVDCodecUtils
   static DVDVideoPicture* ConvertToYUV422PackedPicture(DVDVideoPicture *pSrc, ERenderFormat format);
   static bool CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc);
   static bool CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture *pSrc);
-  static bool CopyDXVA2Picture(YV12Image* pImage, DVDVideoPicture *pSrc);
 
   static bool IsVP3CompatibleWidth(int width);
 
diff --git a/xbmc/utils/win32/gpu_memcpy_sse4.h b/xbmc/utils/win32/gpu_memcpy_sse4.h
new file mode 100644
index 0000000000000..5fbea9d99d675
--- /dev/null
+++ b/xbmc/utils/win32/gpu_memcpy_sse4.h
@@ -0,0 +1,128 @@
+/*
+ *      Copyright (C) 2011-2015 Hendrik Leppkes
+ *      http://www.1f0.de
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *  Taken from the QuickSync decoder by Eric Gur
+ */
+
+#include <emmintrin.h>
+
+// gpu_memcpy is a memcpy style function that copied data very fast from a
+// GPU tiled memory (write back)
+// Performance tip: page offset (12 lsb) of both addresses should be different
+//  optimally use a 2K offset between them.
+inline void* gpu_memcpy(void* d, const void* s, size_t size)
+{
+    static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
+
+    if (d == nullptr || s == nullptr) return nullptr;
+
+    // If memory is not aligned, use memcpy
+    bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
+    if (!isAligned)
+    {
+        return memcpy(d, s, size);
+    }
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+#ifdef _M_X64
+    __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+#endif
+
+    size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
+    size_t end = 0;
+
+    __m128i* pTrg = (__m128i*)d;
+    __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
+    __m128i* pSrc = (__m128i*)s;
+    
+    // Make sure source is synced - doesn't hurt if not needed.
+    _mm_sfence();
+
+    while (pTrg < pTrgEnd)
+    {
+        // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
+        // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
+        xmm0  = _mm_stream_load_si128(pSrc);
+        xmm1  = _mm_stream_load_si128(pSrc + 1);
+        xmm2  = _mm_stream_load_si128(pSrc + 2);
+        xmm3  = _mm_stream_load_si128(pSrc + 3);
+        xmm4  = _mm_stream_load_si128(pSrc + 4);
+        xmm5  = _mm_stream_load_si128(pSrc + 5);
+        xmm6  = _mm_stream_load_si128(pSrc + 6);
+        xmm7  = _mm_stream_load_si128(pSrc + 7);
+#ifdef _M_X64 // Use all 16 xmm registers
+        xmm8  = _mm_stream_load_si128(pSrc + 8);
+        xmm9  = _mm_stream_load_si128(pSrc + 9);
+        xmm10 = _mm_stream_load_si128(pSrc + 10);
+        xmm11 = _mm_stream_load_si128(pSrc + 11);
+        xmm12 = _mm_stream_load_si128(pSrc + 12);
+        xmm13 = _mm_stream_load_si128(pSrc + 13);
+        xmm14 = _mm_stream_load_si128(pSrc + 14);
+        xmm15 = _mm_stream_load_si128(pSrc + 15);
+#endif
+        pSrc += regsInLoop;
+        // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
+        _mm_store_si128(pTrg     , xmm0);
+        _mm_store_si128(pTrg +  1, xmm1);
+        _mm_store_si128(pTrg +  2, xmm2);
+        _mm_store_si128(pTrg +  3, xmm3);
+        _mm_store_si128(pTrg +  4, xmm4);
+        _mm_store_si128(pTrg +  5, xmm5);
+        _mm_store_si128(pTrg +  6, xmm6);
+        _mm_store_si128(pTrg +  7, xmm7);
+#ifdef _M_X64 // Use all 16 xmm registers
+        _mm_store_si128(pTrg +  8, xmm8);
+        _mm_store_si128(pTrg +  9, xmm9);
+        _mm_store_si128(pTrg + 10, xmm10);
+        _mm_store_si128(pTrg + 11, xmm11);
+        _mm_store_si128(pTrg + 12, xmm12);
+        _mm_store_si128(pTrg + 13, xmm13);
+        _mm_store_si128(pTrg + 14, xmm14);
+        _mm_store_si128(pTrg + 15, xmm15);
+#endif
+        pTrg += regsInLoop;
+    }
+
+    // Copy in 16 byte steps
+    if (reminder >= 16)
+    {
+        size = reminder;
+        reminder = size & 15;
+        end = size >> 4;
+        for (size_t i = 0; i < end; ++i)
+        {
+            pTrg[i] = _mm_stream_load_si128(pSrc + i);
+        }
+    }
+
+    // Copy last bytes - shouldn't happen as strides are modulu 16
+    if (reminder)
+    {
+        __m128i temp = _mm_stream_load_si128(pSrc + end);
+
+        char* ps = (char*)(&temp);
+        char* pt = (char*)(pTrg + end);
+
+        for (size_t i = 0; i < reminder; ++i)
+        {
+            pt[i] = ps[i];
+        }
+    }
+
+    return d;
+}
\ No newline at end of file

From 95d07d3faee82314d94f880421fec87c70d1ca45 Mon Sep 17 00:00:00 2001
From: Anton Fedchin <afedchin@ruswizards.com>
Date: Sat, 18 Jul 2015 13:28:14 +0300
Subject: [PATCH 2/3] [dxva] CProcessorHD::Convert - Optimize method with sse2
 instructions.

---
 project/VS2010Express/XBMC.vcxproj         |   1 +
 project/VS2010Express/XBMC.vcxproj.filters |   3 +
 xbmc/cores/VideoRenderers/DXVAHD.cpp       |  54 ++++++----
 xbmc/utils/win32/memcpy_sse2.h             | 113 +++++++++++++++++++++
 4 files changed, 149 insertions(+), 22 deletions(-)
 create mode 100644 xbmc/utils/win32/memcpy_sse2.h

diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
index 764d85cd5631b..199bd8dd20ed2 100644
--- a/project/VS2010Express/XBMC.vcxproj
+++ b/project/VS2010Express/XBMC.vcxproj
@@ -1127,6 +1127,7 @@
     <ClInclude Include="..\..\xbmc\utils\uXstrings.h" />
     <ClInclude Include="..\..\xbmc\utils\Vector.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\gpu_memcpy_sse4.h" />
+    <ClInclude Include="..\..\xbmc\utils\win32\memcpy_sse2.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\Win32InterfaceForCLog.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\Win32Log.h" />
     <ClInclude Include="..\..\xbmc\utils\XSLTUtils.h" />
diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
index 66dafd3c347b6..4a8cab1925a25 100644
--- a/project/VS2010Express/XBMC.vcxproj.filters
+++ b/project/VS2010Express/XBMC.vcxproj.filters
@@ -6190,6 +6190,9 @@
     <ClInclude Include="..\..\xbmc\utils\win32\gpu_memcpy_sse4.h">
       <Filter>utils\win32</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\xbmc\utils\win32\memcpy_sse2.h">
+      <Filter>utils\win32</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
diff --git a/xbmc/cores/VideoRenderers/DXVAHD.cpp b/xbmc/cores/VideoRenderers/DXVAHD.cpp
index 8e90a3f9c881f..1ca33dbd194f2 100644
--- a/xbmc/cores/VideoRenderers/DXVAHD.cpp
+++ b/xbmc/cores/VideoRenderers/DXVAHD.cpp
@@ -34,6 +34,7 @@
 #include "settings/MediaSettings.h"
 #include "utils/AutoPtrHandle.h"
 #include "utils/Log.h"
+#include "utils/win32/memcpy_sse2.h"
 #include "win32/WIN32Util.h"
 #include "windowing/WindowingFactory.h"
 
@@ -434,9 +435,9 @@ bool CProcessorHD::CreateSurfaces()
 CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
 {
   // RENDER_FMT_YUV420P -> DXGI_FORMAT_NV12
-  // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010/DXGI_FORMAT_Y410
-  // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016/DXGI_FORMAT_Y416
-  if (picture->format != RENDER_FMT_YUV420P
+  // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010
+  // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016
+  if ( picture->format != RENDER_FMT_YUV420P
     && picture->format != RENDER_FMT_YUV420P10
     && picture->format != RENDER_FMT_YUV420P16)
   {
@@ -470,28 +471,38 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
     return nullptr;
   }
 
-  // Convert to NV12 - Luma
-  // TODO: Optimize this later using shaders/swscale/etc.
-  uint8_t *s = picture->data[0];
-  uint8_t* bits = (uint8_t*)rectangle.pData;
-  for (unsigned y = 0; y < picture->iHeight; y++)
+  if (picture->format == RENDER_FMT_YUV420P)
   {
-    memcpy(bits, s, picture->iWidth);
-    s += picture->iLineSize[0];
-    bits += rectangle.RowPitch;
+    uint8_t*  pData = static_cast<uint8_t*>(rectangle.pData);
+    uint8_t*  dst[] = { pData, pData + sDesc.Height * rectangle.RowPitch };
+    int dstStride[] = { rectangle.RowPitch, rectangle.RowPitch };
+    convert_yuv420_nv12(picture->data, picture->iLineSize, picture->iHeight, picture->iWidth, dst, dstStride);
   }
-
-  // Convert to NV12 - Chroma
-  uint8_t *s_u, *s_v, *d_uv;
-  for (unsigned y = 0; y < picture->iHeight / 2; y++)
+  else
   {
-    s_u = picture->data[1] + y * picture->iLineSize[1];
-    s_v = picture->data[2] + y * picture->iLineSize[2];
-    d_uv = (uint8_t*)rectangle.pData + (sDesc.Height + y) * rectangle.RowPitch;
-    for (unsigned x = 0; x < picture->iWidth / 2; x++)
+    // TODO: Optimize this later using sse2/sse4
+    uint16_t * d_y = static_cast<uint16_t*>(rectangle.pData);
+    uint16_t * d_uv = d_y + sDesc.Height * rectangle.RowPitch;
+    // Convert to NV12 - Luma
+    for (size_t line = 0; line < picture->iHeight; ++line)
     {
-      *d_uv++ = *s_u++;
-      *d_uv++ = *s_v++;
+      uint16_t * y = (uint16_t*)(picture->data[0] + picture->iLineSize[0] * line);
+      uint16_t * d = d_y + rectangle.RowPitch * line;
+      memcpy(d, y, picture->iLineSize[0]);
+    }
+    // Convert to NV12 - Chroma
+    size_t chromaWidth = (picture->iWidth + 1) >> 1;
+    size_t chromaHeight = picture->iHeight >> 1;
+    for (size_t line = 0; line < chromaHeight; ++line)
+    {
+      uint16_t * u = (uint16_t*)picture->data[1] + line * picture->iLineSize[1];
+      uint16_t * v = (uint16_t*)picture->data[2] + line * picture->iLineSize[2];
+      uint16_t * d = d_uv + line * rectangle.RowPitch;
+      for (size_t x = 0; x < chromaWidth; x++)
+      {
+        *d++ = *u++; 
+        *d++ = *v++;
+      }
     }
   }
   pContext->Unmap(texture, subresource);
@@ -503,7 +514,6 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
   return pic;
 }
 
-
 bool CProcessorHD::ApplyFilter(D3D11_VIDEO_PROCESSOR_FILTER filter, int value, int min, int max, int def)
 {
   if (filter >= NUM_FILTERS)
diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h
new file mode 100644
index 0000000000000..c585136547487
--- /dev/null
+++ b/xbmc/utils/win32/memcpy_sse2.h
@@ -0,0 +1,113 @@
+/*
+*      Copyright (C) 2005-2015 Team Kodi
+*      http://kodi.tv
+*
+*  This library is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  This library is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with this library; if not, write to the Free Software
+*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*
+*/
+
+#include <emmintrin.h>
+
+inline void* memcpy_aligned(void* dst, const void* src, size_t size)
+{
+  size_t i;
+  __m128i xmm1, xmm2, xmm3, xmm4;
+
+  // if memory is not aligned, use memcpy
+  if ((((size_t)(src) | (size_t)(dst)) & 0xF))
+    return memcpy(dst, src, size);
+
+  uint8_t* d = (uint8_t*)(dst);
+  uint8_t* s = (uint8_t*)(src);
+
+  for (i = 0; i < size - 63; i += 64)
+  {
+    xmm1 = _mm_load_si128((__m128i*)(s + i +  0));
+    xmm2 = _mm_load_si128((__m128i*)(s + i + 16));
+    xmm3 = _mm_load_si128((__m128i*)(s + i + 32));
+    xmm4 = _mm_load_si128((__m128i*)(s + i + 48));
+    _mm_stream_si128((__m128i*)(d + i +  0), xmm1);
+    _mm_stream_si128((__m128i*)(d + i + 16), xmm2);
+    _mm_stream_si128((__m128i*)(d + i + 32), xmm3);
+    _mm_stream_si128((__m128i*)(d + i + 48), xmm4);
+  }
+  for (; i < size; i += 16)
+  {
+    xmm1 = _mm_load_si128((__m128i*)(s + i));
+    _mm_stream_si128((__m128i*)(d + i), xmm1);
+  }
+  return dst;
+}
+
+inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[])
+{
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  _mm_sfence();
+
+  // Convert to NV12 - Luma
+  if (srcStride[0] == dstStride[0])
+    memcpy_aligned(dst[0], src[0], srcStride[0] * height);
+  else
+  {
+    for (size_t line = 0; line < height; ++line)
+    {
+      uint8_t * s = src[0] + srcStride[0] * line;
+      uint8_t * d = dst[0] + dstStride[0] * line;
+      memcpy_aligned(d, s, srcStride[0]);
+    }
+  }
+  // Convert to NV12 - Chroma
+  size_t chromaWidth = (width + 1) >> 1;
+  size_t chromaHeight = height >> 1;
+  for (size_t line = 0; line < chromaHeight; ++line)
+  {
+    size_t i;
+    uint8_t * u = src[1] + line * srcStride[1];
+    uint8_t * v = src[2] + line * srcStride[2];
+    uint8_t * d = dst[1] + line * dstStride[1];
+    for (i = 0; i < (chromaWidth - 31); i += 32)
+    {
+      xmm0 = _mm_load_si128((__m128i*)(v + i));
+      xmm1 = _mm_load_si128((__m128i*)(u + i));
+      xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
+      xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
+
+      xmm4 = xmm0;
+      xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+      xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
+
+      xmm1 = xmm2;
+      xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
+      xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
+
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
+    }
+    for (; i < chromaWidth; i += 16)
+    {
+      xmm0 = _mm_load_si128((__m128i*)(v + i));
+      xmm1 = _mm_load_si128((__m128i*)(u + i));
+
+      xmm2 = xmm0;
+      xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+      xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
+
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);
+    }
+  }
+}

From f0a82491eec5848501878d50495e4a0fdf6a52d6 Mon Sep 17 00:00:00 2001
From: Anton Fedchin <afedchin@ruswizards.com>
Date: Mon, 27 Jul 2015 17:21:34 +0300
Subject: [PATCH 3/3] [dx11] CProcessorHD: Get rig of unneeded std::map.

---
 xbmc/cores/VideoRenderers/DXVAHD.cpp | 67 ++++++++++++----------------
 xbmc/cores/VideoRenderers/DXVAHD.h   |  2 +-
 2 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/xbmc/cores/VideoRenderers/DXVAHD.cpp b/xbmc/cores/VideoRenderers/DXVAHD.cpp
index 1ca33dbd194f2..fb21b70f2ff6e 100644
--- a/xbmc/cores/VideoRenderers/DXVAHD.cpp
+++ b/xbmc/cores/VideoRenderers/DXVAHD.cpp
@@ -59,7 +59,6 @@ CProcessorHD::CProcessorHD()
   g_Windowing.Register(this);
 
   m_context = nullptr;
-  m_mappedResource.clear();
   m_width = 0;
   m_height = 0;
 }
@@ -84,12 +83,6 @@ void CProcessorHD::Close()
   SAFE_RELEASE(m_pEnumerator);
   SAFE_RELEASE(m_pVideoProcessor);
   SAFE_RELEASE(m_context);
-  std::map<ID3D11VideoProcessorInputView*, ID3D11Texture2D*>::iterator it = m_mappedResource.begin();
-  for (; it != m_mappedResource.end(); ++it)
-  {
-    if (it->second) it->second->Release();
-  }
-  m_mappedResource.clear();
 }
 
 bool CProcessorHD::UpdateSize(const DXVA2_VideoDesc& dsc)
@@ -125,6 +118,7 @@ bool CProcessorHD::PreInit()
     return false;
   }
 
+  memset(&m_texDesc, 0, sizeof(D3D11_TEXTURE2D_DESC));
   return true;
 }
 
@@ -382,42 +376,38 @@ bool CProcessorHD::OpenProcessor()
 
 bool CProcessorHD::CreateSurfaces()
 {
+  HRESULT hr;
+  size_t idx;
   ID3D11Device* pD3DDevice = g_Windowing.Get3D11Device();
 
   // we cannot use texture array (like in decoder) for USAGE_DYNAMIC, so create separete textures
-  CD3D11_TEXTURE2D_DESC desc(m_textureFormat, (m_width + 15) & ~15, (m_height + 15) & ~15, 1, 1, D3D11_BIND_DECODER, D3D11_USAGE_DYNAMIC, D3D11_CPU_ACCESS_WRITE);
-  D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC pivd = {0};
-  pivd.FourCC = 0;
-  pivd.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D;
+  CD3D11_TEXTURE2D_DESC texDesc(m_textureFormat, FFALIGN(m_width, 16), FFALIGN(m_height, 16), 1, 1, D3D11_BIND_DECODER, D3D11_USAGE_DYNAMIC, D3D11_CPU_ACCESS_WRITE);
+  D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC pivd = { 0, D3D11_VPIV_DIMENSION_TEXTURE2D };
   pivd.Texture2D.ArraySlice = 0;
   pivd.Texture2D.MipSlice = 0;
 
-  ID3D11Texture2D* resource[32];
-  ID3D11VideoProcessorInputView* views[32];
-  memset(views, 0, 32 * sizeof(ID3D11VideoProcessorInputView*));
-  memset(resource, 0, 32 * sizeof(ID3D11Texture2D*));
-  bool needRelease = false;
-
+  ID3D11VideoProcessorInputView* views[32] = { 0 };
   CLog::Log(LOGDEBUG, "%s - Creating %d processor surfaces with format %d.", __FUNCTION__, m_size, m_textureFormat);
 
-  for (unsigned idx = 0; idx < m_size; idx++)
+  for (idx = 0; idx < m_size; idx++)
   {
-    if ( FAILED(pD3DDevice->CreateTexture2D(&desc, NULL, &resource[idx]))
-      || FAILED(m_pVideoDevice->CreateVideoProcessorInputView(resource[idx], m_pEnumerator, &pivd, &views[idx])))
-    {
-      SAFE_RELEASE(resource[idx]);
-      SAFE_RELEASE(views[idx]);
-      needRelease = true;
-    }
+    ID3D11Texture2D* pTexture = nullptr;
+    hr = pD3DDevice->CreateTexture2D(&texDesc, NULL, &pTexture);
+    if (FAILED(hr))
+      break;
+
+    hr = m_pVideoDevice->CreateVideoProcessorInputView(pTexture, m_pEnumerator, &pivd, &views[idx]);
+    SAFE_RELEASE(pTexture);
+    if (FAILED(hr))
+      break;
   }
 
-  if (needRelease) 
+  if (idx != m_size)
   {
+    // something goes wrong
     CLog::Log(LOGERROR, "%s - Failed to create processor surfaces.", __FUNCTION__);
-
     for (unsigned idx = 0; idx < m_size; idx++)
     {
-      SAFE_RELEASE(resource[idx]);
       SAFE_RELEASE(views[idx]);
     }
     return false;
@@ -426,9 +416,10 @@ bool CProcessorHD::CreateSurfaces()
   m_context = new CSurfaceContext();
   for (unsigned int i = 0; i < m_size; i++)
   {
-    m_mappedResource[views[i]] = resource[i];
     m_context->AddSurface(views[i]);
   }
+
+  m_texDesc = texDesc;
   return true;
 }
 
@@ -453,18 +444,17 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
   }
 
   ID3D11VideoProcessorInputView* view = reinterpret_cast<ID3D11VideoProcessorInputView*>(pView);
-  ID3D11Texture2D* texture = m_mappedResource[view];
+
+  ID3D11Resource* pResource = nullptr;
+  view->GetResource(&pResource);
 
   D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC vpivd;
   view->GetDesc(&vpivd);
-  int subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1);
-
-  D3D11_TEXTURE2D_DESC sDesc;
-  texture->GetDesc(&sDesc);
+  UINT subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1);
 
   D3D11_MAPPED_SUBRESOURCE rectangle;
   ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
-  if (FAILED(pContext->Map(texture, subresource, D3D11_MAP_WRITE_DISCARD, 0, &rectangle)))
+  if (FAILED(pContext->Map(pResource, subresource, D3D11_MAP_WRITE_DISCARD, 0, &rectangle)))
   {
     CLog::Log(LOGERROR, "%s - could not lock rect", __FUNCTION__);
     m_context->ClearReference(view);
@@ -474,7 +464,7 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
   if (picture->format == RENDER_FMT_YUV420P)
   {
     uint8_t*  pData = static_cast<uint8_t*>(rectangle.pData);
-    uint8_t*  dst[] = { pData, pData + sDesc.Height * rectangle.RowPitch };
+    uint8_t*  dst[] = { pData, pData + m_texDesc.Height * rectangle.RowPitch };
     int dstStride[] = { rectangle.RowPitch, rectangle.RowPitch };
     convert_yuv420_nv12(picture->data, picture->iLineSize, picture->iHeight, picture->iWidth, dst, dstStride);
   }
@@ -482,7 +472,7 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
   {
     // TODO: Optimize this later using sse2/sse4
     uint16_t * d_y = static_cast<uint16_t*>(rectangle.pData);
-    uint16_t * d_uv = d_y + sDesc.Height * rectangle.RowPitch;
+    uint16_t * d_uv = d_y + m_texDesc.Height * rectangle.RowPitch;
     // Convert to NV12 - Luma
     for (size_t line = 0; line < picture->iHeight; ++line)
     {
@@ -505,7 +495,8 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
       }
     }
   }
-  pContext->Unmap(texture, subresource);
+  pContext->Unmap(pResource, subresource);
+  SAFE_RELEASE(pResource);
 
   m_context->ClearReference(view);
   m_context->MarkRender(view);
diff --git a/xbmc/cores/VideoRenderers/DXVAHD.h b/xbmc/cores/VideoRenderers/DXVAHD.h
index 609c7ee3a8466..154668f12b911 100644
--- a/xbmc/cores/VideoRenderers/DXVAHD.h
+++ b/xbmc/cores/VideoRenderers/DXVAHD.h
@@ -98,7 +98,7 @@ class CProcessorHD : ID3DResource
 
   unsigned int                    m_procIndex;
   D3D11_VIDEO_PROCESSOR_RATE_CONVERSION_CAPS m_rateCaps;
-  std::map<ID3D11VideoProcessorInputView*, ID3D11Texture2D*> m_mappedResource;
+  D3D11_TEXTURE2D_DESC            m_texDesc;
 };
 
 };