diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
index f876e46f8e27f..199bd8dd20ed2 100644
--- a/project/VS2010Express/XBMC.vcxproj
+++ b/project/VS2010Express/XBMC.vcxproj
@@ -1126,6 +1126,8 @@
+
+
diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
index 79ad351f86b70..4a8cab1925a25 100644
--- a/project/VS2010Express/XBMC.vcxproj.filters
+++ b/project/VS2010Express/XBMC.vcxproj.filters
@@ -6187,6 +6187,12 @@
video\jobs
+
+ utils\win32
+
+
+ utils\win32
+
diff --git a/xbmc/cores/VideoRenderers/DXVAHD.cpp b/xbmc/cores/VideoRenderers/DXVAHD.cpp
index 8e90a3f9c881f..fb21b70f2ff6e 100644
--- a/xbmc/cores/VideoRenderers/DXVAHD.cpp
+++ b/xbmc/cores/VideoRenderers/DXVAHD.cpp
@@ -34,6 +34,7 @@
#include "settings/MediaSettings.h"
#include "utils/AutoPtrHandle.h"
#include "utils/Log.h"
+#include "utils/win32/memcpy_sse2.h"
#include "win32/WIN32Util.h"
#include "windowing/WindowingFactory.h"
@@ -58,7 +59,6 @@ CProcessorHD::CProcessorHD()
g_Windowing.Register(this);
m_context = nullptr;
- m_mappedResource.clear();
m_width = 0;
m_height = 0;
}
@@ -83,12 +83,6 @@ void CProcessorHD::Close()
SAFE_RELEASE(m_pEnumerator);
SAFE_RELEASE(m_pVideoProcessor);
SAFE_RELEASE(m_context);
- std::map::iterator it = m_mappedResource.begin();
- for (; it != m_mappedResource.end(); ++it)
- {
- if (it->second) it->second->Release();
- }
- m_mappedResource.clear();
}
bool CProcessorHD::UpdateSize(const DXVA2_VideoDesc& dsc)
@@ -124,6 +118,7 @@ bool CProcessorHD::PreInit()
return false;
}
+ memset(&m_texDesc, 0, sizeof(D3D11_TEXTURE2D_DESC));
return true;
}
@@ -381,42 +376,38 @@ bool CProcessorHD::OpenProcessor()
bool CProcessorHD::CreateSurfaces()
{
+ HRESULT hr;
+ size_t idx;
ID3D11Device* pD3DDevice = g_Windowing.Get3D11Device();
// we cannot use texture array (like in decoder) for USAGE_DYNAMIC, so create separete textures
- CD3D11_TEXTURE2D_DESC desc(m_textureFormat, (m_width + 15) & ~15, (m_height + 15) & ~15, 1, 1, D3D11_BIND_DECODER, D3D11_USAGE_DYNAMIC, D3D11_CPU_ACCESS_WRITE);
- D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC pivd = {0};
- pivd.FourCC = 0;
- pivd.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D;
+ CD3D11_TEXTURE2D_DESC texDesc(m_textureFormat, FFALIGN(m_width, 16), FFALIGN(m_height, 16), 1, 1, D3D11_BIND_DECODER, D3D11_USAGE_DYNAMIC, D3D11_CPU_ACCESS_WRITE);
+ D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC pivd = { 0, D3D11_VPIV_DIMENSION_TEXTURE2D };
pivd.Texture2D.ArraySlice = 0;
pivd.Texture2D.MipSlice = 0;
- ID3D11Texture2D* resource[32];
- ID3D11VideoProcessorInputView* views[32];
- memset(views, 0, 32 * sizeof(ID3D11VideoProcessorInputView*));
- memset(resource, 0, 32 * sizeof(ID3D11Texture2D*));
- bool needRelease = false;
-
+ ID3D11VideoProcessorInputView* views[32] = { 0 };
CLog::Log(LOGDEBUG, "%s - Creating %d processor surfaces with format %d.", __FUNCTION__, m_size, m_textureFormat);
- for (unsigned idx = 0; idx < m_size; idx++)
+ for (idx = 0; idx < m_size; idx++)
{
- if ( FAILED(pD3DDevice->CreateTexture2D(&desc, NULL, &resource[idx]))
- || FAILED(m_pVideoDevice->CreateVideoProcessorInputView(resource[idx], m_pEnumerator, &pivd, &views[idx])))
- {
- SAFE_RELEASE(resource[idx]);
- SAFE_RELEASE(views[idx]);
- needRelease = true;
- }
+ ID3D11Texture2D* pTexture = nullptr;
+ hr = pD3DDevice->CreateTexture2D(&texDesc, NULL, &pTexture);
+ if (FAILED(hr))
+ break;
+
+ hr = m_pVideoDevice->CreateVideoProcessorInputView(pTexture, m_pEnumerator, &pivd, &views[idx]);
+ SAFE_RELEASE(pTexture);
+ if (FAILED(hr))
+ break;
}
- if (needRelease)
+ if (idx != m_size)
{
+ // something goes wrong
CLog::Log(LOGERROR, "%s - Failed to create processor surfaces.", __FUNCTION__);
-
for (unsigned idx = 0; idx < m_size; idx++)
{
- SAFE_RELEASE(resource[idx]);
SAFE_RELEASE(views[idx]);
}
return false;
@@ -425,18 +416,19 @@ bool CProcessorHD::CreateSurfaces()
m_context = new CSurfaceContext();
for (unsigned int i = 0; i < m_size; i++)
{
- m_mappedResource[views[i]] = resource[i];
m_context->AddSurface(views[i]);
}
+
+ m_texDesc = texDesc;
return true;
}
CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
{
// RENDER_FMT_YUV420P -> DXGI_FORMAT_NV12
- // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010/DXGI_FORMAT_Y410
- // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016/DXGI_FORMAT_Y416
- if (picture->format != RENDER_FMT_YUV420P
+ // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010
+ // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016
+ if ( picture->format != RENDER_FMT_YUV420P
&& picture->format != RENDER_FMT_YUV420P10
&& picture->format != RENDER_FMT_YUV420P16)
{
@@ -452,49 +444,59 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
}
ID3D11VideoProcessorInputView* view = reinterpret_cast(pView);
- ID3D11Texture2D* texture = m_mappedResource[view];
+
+ ID3D11Resource* pResource = nullptr;
+ view->GetResource(&pResource);
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC vpivd;
view->GetDesc(&vpivd);
- int subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1);
-
- D3D11_TEXTURE2D_DESC sDesc;
- texture->GetDesc(&sDesc);
+ UINT subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1);
D3D11_MAPPED_SUBRESOURCE rectangle;
ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
- if (FAILED(pContext->Map(texture, subresource, D3D11_MAP_WRITE_DISCARD, 0, &rectangle)))
+ if (FAILED(pContext->Map(pResource, subresource, D3D11_MAP_WRITE_DISCARD, 0, &rectangle)))
{
CLog::Log(LOGERROR, "%s - could not lock rect", __FUNCTION__);
m_context->ClearReference(view);
return nullptr;
}
- // Convert to NV12 - Luma
- // TODO: Optimize this later using shaders/swscale/etc.
- uint8_t *s = picture->data[0];
- uint8_t* bits = (uint8_t*)rectangle.pData;
- for (unsigned y = 0; y < picture->iHeight; y++)
+ if (picture->format == RENDER_FMT_YUV420P)
{
- memcpy(bits, s, picture->iWidth);
- s += picture->iLineSize[0];
- bits += rectangle.RowPitch;
+ uint8_t* pData = static_cast(rectangle.pData);
+ uint8_t* dst[] = { pData, pData + m_texDesc.Height * rectangle.RowPitch };
+ int dstStride[] = { rectangle.RowPitch, rectangle.RowPitch };
+ convert_yuv420_nv12(picture->data, picture->iLineSize, picture->iHeight, picture->iWidth, dst, dstStride);
}
-
- // Convert to NV12 - Chroma
- uint8_t *s_u, *s_v, *d_uv;
- for (unsigned y = 0; y < picture->iHeight / 2; y++)
+ else
{
- s_u = picture->data[1] + y * picture->iLineSize[1];
- s_v = picture->data[2] + y * picture->iLineSize[2];
- d_uv = (uint8_t*)rectangle.pData + (sDesc.Height + y) * rectangle.RowPitch;
- for (unsigned x = 0; x < picture->iWidth / 2; x++)
+ // TODO: Optimize this later using sse2/sse4
+ uint16_t * d_y = static_cast(rectangle.pData);
+ uint16_t * d_uv = d_y + m_texDesc.Height * rectangle.RowPitch;
+ // Convert to NV12 - Luma
+ for (size_t line = 0; line < picture->iHeight; ++line)
{
- *d_uv++ = *s_u++;
- *d_uv++ = *s_v++;
+ uint16_t * y = (uint16_t*)(picture->data[0] + picture->iLineSize[0] * line);
+ uint16_t * d = d_y + rectangle.RowPitch * line;
+ memcpy(d, y, picture->iLineSize[0]);
+ }
+ // Convert to NV12 - Chroma
+ size_t chromaWidth = (picture->iWidth + 1) >> 1;
+ size_t chromaHeight = picture->iHeight >> 1;
+ for (size_t line = 0; line < chromaHeight; ++line)
+ {
+ uint16_t * u = (uint16_t*)picture->data[1] + line * picture->iLineSize[1];
+ uint16_t * v = (uint16_t*)picture->data[2] + line * picture->iLineSize[2];
+ uint16_t * d = d_uv + line * rectangle.RowPitch;
+ for (size_t x = 0; x < chromaWidth; x++)
+ {
+ *d++ = *u++;
+ *d++ = *v++;
+ }
}
}
- pContext->Unmap(texture, subresource);
+ pContext->Unmap(pResource, subresource);
+ SAFE_RELEASE(pResource);
m_context->ClearReference(view);
m_context->MarkRender(view);
@@ -503,7 +505,6 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
return pic;
}
-
bool CProcessorHD::ApplyFilter(D3D11_VIDEO_PROCESSOR_FILTER filter, int value, int min, int max, int def)
{
if (filter >= NUM_FILTERS)
diff --git a/xbmc/cores/VideoRenderers/DXVAHD.h b/xbmc/cores/VideoRenderers/DXVAHD.h
index 609c7ee3a8466..154668f12b911 100644
--- a/xbmc/cores/VideoRenderers/DXVAHD.h
+++ b/xbmc/cores/VideoRenderers/DXVAHD.h
@@ -98,7 +98,7 @@ class CProcessorHD : ID3DResource
unsigned int m_procIndex;
D3D11_VIDEO_PROCESSOR_RATE_CONVERSION_CAPS m_rateCaps;
- std::map m_mappedResource;
+ D3D11_TEXTURE2D_DESC m_texDesc;
};
};
diff --git a/xbmc/cores/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoRenderers/RenderManager.cpp
index 09327a37c692a..f35e0b3bbdf01 100644
--- a/xbmc/cores/VideoRenderers/RenderManager.cpp
+++ b/xbmc/cores/VideoRenderers/RenderManager.cpp
@@ -991,10 +991,6 @@ int CXBMCRenderManager::AddVideoPicture(DVDVideoPicture& pic)
{
CDVDCodecUtils::CopyYUV422PackedPicture(&image, &pic);
}
- else if(pic.format == RENDER_FMT_DXVA)
- {
- CDVDCodecUtils::CopyDXVA2Picture(&image, &pic);
- }
#ifdef HAVE_LIBVDPAU
else if(pic.format == RENDER_FMT_VDPAU
|| pic.format == RENDER_FMT_VDPAU_420)
diff --git a/xbmc/cores/VideoRenderers/WinRenderer.cpp b/xbmc/cores/VideoRenderers/WinRenderer.cpp
index 9f3bf1325b0e0..98c1a1c16f336 100644
--- a/xbmc/cores/VideoRenderers/WinRenderer.cpp
+++ b/xbmc/cores/VideoRenderers/WinRenderer.cpp
@@ -30,7 +30,9 @@
#include "settings/MediaSettings.h"
#include "settings/Settings.h"
#include "threads/SingleLock.h"
+#include "utils/CPUInfo.h"
#include "utils/log.h"
+#include "utils/win32/gpu_memcpy_sse4.h"
#include "VideoShaders/WinVideoFilter.h"
#include "windowing/WindowingFactory.h"
@@ -286,6 +288,18 @@ bool CWinRenderer::AddVideoPicture(DVDVideoPicture* picture, int index)
m_frameIdx += 2;
return true;
}
+ else if (picture->format == RENDER_FMT_DXVA)
+ {
+ int source = index;
+ if (source < 0 || NextYV12Texture() < 0)
+ return false;
+
+ YUVBuffer *buf = (YUVBuffer*)m_VideoBuffers[source];
+ if (buf->IsReadyToRender())
+ return false;
+
+ return buf->CopyFromDXVA(reinterpret_cast(picture->dxva->view));
+ }
return false;
}
@@ -1261,6 +1275,7 @@ bool YUVBuffer::Create(ERenderFormat format, unsigned int width, unsigned int he
void YUVBuffer::Release()
{
+ SAFE_RELEASE(m_staging);
for(unsigned i = 0; i < m_activeplanes; i++)
{
planes[i].texture.Release();
@@ -1275,9 +1290,9 @@ void YUVBuffer::StartRender()
m_locked = false;
- for(unsigned i = 0; i < m_activeplanes; i++)
+ for (unsigned i = 0; i < m_activeplanes; i++)
{
- if(planes[i].texture.Get() && planes[i].rect.pData)
+ if (planes[i].texture.Get() && planes[i].rect.pData)
if (!planes[i].texture.UnlockRect(0))
CLog::Log(LOGERROR, __FUNCTION__" - failed to unlock texture %d", i);
memset(&planes[i].rect, 0, sizeof(planes[i].rect));
@@ -1353,10 +1368,104 @@ void YUVBuffer::Clear()
}
bool YUVBuffer::IsReadyToRender()
+{
+ return !m_locked;
+}
+
+bool YUVBuffer::CopyFromDXVA(ID3D11VideoDecoderOutputView* pView)
+{
+ if (!pView)
+ return false;
+
+ HRESULT hr = S_OK;
+ D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC vpivd;
+ pView->GetDesc(&vpivd);
+ ID3D11Resource* resource = nullptr;
+ pView->GetResource(&resource);
+
+ if (!m_staging)
+ {
+ // create staging texture
+ ID3D11Texture2D* surface = nullptr;
+ hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), reinterpret_cast(&surface));
+ if (SUCCEEDED(hr))
+ {
+ D3D11_TEXTURE2D_DESC tDesc;
+ surface->GetDesc(&tDesc);
+ SAFE_RELEASE(surface);
+
+ CD3D11_TEXTURE2D_DESC sDesc(tDesc);
+ sDesc.ArraySize = 1;
+ sDesc.Usage = D3D11_USAGE_STAGING;
+ sDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+ sDesc.BindFlags = 0;
+
+ hr = g_Windowing.Get3D11Device()->CreateTexture2D(&sDesc, nullptr, &m_staging);
+ if (SUCCEEDED(hr))
+ m_sDesc = sDesc;
+ }
+ }
+
+ if (m_staging)
+ {
+ ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
+ // copy content from decoder texture to temporary texture.
+ pContext->CopySubresourceRegion(m_staging,
+ D3D11CalcSubresource(0, 0, 1),
+ 0, 0, 0,
+ resource,
+ D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, 1),
+ nullptr);
+ PerformCopy();
+ }
+ SAFE_RELEASE(resource);
+
+ return SUCCEEDED(hr);
+}
+
+void YUVBuffer::PerformCopy()
{
if (!m_locked)
- return true;
- return false;
+ return;
+
+ ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
+ D3D11_MAPPED_SUBRESOURCE rectangle;
+ if (SUCCEEDED(pContext->Map(m_staging, 0, D3D11_MAP_READ, 0, &rectangle)))
+ {
+ void* (*copy_func)(void* d, const void* s, size_t size) =
+ ((g_cpuInfo.GetCPUFeatures() & CPU_FEATURE_SSE4) != 0) ? gpu_memcpy : memcpy;
+
+ uint8_t* s_y = static_cast(rectangle.pData);
+ uint8_t *s_uv = static_cast(rectangle.pData) + m_sDesc.Height * rectangle.RowPitch;
+ uint8_t* d_y = static_cast(planes[PLANE_Y].rect.pData);
+ uint8_t *d_uv = static_cast(planes[PLANE_UV].rect.pData);
+
+ if ( planes[PLANE_Y ].rect.RowPitch == rectangle.RowPitch
+ && planes[PLANE_UV].rect.RowPitch == rectangle.RowPitch)
+ {
+ copy_func(d_y, s_y, rectangle.RowPitch * m_height);
+ copy_func(d_uv, s_uv, rectangle.RowPitch * m_height >> 1);
+ }
+ else
+ {
+ for (unsigned y = 0; y < m_sDesc.Height >> 1; ++y)
+ {
+ // Copy Y
+ copy_func(d_y, s_y, planes[PLANE_Y].rect.RowPitch);
+ s_y += rectangle.RowPitch;
+ d_y += planes[PLANE_Y].rect.RowPitch;
+ // Copy Y
+ copy_func(d_y, s_y, planes[PLANE_Y].rect.RowPitch);
+ s_y += rectangle.RowPitch;
+ d_y += planes[PLANE_Y].rect.RowPitch;
+ // Copy UV
+ copy_func(d_uv, s_uv, planes[PLANE_UV].rect.RowPitch);
+ s_uv += rectangle.RowPitch;
+ d_uv += planes[PLANE_UV].rect.RowPitch;
+ }
+ }
+ pContext->Unmap(m_staging, 0);
+ }
}
#endif
diff --git a/xbmc/cores/VideoRenderers/WinRenderer.h b/xbmc/cores/VideoRenderers/WinRenderer.h
index a693a84e1db36..79fa33e65f3c6 100644
--- a/xbmc/cores/VideoRenderers/WinRenderer.h
+++ b/xbmc/cores/VideoRenderers/WinRenderer.h
@@ -102,7 +102,10 @@ struct SVideoPlane
struct YUVBuffer : SVideoBuffer
{
- YUVBuffer() : m_width(0), m_height(0), m_format(RENDER_FMT_NONE), m_activeplanes(0), m_locked(false) {}
+ YUVBuffer() : m_width(0), m_height(0), m_format(RENDER_FMT_NONE), m_activeplanes(0), m_locked(false), m_staging(nullptr)
+ {
+ memset(&m_sDesc, 0, sizeof(CD3D11_TEXTURE2D_DESC));
+ }
~YUVBuffer();
bool Create(ERenderFormat format, unsigned int width, unsigned int height, bool dynamic);
virtual void Release();
@@ -111,16 +114,21 @@ struct YUVBuffer : SVideoBuffer
virtual void Clear();
unsigned int GetActivePlanes() { return m_activeplanes; }
virtual bool IsReadyToRender();
+ bool CopyFromDXVA(ID3D11VideoDecoderOutputView* pView);
SVideoPlane planes[MAX_PLANES];
private:
+ void PerformCopy();
+
unsigned int m_width;
unsigned int m_height;
ERenderFormat m_format;
unsigned int m_activeplanes;
bool m_locked;
D3D11_MAP m_mapType;
+ ID3D11Texture2D* m_staging;
+ CD3D11_TEXTURE2D_DESC m_sDesc;
};
struct DXVABuffer : SVideoBuffer
@@ -189,6 +197,7 @@ class CWinRenderer : public CBaseRenderer
void SelectPSVideoFilter();
void UpdatePSVideoFilter();
bool CreateIntermediateRenderTarget(unsigned int width, unsigned int height);
+ bool CopyDXVA2YUVBuffer(ID3D11VideoDecoderOutputView* pView, YUVBuffer *pBuf);
void RenderProcessor(DWORD flags);
int m_iYV12RenderBuffer;
diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
index 864a5f13818d4..a65af666a9749 100644
--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
@@ -24,10 +24,6 @@
#include "utils/log.h"
#include "cores/FFmpeg.h"
#include "Util.h"
-#ifdef HAS_DX
-#include "cores/dvdplayer/DVDCodecs/Video/DXVA.h"
-#include "windowing/WindowingFactory.h"
-#endif
#ifdef TARGET_WINDOWS
#pragma comment(lib, "avcodec.lib")
@@ -357,104 +353,6 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture
return true;
}
-bool CDVDCodecUtils::CopyDXVA2Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
-{
-#ifdef HAS_DX
- HRESULT hr;
- switch (pSrc->extended_format)
- {
- case DXGI_FORMAT_NV12: // MAKEFOURCC('N', 'V', '1', '2'):
- // Future...
- //case DXGI_FORMAT_420_OPAQUE: // MAKEFOURCC('Y', 'V', '1', '2'):
- //case MAKEFOURCC('Y','V','V','Y'): - what is it?
- break;
- default:
- CLog::Log(LOGWARNING, "CDVDCodecUtils::CopyDXVA2Picture colorspace not supported");
- return false;
- }
-
- // TODO: Optimize this later using shaders/swscale/etc.
- ID3D11VideoDecoderOutputView* view = reinterpret_cast(pSrc->dxva->view);
- if (!view)
- return false;
-
- ID3D11Resource* resource = nullptr;
- ID3D11Texture2D* surface = nullptr;
- view->GetResource(&resource);
- hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), reinterpret_cast(&surface));
- SAFE_RELEASE(resource);
- if (FAILED(hr))
- return false;
-
- D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC vpivd;
- view->GetDesc(&vpivd);
-
- D3D11_TEXTURE2D_DESC tDesc;
- surface->GetDesc(&tDesc);
-
- int subresource = D3D11CalcSubresource(0, vpivd.Texture2D.ArraySlice, tDesc.MipLevels);
-
- // we cannot read from dxva decoder texture so create new one with read access and copy content to it.
- CD3D11_TEXTURE2D_DESC sDesc(tDesc);
- sDesc.ArraySize = 1;
- sDesc.Usage = D3D11_USAGE_STAGING;
- sDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
- sDesc.BindFlags = 0;
-
- ID3D11Texture2D* staging = nullptr;
- hr = g_Windowing.Get3D11Device()->CreateTexture2D(&sDesc, nullptr, &staging);
- if (FAILED(hr))
- {
- SAFE_RELEASE(surface);
- return false;
- }
-
- ID3D11DeviceContext* pContext = g_Windowing.GetImmediateContext();
- // copy content from decoder texture to temporary texture.
- pContext->CopySubresourceRegion(staging, D3D11CalcSubresource(0, 0, tDesc.MipLevels), 0, 0, 0, surface, subresource, nullptr);
-
- D3D11_MAPPED_SUBRESOURCE rectangle;
- if (FAILED(pContext->Map(staging, 0, D3D11_MAP_READ, 0, &rectangle)))
- return false;
-
- switch (pSrc->extended_format)
- {
- case DXGI_FORMAT_NV12:
- {
- uint8_t* s_y = (uint8_t*)(rectangle.pData);
- uint8_t* d_y = pImage->plane[0];
- uint8_t *s_uv = ((uint8_t*)(rectangle.pData)) + sDesc.Height * rectangle.RowPitch;
- uint8_t *d_uv = pImage->plane[1];
- for (unsigned y = 0; y < pSrc->iHeight >> 1; ++y)
- {
- // Copy Y
- memcpy(d_y, s_y, pSrc->iWidth);
- s_y += rectangle.RowPitch;
- d_y += pImage->stride[0];
- // Copy Y
- memcpy(d_y, s_y, pSrc->iWidth);
- s_y += rectangle.RowPitch;
- d_y += pImage->stride[0];
- // Copy UV
- memcpy(d_uv, s_uv, pSrc->iWidth);
- s_uv += rectangle.RowPitch;
- d_uv += pImage->stride[1];
- }
- }
- break;
- case DXGI_FORMAT_420_OPAQUE:
- // not implemented yet
- break;
- }
- pContext->Unmap(staging, 0);
- SAFE_RELEASE(surface);
- SAFE_RELEASE(staging);
- return true;
-
-#endif // HAS_DX
- return false;
-}
-
bool CDVDCodecUtils::IsVP3CompatibleWidth(int width)
{
// known hardware limitation of purevideo 3 (VP3). (the Nvidia 9400 is a purevideo 3 chip)
diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h
index c8ed9b9ab160c..13587feabd144 100644
--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h
+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.h
@@ -37,7 +37,6 @@ class CDVDCodecUtils
static DVDVideoPicture* ConvertToYUV422PackedPicture(DVDVideoPicture *pSrc, ERenderFormat format);
static bool CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc);
static bool CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture *pSrc);
- static bool CopyDXVA2Picture(YV12Image* pImage, DVDVideoPicture *pSrc);
static bool IsVP3CompatibleWidth(int width);
diff --git a/xbmc/utils/win32/gpu_memcpy_sse4.h b/xbmc/utils/win32/gpu_memcpy_sse4.h
new file mode 100644
index 0000000000000..5fbea9d99d675
--- /dev/null
+++ b/xbmc/utils/win32/gpu_memcpy_sse4.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2011-2015 Hendrik Leppkes
+ * http://www.1f0.de
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Taken from the QuickSync decoder by Eric Gur
+ */
+
+#include
+
+// gpu_memcpy is a memcpy style function that copied data very fast from a
+// GPU tiled memory (write back)
+// Performance tip: page offset (12 lsb) of both addresses should be different
+// optimally use a 2K offset between them.
+inline void* gpu_memcpy(void* d, const void* s, size_t size)
+{
+ static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
+
+ if (d == nullptr || s == nullptr) return nullptr;
+
+ // If memory is not aligned, use memcpy
+ bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
+ if (!isAligned)
+ {
+ return memcpy(d, s, size);
+ }
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+#ifdef _M_X64
+ __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+#endif
+
+ size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
+ size_t end = 0;
+
+ __m128i* pTrg = (__m128i*)d;
+ __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
+ __m128i* pSrc = (__m128i*)s;
+
+ // Make sure source is synced - doesn't hurt if not needed.
+ _mm_sfence();
+
+ while (pTrg < pTrgEnd)
+ {
+ // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
+ // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
+ xmm0 = _mm_stream_load_si128(pSrc);
+ xmm1 = _mm_stream_load_si128(pSrc + 1);
+ xmm2 = _mm_stream_load_si128(pSrc + 2);
+ xmm3 = _mm_stream_load_si128(pSrc + 3);
+ xmm4 = _mm_stream_load_si128(pSrc + 4);
+ xmm5 = _mm_stream_load_si128(pSrc + 5);
+ xmm6 = _mm_stream_load_si128(pSrc + 6);
+ xmm7 = _mm_stream_load_si128(pSrc + 7);
+#ifdef _M_X64 // Use all 16 xmm registers
+ xmm8 = _mm_stream_load_si128(pSrc + 8);
+ xmm9 = _mm_stream_load_si128(pSrc + 9);
+ xmm10 = _mm_stream_load_si128(pSrc + 10);
+ xmm11 = _mm_stream_load_si128(pSrc + 11);
+ xmm12 = _mm_stream_load_si128(pSrc + 12);
+ xmm13 = _mm_stream_load_si128(pSrc + 13);
+ xmm14 = _mm_stream_load_si128(pSrc + 14);
+ xmm15 = _mm_stream_load_si128(pSrc + 15);
+#endif
+ pSrc += regsInLoop;
+ // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
+ _mm_store_si128(pTrg , xmm0);
+ _mm_store_si128(pTrg + 1, xmm1);
+ _mm_store_si128(pTrg + 2, xmm2);
+ _mm_store_si128(pTrg + 3, xmm3);
+ _mm_store_si128(pTrg + 4, xmm4);
+ _mm_store_si128(pTrg + 5, xmm5);
+ _mm_store_si128(pTrg + 6, xmm6);
+ _mm_store_si128(pTrg + 7, xmm7);
+#ifdef _M_X64 // Use all 16 xmm registers
+ _mm_store_si128(pTrg + 8, xmm8);
+ _mm_store_si128(pTrg + 9, xmm9);
+ _mm_store_si128(pTrg + 10, xmm10);
+ _mm_store_si128(pTrg + 11, xmm11);
+ _mm_store_si128(pTrg + 12, xmm12);
+ _mm_store_si128(pTrg + 13, xmm13);
+ _mm_store_si128(pTrg + 14, xmm14);
+ _mm_store_si128(pTrg + 15, xmm15);
+#endif
+ pTrg += regsInLoop;
+ }
+
+ // Copy in 16 byte steps
+ if (reminder >= 16)
+ {
+ size = reminder;
+ reminder = size & 15;
+ end = size >> 4;
+ for (size_t i = 0; i < end; ++i)
+ {
+ pTrg[i] = _mm_stream_load_si128(pSrc + i);
+ }
+ }
+
+ // Copy last bytes - shouldn't happen as strides are modulu 16
+ if (reminder)
+ {
+ __m128i temp = _mm_stream_load_si128(pSrc + end);
+
+ char* ps = (char*)(&temp);
+ char* pt = (char*)(pTrg + end);
+
+ for (size_t i = 0; i < reminder; ++i)
+ {
+ pt[i] = ps[i];
+ }
+ }
+
+ return d;
+}
\ No newline at end of file
diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h
new file mode 100644
index 0000000000000..c585136547487
--- /dev/null
+++ b/xbmc/utils/win32/memcpy_sse2.h
@@ -0,0 +1,113 @@
+/*
+* Copyright (C) 2005-2015 Team Kodi
+* http://kodi.tv
+*
+* This library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with this library; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*
+*/
+
+#include
+
+inline void* memcpy_aligned(void* dst, const void* src, size_t size)
+{
+ size_t i;
+ __m128i xmm1, xmm2, xmm3, xmm4;
+
+ // if memory is not aligned, use memcpy
+ if ((((size_t)(src) | (size_t)(dst)) & 0xF))
+ return memcpy(dst, src, size);
+
+ uint8_t* d = (uint8_t*)(dst);
+ uint8_t* s = (uint8_t*)(src);
+
+ for (i = 0; i < size - 63; i += 64)
+ {
+ xmm1 = _mm_load_si128((__m128i*)(s + i + 0));
+ xmm2 = _mm_load_si128((__m128i*)(s + i + 16));
+ xmm3 = _mm_load_si128((__m128i*)(s + i + 32));
+ xmm4 = _mm_load_si128((__m128i*)(s + i + 48));
+ _mm_stream_si128((__m128i*)(d + i + 0), xmm1);
+ _mm_stream_si128((__m128i*)(d + i + 16), xmm2);
+ _mm_stream_si128((__m128i*)(d + i + 32), xmm3);
+ _mm_stream_si128((__m128i*)(d + i + 48), xmm4);
+ }
+ for (; i < size; i += 16)
+ {
+ xmm1 = _mm_load_si128((__m128i*)(s + i));
+ _mm_stream_si128((__m128i*)(d + i), xmm1);
+ }
+ return dst;
+}
+
+inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[])
+{
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ _mm_sfence();
+
+ // Convert to NV12 - Luma
+ if (srcStride[0] == dstStride[0])
+ memcpy_aligned(dst[0], src[0], srcStride[0] * height);
+ else
+ {
+ for (size_t line = 0; line < height; ++line)
+ {
+ uint8_t * s = src[0] + srcStride[0] * line;
+ uint8_t * d = dst[0] + dstStride[0] * line;
+ memcpy_aligned(d, s, srcStride[0]);
+ }
+ }
+ // Convert to NV12 - Chroma
+ size_t chromaWidth = (width + 1) >> 1;
+ size_t chromaHeight = height >> 1;
+ for (size_t line = 0; line < chromaHeight; ++line)
+ {
+ size_t i;
+ uint8_t * u = src[1] + line * srcStride[1];
+ uint8_t * v = src[2] + line * srcStride[2];
+ uint8_t * d = dst[1] + line * dstStride[1];
+ for (i = 0; i < (chromaWidth - 31); i += 32)
+ {
+ xmm0 = _mm_load_si128((__m128i*)(v + i));
+ xmm1 = _mm_load_si128((__m128i*)(u + i));
+ xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
+ xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
+
+ xmm4 = xmm0;
+ xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
+
+ xmm1 = xmm2;
+ xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
+ xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
+
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
+ }
+ for (; i < chromaWidth; i += 16)
+ {
+ xmm0 = _mm_load_si128((__m128i*)(v + i));
+ xmm1 = _mm_load_si128((__m128i*)(u + i));
+
+ xmm2 = xmm0;
+ xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+ xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
+
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);
+ }
+ }
+}