Skip to content

Commit

Permalink
dxva: let renderer control the render buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
FernetMenta committed Sep 19, 2014
1 parent 4f49d56 commit 71f50d3
Show file tree
Hide file tree
Showing 9 changed files with 274 additions and 419 deletions.
3 changes: 2 additions & 1 deletion xbmc/cores/VideoRenderers/BaseRenderer.h
Expand Up @@ -29,7 +29,7 @@

#define MAX_PLANES 3
#define MAX_FIELDS 3
#define NUM_BUFFERS 3
#define NUM_BUFFERS 6

class CSetting;

Expand Down Expand Up @@ -93,6 +93,7 @@ class CBaseRenderer
virtual unsigned int GetMaxBufferSize() { return 0; }
virtual void SetBufferSize(int numBuffers) { }
virtual void ReleaseBuffer(int idx) { }
virtual bool NeedBufferForRef(int idx) { return false; }

virtual bool Supports(ERENDERFEATURE feature) { return false; }

Expand Down
280 changes: 107 additions & 173 deletions xbmc/cores/VideoRenderers/DXVA.cpp
Expand Up @@ -148,11 +148,9 @@ CProcessor::CProcessor()
{
m_service = NULL;
m_process = NULL;
m_time = 0;
g_Windowing.Register(this);

m_context = NULL;
m_index = 0;
m_progressive = true;
}

Expand All @@ -173,12 +171,6 @@ void CProcessor::Close()
{
CSingleLock lock(m_section);
SAFE_RELEASE(m_process);
for(unsigned i = 0; i < m_samples.size(); i++)
{
SAFE_RELEASE(m_samples[i].renderPic);
}
m_samples.clear();

SAFE_RELEASE(m_context);
}

Expand Down Expand Up @@ -362,8 +354,6 @@ bool CProcessor::Open(UINT width, UINT height, unsigned int flags, unsigned int
if (!OpenProcessor())
return false;

m_time = 0;

return true;
}

Expand Down Expand Up @@ -511,114 +501,71 @@ bool CProcessor::CreateSurfaces()
return true;
}

REFERENCE_TIME CProcessor::Add(DVDVideoPicture* picture)
CRenderPicture *CProcessor::Convert(DVDVideoPicture* picture)
{
CSingleLock lock(m_section);

IDirect3DSurface9* surface = NULL;

if (picture->iFlags & DVP_FLAG_DROPPED)
return 0;

switch (picture->format)
if (picture->format != RENDER_FMT_YUV420P)
{
case RENDER_FMT_DXVA:
{
surface = picture->dxva->surface;
break;
}

case RENDER_FMT_YUV420P:
{
surface = m_context->GetAtIndex(m_index);
m_index = (m_index + 1) % m_size;

D3DLOCKED_RECT rectangle;
if (FAILED(surface->LockRect(&rectangle, NULL, 0)))
return 0;

// Convert to NV12 - Luma
// TODO: Optimize this later using shaders/swscale/etc.
uint8_t *s = picture->data[0];
uint8_t* bits = (uint8_t*)(rectangle.pBits);
for (unsigned y = 0; y < picture->iHeight; y++)
{
memcpy(bits, s, picture->iWidth);
s += picture->iLineSize[0];
bits += rectangle.Pitch;
}

D3DSURFACE_DESC desc;
if (FAILED(surface->GetDesc(&desc)))
return 0;

// Convert to NV12 - Chroma
for (unsigned y = 0; y < picture->iHeight/2; y++)
{
uint8_t *s_u = picture->data[1] + (y * picture->iLineSize[1]);
uint8_t *s_v = picture->data[2] + (y * picture->iLineSize[2]);
uint8_t *d_uv = ((uint8_t*)(rectangle.pBits)) + (desc.Height + y) * rectangle.Pitch;
for (unsigned x = 0; x < picture->iWidth/2; x++)
{
*d_uv++ = *s_u++;
*d_uv++ = *s_v++;
}
}

if (FAILED(surface->UnlockRect()))
return 0;

break;
}

default:
{
CLog::Log(LOGWARNING, "DXVA - colorspace not supported by processor, skipping frame");
return 0;
}
CLog::Log(LOGERROR, "%s - colorspace not supported by processor, skipping frame.", __FUNCTION__);
return NULL;
}

IDirect3DSurface9* surface = m_context->GetFree(NULL);
if (!surface)
return 0;

m_time += 2;
{
CLog::Log(LOGERROR, "%s - no free video surface", __FUNCTION__);
return NULL;
}

SVideoSample vs = {};
vs.sample.Start = m_time;
vs.sample.End = 0;
vs.sample.SampleFormat = m_desc.SampleFormat;
vs.renderPic = NULL;
if (picture->format == RENDER_FMT_DXVA)
vs.renderPic = picture->dxva->Acquire();
D3DLOCKED_RECT rectangle;
if (FAILED(surface->LockRect(&rectangle, NULL, 0)))
{
CLog::Log(LOGERROR, "%s - could not lock rect", __FUNCTION__);
m_context->ClearReference(surface);
return NULL;
}

if (picture->iFlags & DVP_FLAG_INTERLACED)
// Convert to NV12 - Luma
// TODO: Optimize this later using shaders/swscale/etc.
uint8_t *s = picture->data[0];
uint8_t* bits = (uint8_t*)(rectangle.pBits);
for (unsigned y = 0; y < picture->iHeight; y++)
{
if (picture->iFlags & DVP_FLAG_TOP_FIELD_FIRST)
vs.sample.SampleFormat.SampleFormat = DXVA2_SampleFieldInterleavedEvenFirst;
else
vs.sample.SampleFormat.SampleFormat = DXVA2_SampleFieldInterleavedOddFirst;
memcpy(bits, s, picture->iWidth);
s += picture->iLineSize[0];
bits += rectangle.Pitch;
}
else
D3DSURFACE_DESC desc;
if (FAILED(surface->GetDesc(&desc)))
{
vs.sample.SampleFormat.SampleFormat = DXVA2_SampleProgressiveFrame;
CLog::Log(LOGERROR, "%s - could not get surface descriptor", __FUNCTION__);
m_context->ClearReference(surface);
return NULL;
}

vs.sample.PlanarAlpha = DXVA2_Fixed32OpaqueAlpha();
vs.sample.SampleData = 0;
vs.sample.SrcSurface = surface;


if(!m_samples.empty())
m_samples.back().sample.End = vs.sample.Start;

m_samples.push_back(vs);
if (m_samples.size() > m_size)
// Convert to NV12 - Chroma
uint8_t *s_u, *s_v, *d_uv;
for (unsigned y = 0; y < picture->iHeight / 2; y++)
{
SAFE_RELEASE(m_samples.front().renderPic);
m_samples.pop_front();
s_u = picture->data[1] + (y * picture->iLineSize[1]);
s_v = picture->data[2] + (y * picture->iLineSize[2]);
d_uv = ((uint8_t*)(rectangle.pBits)) + (desc.Height + y) * rectangle.Pitch;
for (unsigned x = 0; x < picture->iWidth / 2; x++)
{
*d_uv++ = *s_u++;
*d_uv++ = *s_v++;
}
}
if (FAILED(surface->UnlockRect()))
{
CLog::Log(LOGERROR, "%s - failed to unlock surface", __FUNCTION__);
m_context->ClearReference(surface);
return NULL;
}

return m_time;
m_context->ClearReference(surface);
m_context->MarkRender(surface);
CRenderPicture *pic = new CRenderPicture(m_context);
pic->surface = surface;
return pic;
}

static DXVA2_Fixed32 ConvertRange(const DXVA2_ValueRange& range, int value, int min, int max, int def)
Expand All @@ -635,10 +582,13 @@ static DXVA2_Fixed32 ConvertRange(const DXVA2_ValueRange& range, int value, int
return range.DefaultValue;
}

bool CProcessor::Render(CRect src, CRect dst, IDirect3DSurface9* target, REFERENCE_TIME time, DWORD flags)
bool CProcessor::Render(CRect src, CRect dst, IDirect3DSurface9* target, IDirect3DSurface9** source, DWORD flags, UINT frameIdx)
{
CSingleLock lock(m_section);

if (!source[2])
return false;

// With auto deinterlacing, the Ion Gen. 1 drops some frames with deinterlacing processor + progressive flags for progressive material.
// For that GPU (or when specified by an advanced setting), use the progressive processor.
// This is at the expense of the switch speed when video interlacing flags change and a deinterlacing processor is actually required.
Expand All @@ -657,93 +607,77 @@ bool CProcessor::Render(CRect src, CRect dst, IDirect3DSurface9* target, REFEREN
return false;
}

// MinTime and MaxTime are the first and last samples to keep. Delete the rest.
REFERENCE_TIME MinTime = time - m_max_back_refs*2;
REFERENCE_TIME MaxTime = time + m_max_fwd_refs*2;

std::deque<SVideoSample>::iterator it = m_samples.begin();
while (it != m_samples.end())
{
if (it->sample.Start < MinTime)
{
SAFE_RELEASE(it->renderPic);
it = m_samples.erase(it);
}
else
++it;
}

if(m_samples.empty())
return false;

// MinTime and MaxTime are now the first and last samples to feed the processor.
MinTime = time - m_caps.NumBackwardRefSamples*2;
MaxTime = time + m_caps.NumForwardRefSamples*2;

D3DSURFACE_DESC desc;
CHECK(target->GetDesc(&desc));
CRect rectTarget(0, 0, desc.Width, desc.Height);
CWIN32Util::CropSource(src, dst, rectTarget);
RECT sourceRECT = { src.x1, src.y1, src.x2, src.y2 };
RECT dstRECT = { dst.x1, dst.y1, dst.x2, dst.y2 };

// set sample format for progressive and interlaced
UINT sampleFormat = DXVA2_SampleProgressiveFrame;
if (flags & RENDER_FLAG_FIELD0 && flags & RENDER_FLAG_TOP)
sampleFormat = DXVA2_SampleFieldInterleavedEvenFirst;
else if (flags & RENDER_FLAG_FIELD1 && flags & RENDER_FLAG_BOT)
sampleFormat = DXVA2_SampleFieldInterleavedEvenFirst;
if (flags & RENDER_FLAG_FIELD0 && flags & RENDER_FLAG_BOT)
sampleFormat = DXVA2_SampleFieldInterleavedOddFirst;
if (flags & RENDER_FLAG_FIELD1 && flags & RENDER_FLAG_TOP)
sampleFormat = DXVA2_SampleFieldInterleavedOddFirst;

// How to prepare the samples array for VideoProcessBlt
// - always provide current picture + the number of forward and backward references required by the current processor.
// - provide the surfaces in the array in increasing temporal order
// - at the start of playback, there may not be enough samples available. Use SampleFormat.SampleFormat = DXVA2_SampleUnknown for the missing samples.

int count = 1 + m_caps.NumBackwardRefSamples + m_caps.NumForwardRefSamples;
int valid = 0;
auto_aptr<DXVA2_VideoSample> samp(new DXVA2_VideoSample[count]);

for (int i = 0; i < count; i++)
samp[i].SampleFormat.SampleFormat = DXVA2_SampleUnknown;

for(it = m_samples.begin(); it != m_samples.end() && valid < count; ++it)
unsigned int providedPast = 0;
for (int i = 3; i < 8; i++)
{
if (it->sample.Start >= MinTime && it->sample.Start <= MaxTime)
{
DXVA2_VideoSample& vs = samp[(it->sample.Start - MinTime) / 2];
vs = it->sample;
vs.SrcRect = sourceRECT;
vs.DstRect = dstRECT;
if(vs.End == 0)
vs.End = vs.Start + 2;

// Override the sample format when the processor doesn't need to deinterlace or when deinterlacing is forced and flags are missing.
if (m_progressive)
vs.SampleFormat.SampleFormat = DXVA2_SampleProgressiveFrame;
else if (m_deinterlace_mode == VS_DEINTERLACEMODE_FORCE && vs.SampleFormat.SampleFormat == DXVA2_SampleProgressiveFrame)
vs.SampleFormat.SampleFormat = DXVA2_SampleFieldInterleavedEvenFirst;

valid++;
}
if (source[i])
providedPast++;
}

// MS' guidelines above don't work. The blit fails when the processor is given DXVA2_SampleUnknown samples (with ATI at least).
// The ATI driver works with a reduced number of samples though, support that for now.
// Problem is an ambiguity if there are future refs requested by the processor. There are no such implementations at the moment.
int offset = 0;
if(valid < count)
unsigned int providedFuture = 0;
for (int i = 1; i >= 0; i--)
{
CLog::Log(LOGWARNING, __FUNCTION__" - did not find all required samples, adjusting the sample array.");
if (source[i])
providedFuture++;
}
int futureFrames = std::min(providedFuture, m_caps.NumForwardRefSamples);
int pastFrames = std::min(providedPast, m_caps.NumBackwardRefSamples);

for (int i = 0; i < count; i++)
{
if (samp[i].SampleFormat.SampleFormat == DXVA2_SampleUnknown)
offset = i+1;
}
count -= offset;
if (count == 0)
{
CLog::Log(LOGWARNING, __FUNCTION__" - no usable samples.");
return false;
}
int count = 1 + pastFrames + futureFrames;
auto_aptr<DXVA2_VideoSample> samp(new DXVA2_VideoSample[count]);

int start = 2 - futureFrames;
int end = 2 + pastFrames;
int sampIdx = 0;
for (int i = end; i >= start; i--)
{
if (!source[i])
continue;

DXVA2_VideoSample& vs = samp[sampIdx];
vs.SrcSurface = source[i];
vs.SrcRect = sourceRECT;
vs.DstRect = dstRECT;
vs.SampleData = 0;
vs.Start = frameIdx + (sampIdx - pastFrames) * 2;
vs.End = vs.Start + 2;
vs.PlanarAlpha = DXVA2_Fixed32OpaqueAlpha();
vs.SampleFormat.SampleFormat = sampleFormat;

// Override the sample format when the processor doesn't need to deinterlace or when deinterlacing is forced and flags are missing.
if (m_progressive)
vs.SampleFormat.SampleFormat = DXVA2_SampleProgressiveFrame;
else if (m_deinterlace_mode == VS_DEINTERLACEMODE_FORCE && vs.SampleFormat.SampleFormat == DXVA2_SampleProgressiveFrame)
vs.SampleFormat.SampleFormat = DXVA2_SampleFieldInterleavedEvenFirst;

sampIdx++;
}


DXVA2_VideoProcessBltParams blt = {};
blt.TargetFrame = time;
blt.TargetFrame = frameIdx;
if (flags & RENDER_FLAG_FIELD1)
blt.TargetFrame += 1;
blt.TargetRect = dstRECT;
Expand Down Expand Up @@ -776,7 +710,7 @@ bool CProcessor::Render(CRect src, CRect dst, IDirect3DSurface9* target, REFEREN
float verts[2][3]= {};
g_Windowing.Get3DDevice()->DrawPrimitiveUP(D3DPT_TRIANGLEFAN, 1, verts, 3*sizeof(float));

CHECK(m_process->VideoProcessBlt(target, &blt, &samp[offset], count, NULL));
CHECK(m_process->VideoProcessBlt(target, &blt, &samp[0], count, NULL));
return true;
}

Expand Down

4 comments on commit 71f50d3

@Memphiz
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This broke atv2. With this i am not able to watch livetv (sd - software decoded) in a normal way anymore. I get frame drops each second with this. Strange thing is - once i enable the debug logging on screen text it works smooth again. But thats not a solution.

I am not sure how to revert this for a test (results in conflicts i don't have a clue how to solve). But during bisect this one was clearly the showstopper.

I think its a change like that that borked playing hd / hw accelerated playback on atv2 since gotham. (never figured that one out though)

Stupid me never realises those issues until i try something short before the release and then might or might not be able to bisect it.

Please everyone working on those serious places in the code - try to remember that there is an atv2 which has 256mb ram which is already loaded by a lot of apple stuff :/ (so i can try those code changes before they get merged).

@topfs2 this regression is a blocker for helix imo
@fritsch fyi ping ...

@FernetMenta
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Memphiz you really should test more frequently on atv if you consider this an important platform which btw majority of users do not. Don't get me wrong, I don't say this this should not be fixed but a platform like atv must not block major progress on the main platforms.
please post a debug log in the testing section of the forum. then we can start investigation.

@FernetMenta
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Memphiz
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use the atv2 daily for exact that reason (determining regressions). Problem is that i was not able to update it for a month or so because of time constraints (ios8, yosemite stuff/testing). I agree that the atv2 should not hold off development and i still have the goal to drop atv2 support - but helix should be working on atv2 as the final version.

Its just not possible to be quick enough when technically maintaining 3 platforms. Also i ignored all the dxva stuff because dxva is windows for me - who would know that this has an effect on atv2 ;).

It was not ment as bitching - but whenever i bisect something its getting really really late and it always happens short before release which makes me nervous then...

form thread with debug log in the internal test forum ...

Please sign in to comment.