Skip to content

Commit

Permalink
media: venus: Rework error fail recover logic
Browse files Browse the repository at this point in the history
[ Upstream commit 4cba547 ]

The Venus code has a sort of watchdog that attempts to recover
from IP errors, implemented as a delayed work job, which
calls venus_sys_error_handler().

Right now, it has several issues:

1. It assumes that PM runtime resume never fails

2. It internally runs two while() loops that also assume that
   PM runtime will never fail to go idle:

	while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc))
		msleep(10);

...

	while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0]))
		usleep_range(1000, 1500);

3. It uses an OR to merge all return codes and then report to the user

4. If the hardware never recovers, it keeps running on every 10ms,
   flooding the syslog with 2 messages (so, up to 200 messages
   per second).

Rework the code, in order to prevent that, by:

1. check the return code from PM runtime resume;
2. don't let the while() loops run forever;
3. store the failed event;
4. use warn ratelimited when it fails to recover.

Fixes: af2c383 ("[media] media: venus: adding core part and helper functions")
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
  • Loading branch information
mchehab authored and gregkh committed Jul 14, 2021
1 parent 08d0aa1 commit e23dc4a
Showing 1 changed file with 47 additions and 13 deletions.
60 changes: 47 additions & 13 deletions drivers/media/platform/qcom/venus/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,52 +48,86 @@ static const struct hfi_core_ops venus_core_ops = {
.event_notify = venus_event_notify,
};

#define RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS 10

static void venus_sys_error_handler(struct work_struct *work)
{
struct venus_core *core =
container_of(work, struct venus_core, work.work);
int ret = 0;

pm_runtime_get_sync(core->dev);
int ret, i, max_attempts = RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS;
const char *err_msg = "";
bool failed = false;

ret = pm_runtime_get_sync(core->dev);
if (ret < 0) {
err_msg = "resume runtime PM";
max_attempts = 0;
failed = true;
}

hfi_core_deinit(core, true);

dev_warn(core->dev, "system error has occurred, starting recovery!\n");

mutex_lock(&core->lock);

while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc))
for (i = 0; i < max_attempts; i++) {
if (!pm_runtime_active(core->dev_dec) && !pm_runtime_active(core->dev_enc))
break;
msleep(10);
}

venus_shutdown(core);

pm_runtime_put_sync(core->dev);

while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0]))
for (i = 0; i < max_attempts; i++) {
if (!core->pmdomains[0] || !pm_runtime_active(core->pmdomains[0]))
break;
usleep_range(1000, 1500);
}

hfi_reinit(core);

pm_runtime_get_sync(core->dev);
ret = pm_runtime_get_sync(core->dev);
if (ret < 0) {
err_msg = "resume runtime PM";
failed = true;
}

ret = venus_boot(core);
if (ret && !failed) {
err_msg = "boot Venus";
failed = true;
}

ret |= venus_boot(core);
ret |= hfi_core_resume(core, true);
ret = hfi_core_resume(core, true);
if (ret && !failed) {
err_msg = "resume HFI";
failed = true;
}

enable_irq(core->irq);

mutex_unlock(&core->lock);

ret |= hfi_core_init(core);
ret = hfi_core_init(core);
if (ret && !failed) {
err_msg = "init HFI";
failed = true;
}

pm_runtime_put_sync(core->dev);

if (ret) {
if (failed) {
disable_irq_nosync(core->irq);
dev_warn(core->dev, "recovery failed (%d)\n", ret);
dev_warn_ratelimited(core->dev,
"System error has occurred, recovery failed to %s\n",
err_msg);
schedule_delayed_work(&core->work, msecs_to_jiffies(10));
return;
}

dev_warn(core->dev, "system error has occurred (recovered)\n");

mutex_lock(&core->lock);
core->sys_error = false;
mutex_unlock(&core->lock);
Expand Down

0 comments on commit e23dc4a

Please sign in to comment.