Skip to content

Commit

Permalink
accel/habanalabs: add pci health check during heartbeat
Browse files Browse the repository at this point in the history
[ Upstream commit d8b9cea ]

Currently upon a heartbeat failure, we don't know if the failure
is due to firmware hang or due to a bad PCI link. Hence, we
are reading a PCI config space register with a known value (vendor ID)
so we will know which of the two possibilities caused the heartbeat
failure.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
  • Loading branch information
ofirbitt authored and gregkh committed Aug 23, 2023
1 parent b7a34e3 commit d7933b9
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
15 changes: 14 additions & 1 deletion drivers/misc/habanalabs/common/device.c
Expand Up @@ -870,6 +870,18 @@ static void device_early_fini(struct hl_device *hdev)
hdev->asic_funcs->early_fini(hdev);
}

static bool is_pci_link_healthy(struct hl_device *hdev)
{
u16 vendor_id;

if (!hdev->pdev)
return false;

pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);

return (vendor_id == PCI_VENDOR_ID_HABANALABS);
}

static void hl_device_heartbeat(struct work_struct *work)
{
struct hl_device *hdev = container_of(work, struct hl_device,
Expand All @@ -882,7 +894,8 @@ static void hl_device_heartbeat(struct work_struct *work)
goto reschedule;

if (hl_device_operational(hdev, NULL))
dev_err(hdev->dev, "Device heartbeat failed!\n");
dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
is_pci_link_healthy(hdev) ? "healthy" : "broken");

hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);

Expand Down
2 changes: 2 additions & 0 deletions drivers/misc/habanalabs/common/habanalabs.h
Expand Up @@ -34,6 +34,8 @@
struct hl_device;
struct hl_fpriv;

#define PCI_VENDOR_ID_HABANALABS 0x1da3

/* Use upper bits of mmap offset to store habana driver specific information.
* bits[63:59] - Encode mmap type
* bits[45:0] - mmap offset value
Expand Down
2 changes: 0 additions & 2 deletions drivers/misc/habanalabs/common/habanalabs_drv.c
Expand Up @@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444);
MODULE_PARM_DESC(boot_error_status_mask,
"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");

#define PCI_VENDOR_ID_HABANALABS 0x1da3

#define PCI_IDS_GOYA 0x0001
#define PCI_IDS_GAUDI 0x1000
#define PCI_IDS_GAUDI_SEC 0x1010
Expand Down

0 comments on commit d7933b9

Please sign in to comment.