Skip to content

Commit dcaacbe

Browse files
authored
feat: add manual retry for failed async operations (#537)
- API: POST /v1/default/banks/{bank_id}/operations/{operation_id}/retry resets status to pending so the worker re-executes the task - UI: Retry button on failed operations in the operations view - Control plane proxy route + ControlPlaneClient.retryOperation() - Updated OpenAPI spec, all generated clients, and operations docs
1 parent 32a4882 commit dcaacbe

File tree

17 files changed

+1195
-2
lines changed

17 files changed

+1195
-2
lines changed

hindsight-api/hindsight_api/api/http.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,24 @@ class CancelOperationResponse(BaseModel):
15581558
operation_id: str
15591559

15601560

1561+
class RetryOperationResponse(BaseModel):
1562+
"""Response model for retry operation endpoint."""
1563+
1564+
model_config = ConfigDict(
1565+
json_schema_extra={
1566+
"example": {
1567+
"success": True,
1568+
"message": "Operation 550e8400-e29b-41d4-a716-446655440000 queued for retry",
1569+
"operation_id": "550e8400-e29b-41d4-a716-446655440000",
1570+
}
1571+
}
1572+
)
1573+
1574+
success: bool
1575+
message: str
1576+
operation_id: str
1577+
1578+
15611579
class ChildOperationStatus(BaseModel):
15621580
"""Status of a child operation (for batch operations)."""
15631581

@@ -3532,6 +3550,39 @@ async def api_cancel_operation(
35323550
logger.error(f"Error in /v1/default/banks/{bank_id}/operations/{operation_id}: {error_detail}")
35333551
raise HTTPException(status_code=500, detail=str(e))
35343552

3553+
@app.post(
3554+
"/v1/default/banks/{bank_id}/operations/{operation_id}/retry",
3555+
response_model=RetryOperationResponse,
3556+
summary="Retry a failed async operation",
3557+
description="Re-queue a failed async operation so the worker picks it up again",
3558+
operation_id="retry_operation",
3559+
tags=["Operations"],
3560+
)
3561+
async def api_retry_operation(
3562+
bank_id: str, operation_id: str, request_context: RequestContext = Depends(get_request_context)
3563+
):
3564+
"""Retry a failed async operation."""
3565+
try:
3566+
try:
3567+
uuid.UUID(operation_id)
3568+
except ValueError:
3569+
raise HTTPException(status_code=400, detail=f"Invalid operation_id format: {operation_id}")
3570+
3571+
result = await app.state.memory.retry_operation(bank_id, operation_id, request_context=request_context)
3572+
return RetryOperationResponse(**result)
3573+
except ValueError as e:
3574+
raise HTTPException(status_code=404, detail=str(e))
3575+
except OperationValidationError as e:
3576+
raise HTTPException(status_code=e.status_code, detail=e.reason)
3577+
except (AuthenticationError, HTTPException):
3578+
raise
3579+
except Exception as e:
3580+
import traceback
3581+
3582+
error_detail = f"{str(e)}\n\nTraceback:\n{traceback.format_exc()}"
3583+
logger.error(f"Error in POST /v1/default/banks/{bank_id}/operations/{operation_id}/retry: {error_detail}")
3584+
raise HTTPException(status_code=500, detail=str(e))
3585+
35353586
@app.get(
35363587
"/v1/default/banks/{bank_id}/profile",
35373588
response_model=BankProfileResponse,

hindsight-api/hindsight_api/engine/memory_engine.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7118,6 +7118,64 @@ async def cancel_operation(
71187118
"bank_id": bank_id,
71197119
}
71207120

7121+
async def retry_operation(
7122+
self,
7123+
bank_id: str,
7124+
operation_id: str,
7125+
*,
7126+
request_context: "RequestContext",
7127+
) -> dict[str, Any]:
7128+
"""Re-queue a failed async operation."""
7129+
await self._authenticate_tenant(request_context)
7130+
from hindsight_api.extensions import OperationValidationError
7131+
7132+
if self._operation_validator:
7133+
from hindsight_api.extensions import BankWriteContext
7134+
7135+
ctx = BankWriteContext(bank_id=bank_id, operation="retry_operation", request_context=request_context)
7136+
await self._validate_operation(self._operation_validator.validate_bank_write(ctx))
7137+
pool = await self._get_pool()
7138+
7139+
op_uuid = uuid.UUID(operation_id)
7140+
7141+
async with acquire_with_retry(pool) as conn:
7142+
row = await conn.fetchrow(
7143+
f"SELECT bank_id, status FROM {fq_table('async_operations')} WHERE operation_id = $1 AND bank_id = $2",
7144+
op_uuid,
7145+
bank_id,
7146+
)
7147+
7148+
if not row:
7149+
raise ValueError(f"Operation {operation_id} not found for bank {bank_id}")
7150+
7151+
if row["status"] != "failed":
7152+
raise OperationValidationError(
7153+
f"Operation {operation_id} cannot be retried: status is '{row['status']}', expected 'failed'",
7154+
409,
7155+
)
7156+
7157+
await conn.execute(
7158+
f"""
7159+
UPDATE {fq_table("async_operations")}
7160+
SET status = 'pending',
7161+
error_message = NULL,
7162+
completed_at = NULL,
7163+
next_retry_at = NULL,
7164+
worker_id = NULL,
7165+
claimed_at = NULL,
7166+
retry_count = 0,
7167+
updated_at = NOW()
7168+
WHERE operation_id = $1
7169+
""",
7170+
op_uuid,
7171+
)
7172+
7173+
return {
7174+
"success": True,
7175+
"message": f"Operation {operation_id} queued for retry",
7176+
"operation_id": operation_id,
7177+
}
7178+
71217179
async def update_bank(
71227180
self,
71237181
bank_id: str,

hindsight-clients/go/api/openapi.yaml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,51 @@ paths:
17711771
summary: Get operation status
17721772
tags:
17731773
- Operations
1774+
/v1/default/banks/{bank_id}/operations/{operation_id}/retry:
1775+
post:
1776+
description: Re-queue a failed async operation so the worker picks it up again
1777+
operationId: retry_operation
1778+
parameters:
1779+
- explode: false
1780+
in: path
1781+
name: bank_id
1782+
required: true
1783+
schema:
1784+
title: Bank Id
1785+
type: string
1786+
style: simple
1787+
- explode: false
1788+
in: path
1789+
name: operation_id
1790+
required: true
1791+
schema:
1792+
title: Operation Id
1793+
type: string
1794+
style: simple
1795+
- explode: false
1796+
in: header
1797+
name: authorization
1798+
required: false
1799+
schema:
1800+
nullable: true
1801+
type: string
1802+
style: simple
1803+
responses:
1804+
"200":
1805+
content:
1806+
application/json:
1807+
schema:
1808+
$ref: '#/components/schemas/RetryOperationResponse'
1809+
description: Successful Response
1810+
"422":
1811+
content:
1812+
application/json:
1813+
schema:
1814+
$ref: '#/components/schemas/HTTPValidationError'
1815+
description: Validation Error
1816+
summary: Retry a failed async operation
1817+
tags:
1818+
- Operations
17741819
/v1/default/banks/{bank_id}/profile:
17751820
get:
17761821
deprecated: true
@@ -4748,6 +4793,27 @@ components:
47484793
- items_count
47494794
- success
47504795
title: RetainResponse
4796+
RetryOperationResponse:
4797+
description: Response model for retry operation endpoint.
4798+
example:
4799+
message: Operation 550e8400-e29b-41d4-a716-446655440000 queued for retry
4800+
operation_id: 550e8400-e29b-41d4-a716-446655440000
4801+
success: true
4802+
properties:
4803+
success:
4804+
title: Success
4805+
type: boolean
4806+
message:
4807+
title: Message
4808+
type: string
4809+
operation_id:
4810+
title: Operation Id
4811+
type: string
4812+
required:
4813+
- message
4814+
- operation_id
4815+
- success
4816+
title: RetryOperationResponse
47514817
SourceFactsIncludeOptions:
47524818
description: Options for including source facts for observation-type results.
47534819
properties:

hindsight-clients/go/api_operations.go

Lines changed: 126 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)