From e0b82e0fa2cfe5e98be05ca00441a7555f52d7e8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 17 Nov 2025 15:35:15 -0500 Subject: [PATCH] Update file crawl db object after deleting files After deleting files (e.g. WACZs uploaded while a crawl was paused) for canceled or otherwise failed crawls, ensure we also update the crawl database object. This fixes a regression introduced by crawl pausing, which resulted in org storage numbers being incorrect when later deleting the canceled crawl, because its files wer enot removed from the database at the same time as they were deleted from storage. It also renames the basecrawls method to make purpose clearer, as it is only used by the operator and should only be used for failed crawls. --- backend/btrixcloud/basecrawls.py | 16 +++++++++++++--- backend/btrixcloud/operator/crawls.py | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index d94717786d..2cf09a0f85 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -426,11 +426,21 @@ async def _delete_crawl_files( return size - async def delete_crawl_files(self, crawl_id: str, oid: UUID): - """Delete crawl files""" + async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): + """Delete crawl files for failed crawl""" crawl = await self.get_base_crawl(crawl_id) org = await self.orgs.get_org_by_id(oid) - return await self._delete_crawl_files(crawl, org) + await self._delete_crawl_files(crawl, org) + await self.crawls.find_one_and_update( + {"_id": crawl_id, "oid": oid}, + { + "$set": { + "files": [], + "fileCount": 0, + "fileSize": 0, + } + }, + ) async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization): """Delete files for all qa runs in a crawl""" diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index c7f9861ee6..01d81ca407 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1707,7 +1707,7 @@ async def do_crawl_finished_tasks( ) if state in FAILED_STATES: - await self.crawl_ops.delete_crawl_files(crawl.id, crawl.oid) + await self.crawl_ops.delete_failed_crawl_files(crawl.id, crawl.oid) await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid) await self.event_webhook_ops.create_crawl_finished_notification(