webrecorder · tw4l · Sep 22, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
@@ -190,28 +190,37 @@ async def update_crawl(
         return {"updated": True}
 
     async def delete_crawls(
-        self, org: Organization, delete_list: DeleteCrawlList, type_=None
+        self, org: Organization, delete_list: DeleteCrawlList, type_: str
     ):
         """Delete a list of crawls by id for given org"""
-        cids_to_update = set()
+        cids_to_update: dict[str, dict[str, int]] = {}
 
         size = 0
 
         for crawl_id in delete_list.crawl_ids:
             crawl = await self.get_crawl_raw(crawl_id, org)
-            size += await self._delete_crawl_files(crawl, org)
-            if crawl.get("cid"):
-                cids_to_update.add(crawl.get("cid"))
+            if crawl.get("type") != type_:
+                continue
 
-        query = {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
-        if type_:
-            query["type"] = type_
+            crawl_size = await self._delete_crawl_files(crawl, org)
+            size += crawl_size
 
+            cid = crawl.get("cid")
+            if cid:
+                if cids_to_update.get(cid):
+                    cids_to_update[cid]["inc"] += 1
+                    cids_to_update[cid]["size"] += crawl_size
+                else:
+                    cids_to_update[cid] = {}
+                    cids_to_update[cid]["inc"] = 1
+                    cids_to_update[cid]["size"] = crawl_size
+
+        query = {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id, "type": type_}
         res = await self.crawls.delete_many(query)
 
-        quota_reached = await self.orgs.inc_org_bytes_stored(org.id, -size)
+        quota_reached = await self.orgs.inc_org_bytes_stored(org.id, -size, type_)
 
-        return res.deleted_count, size, cids_to_update, quota_reached
+        return res.deleted_count, cids_to_update, quota_reached
 
     async def _delete_crawl_files(self, crawl, org: Organization):
         """Delete files associated with crawl from storage."""
@@ -496,13 +505,61 @@ async def delete_crawls_all_types(
         self, delete_list: DeleteCrawlList, org: Organization
     ):
         """Delete uploaded crawls"""
-        deleted_count, _, _, quota_reached = await self.delete_crawls(org, delete_list)
+        if len(delete_list.crawl_ids) == 0:
+            raise HTTPException(status_code=400, detail="nothing_to_delete")
+
+        deleted_count = 0
+        # Value is set in delete calls, but initialize to keep linter happy.
+        quota_reached = False
+
+        crawls_to_delete, uploads_to_delete = await self._split_delete_list_by_type(
+            delete_list, org
+        )
+
+        if len(crawls_to_delete) > 0:
+            crawl_delete_list = DeleteCrawlList(crawl_ids=crawls_to_delete)
+            deleted, cids_to_update, quota_reached = await self.delete_crawls(
+                org, crawl_delete_list, "crawl"
+            )
+            deleted_count += deleted
+
+            for cid, cid_dict in cids_to_update.items():
+                cid_size = cid_dict["size"]
+                cid_inc = cid_dict["inc"]
+                await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)
+
+        if len(uploads_to_delete) > 0:
+            upload_delete_list = DeleteCrawlList(crawl_ids=uploads_to_delete)
+            deleted, _, quota_reached = await self.delete_crawls(
+                org, upload_delete_list, "upload"
+            )
+            deleted_count += deleted
 
         if deleted_count < 1:
             raise HTTPException(status_code=404, detail="crawl_not_found")
 
         return {"deleted": True, "storageQuotaReached": quota_reached}
 
+    async def _split_delete_list_by_type(
+        self, delete_list: DeleteCrawlList, org: Organization
+    ):
+        """Return separate crawl and upload arrays from mixed input"""
+        crawls: list[str] = []
+        uploads: list[str] = []
+
+        for crawl_id in delete_list.crawl_ids:
+            try:
+                crawl_raw = await self.get_crawl_raw(crawl_id, org)
+                crawl_type = crawl_raw.get("type")
+                if crawl_type == "crawl":
+                    crawls.append(crawl_id)
+                elif crawl_type == "upload":
+                    uploads.append(crawl_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(err, flush=True)
+        return crawls, uploads
+
     async def get_all_crawl_search_values(
         self, org: Organization, type_: Optional[str] = None
     ):

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -503,7 +503,9 @@ async def get_running_crawl(self, crawlconfig: CrawlConfig):
 
         return None
 
-    async def stats_recompute_last(self, cid: uuid.UUID, size: int, inc_crawls=1):
+    async def stats_recompute_last(
+        self, cid: uuid.UUID, size: int, inc_crawls: int = 1
+    ):
         """recompute stats by incrementing size counter and number of crawls"""
         update_query: dict[str, object] = {
             "lastCrawlId": None,

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
@@ -216,18 +216,17 @@ async def delete_crawls(
     ):
         """Delete a list of crawls by id for given org"""
 
-        count, size, cids_to_update, quota_reached = await super().delete_crawls(
+        count, cids_to_update, quota_reached = await super().delete_crawls(
             org, delete_list, type_
         )
 
         if count < 1:
             raise HTTPException(status_code=404, detail="crawl_not_found")
 
-        for cid in cids_to_update:
-            if not await self.crawl_configs.stats_recompute_last(cid, -size, -1):
-                raise HTTPException(
-                    status_code=404, detail=f"crawl_config_not_found: {cid}"
-                )
+        for cid, cid_dict in cids_to_update.items():
+            cid_size = cid_dict["size"]
+            cid_inc = cid_dict["inc"]
+            await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)
 
         return {"deleted": True, "storageQuotaReached": quota_reached}
 

diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
@@ -15,7 +15,7 @@
 from .migrations import BaseMigration
 
 
-CURR_DB_VERSION = "0016"
+CURR_DB_VERSION = "0017"
 
 
 # ============================================================================

diff --git a/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py b/backend/btrixcloud/migrations/migration_0015_org_storage_usage.py
@@ -7,7 +7,7 @@
 MIGRATION_VERSION = "0015"
 
 
-# pylint: disable=too-many-locals
+# pylint: disable=too-many-locals, duplicate-code
 class Migration(BaseMigration):
     """Migration class."""
 

diff --git a/backend/btrixcloud/migrations/migration_0017_storage_by_type.py b/backend/btrixcloud/migrations/migration_0017_storage_by_type.py
@@ -0,0 +1,73 @@
+"""
+Migration 0017 - Calculate and store org storage usage by type
+"""
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0017"
+
+
+# pylint: disable=too-many-locals, duplicate-code
+class Migration(BaseMigration):
+    """Migration class."""
+
+    def __init__(self, mdb, migration_version=MIGRATION_VERSION):
+        super().__init__(mdb, migration_version)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Calculate and store org storage usage
+        """
+        mdb_orgs = self.mdb["organizations"]
+        mdb_crawls = self.mdb["crawls"]
+        mdb_profiles = self.mdb["profiles"]
+
+        orgs = [res async for res in mdb_orgs.find({})]
+        for org in orgs:
+            oid = org.get("_id")
+
+            bytes_stored_crawls = 0
+            bytes_stored_uploads = 0
+            bytes_stored_profiles = 0
+
+            crawls = [
+                res
+                async for res in mdb_crawls.find(
+                    {"oid": oid, "type": {"$in": [None, "crawl"]}}
+                )
+            ]
+            for crawl in crawls:
+                for crawl_file in crawl.get("files", []):
+                    bytes_stored_crawls += crawl_file.get("size", 0)
+
+            uploads = [
+                res async for res in mdb_crawls.find({"oid": oid, "type": "upload"})
+            ]
+            for upload in uploads:
+                for upload_file in upload.get("files", []):
+                    bytes_stored_uploads += upload_file.get("size", 0)
+
+            profiles = [res async for res in mdb_profiles.find({"oid": oid})]
+            for profile in profiles:
+                profile_file = profile.get("resource")
+                if profile_file:
+                    bytes_stored_profiles += profile_file.get("size", 0)
+
+            try:
+                res = await mdb_orgs.find_one_and_update(
+                    {"_id": oid},
+                    {
+                        "$set": {
+                            "bytesStoredCrawls": bytes_stored_crawls,
+                            "bytesStoredUploads": bytes_stored_uploads,
+                            "bytesStoredProfiles": bytes_stored_profiles,
+                        }
+                    },
+                )
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Unable to set bytes stored by type for org {oid}: {err}",
+                    flush=True,
+                )
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -666,6 +666,9 @@ class Organization(BaseMongoModel):
     usage: Dict[str, int] = {}
 
     bytesStored: int = 0
+    bytesStoredCrawls: int = 0
+    bytesStoredUploads: int = 0
+    bytesStoredProfiles: int = 0
 
     default: bool = False
 
@@ -744,6 +747,9 @@ class OrgOut(BaseMongoModel):
     usage: Optional[Dict[str, int]]
     default: bool = False
     bytesStored: int
+    bytesStoredCrawls: int
+    bytesStoredUploads: int
+    bytesStoredProfiles: int
     origin: Optional[AnyHttpUrl]
 
     webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
@@ -755,9 +761,10 @@ class OrgMetrics(BaseModel):
     """Organization API metrics model"""
 
     storageUsedBytes: int
-    storageUsedGB: float
+    storageUsedCrawls: int
+    storageUsedUploads: int
+    storageUsedProfiles: int
     storageQuotaBytes: int
-    storageQuotaGB: float
     archivedItemCount: int
     crawlCount: int
     uploadCount: int

diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py
@@ -887,10 +887,7 @@ async def do_crawl_finished_tasks(
         await self.crawl_config_ops.stats_recompute_last(cid, files_added_size, 1)
 
         if state in SUCCESSFUL_STATES and oid:
-            await self.org_ops.add_crawl_files_to_org_bytes_stored(
-                oid, files_added_size
-            )
-
+            await self.org_ops.inc_org_bytes_stored(oid, files_added_size, "crawl")
             await self.coll_ops.add_successful_crawl_to_collections(crawl_id, cid)
 
         await self.event_webhook_ops.create_crawl_finished_notification(crawl_id, state)

diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py
@@ -37,8 +37,6 @@
 
 DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization")
 
-BYTES_IN_GB = 1_000_000_000
-
 
 # ============================================================================
 # pylint: disable=too-many-public-methods, too-many-instance-attributes
@@ -266,11 +264,22 @@ async def get_max_pages_per_crawl(self, oid: uuid.UUID):
             return org.quotas.maxPagesPerCrawl
         return 0
 
-    async def inc_org_bytes_stored(self, oid: uuid.UUID, size: int):
+    async def inc_org_bytes_stored(self, oid: uuid.UUID, size: int, type_="crawl"):
         """Increase org bytesStored count (pass negative value to subtract)."""
-        await self.orgs.find_one_and_update(
-            {"_id": oid}, {"$inc": {"bytesStored": size}}
-        )
+        if type_ == "crawl":
+            await self.orgs.find_one_and_update(
+                {"_id": oid}, {"$inc": {"bytesStored": size, "bytesStoredCrawls": size}}
+            )
+        elif type_ == "upload":
+            await self.orgs.find_one_and_update(
+                {"_id": oid},
+                {"$inc": {"bytesStored": size, "bytesStoredUploads": size}},
+            )
+        elif type_ == "profile":
+            await self.orgs.find_one_and_update(
+                {"_id": oid},
+                {"$inc": {"bytesStored": size, "bytesStoredProfiles": size}},
+            )
         return await self.storage_quota_reached(oid)
 
     # pylint: disable=invalid-name
@@ -327,20 +336,10 @@ async def get_max_concurrent_crawls(self, oid):
             return org.quotas.maxConcurrentCrawls
         return 0
 
-    async def add_crawl_files_to_org_bytes_stored(self, oid: uuid.UUID, size: int):
-        """Add crawl's files to org bytesStored"""
-        await self.orgs.find_one_and_update(
-            {"_id": oid}, {"$inc": {"bytesStored": size}}
-        )
-
     async def get_org_metrics(self, org: Organization):
         """Calculate and return org metrics"""
         # pylint: disable=too-many-locals
-        storage_quota_gb = 0
         storage_quota = await self.get_org_storage_quota(org.id)
-        if storage_quota:
-            storage_quota_gb = round(storage_quota / BYTES_IN_GB)
-
         max_concurrent_crawls = await self.get_max_concurrent_crawls(org.id)
 
         # Calculate these counts in loop to avoid having db iterate through
@@ -378,9 +377,10 @@ async def get_org_metrics(self, org: Organization):
 
         return {
             "storageUsedBytes": org.bytesStored,
-            "storageUsedGB": round((org.bytesStored / BYTES_IN_GB), 2),
+            "storageUsedCrawls": org.bytesStoredCrawls,
+            "storageUsedUploads": org.bytesStoredUploads,
+            "storageUsedProfiles": org.bytesStoredProfiles,
             "storageQuotaBytes": storage_quota,
-            "storageQuotaGB": storage_quota_gb,
             "archivedItemCount": archived_item_count,
             "crawlCount": crawl_count,
             "uploadCount": upload_count,

diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py
@@ -193,7 +193,7 @@ async def commit_to_profile(
             {"_id": profile.id}, {"$set": profile.to_dict()}, upsert=True
         )
 
-        quota_reached = await self.orgs.inc_org_bytes_stored(oid, file_size)
+        quota_reached = await self.orgs.inc_org_bytes_stored(oid, file_size, "profile")
 
         return {
             "added": True,
@@ -310,7 +310,9 @@ async def delete_profile(self, profileid: uuid.UUID, org: Organization):
         # Delete file from storage
         if profile.resource:
             await delete_crawl_file_object(org, profile.resource, self.crawl_manager)
-            await self.orgs.inc_org_bytes_stored(org.id, -profile.resource.size)
+            await self.orgs.inc_org_bytes_stored(
+                org.id, -profile.resource.size, "profile"
+            )
 
         res = await self.profiles.delete_one(query)
         if not res or res.deleted_count != 1:

diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
@@ -180,13 +180,15 @@ async def _create_upload(
             self.event_webhook_ops.create_upload_finished_notification(crawl_id)
         )
 
-        quota_reached = await self.orgs.inc_org_bytes_stored(org.id, file_size)
+        quota_reached = await self.orgs.inc_org_bytes_stored(
+            org.id, file_size, "upload"
+        )
 
         return {"id": crawl_id, "added": True, "storageQuotaReached": quota_reached}
 
     async def delete_uploads(self, delete_list: DeleteCrawlList, org: Organization):
         """Delete uploaded crawls"""
-        deleted_count, _, _, quota_reached = await self.delete_crawls(
+        deleted_count, _, quota_reached = await self.delete_crawls(
             org, delete_list, "upload"
         )