Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 68 additions & 11 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,28 +190,37 @@ async def update_crawl(
return {"updated": True}

async def delete_crawls(
self, org: Organization, delete_list: DeleteCrawlList, type_=None
self, org: Organization, delete_list: DeleteCrawlList, type_: str
):
"""Delete a list of crawls by id for given org"""
cids_to_update = set()
cids_to_update: dict[str, dict[str, int]] = {}

size = 0

for crawl_id in delete_list.crawl_ids:
crawl = await self.get_crawl_raw(crawl_id, org)
size += await self._delete_crawl_files(crawl, org)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can filter by crawl type here, since we have the type available. Files are deleted for all crawls, but then can do:

deleted_size = await self._delete_crawl_files(crawl, org)
if crawl.type == "crawl":
   crawl_size += delete_size
else:
   upload_size += delete_size

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it looks like we're not even checking the type before deleting crawl files here!
Should probably do:

if type_ and crawl.type != type_:
   continue

This is actually a bug in the current version as well, where it'll delete the files regardless of type, but not actual archived item object!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch on the missing type check! That's now been added.

I think there are advantages to handling the complexity of deleting multiple types in delete_crawls_all_types rather than delete_crawls. Everywhere else in our app that we are deleting content other than through the /all-crawls delete endpoint, we're handling one type at a time. And having delete_crawls handle only one type at a time keeps things a bit simpler, e.g. letting inc_org_bytes_storedonly need to worry about one archived item type at a time.

I pushed a change to delete_crawls_all_types to split the delete list into crawls and uploads arrays in one pass rather than two, which should help.

There are other improvements to be made to the delete endpoints that I'm planning on doing in a separate pass in this sprint (see #1208), so I'd expect some of this to change and get optimized there, but I don't want to hold up this PR for something that's creeping a bit out of scope just because I discovered some bugs 😅 If that sounds reasonable to you!

if crawl.get("cid"):
cids_to_update.add(crawl.get("cid"))
if crawl.get("type") != type_:
continue

query = {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
if type_:
query["type"] = type_
crawl_size = await self._delete_crawl_files(crawl, org)
size += crawl_size

cid = crawl.get("cid")
if cid:
if cids_to_update.get(cid):
cids_to_update[cid]["inc"] += 1
cids_to_update[cid]["size"] += crawl_size
else:
cids_to_update[cid] = {}
cids_to_update[cid]["inc"] = 1
cids_to_update[cid]["size"] = crawl_size

query = {"_id": {"$in": delete_list.crawl_ids}, "oid": org.id, "type": type_}
res = await self.crawls.delete_many(query)

quota_reached = await self.orgs.inc_org_bytes_stored(org.id, -size)
quota_reached = await self.orgs.inc_org_bytes_stored(org.id, -size, type_)

return res.deleted_count, size, cids_to_update, quota_reached
return res.deleted_count, cids_to_update, quota_reached

async def _delete_crawl_files(self, crawl, org: Organization):
"""Delete files associated with crawl from storage."""
Expand Down Expand Up @@ -496,13 +505,61 @@ async def delete_crawls_all_types(
self, delete_list: DeleteCrawlList, org: Organization
):
"""Delete uploaded crawls"""
deleted_count, _, _, quota_reached = await self.delete_crawls(org, delete_list)
if len(delete_list.crawl_ids) == 0:
raise HTTPException(status_code=400, detail="nothing_to_delete")

deleted_count = 0
# Value is set in delete calls, but initialize to keep linter happy.
quota_reached = False

crawls_to_delete, uploads_to_delete = await self._split_delete_list_by_type(
delete_list, org
)

if len(crawls_to_delete) > 0:
crawl_delete_list = DeleteCrawlList(crawl_ids=crawls_to_delete)
deleted, cids_to_update, quota_reached = await self.delete_crawls(
org, crawl_delete_list, "crawl"
)
deleted_count += deleted

for cid, cid_dict in cids_to_update.items():
cid_size = cid_dict["size"]
cid_inc = cid_dict["inc"]
await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)

if len(uploads_to_delete) > 0:
upload_delete_list = DeleteCrawlList(crawl_ids=uploads_to_delete)
deleted, _, quota_reached = await self.delete_crawls(
org, upload_delete_list, "upload"
)
deleted_count += deleted

if deleted_count < 1:
raise HTTPException(status_code=404, detail="crawl_not_found")

return {"deleted": True, "storageQuotaReached": quota_reached}

async def _split_delete_list_by_type(
self, delete_list: DeleteCrawlList, org: Organization
):
"""Return separate crawl and upload arrays from mixed input"""
crawls: list[str] = []
uploads: list[str] = []

for crawl_id in delete_list.crawl_ids:
try:
crawl_raw = await self.get_crawl_raw(crawl_id, org)
crawl_type = crawl_raw.get("type")
if crawl_type == "crawl":
crawls.append(crawl_id)
elif crawl_type == "upload":
uploads.append(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(err, flush=True)
return crawls, uploads

async def get_all_crawl_search_values(
self, org: Organization, type_: Optional[str] = None
):
Expand Down
4 changes: 3 additions & 1 deletion backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,9 @@ async def get_running_crawl(self, crawlconfig: CrawlConfig):

return None

async def stats_recompute_last(self, cid: uuid.UUID, size: int, inc_crawls=1):
async def stats_recompute_last(
self, cid: uuid.UUID, size: int, inc_crawls: int = 1
):
"""recompute stats by incrementing size counter and number of crawls"""
update_query: dict[str, object] = {
"lastCrawlId": None,
Expand Down
11 changes: 5 additions & 6 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,18 +216,17 @@ async def delete_crawls(
):
"""Delete a list of crawls by id for given org"""

count, size, cids_to_update, quota_reached = await super().delete_crawls(
count, cids_to_update, quota_reached = await super().delete_crawls(
org, delete_list, type_
)

if count < 1:
raise HTTPException(status_code=404, detail="crawl_not_found")

for cid in cids_to_update:
if not await self.crawl_configs.stats_recompute_last(cid, -size, -1):
raise HTTPException(
status_code=404, detail=f"crawl_config_not_found: {cid}"
)
for cid, cid_dict in cids_to_update.items():
cid_size = cid_dict["size"]
cid_inc = cid_dict["inc"]
await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)

return {"deleted": True, "storageQuotaReached": quota_reached}

Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from .migrations import BaseMigration


CURR_DB_VERSION = "0016"
CURR_DB_VERSION = "0017"


# ============================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
MIGRATION_VERSION = "0015"


# pylint: disable=too-many-locals
# pylint: disable=too-many-locals, duplicate-code
class Migration(BaseMigration):
"""Migration class."""

Expand Down
73 changes: 73 additions & 0 deletions backend/btrixcloud/migrations/migration_0017_storage_by_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
Migration 0017 - Calculate and store org storage usage by type
"""
from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0017"


# pylint: disable=too-many-locals, duplicate-code
class Migration(BaseMigration):
"""Migration class."""

def __init__(self, mdb, migration_version=MIGRATION_VERSION):
super().__init__(mdb, migration_version)

async def migrate_up(self):
"""Perform migration up.

Calculate and store org storage usage
"""
mdb_orgs = self.mdb["organizations"]
mdb_crawls = self.mdb["crawls"]
mdb_profiles = self.mdb["profiles"]

orgs = [res async for res in mdb_orgs.find({})]
for org in orgs:
oid = org.get("_id")

bytes_stored_crawls = 0
bytes_stored_uploads = 0
bytes_stored_profiles = 0

crawls = [
res
async for res in mdb_crawls.find(
{"oid": oid, "type": {"$in": [None, "crawl"]}}
)
]
for crawl in crawls:
for crawl_file in crawl.get("files", []):
bytes_stored_crawls += crawl_file.get("size", 0)

uploads = [
res async for res in mdb_crawls.find({"oid": oid, "type": "upload"})
]
for upload in uploads:
for upload_file in upload.get("files", []):
bytes_stored_uploads += upload_file.get("size", 0)

profiles = [res async for res in mdb_profiles.find({"oid": oid})]
for profile in profiles:
profile_file = profile.get("resource")
if profile_file:
bytes_stored_profiles += profile_file.get("size", 0)

try:
res = await mdb_orgs.find_one_and_update(
{"_id": oid},
{
"$set": {
"bytesStoredCrawls": bytes_stored_crawls,
"bytesStoredUploads": bytes_stored_uploads,
"bytesStoredProfiles": bytes_stored_profiles,
}
},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to set bytes stored by type for org {oid}: {err}",
flush=True,
)
11 changes: 9 additions & 2 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,9 @@ class Organization(BaseMongoModel):
usage: Dict[str, int] = {}

bytesStored: int = 0
bytesStoredCrawls: int = 0
bytesStoredUploads: int = 0
bytesStoredProfiles: int = 0

default: bool = False

Expand Down Expand Up @@ -744,6 +747,9 @@ class OrgOut(BaseMongoModel):
usage: Optional[Dict[str, int]]
default: bool = False
bytesStored: int
bytesStoredCrawls: int
bytesStoredUploads: int
bytesStoredProfiles: int
origin: Optional[AnyHttpUrl]

webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
Expand All @@ -755,9 +761,10 @@ class OrgMetrics(BaseModel):
"""Organization API metrics model"""

storageUsedBytes: int
storageUsedGB: float
storageUsedCrawls: int
storageUsedUploads: int
storageUsedProfiles: int
storageQuotaBytes: int
storageQuotaGB: float
archivedItemCount: int
crawlCount: int
uploadCount: int
Expand Down
5 changes: 1 addition & 4 deletions backend/btrixcloud/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,10 +887,7 @@ async def do_crawl_finished_tasks(
await self.crawl_config_ops.stats_recompute_last(cid, files_added_size, 1)

if state in SUCCESSFUL_STATES and oid:
await self.org_ops.add_crawl_files_to_org_bytes_stored(
oid, files_added_size
)

await self.org_ops.inc_org_bytes_stored(oid, files_added_size, "crawl")
await self.coll_ops.add_successful_crawl_to_collections(crawl_id, cid)

await self.event_webhook_ops.create_crawl_finished_notification(crawl_id, state)
Expand Down
36 changes: 18 additions & 18 deletions backend/btrixcloud/orgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@

DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization")

BYTES_IN_GB = 1_000_000_000


# ============================================================================
# pylint: disable=too-many-public-methods, too-many-instance-attributes
Expand Down Expand Up @@ -266,11 +264,22 @@ async def get_max_pages_per_crawl(self, oid: uuid.UUID):
return org.quotas.maxPagesPerCrawl
return 0

async def inc_org_bytes_stored(self, oid: uuid.UUID, size: int):
async def inc_org_bytes_stored(self, oid: uuid.UUID, size: int, type_="crawl"):
"""Increase org bytesStored count (pass negative value to subtract)."""
await self.orgs.find_one_and_update(
{"_id": oid}, {"$inc": {"bytesStored": size}}
)
if type_ == "crawl":
await self.orgs.find_one_and_update(
{"_id": oid}, {"$inc": {"bytesStored": size, "bytesStoredCrawls": size}}
)
elif type_ == "upload":
await self.orgs.find_one_and_update(
{"_id": oid},
{"$inc": {"bytesStored": size, "bytesStoredUploads": size}},
)
elif type_ == "profile":
await self.orgs.find_one_and_update(
{"_id": oid},
{"$inc": {"bytesStored": size, "bytesStoredProfiles": size}},
)
return await self.storage_quota_reached(oid)

# pylint: disable=invalid-name
Expand Down Expand Up @@ -327,20 +336,10 @@ async def get_max_concurrent_crawls(self, oid):
return org.quotas.maxConcurrentCrawls
return 0

async def add_crawl_files_to_org_bytes_stored(self, oid: uuid.UUID, size: int):
"""Add crawl's files to org bytesStored"""
await self.orgs.find_one_and_update(
{"_id": oid}, {"$inc": {"bytesStored": size}}
)

async def get_org_metrics(self, org: Organization):
"""Calculate and return org metrics"""
# pylint: disable=too-many-locals
storage_quota_gb = 0
storage_quota = await self.get_org_storage_quota(org.id)
if storage_quota:
storage_quota_gb = round(storage_quota / BYTES_IN_GB)

max_concurrent_crawls = await self.get_max_concurrent_crawls(org.id)

# Calculate these counts in loop to avoid having db iterate through
Expand Down Expand Up @@ -378,9 +377,10 @@ async def get_org_metrics(self, org: Organization):

return {
"storageUsedBytes": org.bytesStored,
"storageUsedGB": round((org.bytesStored / BYTES_IN_GB), 2),
"storageUsedCrawls": org.bytesStoredCrawls,
"storageUsedUploads": org.bytesStoredUploads,
"storageUsedProfiles": org.bytesStoredProfiles,
"storageQuotaBytes": storage_quota,
"storageQuotaGB": storage_quota_gb,
"archivedItemCount": archived_item_count,
"crawlCount": crawl_count,
"uploadCount": upload_count,
Expand Down
6 changes: 4 additions & 2 deletions backend/btrixcloud/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ async def commit_to_profile(
{"_id": profile.id}, {"$set": profile.to_dict()}, upsert=True
)

quota_reached = await self.orgs.inc_org_bytes_stored(oid, file_size)
quota_reached = await self.orgs.inc_org_bytes_stored(oid, file_size, "profile")

return {
"added": True,
Expand Down Expand Up @@ -310,7 +310,9 @@ async def delete_profile(self, profileid: uuid.UUID, org: Organization):
# Delete file from storage
if profile.resource:
await delete_crawl_file_object(org, profile.resource, self.crawl_manager)
await self.orgs.inc_org_bytes_stored(org.id, -profile.resource.size)
await self.orgs.inc_org_bytes_stored(
org.id, -profile.resource.size, "profile"
)

res = await self.profiles.delete_one(query)
if not res or res.deleted_count != 1:
Expand Down
6 changes: 4 additions & 2 deletions backend/btrixcloud/uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,15 @@ async def _create_upload(
self.event_webhook_ops.create_upload_finished_notification(crawl_id)
)

quota_reached = await self.orgs.inc_org_bytes_stored(org.id, file_size)
quota_reached = await self.orgs.inc_org_bytes_stored(
org.id, file_size, "upload"
)

return {"id": crawl_id, "added": True, "storageQuotaReached": quota_reached}

async def delete_uploads(self, delete_list: DeleteCrawlList, org: Organization):
"""Delete uploaded crawls"""
deleted_count, _, _, quota_reached = await self.delete_crawls(
deleted_count, _, quota_reached = await self.delete_crawls(
org, delete_list, "upload"
)

Expand Down
Loading