Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 37 additions & 12 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .db import BaseMongoModel
from .users import User
from .orgs import Organization, MAX_CRAWL_SCALE
from .storages import get_presigned_url
from .storages import get_presigned_url, delete_crawl_file_object


# ============================================================================
Expand Down Expand Up @@ -352,13 +352,26 @@ async def _update_presigned(self, updates):
for update in updates:
await self.crawls.find_one_and_update(*update)

async def delete_crawls(self, oid: uuid.UUID, delete_list: DeleteCrawlList):
async def delete_crawls(self, org: Organization, delete_list: DeleteCrawlList):
"""Delete a list of crawls by id for given org"""
for crawl_id in delete_list.crawl_ids:
await self._delete_crawl_files(org, crawl_id)

res = await self.crawls.delete_many(
{"_id": {"$in": delete_list.crawl_ids}, "oid": oid}
{"_id": {"$in": delete_list.crawl_ids}, "oid": org.id}
)

return res.deleted_count

async def _delete_crawl_files(self, org: Organization, crawl_id: str):
"""Delete files associated with crawl from storage."""
crawl_raw = await self.get_crawl_raw(crawl_id, org)
crawl = Crawl.from_dict(crawl_raw)
for file_ in crawl.files:
status_code = await delete_crawl_file_object(org, file_, self.crawl_manager)
if status_code != 204:
raise HTTPException(status_code=400, detail="file_deletion_error")

async def add_new_crawl(self, crawl_id: str, crawlconfig):
"""initialize new crawl"""
crawl = Crawl(
Expand Down Expand Up @@ -651,17 +664,29 @@ async def crawl_graceful_stop(crawl_id, org: Organization = Depends(org_crawl_de

@app.post("/orgs/{oid}/crawls/delete", tags=["crawls"])
async def delete_crawls(
delete_list: DeleteCrawlList, org: Organization = Depends(org_crawl_dep)
delete_list: DeleteCrawlList,
user: User = Depends(user_dep),
org: Organization = Depends(org_crawl_dep),
):
try:
for crawl_id in delete_list:
await crawl_manager.stop_crawl(crawl_id, org.id, graceful=False)

except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
# Ensure user has appropriate permissions for all crawls in list:
# - Crawler users can delete their own crawls
# - Org owners can delete any crawls in org
for crawl_id in delete_list.crawl_ids:
crawl_raw = await ops.get_crawl_raw(crawl_id, org)
crawl = Crawl.from_dict(crawl_raw)
if (crawl.userid != user.id) and not org.is_owner(user):
raise HTTPException(status_code=403, detail="Not Allowed")

if not crawl.finished:
try:
await ops.shutdown_crawl(crawl_id, org, graceful=False)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error Stopping Crawl: {exc}"
)

res = await ops.delete_crawls(org.id, delete_list)
res = await ops.delete_crawls(org, delete_list)

return {"deleted": res}

Expand Down
26 changes: 26 additions & 0 deletions backend/btrixcloud/storages.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,29 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
)

return presigned_url


# ============================================================================
async def delete_crawl_file_object(org, crawlfile, crawl_manager):
"""delete crawl file from storage."""
status_code = None

if crawlfile.def_storage_name:
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)

elif org.storage.type == "s3":
s3storage = org.storage

else:
raise TypeError("No Default Storage Found, Invalid Storage Type")

async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
client,
bucket,
key,
):
key += crawlfile.filename
response = await client.delete_object(Bucket=bucket, Key=key)
status_code = response["ResponseMetadata"]["HTTPStatusCode"]

return status_code
28 changes: 28 additions & 0 deletions backend/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,34 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
time.sleep(5)


@pytest.fixture(scope="session")
def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs sample crawl",
"config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=crawl_data,
)
data = r.json()

crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)


@pytest.fixture(scope="session")
def crawler_config_id(crawler_crawl_id):
return _crawler_config_id
Expand Down
71 changes: 71 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,74 @@ def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
data = r.json()
assert data["tags"] == []
assert not data["notes"]


def test_delete_crawls_crawler(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):
# Test that crawler user can't delete another user's crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [admin_crawl_id]},
)
assert r.status_code == 403
data = r.json()
assert data["detail"] == "Not Allowed"

# Test that crawler user can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=crawler_auth_headers,
json={"crawl_ids": [crawler_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 404


def test_delete_crawls_org_owner(
admin_auth_headers,
crawler_auth_headers,
default_org_id,
admin_crawl_id,
crawler_crawl_id,
wr_specs_crawl_id,
):
# Test that org owner can delete own crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [admin_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404

# Test that org owner can delete another org user's crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [wr_specs_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"] == 1

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404
64 changes: 64 additions & 0 deletions backend/test_nightly/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,67 @@ def default_org_id(admin_auth_headers):
except:
print("Waiting for default org id")
time.sleep(5)


@pytest.fixture(scope="session")
def crawl_id_wr(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder admin test crawl",
"tags": ["wr", "nightly testing"],
"config": {
"seeds": ["https://webrecorder.net/"],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()

crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)


@pytest.fixture(scope="session")
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs admin test crawl",
"tags": ["wr-specs", "nightly testing"],
"config": {
"seeds": ["https://specs.webrecorder.net/"],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()

crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
88 changes: 88 additions & 0 deletions backend/test_nightly/test_delete_crawls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import requests
import time

from .conftest import API_PREFIX, HOST_PREFIX


def test_delete_crawls(
tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
):
# Check that crawls have associated files
crawl_resource_urls = []

def _file_is_retrievable(url):
"""Attempt to retrieve file at url and return True or False."""
file_path = str(tmp_path / "test_download")
if os.path.exists(file_path):
os.remove(file_path)

r = requests.get(f"{HOST_PREFIX}{url}")
if not r.status_code == 200:
return False

with open(file_path, "wb") as fd:
fd.write(r.content)

if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
return False

os.remove(file_path)
return True

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
resources = data["resources"]
assert resources
for resource in resources:
crawl_resource_urls.append(resource["path"])

# Test retrieving resources
for url in crawl_resource_urls:
assert _file_is_retrievable(url)

# Delete crawls
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
)
assert r.status_code == 200
data = r.json()

assert data["deleted"] == 2

# Verify that crawls don't exist in db
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404

# Give Minio time to delete the files
time.sleep(120)

# Verify that files are no longer retrievable from storage
for url in crawl_resource_urls:
assert not _file_is_retrievable(url)