Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,11 @@ async def _resolve_crawl_refs(
raise HTTPException(status_code=400, detail="missing_org")

if hasattr(crawl, "profileid") and crawl.profileid:
crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
profile = await self.crawl_configs.profiles.get_profile(
crawl.profileid, org
)
if profile:
crawl.profileName = profile.name

if (
files
Expand Down
52 changes: 30 additions & 22 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,13 +265,10 @@ async def add_crawl_config(
proxy_id = config_in.proxyId

profileid = None
# ensure profile is valid, get proxy_id from profile
if isinstance(config_in.profileid, UUID):
profileid = config_in.profileid

# ensure profile is valid, get proxy_id from profile
if profileid:
profile = await self.profiles.get_profile(profileid, org)
proxy_id = profile.proxyId
proxy_id = None
else:
if config_in.config and config_in.config.failOnContentCheck:
raise HTTPException(
Expand All @@ -280,8 +277,7 @@ async def add_crawl_config(

# ensure proxy_id is valid and available for org
if proxy_id:
if not self.can_org_use_proxy(org, proxy_id):
raise HTTPException(status_code=404, detail="proxy_not_found")
self.assert_can_org_use_proxy(org, proxy_id)

if config_in.config.exclude:
exclude = config_in.config.exclude
Expand Down Expand Up @@ -602,7 +598,15 @@ async def update_crawl_config(
and ((not update.profileid) != (not orig_crawl_config.profileid))
)

changed = changed or (orig_crawl_config.proxyId != update.proxyId)
# either unsetting profile or no profile set on current config
no_profile = update.profileid == "" or not orig_crawl_config.profileid

changed = changed or (
no_profile
and update.proxyId is not None
and orig_crawl_config.proxyId != update.proxyId
and ((not update.proxyId) != (not orig_crawl_config.proxyId))
)

metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
metadata_changed = metadata_changed or self.check_attr_changed(
Expand Down Expand Up @@ -633,8 +637,6 @@ async def update_crawl_config(
last_rev = ConfigRevision(**orig_dict)
last_rev = await self.config_revs.insert_one(last_rev.to_dict())

proxy_id = update.proxyId

# set update query
query = update.dict(exclude_unset=True)
query["modifiedBy"] = user.id
Expand All @@ -646,15 +648,15 @@ async def update_crawl_config(
query["profileid"] = None
# else, ensure its a valid profile
elif update.profileid:
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
await self.profiles.get_profile(cast(UUID, update.profileid), org)
query["profileid"] = update.profileid
proxy_id = profile.proxyId
# don't change the proxy if profile is set, as it should match the profile proxy
elif orig_crawl_config.profileid:
proxy_id = None

if proxy_id is not None:
query["proxyId"] = proxy_id
if no_profile:
if update.proxyId == "":
query["proxyId"] = None
elif update.proxyId:
self.assert_can_org_use_proxy(org, update.proxyId)
query["proxyId"] = update.proxyId

if update.config is not None:
query["config"] = update.config.dict()
Expand Down Expand Up @@ -1025,9 +1027,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):
await self._add_running_curr_crawl_stats(crawlconfig)

if crawlconfig.profileid:
crawlconfig.profileName = await self.profiles.get_profile_name(
crawlconfig.profileid, org
)
profile = await self.profiles.get_profile(crawlconfig.profileid, org)
if profile:
crawlconfig.profileName = profile.name
crawlconfig.proxyId = profile.proxyId

crawlconfig.config.seeds = None

Expand Down Expand Up @@ -1241,8 +1244,8 @@ async def run_now_internal(
else:
profile_filename = ""

if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")
if crawlconfig.proxyId:
self.assert_can_org_use_proxy(org, crawlconfig.proxyId)

storage_filename = (
crawlconfig.crawlFilenameTemplate or self.default_filename_template
Expand Down Expand Up @@ -1418,6 +1421,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo
_proxy.shared and org.allowSharedProxies
) or _proxy.id in org.allowedProxies

def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]):
"""assert that proxy can be used or throw error"""
if proxy and not self.can_org_use_proxy(org, proxy):
raise HTTPException(status_code=400, detail="proxy_not_found")

def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
"""Generate WARC prefix slug from org slug, name or url
if no name is provided, hostname is used from url, otherwise
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object


CURR_DB_VERSION = "0052"
CURR_DB_VERSION = "0054"


# ============================================================================
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Migration 0054 -- clear proxyId on workflows that have profile set
using proxyId from profile always
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0054"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

async def migrate_up(self):
"""Perform migration up.

Unset proxyId on workflows that have a profileid set
"""
crawl_configs = self.mdb["crawl_configs"]

# Set non-public collections to private
try:
await crawl_configs.update_many(
{"profileid": {"$ne": None}, "proxyId": {"$ne": None}},
{"$set": {"proxyId": None}},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error update crawl_configs: {err}",
flush=True,
)
Loading