From b00f59d6a94b712d581a6c78062170dedd2c6e93 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 20 Nov 2025 17:55:14 -0800 Subject: [PATCH 1/6] profile constraint follow-up: - follow-up to #2988, alternative to #3008 - clear proxyId in workflow if using profile, ensure it is always empty for new workflows/updated workflows that have a profile - add migration to clear proxyId if profileid is set for existing workflows - avoids having to update proxyId in workflows when it changes in the profile - fix assertion when updating proxy to return 400 if proxy is invalid --- backend/btrixcloud/basecrawls.py | 4 +- backend/btrixcloud/crawlconfigs.py | 52 +++++++++++-------- backend/btrixcloud/db.py | 2 +- ..._0054_clear_proxyid_when_using_profiles.py | 37 +++++++++++++ 4 files changed, 71 insertions(+), 24 deletions(-) create mode 100644 backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 2cf09a0f85..75f47a9d74 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -464,9 +464,11 @@ async def _resolve_crawl_refs( raise HTTPException(status_code=400, detail="missing_org") if hasattr(crawl, "profileid") and crawl.profileid: - crawl.profileName = await self.crawl_configs.profiles.get_profile_name( + profile = await self.crawl_configs.profiles.get_profile( crawl.profileid, org ) + if profile: + crawl.profileName = profile.name if ( files diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 552493f170..594ac381b5 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -265,13 +265,10 @@ async def add_crawl_config( proxy_id = config_in.proxyId profileid = None + # ensure profile is valid, get proxy_id from profile if isinstance(config_in.profileid, UUID): profileid = config_in.profileid - - # ensure profile is valid, get proxy_id from profile - if profileid: - profile = await self.profiles.get_profile(profileid, org) - proxy_id = profile.proxyId + proxy_id = None else: if config_in.config and config_in.config.failOnContentCheck: raise HTTPException( @@ -280,8 +277,7 @@ async def add_crawl_config( # ensure proxy_id is valid and available for org if proxy_id: - if not self.can_org_use_proxy(org, proxy_id): - raise HTTPException(status_code=404, detail="proxy_not_found") + self.assert_can_org_use_proxy(org, proxy_id) if config_in.config.exclude: exclude = config_in.config.exclude @@ -602,7 +598,15 @@ async def update_crawl_config( and ((not update.profileid) != (not orig_crawl_config.profileid)) ) - changed = changed or (orig_crawl_config.proxyId != update.proxyId) + # either unsetting profile or no profile set on current config + no_profile = update.profileid == "" or not orig_crawl_config.profileid + + changed = changed or ( + no_profile + and update.proxyId is not None + and orig_crawl_config.proxyId != update.proxyId + and ((not update.proxyId) != (not orig_crawl_config.proxyId)) + ) metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") metadata_changed = metadata_changed or self.check_attr_changed( @@ -633,8 +637,6 @@ async def update_crawl_config( last_rev = ConfigRevision(**orig_dict) last_rev = await self.config_revs.insert_one(last_rev.to_dict()) - proxy_id = update.proxyId - # set update query query = update.dict(exclude_unset=True) query["modifiedBy"] = user.id @@ -646,15 +648,15 @@ async def update_crawl_config( query["profileid"] = None # else, ensure its a valid profile elif update.profileid: - profile = await self.profiles.get_profile(cast(UUID, update.profileid), org) + await self.profiles.get_profile(cast(UUID, update.profileid), org) query["profileid"] = update.profileid - proxy_id = profile.proxyId - # don't change the proxy if profile is set, as it should match the profile proxy - elif orig_crawl_config.profileid: - proxy_id = None - if proxy_id is not None: - query["proxyId"] = proxy_id + if no_profile: + if update.proxyId == "": + query["proxyId"] = None + elif update.proxyId: + self.assert_can_org_use_proxy(org, update.proxyId) + query["proxyId"] = update.proxyId if update.config is not None: query["config"] = update.config.dict() @@ -1025,9 +1027,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization): await self._add_running_curr_crawl_stats(crawlconfig) if crawlconfig.profileid: - crawlconfig.profileName = await self.profiles.get_profile_name( - crawlconfig.profileid, org - ) + profile = await self.profiles.get_profile(crawlconfig.profileid, org) + if profile: + crawlconfig.profileName = profile.name + crawlconfig.proxyId = profile.proxyId crawlconfig.config.seeds = None @@ -1241,8 +1244,8 @@ async def run_now_internal( else: profile_filename = "" - if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): - raise HTTPException(status_code=404, detail="proxy_not_found") + if crawlconfig.proxyId: + self.assert_can_org_use_proxy(org, crawlconfig.proxyId) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -1418,6 +1421,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo _proxy.shared and org.allowSharedProxies ) or _proxy.id in org.allowedProxies + def assert_can_org_use_proxy(self, org: Organization, proxy: str): + """assert that proxy can be used or throw error""" + if self.can_org_use_proxy(org, proxy): + raise HTTPException(status_code=400, detail="proxy_not_found") + def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: """Generate WARC prefix slug from org slug, name or url if no name is provided, hostname is used from url, otherwise diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index c9b403e696..278014deff 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -35,7 +35,7 @@ ) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object -CURR_DB_VERSION = "0052" +CURR_DB_VERSION = "0054" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py new file mode 100644 index 0000000000..25e8a82c7c --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py @@ -0,0 +1,37 @@ +""" +Migration 0054 -- clear proxyId on workflows that have profile set +using proxyId from profile always +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0054" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Unset proxyId on workflows that have a profileid set + """ + crawl_configs = self.mdb["crawl_configs"] + + # Set non-public collections to private + try: + await crawl_configs.update_many( + {"profileid": {"$ne": None}, "proxyId": {"$ne": None}}, + {"$set": {"proxyId": None}}, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error update crawl_configs: {err}", + flush=True, + ) From a4a2445c7f97c492998f753bfff3f076b8096f42 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 21 Nov 2025 15:24:21 -0800 Subject: [PATCH 2/6] fix typo! --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 594ac381b5..7991c46b64 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1423,7 +1423,7 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo def assert_can_org_use_proxy(self, org: Organization, proxy: str): """assert that proxy can be used or throw error""" - if self.can_org_use_proxy(org, proxy): + if not self.can_org_use_proxy(org, proxy): raise HTTPException(status_code=400, detail="proxy_not_found") def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: From ad66915d991eb2a3dbf812f1749b83b7f486ad89 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 21 Nov 2025 15:26:45 -0800 Subject: [PATCH 3/6] none check --- backend/btrixcloud/crawlconfigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 7991c46b64..d014d7b315 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1421,9 +1421,9 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo _proxy.shared and org.allowSharedProxies ) or _proxy.id in org.allowedProxies - def assert_can_org_use_proxy(self, org: Organization, proxy: str): + def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]): """assert that proxy can be used or throw error""" - if not self.can_org_use_proxy(org, proxy): + if proxy and not self.can_org_use_proxy(org, proxy): raise HTTPException(status_code=400, detail="proxy_not_found") def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: From 8ce01dc96830e8eefddb58a1b9ff379fa3c8b107 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Nov 2025 17:09:08 -0800 Subject: [PATCH 4/6] org import: attempt to debug test failure also clear proxyId if profileid is set --- backend/btrixcloud/orgs.py | 4 ++++ backend/test/test_y_org_import_export.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 8acb083b73..ea7d5d4f1e 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -1300,6 +1300,10 @@ async def import_org( if not workflow.get("crawlerChannel"): workflow["crawlerChannel"] = "default" + # Ensure proxyId is unset if profile is set + if workflow.get("profileid"): + workflow["proxyId"] = None + crawl_config = CrawlConfig.from_dict(workflow) await self.crawl_configs_db.insert_one(crawl_config.to_dict()) diff --git a/backend/test/test_y_org_import_export.py b/backend/test/test_y_org_import_export.py index a259dd9383..43b12db17d 100644 --- a/backend/test/test_y_org_import_export.py +++ b/backend/test/test_y_org_import_export.py @@ -172,7 +172,8 @@ def test_import_org(admin_auth_headers): f"{API_PREFIX}/orgs/{ORG_FIXTURE_UUID}/all-crawls", headers=admin_auth_headers, ) - assert r.status_code == 200 + print(r.text()) + #assert r.status_code == 200 data = r.json() assert data["total"] == 4 From c667eded815e7df9fcf16adaf8525987b0dfee93 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Nov 2025 17:55:35 -0800 Subject: [PATCH 5/6] fix --- backend/test/test_y_org_import_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/test_y_org_import_export.py b/backend/test/test_y_org_import_export.py index 43b12db17d..104debb60b 100644 --- a/backend/test/test_y_org_import_export.py +++ b/backend/test/test_y_org_import_export.py @@ -172,7 +172,7 @@ def test_import_org(admin_auth_headers): f"{API_PREFIX}/orgs/{ORG_FIXTURE_UUID}/all-crawls", headers=admin_auth_headers, ) - print(r.text()) + print(r.text) #assert r.status_code == 200 data = r.json() assert data["total"] == 4 From b40294c28624bb9eeb8f85a72c9ecac697c663b2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Nov 2025 18:56:20 -0800 Subject: [PATCH 6/6] add 'id: UUID' to Profile to fix import --- backend/btrixcloud/models.py | 2 ++ backend/test/test_y_org_import_export.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e729730c2b..05c8502daf 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2428,6 +2428,8 @@ class ProfileFile(BaseFile): class Profile(BaseMongoModel): """Browser profile""" + id: UUID + name: str description: Optional[str] = "" diff --git a/backend/test/test_y_org_import_export.py b/backend/test/test_y_org_import_export.py index 104debb60b..a259dd9383 100644 --- a/backend/test/test_y_org_import_export.py +++ b/backend/test/test_y_org_import_export.py @@ -172,8 +172,7 @@ def test_import_org(admin_auth_headers): f"{API_PREFIX}/orgs/{ORG_FIXTURE_UUID}/all-crawls", headers=admin_auth_headers, ) - print(r.text) - #assert r.status_code == 200 + assert r.status_code == 200 data = r.json() assert data["total"] == 4