diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 2cf09a0f85..75f47a9d74 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -464,9 +464,11 @@ async def _resolve_crawl_refs( raise HTTPException(status_code=400, detail="missing_org") if hasattr(crawl, "profileid") and crawl.profileid: - crawl.profileName = await self.crawl_configs.profiles.get_profile_name( + profile = await self.crawl_configs.profiles.get_profile( crawl.profileid, org ) + if profile: + crawl.profileName = profile.name if ( files diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 552493f170..d014d7b315 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -265,13 +265,10 @@ async def add_crawl_config( proxy_id = config_in.proxyId profileid = None + # ensure profile is valid, get proxy_id from profile if isinstance(config_in.profileid, UUID): profileid = config_in.profileid - - # ensure profile is valid, get proxy_id from profile - if profileid: - profile = await self.profiles.get_profile(profileid, org) - proxy_id = profile.proxyId + proxy_id = None else: if config_in.config and config_in.config.failOnContentCheck: raise HTTPException( @@ -280,8 +277,7 @@ async def add_crawl_config( # ensure proxy_id is valid and available for org if proxy_id: - if not self.can_org_use_proxy(org, proxy_id): - raise HTTPException(status_code=404, detail="proxy_not_found") + self.assert_can_org_use_proxy(org, proxy_id) if config_in.config.exclude: exclude = config_in.config.exclude @@ -602,7 +598,15 @@ async def update_crawl_config( and ((not update.profileid) != (not orig_crawl_config.profileid)) ) - changed = changed or (orig_crawl_config.proxyId != update.proxyId) + # either unsetting profile or no profile set on current config + no_profile = update.profileid == "" or not orig_crawl_config.profileid + + changed = changed or ( + no_profile + and update.proxyId is not None + and orig_crawl_config.proxyId != update.proxyId + and ((not update.proxyId) != (not orig_crawl_config.proxyId)) + ) metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") metadata_changed = metadata_changed or self.check_attr_changed( @@ -633,8 +637,6 @@ async def update_crawl_config( last_rev = ConfigRevision(**orig_dict) last_rev = await self.config_revs.insert_one(last_rev.to_dict()) - proxy_id = update.proxyId - # set update query query = update.dict(exclude_unset=True) query["modifiedBy"] = user.id @@ -646,15 +648,15 @@ async def update_crawl_config( query["profileid"] = None # else, ensure its a valid profile elif update.profileid: - profile = await self.profiles.get_profile(cast(UUID, update.profileid), org) + await self.profiles.get_profile(cast(UUID, update.profileid), org) query["profileid"] = update.profileid - proxy_id = profile.proxyId - # don't change the proxy if profile is set, as it should match the profile proxy - elif orig_crawl_config.profileid: - proxy_id = None - if proxy_id is not None: - query["proxyId"] = proxy_id + if no_profile: + if update.proxyId == "": + query["proxyId"] = None + elif update.proxyId: + self.assert_can_org_use_proxy(org, update.proxyId) + query["proxyId"] = update.proxyId if update.config is not None: query["config"] = update.config.dict() @@ -1025,9 +1027,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization): await self._add_running_curr_crawl_stats(crawlconfig) if crawlconfig.profileid: - crawlconfig.profileName = await self.profiles.get_profile_name( - crawlconfig.profileid, org - ) + profile = await self.profiles.get_profile(crawlconfig.profileid, org) + if profile: + crawlconfig.profileName = profile.name + crawlconfig.proxyId = profile.proxyId crawlconfig.config.seeds = None @@ -1241,8 +1244,8 @@ async def run_now_internal( else: profile_filename = "" - if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): - raise HTTPException(status_code=404, detail="proxy_not_found") + if crawlconfig.proxyId: + self.assert_can_org_use_proxy(org, crawlconfig.proxyId) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -1418,6 +1421,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo _proxy.shared and org.allowSharedProxies ) or _proxy.id in org.allowedProxies + def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]): + """assert that proxy can be used or throw error""" + if proxy and not self.can_org_use_proxy(org, proxy): + raise HTTPException(status_code=400, detail="proxy_not_found") + def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: """Generate WARC prefix slug from org slug, name or url if no name is provided, hostname is used from url, otherwise diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index c9b403e696..278014deff 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -35,7 +35,7 @@ ) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object -CURR_DB_VERSION = "0052" +CURR_DB_VERSION = "0054" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py new file mode 100644 index 0000000000..25e8a82c7c --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py @@ -0,0 +1,37 @@ +""" +Migration 0054 -- clear proxyId on workflows that have profile set +using proxyId from profile always +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0054" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Unset proxyId on workflows that have a profileid set + """ + crawl_configs = self.mdb["crawl_configs"] + + # Set non-public collections to private + try: + await crawl_configs.update_many( + {"profileid": {"$ne": None}, "proxyId": {"$ne": None}}, + {"$set": {"proxyId": None}}, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error update crawl_configs: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e729730c2b..05c8502daf 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2428,6 +2428,8 @@ class ProfileFile(BaseFile): class Profile(BaseMongoModel): """Browser profile""" + id: UUID + name: str description: Optional[str] = "" diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 8acb083b73..ea7d5d4f1e 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -1300,6 +1300,10 @@ async def import_org( if not workflow.get("crawlerChannel"): workflow["crawlerChannel"] = "default" + # Ensure proxyId is unset if profile is set + if workflow.get("profileid"): + workflow["proxyId"] = None + crawl_config = CrawlConfig.from_dict(workflow) await self.crawl_configs_db.insert_one(crawl_config.to_dict()) diff --git a/chart/test/test.yaml b/chart/test/test.yaml index b3f62e86d7..aac4cb2117 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -16,6 +16,12 @@ operator_resync_seconds: 3 qa_scale: 2 +# lower storage sizes +redis_storage: "100Mi" +profile_browser_workdir_size: "100Mi" +crawler_storage: "1Gi" + + # for testing only crawler_extra_cpu_per_browser: 300m