diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 22459e6924..321be79870 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -252,22 +252,25 @@ async def add_crawl_config( if self.is_single_page(config_in.config): config_in.browserWindows = 1 + proxy_id = config_in.proxyId + profileid = None if isinstance(config_in.profileid, UUID): profileid = config_in.profileid - # ensure profile is valid, if provided + # ensure profile is valid, get proxy_id from profile if profileid: - await self.profiles.get_profile(profileid, org) + profile = await self.profiles.get_profile(profileid, org) + proxy_id = profile.proxyId else: if config_in.config and config_in.config.failOnContentCheck: raise HTTPException( status_code=400, detail="fail_on_content_check_requires_profile" ) - # ensure proxyId is valid and available for org - if config_in.proxyId: - if not self.can_org_use_proxy(org, config_in.proxyId): + # ensure proxy_id is valid and available for org + if proxy_id: + if not self.can_org_use_proxy(org, proxy_id): raise HTTPException(status_code=404, detail="proxy_not_found") if config_in.config.exclude: @@ -336,7 +339,7 @@ async def add_crawl_config( profileid=profileid, crawlerChannel=config_in.crawlerChannel, crawlFilenameTemplate=config_in.crawlFilenameTemplate, - proxyId=config_in.proxyId, + proxyId=proxy_id, firstSeed=first_seed, seedCount=seed_count, shareable=config_in.shareable, @@ -620,6 +623,8 @@ async def update_crawl_config( last_rev = ConfigRevision(**orig_dict) last_rev = await self.config_revs.insert_one(last_rev.to_dict()) + proxy_id = update.proxyId + # set update query query = update.dict(exclude_unset=True) query["modifiedBy"] = user.id @@ -631,8 +636,15 @@ async def update_crawl_config( query["profileid"] = None # else, ensure its a valid profile elif update.profileid: - await self.profiles.get_profile(cast(UUID, update.profileid), org) + profile = await self.profiles.get_profile(cast(UUID, update.profileid), org) query["profileid"] = update.profileid + proxy_id = profile.proxyId + # don't change the proxy if profile is set, as it should match the profile proxy + elif orig_crawl_config.profileid: + proxy_id = None + + if proxy_id is not None: + query["proxyId"] = proxy_id if update.config is not None: query["config"] = update.config.dict() @@ -1200,20 +1212,21 @@ async def run_now_internal( if await self.get_running_crawl(crawlconfig.id): raise HTTPException(status_code=400, detail="crawl_already_running") - if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): - raise HTTPException(status_code=404, detail="proxy_not_found") - await self.check_if_too_many_waiting_crawls(org) - profile_filename, profile_proxy_id = ( - await self.profiles.get_profile_filename_and_proxy( - crawlconfig.profileid, org + if crawlconfig.profileid: + profile_filename, crawlconfig.proxyId, _ = ( + await self.profiles.get_profile_filename_proxy_channel( + crawlconfig.profileid, org + ) ) - ) - if crawlconfig.profileid and not profile_filename: - raise HTTPException(status_code=400, detail="invalid_profile_id") + if not profile_filename: + raise HTTPException(status_code=400, detail="invalid_profile_id") + else: + profile_filename = "" - save_profile_id = self.get_save_profile_id(profile_proxy_id, crawlconfig) + if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): + raise HTTPException(status_code=404, detail="proxy_not_found") storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -1244,7 +1257,7 @@ async def run_now_internal( warc_prefix=self.get_warc_prefix(org, crawlconfig), storage_filename=storage_filename, profile_filename=profile_filename or "", - profileid=save_profile_id, + profileid=str(crawlconfig.profileid) if crawlconfig.profileid else "", is_single_page=self.is_single_page(crawlconfig.config), seed_file_url=seed_file_url, ) @@ -1256,25 +1269,6 @@ async def run_now_internal( print(traceback.format_exc()) raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") - def get_save_profile_id( - self, profile_proxy_id: str, crawlconfig: CrawlConfig - ) -> str: - """return profile id if profile should be auto-saved, or empty str if not""" - # if no profile, nothing to save - if not crawlconfig.profileid: - return "" - - # if no proxies, allow saving - if not crawlconfig.proxyId and not profile_proxy_id: - return str(crawlconfig.profileid) - - # if proxy ids match, allow saving - if crawlconfig.proxyId == profile_proxy_id: - return str(crawlconfig.profileid) - - # otherwise, don't save - return "" - async def check_if_too_many_waiting_crawls(self, org: Organization): """if max concurrent crawls are set, limit number of queued crawls to X concurrent limit return 429 if at limit""" diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 729d56eb12..bdf435609d 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -125,18 +125,17 @@ async def make_new_crawljob( ) print("Scheduled Crawl Created: " + crawl_id) - profile_filename, profile_proxy_id = ( - await self.crawl_config_ops.profiles.get_profile_filename_and_proxy( - crawlconfig.profileid, org + if crawlconfig.profileid: + profile_filename, crawlconfig.proxyId, _ = ( + await self.crawl_config_ops.profiles.get_profile_filename_proxy_channel( + crawlconfig.profileid, org + ) ) - ) - if crawlconfig.profileid and not profile_filename: - print(f"error: missing profile {crawlconfig.profileid}") - return self.get_finished_response(metadata) - - save_profile_id = self.crawl_config_ops.get_save_profile_id( - profile_proxy_id, crawlconfig - ) + if not profile_filename: + print(f"error: missing profile {crawlconfig.profileid}") + return self.get_finished_response(metadata) + else: + profile_filename = "" crawl_id, crawljob = self.k8s.new_crawl_job_yaml( cid=str(cid), @@ -153,7 +152,7 @@ async def make_new_crawljob( warc_prefix=warc_prefix, storage_filename=self.crawl_config_ops.default_filename_template, profile_filename=profile_filename or "", - profileid=save_profile_id, + profileid=str(crawlconfig.profileid) if crawlconfig else "", proxy_id=crawlconfig.proxyId or "", is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config), ) diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index 72c5203040..6a3f817f7e 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -112,9 +112,12 @@ async def create_new_browser( prev_profile_path = "" prev_profile_id = "" prev_proxy_id = "" + prev_channel = "" if profile_launch.profileId: - prev_profile_path, prev_proxy_id = ( - await self.get_profile_filename_and_proxy(profile_launch.profileId, org) + prev_profile_path, prev_proxy_id, prev_channel = ( + await self.get_profile_filename_proxy_channel( + profile_launch.profileId, org + ) ) if not prev_profile_path: @@ -122,14 +125,14 @@ async def create_new_browser( prev_profile_id = str(profile_launch.profileId) - crawler_image = self.crawlconfigs.get_channel_crawler_image( - profile_launch.crawlerChannel - ) + crawler_channel = profile_launch.crawlerChannel or prev_channel + + crawler_image = self.crawlconfigs.get_channel_crawler_image(crawler_channel) if not crawler_image: raise HTTPException(status_code=404, detail="crawler_not_found") image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy( - profile_launch.crawlerChannel + crawler_channel ) # use either specified proxyId or if none, use proxyId from existing profile @@ -512,23 +515,23 @@ async def get_profile(self, profileid: UUID, org: Organization) -> Profile: profile.inUse = await self.crawlconfigs.is_profile_in_use(profileid, org) return profile - async def get_profile_filename_and_proxy( + async def get_profile_filename_proxy_channel( self, profileid: Optional[UUID], org: Organization - ) -> tuple[str, str]: + ) -> tuple[str, str, str]: """return profile path filename (relative path) for given profile id and org""" if not profileid: - return "", "" + return "", "", "" try: profile = await self.get_profile(profileid, org) storage_path = profile.resource.filename if profile.resource else "" storage_path = storage_path.lstrip(f"{org.id}/") - return storage_path, profile.proxyId or "" + return storage_path, profile.proxyId or "", profile.crawlerChannel or "" # pylint: disable=bare-except except: pass - return "", "" + return "", "", "" async def get_profile_name(self, profileid: UUID, org: Organization) -> str: """return profile for given profile id and org"""