Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 31 additions & 37 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,22 +252,25 @@ async def add_crawl_config(
if self.is_single_page(config_in.config):
config_in.browserWindows = 1

proxy_id = config_in.proxyId

profileid = None
if isinstance(config_in.profileid, UUID):
profileid = config_in.profileid

# ensure profile is valid, if provided
# ensure profile is valid, get proxy_id from profile
if profileid:
await self.profiles.get_profile(profileid, org)
profile = await self.profiles.get_profile(profileid, org)
proxy_id = profile.proxyId
else:
if config_in.config and config_in.config.failOnContentCheck:
raise HTTPException(
status_code=400, detail="fail_on_content_check_requires_profile"
)

# ensure proxyId is valid and available for org
if config_in.proxyId:
if not self.can_org_use_proxy(org, config_in.proxyId):
# ensure proxy_id is valid and available for org
if proxy_id:
if not self.can_org_use_proxy(org, proxy_id):
raise HTTPException(status_code=404, detail="proxy_not_found")

if config_in.config.exclude:
Expand Down Expand Up @@ -336,7 +339,7 @@ async def add_crawl_config(
profileid=profileid,
crawlerChannel=config_in.crawlerChannel,
crawlFilenameTemplate=config_in.crawlFilenameTemplate,
proxyId=config_in.proxyId,
proxyId=proxy_id,
firstSeed=first_seed,
seedCount=seed_count,
shareable=config_in.shareable,
Expand Down Expand Up @@ -620,6 +623,8 @@ async def update_crawl_config(
last_rev = ConfigRevision(**orig_dict)
last_rev = await self.config_revs.insert_one(last_rev.to_dict())

proxy_id = update.proxyId

# set update query
query = update.dict(exclude_unset=True)
query["modifiedBy"] = user.id
Expand All @@ -631,8 +636,15 @@ async def update_crawl_config(
query["profileid"] = None
# else, ensure its a valid profile
elif update.profileid:
await self.profiles.get_profile(cast(UUID, update.profileid), org)
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
query["profileid"] = update.profileid
proxy_id = profile.proxyId
# don't change the proxy if profile is set, as it should match the profile proxy
elif orig_crawl_config.profileid:
proxy_id = None

if proxy_id is not None:
query["proxyId"] = proxy_id

if update.config is not None:
query["config"] = update.config.dict()
Expand Down Expand Up @@ -1200,20 +1212,21 @@ async def run_now_internal(
if await self.get_running_crawl(crawlconfig.id):
raise HTTPException(status_code=400, detail="crawl_already_running")

if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

await self.check_if_too_many_waiting_crawls(org)

profile_filename, profile_proxy_id = (
await self.profiles.get_profile_filename_and_proxy(
crawlconfig.profileid, org
if crawlconfig.profileid:
profile_filename, crawlconfig.proxyId, _ = (
await self.profiles.get_profile_filename_proxy_channel(
crawlconfig.profileid, org
)
)
)
if crawlconfig.profileid and not profile_filename:
raise HTTPException(status_code=400, detail="invalid_profile_id")
if not profile_filename:
raise HTTPException(status_code=400, detail="invalid_profile_id")
else:
profile_filename = ""

save_profile_id = self.get_save_profile_id(profile_proxy_id, crawlconfig)
if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

storage_filename = (
crawlconfig.crawlFilenameTemplate or self.default_filename_template
Expand Down Expand Up @@ -1244,7 +1257,7 @@ async def run_now_internal(
warc_prefix=self.get_warc_prefix(org, crawlconfig),
storage_filename=storage_filename,
profile_filename=profile_filename or "",
profileid=save_profile_id,
profileid=str(crawlconfig.profileid) if crawlconfig.profileid else "",
is_single_page=self.is_single_page(crawlconfig.config),
seed_file_url=seed_file_url,
)
Expand All @@ -1256,25 +1269,6 @@ async def run_now_internal(
print(traceback.format_exc())
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")

def get_save_profile_id(
self, profile_proxy_id: str, crawlconfig: CrawlConfig
) -> str:
"""return profile id if profile should be auto-saved, or empty str if not"""
# if no profile, nothing to save
if not crawlconfig.profileid:
return ""

# if no proxies, allow saving
if not crawlconfig.proxyId and not profile_proxy_id:
return str(crawlconfig.profileid)

# if proxy ids match, allow saving
if crawlconfig.proxyId == profile_proxy_id:
return str(crawlconfig.profileid)

# otherwise, don't save
return ""

async def check_if_too_many_waiting_crawls(self, org: Organization):
"""if max concurrent crawls are set, limit number of queued crawls to X concurrent limit
return 429 if at limit"""
Expand Down
23 changes: 11 additions & 12 deletions backend/btrixcloud/operator/cronjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,18 +125,17 @@ async def make_new_crawljob(
)
print("Scheduled Crawl Created: " + crawl_id)

profile_filename, profile_proxy_id = (
await self.crawl_config_ops.profiles.get_profile_filename_and_proxy(
crawlconfig.profileid, org
if crawlconfig.profileid:
profile_filename, crawlconfig.proxyId, _ = (
await self.crawl_config_ops.profiles.get_profile_filename_proxy_channel(
crawlconfig.profileid, org
)
)
)
if crawlconfig.profileid and not profile_filename:
print(f"error: missing profile {crawlconfig.profileid}")
return self.get_finished_response(metadata)

save_profile_id = self.crawl_config_ops.get_save_profile_id(
profile_proxy_id, crawlconfig
)
if not profile_filename:
print(f"error: missing profile {crawlconfig.profileid}")
return self.get_finished_response(metadata)
else:
profile_filename = ""

crawl_id, crawljob = self.k8s.new_crawl_job_yaml(
cid=str(cid),
Expand All @@ -153,7 +152,7 @@ async def make_new_crawljob(
warc_prefix=warc_prefix,
storage_filename=self.crawl_config_ops.default_filename_template,
profile_filename=profile_filename or "",
profileid=save_profile_id,
profileid=str(crawlconfig.profileid) if crawlconfig else "",
proxy_id=crawlconfig.proxyId or "",
is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config),
)
Expand Down
25 changes: 14 additions & 11 deletions backend/btrixcloud/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,24 +112,27 @@ async def create_new_browser(
prev_profile_path = ""
prev_profile_id = ""
prev_proxy_id = ""
prev_channel = ""
if profile_launch.profileId:
prev_profile_path, prev_proxy_id = (
await self.get_profile_filename_and_proxy(profile_launch.profileId, org)
prev_profile_path, prev_proxy_id, prev_channel = (
await self.get_profile_filename_proxy_channel(
profile_launch.profileId, org
)
)

if not prev_profile_path:
raise HTTPException(status_code=400, detail="invalid_base_profile")

prev_profile_id = str(profile_launch.profileId)

crawler_image = self.crawlconfigs.get_channel_crawler_image(
profile_launch.crawlerChannel
)
crawler_channel = profile_launch.crawlerChannel or prev_channel

crawler_image = self.crawlconfigs.get_channel_crawler_image(crawler_channel)
if not crawler_image:
raise HTTPException(status_code=404, detail="crawler_not_found")

image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
profile_launch.crawlerChannel
crawler_channel
)

# use either specified proxyId or if none, use proxyId from existing profile
Expand Down Expand Up @@ -512,23 +515,23 @@ async def get_profile(self, profileid: UUID, org: Organization) -> Profile:
profile.inUse = await self.crawlconfigs.is_profile_in_use(profileid, org)
return profile

async def get_profile_filename_and_proxy(
async def get_profile_filename_proxy_channel(
self, profileid: Optional[UUID], org: Organization
) -> tuple[str, str]:
) -> tuple[str, str, str]:
"""return profile path filename (relative path) for given profile id and org"""
if not profileid:
return "", ""
return "", "", ""

try:
profile = await self.get_profile(profileid, org)
storage_path = profile.resource.filename if profile.resource else ""
storage_path = storage_path.lstrip(f"{org.id}/")
return storage_path, profile.proxyId or ""
return storage_path, profile.proxyId or "", profile.crawlerChannel or ""
# pylint: disable=bare-except
except:
pass

return "", ""
return "", "", ""

async def get_profile_name(self, profileid: UUID, org: Organization) -> str:
"""return profile for given profile id and org"""
Expand Down
Loading