Skip to content

Commit

Permalink
new seed on redirect + error page check:
Browse files Browse the repository at this point in the history
- if a seed page redirects (page response != seed url), then add the final url as a new seed with same scope
- add newScopeSeed() to ScopedSeed to duplicate seed with different URL, store original includes / excludes
- also add check for 'chrome-error://' URLs for the page, and ensure page is marked as failed if page.url() starts with chrome-error://
- fixes #475
  • Loading branch information
ikreymer committed Feb 24, 2024
1 parent cdd047d commit 6dac1e5
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 15 deletions.
37 changes: 30 additions & 7 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1521,7 +1521,7 @@ self.__bx_behaviors.selectMainBehavior();
data: PageState,
selectorOptsList = DEFAULT_SELECTORS,
) {
const { url, seedId, depth } = data;
const { url, depth } = data;

const logDetails = data.logDetails;

Expand Down Expand Up @@ -1556,22 +1556,43 @@ self.__bx_behaviors.selectMainBehavior();
throw new Error("page response missing");
}

const respUrl = resp.url();
const isChromeError = page.url().startsWith("chrome-error://");

if (respUrl !== url && depth === 0 && !isChromeError) {
const seed = this.params.scopedSeeds[data.seedId];
this.params.scopedSeeds.push(seed.newScopedSeed(respUrl));
data.seedId = this.params.scopedSeeds.length - 1;
logger.info("Seed page redirected, adding redirected seed", {
origUrl: url,
newUrl: respUrl,
seedId: data.seedId,
});
}

// Handle 4xx or 5xx response as a page load error
const statusCode = resp.status();
const statusString = statusCode.toString();
if (
statusCode.toString().startsWith("4") ||
statusCode.toString().startsWith("5")
statusString.startsWith("4") ||
statusString.startsWith("5") ||
isChromeError
) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {
statusCode,
...logDetails,
});
} else {
logger.error("Non-200 Status Code, skipping page", {
statusCode,
...logDetails,
});
logger.error(
isChromeError
? "Page Crashed on Load"
: "Non-200 Status Code, skipping page",
{
statusCode,
...logDetails,
},
);
throw new Error("logged");
}
}
Expand Down Expand Up @@ -1649,6 +1670,8 @@ self.__bx_behaviors.selectMainBehavior();
return;
}

const { seedId } = data;

const seed = this.params.scopedSeeds[seedId];

await this.checkCF(page, logDetails);
Expand Down
30 changes: 22 additions & 8 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,22 @@ export class ScopedSeed {
url: string;
scopeType: ScopeType;
include: RegExp[];
exclude: RegExp[] = [];
exclude: RegExp[];
allowHash = false;
depth = -1;
sitemap?: string | null;
extraHops = 0;

maxExtraHops = 0;
maxDepth = 0;

_includeStr: string[];
_excludeStr: string[];

constructor({
url,
scopeType,
include,
exclude = [],
exclude,
allowHash = false,
depth = -1,
sitemap = false,
Expand All @@ -36,7 +38,7 @@ export class ScopedSeed {
url: string;
scopeType: ScopeType;
include: string[];
exclude?: string[];
exclude: string[];
allowHash?: boolean;
depth?: number;
sitemap?: string | boolean | null;
Expand All @@ -51,6 +53,9 @@ export class ScopedSeed {
this.exclude = this.parseRx(exclude);
this.scopeType = scopeType;

this._includeStr = include;
this._excludeStr = exclude;

if (!this.scopeType) {
this.scopeType = this.include.length ? "custom" : "prefix";
}
Expand All @@ -76,10 +81,7 @@ export class ScopedSeed {
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
}

//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parseRx(value: any) {
parseRx(value: string[] | RegExp[] | string | null | undefined) {
if (value === null || value === undefined || value === "") {
return [];
} else if (!(value instanceof Array)) {
Expand All @@ -89,6 +91,18 @@ export class ScopedSeed {
}
}

newScopedSeed(url: string) {
return new ScopedSeed({
url,
scopeType: this.scopeType,
include: this._includeStr,
exclude: this._excludeStr,
allowHash: this.allowHash,
depth: this.maxDepth,
extraHops: this.maxExtraHops,
});
}

addExclusion(value: string | RegExp) {
if (!value) {
return;
Expand Down

0 comments on commit 6dac1e5

Please sign in to comment.