Skip to content

Commit

Permalink
Ensure links added via behaviors also get processed (#478)
Browse files Browse the repository at this point in the history
Requires webrecorder/browsertrix-behaviors#69 / browsertrix-behaviors
0.5.3, which will add support for behaviors to add links.

Simplify adding links by simply adding the links directly, instead of
batching to 500 links. Errors are already being logged in queueing a new
URL fails.
  • Loading branch information
ikreymer committed Feb 29, 2024
1 parent c348de2 commit 184f4a2
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 25 deletions.
2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.5.2",
"browsertrix-behaviors": "^0.5.3",
"crc": "^4.3.2",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
Expand Down
21 changes: 2 additions & 19 deletions src/crawler.ts
Expand Up @@ -1722,17 +1722,8 @@ self.__bx_behaviors.selectMainBehavior();
) {
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;

let links: string[] = [];
const promiseList = [];

callbacks.addLink = (url: string) => {
links.push(url);
if (links.length == 500) {
promiseList.push(
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
);
links = [];
}
callbacks.addLink = async (url: string) => {
await this.queueInScopeUrls(seedId, [url], depth, extraHops, logDetails);
};

const loadLinks = (options: {
Expand Down Expand Up @@ -1801,14 +1792,6 @@ self.__bx_behaviors.selectMainBehavior();
} catch (e) {
logger.warn("Link Extraction failed", e, "links");
}

if (links.length) {
promiseList.push(
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
);
}

await Promise.allSettled(promiseList);
}

async queueInScopeUrls(
Expand Down
2 changes: 1 addition & 1 deletion src/util/state.ts
Expand Up @@ -36,7 +36,7 @@ export type QueueEntry = {

// ============================================================================
export type PageCallbacks = {
addLink?: (url: string) => void;
addLink?: (url: string) => Promise<void>;
};

// ============================================================================
Expand Down
8 changes: 4 additions & 4 deletions yarn.lock
Expand Up @@ -1425,10 +1425,10 @@ browserslist@^4.21.3:
node-releases "^2.0.6"
update-browserslist-db "^1.0.9"

browsertrix-behaviors@^0.5.2:
version "0.5.2"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.2.tgz#d2fe1d6ff08815ff0dd68a05fe1a3cdc4bbec8ca"
integrity sha512-8nhpnzY8OM1mxQ+mZ+m10dpGgMuhCnKUV5YUlitDpMyEfKlEybUmTz5sroVQH8e//NcJox7W6QYjaU2Y/ygxww==
browsertrix-behaviors@^0.5.3:
version "0.5.3"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.5.3.tgz#f987075790b0fd970814f57195e8525277ddd2a0"
integrity sha512-NiVdV42xvj4DvX/z0Dxqzqsa+5e57/M7hIyK3fl41BxzOJqCgSMu0MpkrWuKpbRVo+89ZnBmzh2z6D18Vmn1LA==

bser@2.1.1:
version "2.1.1"
Expand Down

0 comments on commit 184f4a2

Please sign in to comment.