Skip to content

Commit

Permalink
Include WARC prefix for screenshots and text WARCs (#473)
Browse files Browse the repository at this point in the history
Ensure the env var / cli <warc prefix>-<crawlId> is also applied to
`screenshots.warc.gz` and `text.warc.gz`
  • Loading branch information
ikreymer committed Feb 28, 2024
1 parent cdd047d commit dd48251
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 2 deletions.
10 changes: 10 additions & 0 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ export class Crawler {
maxHeapUsed = 0;
maxHeapTotal = 0;

warcPrefix: string;

driver!: (opts: {
page: Page;
data: PageState;
Expand Down Expand Up @@ -260,6 +262,12 @@ export class Crawler {
this.customBehaviors = "";

this.browser = new Browser();

this.warcPrefix = process.env.WARC_PREFIX || this.params.warcPrefix || "";

if (this.warcPrefix) {
this.warcPrefix += "-" + this.crawlId + "-";
}
}

configureUA() {
Expand Down Expand Up @@ -741,6 +749,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug("Skipping screenshots for non-HTML page", logDetails);
}
const screenshots = new Screenshots({
warcPrefix: this.warcPrefix,
browser: this.browser,
page,
url,
Expand All @@ -761,6 +770,7 @@ self.__bx_behaviors.selectMainBehavior();

if (data.isHTMLPage) {
textextract = new TextExtractViaSnapshot(cdp, {
warcPrefix: this.warcPrefix,
url,
directory: archiveDir,
});
Expand Down
1 change: 0 additions & 1 deletion src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,6 @@ class ArgParser {
describe:
"prefix for WARC files generated, including WARCs added to WACZ",
type: "string",
default: "rec",
},
};
}
Expand Down
4 changes: 3 additions & 1 deletion src/util/warcresourcewriter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@ export class WARCResourceWriter {
url,
directory,
date,
warcPrefix,
warcName,
}: {
url: string;
directory: string;
date: Date;
warcPrefix: string;
warcName: string;
}) {
this.url = url;
this.directory = directory;
this.warcName = path.join(this.directory, warcName);
this.warcName = path.join(this.directory, warcPrefix + warcName);
this.date = date ? date : new Date();
}

Expand Down

0 comments on commit dd48251

Please sign in to comment.