Skip to content

Commit

Permalink
Add arg to write pages to Redis (#464)
Browse files Browse the repository at this point in the history
Fixes #462 

Add --writePagesToRedis arg, for use conjunction with QA features in Browsertrix Cloud, to add
pages to the database for each crawl.
Ensure timestamp (as ISO date) is added to pages when they are serialized (both to pages.jsonl and redis)
Also include timestamp (as ISO date) in `pageinfo:` records

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
  • Loading branch information
tw4l committed Feb 10, 2024
1 parent 298deac commit bdffa79
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 6 deletions.
10 changes: 10 additions & 0 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ type PageEntry = {
seed?: boolean;
text?: string;
favIconUrl?: string;
ts?: string;
};

// ============================================================================
Expand Down Expand Up @@ -1928,13 +1929,22 @@ self.__bx_behaviors.selectMainBehavior();
loadState,
mime,
favicon,
ts,
}: PageState) {
const row: PageEntry = { id: pageid!, url, title, loadState };

if (ts) {
row.ts = ts.toISOString();
}

if (mime) {
row.mime = mime;
}

if (this.params.writePagesToRedis) {
await this.crawlState.writeToPagesQueue(JSON.stringify(row));
}

if (depth === 0) {
row.seed = true;
}
Expand Down
6 changes: 6 additions & 0 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,12 @@ class ArgParser {
default: false,
},

writePagesToRedis: {
describe: "If set, write page objects to redis",
type: "boolean",
default: false,
},

failOnFailedSeed: {
describe:
"If set, crawler will fail with exit code 1 if any seed fails",
Expand Down
12 changes: 10 additions & 2 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ function logNetwork(msg: string, data: any) {
export type PageInfoRecord = {
pageid: string;
urls: Record<string, number>;
url: string;
ts?: Date;
};

// =================================================================
Expand Down Expand Up @@ -463,6 +465,10 @@ export class Recorder {
return false;
}

if (url === this.pageUrl) {
this.pageInfo.ts = reqresp.ts;
}

reqresp.fillFetchRequestPaused(params);

if (this.noResponseForStatus(responseStatusCode)) {
Expand Down Expand Up @@ -622,7 +628,7 @@ export class Recorder {
this.pendingRequests = new Map();
this.skipIds = new Set();
this.skipping = false;
this.pageInfo = { pageid, urls: {} };
this.pageInfo = { pageid, urls: {}, url };
}

addPageRecord(reqresp: RequestResponseInfo) {
Expand Down Expand Up @@ -685,6 +691,8 @@ export class Recorder {
}

await this.writePageInfoRecord();

return this.pageInfo.ts;
}

async onClosePage() {
Expand Down Expand Up @@ -1384,7 +1392,7 @@ function createResponse(
const url = reqresp.url;
const warcVersion = "WARC/1.1";
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
const date = new Date().toISOString();
const date = new Date(reqresp.ts).toISOString();

const httpHeaders = reqresp.getResponseHeadersDict(
reqresp.payload ? reqresp.payload.length : 0,
Expand Down
4 changes: 1 addition & 3 deletions src/util/reqresp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@ const MAX_ARG_LEN = 512;

// ===========================================================================
export class RequestResponseInfo {
_created: Date = new Date();
ts: Date = new Date();

requestId: string;

ts?: string;

method?: string;
url!: string;
protocol?: string = "HTTP/1.1";
Expand Down
8 changes: 8 additions & 0 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ export class PageState {
pageid?: string;
title?: string;
mime?: string;
ts?: Date;

callbacks: PageCallbacks = {};

Expand Down Expand Up @@ -135,6 +136,7 @@ export class RedisCrawlState {
dkey: string;
fkey: string;
ekey: string;
pageskey: string;

constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
this.redis = redis;
Expand All @@ -152,6 +154,8 @@ export class RedisCrawlState {
this.fkey = this.key + ":f";
// crawler errors
this.ekey = this.key + ":e";
// pages
this.pageskey = this.key + ":pages";

this._initLuaCommands(this.redis);
}
Expand Down Expand Up @@ -682,4 +686,8 @@ return 0;
async logError(error: string) {
return await this.redis.lpush(this.ekey, error);
}

async writeToPagesQueue(value: string) {
return await this.redis.lpush(this.pageskey, value);
}
}
2 changes: 1 addition & 1 deletion src/util/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ export class PageWorker {
async crawlPage(opts: WorkerState) {
const res = await this.crawler.crawlPage(opts);
if (this.recorder) {
await this.recorder.finishPage();
opts.data.ts = await this.recorder.finishPage();
}
return res;
}
Expand Down

0 comments on commit bdffa79

Please sign in to comment.