Skip to content

Commit

Permalink
more specific types additions (#434)
Browse files Browse the repository at this point in the history
- add QueueEntry for type of json object stored in Redis
- and PageCallbacks for callback type
- use Crawler type
  • Loading branch information
ikreymer committed Nov 13, 2023
1 parent 0d51e03 commit 456155e
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 32 deletions.
4 changes: 2 additions & 2 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
QueueState,
PageState,
WorkerId,
PageCallbacks,
} from "./util/state.js";

import Sitemapper from "sitemapper";
Expand Down Expand Up @@ -554,8 +555,7 @@ export class Crawler {
page: Page;
cdp: CDPSession;
workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
callbacks: any;
callbacks: PageCallbacks;
}) {
await this.browser.setupPage({ page, cdp });

Expand Down
48 changes: 21 additions & 27 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@ export enum QueueState {
// ============================================================================
export type WorkerId = number;

// ============================================================================
export type QueueEntry = {
added?: string;
url: string;
seedId: number;
depth: number;
extraHops: number;
};

// ============================================================================
export type PageCallbacks = {
addLink?: (url: string) => void;
};

// ============================================================================
export class PageState {
url: string;
Expand All @@ -38,9 +52,7 @@ export class PageState {
title?: string;
mime?: string;

// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
callbacks: any;
callbacks: PageCallbacks = {};

isHTMLPage?: boolean;
text?: string;
Expand All @@ -52,16 +64,11 @@ export class PageState {

logDetails = {};

constructor(redisData: {
url: string;
seedId: number;
depth: number;
extraHops: number;
}) {
constructor(redisData: QueueEntry) {
this.url = redisData.url;
this.seedId = redisData.seedId;
this.depth = redisData.depth;
this.extraHops = redisData.extraHops;
this.extraHops = redisData.extraHops || 0;
}
}

Expand Down Expand Up @@ -290,10 +297,7 @@ return 0;
await this.redis.srem(this.skey, url);
}

recheckScope(
data: { url: string; depth: number; extraHops: number; seedId: number },
seeds: ScopedSeed[],
) {
recheckScope(data: QueueEntry, seeds: ScopedSeed[]) {
const seed = seeds[data.seedId];

return seed.isIncluded(data.url, data.depth, data.extraHops);
Expand Down Expand Up @@ -434,21 +438,11 @@ return 0;

//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
async addToQueue(
{
url,
seedId,
depth = 0,
extraHops = 0,
}: { url: string; seedId: number; depth?: number; extraHops?: number },
{ url, seedId, depth = 0, extraHops = 0 }: QueueEntry,
limit = 0,
) {
const added = this._timestamp();
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const data: any = { added, url, seedId, depth };
if (extraHops) {
data.extraHops = extraHops;
}
const data: QueueEntry = { added, url, seedId, depth, extraHops };

// return codes
// 0 - url queued successfully
Expand Down Expand Up @@ -500,7 +494,7 @@ return 0;
return { done, queued, pending, failed, errors };
}

_getScore(data: { depth: number; extraHops: number }) {
_getScore(data: QueueEntry) {
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
}

Expand Down
5 changes: 2 additions & 3 deletions src/util/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { Recorder } from "./recorder.js";
import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js";
import { Crawler } from "../crawler.js";

const MAX_REUSE = 5;

Expand All @@ -17,9 +18,7 @@ const FINISHED_TIMEOUT = 60;

// ===========================================================================
export function runWorkers(
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
crawler: any,
crawler: Crawler,
numWorkers: number,
maxPageTime: number,
collDir: string,
Expand Down

0 comments on commit 456155e

Please sign in to comment.