-
Notifications
You must be signed in to change notification settings - Fork 0
Add opt-in browser-scraper Worker for JS-rendered monitor pages #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| /* eslint-disable import/no-default-export */ | ||
| import { baseConfig } from "@repo/eslint/base"; | ||
| import { defineConfig } from "eslint/config"; | ||
|
|
||
| export default defineConfig(baseConfig); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| { | ||
| "name": "@repo/browser-scraper", | ||
| "private": true, | ||
| "type": "module", | ||
| "scripts": { | ||
| "dev": "wrangler dev", | ||
| "deploy": "wrangler deploy", | ||
| "typecheck": "tsc", | ||
| "lint": "eslint .", | ||
| "test": "vitest run --passWithNoTests" | ||
| }, | ||
| "dependencies": { | ||
| "@cloudflare/playwright": "1.2.0", | ||
| "@repo/logger": "workspace:*", | ||
| "@repo/url-validator": "workspace:*", | ||
| "zod": "4.3.6" | ||
| }, | ||
| "devDependencies": { | ||
| "@cloudflare/workers-types": "4.20260412.1", | ||
| "@repo/eslint": "workspace:*", | ||
| "@repo/prettier": "workspace:*", | ||
| "@repo/typescript": "workspace:*", | ||
| "@types/node": "25.6.0", | ||
| "eslint": "9.39.1", | ||
| "prettier": "3.8.2", | ||
| "typescript": "6.0.2", | ||
| "vitest": "4.1.4", | ||
| "wrangler": "4.81.1" | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| import type { BrowserWorker } from "@cloudflare/playwright"; | ||
| import { Logger } from "@repo/logger"; | ||
| import { validateSourceUrl } from "@repo/url-validator"; | ||
| import { z } from "zod"; | ||
|
|
||
| import { PlaywrightService } from "./services/playwright"; | ||
|
|
||
| type Env = { BROWSER: BrowserWorker }; | ||
|
|
||
| const requestSchema = z.object({ url: z.string() }); | ||
|
|
||
| const handle = async (request: Request, env: Env): Promise<Response> => { | ||
| const logger = new Logger({ context: "browser-scraper" }); | ||
|
|
||
| if (request.method !== "POST") { | ||
| return Response.json( | ||
| { ok: false, error: "Method not allowed" }, | ||
| { status: 405 } | ||
| ); | ||
| } | ||
|
|
||
| let parsed: z.infer<typeof requestSchema>; | ||
| try { | ||
| const body: unknown = await request.json(); | ||
| parsed = requestSchema.parse(body); | ||
| } catch (error) { | ||
| return Response.json( | ||
| { | ||
| ok: false, | ||
| error: error instanceof Error ? error.message : "Invalid body", | ||
| }, | ||
| { status: 400 } | ||
| ); | ||
| } | ||
|
|
||
| const check = validateSourceUrl(parsed.url); | ||
| if (!check.valid) { | ||
| return Response.json( | ||
| { ok: false, error: `Invalid URL: ${check.reason}` }, | ||
| { status: 400 } | ||
| ); | ||
| } | ||
|
|
||
| try { | ||
| const result = await new PlaywrightService(env.BROWSER, logger).render( | ||
| parsed.url | ||
| ); | ||
| return Response.json(result); | ||
| } catch (error) { | ||
| logger.error("unexpected render failure", { | ||
| url: parsed.url, | ||
| errorMessage: error instanceof Error ? error.message : "Unknown error", | ||
| }); | ||
| return Response.json( | ||
| { ok: false, error: "Internal render failure" }, | ||
| { status: 500 } | ||
| ); | ||
| } | ||
| }; | ||
|
|
||
| // eslint-disable-next-line import/no-default-export | ||
| export default { | ||
| fetch: handle, | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| import { Logger } from "@repo/logger"; | ||
| import { beforeEach, describe, expect, it, vi } from "vitest"; | ||
|
|
||
| vi.mock("@cloudflare/playwright", () => ({ | ||
| launch: vi.fn(), | ||
| })); | ||
|
|
||
| const { launch } = await import("@cloudflare/playwright"); | ||
| const { PlaywrightService } = await import("./playwright"); | ||
|
|
||
| type GotoFn = () => Promise<{ | ||
| status: () => number; | ||
| headers: () => Record<string, string>; | ||
| } | null>; | ||
|
|
||
| const makeMockBrowser = ( | ||
| overrides: { | ||
| status?: number; | ||
| contentType?: string; | ||
| finalUrl?: string; | ||
| html?: string; | ||
| goto?: GotoFn; | ||
| } = {} | ||
| ): { browser: unknown; closeMock: ReturnType<typeof vi.fn> } => { | ||
| const closeMock = vi.fn().mockResolvedValue(undefined); | ||
| const goto: GotoFn = | ||
| overrides.goto ?? | ||
| (() => | ||
| Promise.resolve({ | ||
| status: () => overrides.status ?? 200, | ||
| headers: () => ({ | ||
| "content-type": overrides.contentType ?? "text/html", | ||
| }), | ||
| })); | ||
| const page = { | ||
| route: vi.fn().mockResolvedValue(undefined), | ||
| goto: vi.fn(goto), | ||
| url: vi.fn().mockReturnValue(overrides.finalUrl ?? "https://example.com/"), | ||
| content: vi.fn().mockResolvedValue(overrides.html ?? "<html></html>"), | ||
| }; | ||
| const browser = { | ||
| newPage: vi.fn().mockResolvedValue(page), | ||
| close: closeMock, | ||
| }; | ||
| return { browser, closeMock }; | ||
| }; | ||
|
|
||
| describe("PlaywrightService.render", () => { | ||
| const logger = new Logger({ context: "test" }); | ||
|
|
||
| beforeEach(() => { | ||
| vi.mocked(launch).mockReset(); | ||
| }); | ||
|
|
||
| it("returns rendered html on a successful navigation", async () => { | ||
| const { browser, closeMock } = makeMockBrowser({ | ||
| html: "<html><body>hello</body></html>", | ||
| }); | ||
| vi.mocked(launch).mockResolvedValueOnce(browser as never); | ||
|
|
||
| const result = await new PlaywrightService({} as never, logger).render( | ||
| "https://example.com/" | ||
| ); | ||
|
|
||
| expect(result).toEqual({ | ||
| ok: true, | ||
| html: "<html><body>hello</body></html>", | ||
| finalUrl: "https://example.com/", | ||
| status: 200, | ||
| contentType: "text/html", | ||
| truncated: false, | ||
| }); | ||
| expect(closeMock).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it("rejects when the post-redirect URL is unsafe", async () => { | ||
| const { browser, closeMock } = makeMockBrowser({ | ||
| finalUrl: "https://127.0.0.1/secrets", | ||
| }); | ||
| vi.mocked(launch).mockResolvedValueOnce(browser as never); | ||
|
|
||
| const result = await new PlaywrightService({} as never, logger).render( | ||
| "https://example.com/" | ||
| ); | ||
|
|
||
| expect(result.ok).toBe(false); | ||
| if (!result.ok) { | ||
| expect(result.error).toMatch(/Unsafe final URL/); | ||
| } | ||
| expect(closeMock).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it("returns an error when goto throws", async () => { | ||
| const { browser, closeMock } = makeMockBrowser({ | ||
| goto: () => Promise.reject(new Error("Navigation timeout")), | ||
| }); | ||
| vi.mocked(launch).mockResolvedValueOnce(browser as never); | ||
|
|
||
| const result = await new PlaywrightService({} as never, logger).render( | ||
| "https://example.com/" | ||
| ); | ||
|
|
||
| expect(result).toEqual({ ok: false, error: "Navigation timeout" }); | ||
| expect(closeMock).toHaveBeenCalledOnce(); | ||
| }); | ||
|
|
||
| it("returns an error for non-2xx page status", async () => { | ||
| const { browser } = makeMockBrowser({ status: 404 }); | ||
| vi.mocked(launch).mockResolvedValueOnce(browser as never); | ||
|
|
||
| const result = await new PlaywrightService({} as never, logger).render( | ||
| "https://example.com/missing" | ||
| ); | ||
|
|
||
| expect(result).toEqual({ ok: false, error: "HTTP 404", status: 404 }); | ||
| }); | ||
| }); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| import { launch } from "@cloudflare/playwright"; | ||
| import type { BrowserWorker } from "@cloudflare/playwright"; | ||
| import type { Logger } from "@repo/logger"; | ||
| import { validateSourceUrl } from "@repo/url-validator"; | ||
|
|
||
| const NAV_TIMEOUT_MS = 15_000; | ||
| const MAX_HTML_CHARS = 2 * 1024 * 1024; | ||
|
|
||
| type RenderSuccess = { | ||
| ok: true; | ||
| html: string; | ||
| finalUrl: string; | ||
| status: number; | ||
| contentType: string; | ||
| truncated: boolean; | ||
| }; | ||
| type RenderError = { ok: false; error: string; status?: number }; | ||
| type RenderResult = RenderSuccess | RenderError; | ||
|
|
||
| class PlaywrightService { | ||
| constructor( | ||
| private readonly browser: BrowserWorker, | ||
| private readonly logger: Logger | ||
| ) {} | ||
|
|
||
| async render(url: string): Promise<RenderResult> { | ||
| const start = Date.now(); | ||
| const browser = await launch(this.browser); | ||
| try { | ||
| const page = await browser.newPage(); | ||
|
|
||
| await page.route("**/*", (route) => { | ||
| const reqUrl = route.request().url(); | ||
| const check = validateSourceUrl(reqUrl); | ||
| if (!check.valid) { | ||
| this.logger.warn("blocked unsafe subresource", { | ||
| url: reqUrl, | ||
| reason: check.reason, | ||
| }); | ||
| return route.abort(); | ||
| } | ||
| return route.continue(); | ||
| }); | ||
|
|
||
| let response; | ||
| try { | ||
| response = await page.goto(url, { | ||
| waitUntil: "domcontentloaded", | ||
| timeout: NAV_TIMEOUT_MS, | ||
| }); | ||
| } catch (error) { | ||
| return { | ||
| ok: false, | ||
| error: error instanceof Error ? error.message : "Navigation failed", | ||
| }; | ||
| } | ||
|
|
||
| if (!response) { | ||
| return { ok: false, error: "No navigation response" }; | ||
| } | ||
|
|
||
| const finalUrl = page.url(); | ||
| const finalCheck = validateSourceUrl(finalUrl); | ||
| if (!finalCheck.valid) { | ||
| return { | ||
| ok: false, | ||
| error: `Unsafe final URL after redirect: ${finalCheck.reason}`, | ||
| }; | ||
| } | ||
|
|
||
| const status = response.status(); | ||
| if (status < 200 || status >= 300) { | ||
| return { ok: false, error: `HTTP ${String(status)}`, status }; | ||
| } | ||
|
|
||
| const headers = response.headers(); | ||
| const contentType = headers["content-type"] ?? ""; | ||
|
|
||
| const html = await page.content(); | ||
| const truncated = html.length > MAX_HTML_CHARS; | ||
| const finalHtml = truncated ? html.slice(0, MAX_HTML_CHARS) : html; | ||
|
Comment on lines
+79
to
+81
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intentional. |
||
|
|
||
| return { | ||
| ok: true, | ||
| html: finalHtml, | ||
| finalUrl, | ||
| status, | ||
| contentType, | ||
| truncated, | ||
| }; | ||
| } finally { | ||
| await browser.close(); | ||
| this.logger.info("browser render finished", { | ||
| url, | ||
| browserDurationMs: Date.now() - start, | ||
| }); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| export { MAX_HTML_CHARS, NAV_TIMEOUT_MS, PlaywrightService }; | ||
| export type { RenderError, RenderResult, RenderSuccess }; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| { | ||
| "extends": "@repo/typescript/base.json", | ||
| "compilerOptions": { | ||
| "types": ["@cloudflare/workers-types", "vitest/globals"] | ||
| }, | ||
| "include": ["src"], | ||
| "exclude": ["node_modules"] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| { | ||
| "$schema": "node_modules/wrangler/config-schema.json", | ||
| "name": "switch-operator-browser-scraper", | ||
| "main": "src/index.ts", | ||
| "compatibility_date": "2026-03-29", | ||
| "compatibility_flags": ["nodejs_compat"], | ||
| "workers_dev": false, | ||
| "observability": { | ||
| "logs": { | ||
| "enabled": true, | ||
| "invocation_logs": true, | ||
| }, | ||
| }, | ||
| "browser": { | ||
| "binding": "BROWSER", | ||
| }, | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| ALTER TABLE `schedules` ADD `use_browser` integer DEFAULT false NOT NULL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This workflow deploys the updated operator immediately but never applies migration
0005_nervous_scorpion.sql, even though the commit adds a newschedules.use_browsercolumn that is now part of the runtime schema (apps/operator/src/db/schema.ts) and read during scheduled execution. On environments wheredb:migrate:remotehas not been run yet, the newly deployed worker can fail withno such column: use_browseras soon as it queries schedules. Add a migration step (for examplepnpm --filter @repo/operator db:migrate:remote) before the operator deploy step.Useful? React with 👍 / 👎.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A dedicated workflow at
.github/workflows/migrations.ymlalready applies D1 migrations on push tomainwheneverapps/operator/migrations/**changes (wrangler d1 migrations apply switch-operator-db --remote). Duplicating that step inside the main deploy job would either run it twice or couple the two workflows. There is a known race window where the deploy can land before the migration workflow finishes; the current mitigation is to merge migration-only changes ahead of the feature commit when the gap matters. Happy to add an explicit guard if the race becomes a real problem.