Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .claude/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"Bash(pnpm typecheck:*)",
"Bash(pnpm test:*)",
"Bash(pnpm format:*)",
"Bash(pnpm --filter:*)"
"Bash(pnpm --filter:*)",
"Bash(gh api *)"
]
}
}
8 changes: 7 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,13 @@ jobs:
- name: Setup
uses: ./.github/actions/setup

- name: Deploy to Cloudflare Workers
- name: Deploy browser-scraper to Cloudflare Workers
run: pnpm --filter @repo/browser-scraper run deploy
env:
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}

- name: Deploy operator to Cloudflare Workers
run: pnpm --filter @repo/operator run deploy
Comment on lines +110 to 111
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Run DB migration before deploying the operator

This workflow deploys the updated operator immediately but never applies migration 0005_nervous_scorpion.sql, even though the commit adds a new schedules.use_browser column that is now part of the runtime schema (apps/operator/src/db/schema.ts) and read during scheduled execution. On environments where db:migrate:remote has not been run yet, the newly deployed worker can fail with no such column: use_browser as soon as it queries schedules. Add a migration step (for example pnpm --filter @repo/operator db:migrate:remote) before the operator deploy step.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A dedicated workflow at .github/workflows/migrations.yml already applies D1 migrations on push to main whenever apps/operator/migrations/** changes (wrangler d1 migrations apply switch-operator-db --remote). Duplicating that step inside the main deploy job would either run it twice or couple the two workflows. There is a known race window where the deploy can land before the migration workflow finishes; the current mitigation is to merge migration-only changes ahead of the feature commit when the gap matters. Happy to add an explicit guard if the race becomes a real problem.

env:
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
Expand Down
5 changes: 5 additions & 0 deletions apps/browser-scraper/eslint.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/* eslint-disable import/no-default-export */
import { baseConfig } from "@repo/eslint/base";
import { defineConfig } from "eslint/config";

export default defineConfig(baseConfig);
30 changes: 30 additions & 0 deletions apps/browser-scraper/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"name": "@repo/browser-scraper",
"private": true,
"type": "module",
"scripts": {
"dev": "wrangler dev",
"deploy": "wrangler deploy",
"typecheck": "tsc",
"lint": "eslint .",
"test": "vitest run --passWithNoTests"
},
"dependencies": {
"@cloudflare/playwright": "1.2.0",
"@repo/logger": "workspace:*",
"@repo/url-validator": "workspace:*",
"zod": "4.3.6"
},
"devDependencies": {
"@cloudflare/workers-types": "4.20260412.1",
"@repo/eslint": "workspace:*",
"@repo/prettier": "workspace:*",
"@repo/typescript": "workspace:*",
"@types/node": "25.6.0",
"eslint": "9.39.1",
"prettier": "3.8.2",
"typescript": "6.0.2",
"vitest": "4.1.4",
"wrangler": "4.81.1"
}
}
64 changes: 64 additions & 0 deletions apps/browser-scraper/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import type { BrowserWorker } from "@cloudflare/playwright";
import { Logger } from "@repo/logger";
import { validateSourceUrl } from "@repo/url-validator";
import { z } from "zod";

import { PlaywrightService } from "./services/playwright";

type Env = { BROWSER: BrowserWorker };

const requestSchema = z.object({ url: z.string() });

const handle = async (request: Request, env: Env): Promise<Response> => {
const logger = new Logger({ context: "browser-scraper" });

if (request.method !== "POST") {
return Response.json(
{ ok: false, error: "Method not allowed" },
{ status: 405 }
);
}

let parsed: z.infer<typeof requestSchema>;
try {
const body: unknown = await request.json();
parsed = requestSchema.parse(body);
} catch (error) {
return Response.json(
{
ok: false,
error: error instanceof Error ? error.message : "Invalid body",
},
{ status: 400 }
);
}

const check = validateSourceUrl(parsed.url);
if (!check.valid) {
return Response.json(
{ ok: false, error: `Invalid URL: ${check.reason}` },
{ status: 400 }
);
}

try {
const result = await new PlaywrightService(env.BROWSER, logger).render(
parsed.url
);
return Response.json(result);
} catch (error) {
logger.error("unexpected render failure", {
url: parsed.url,
errorMessage: error instanceof Error ? error.message : "Unknown error",
});
return Response.json(
{ ok: false, error: "Internal render failure" },
{ status: 500 }
);
}
};

// eslint-disable-next-line import/no-default-export
export default {
fetch: handle,
};
117 changes: 117 additions & 0 deletions apps/browser-scraper/src/services/playwright.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import { Logger } from "@repo/logger";
import { beforeEach, describe, expect, it, vi } from "vitest";

vi.mock("@cloudflare/playwright", () => ({
launch: vi.fn(),
}));

const { launch } = await import("@cloudflare/playwright");
const { PlaywrightService } = await import("./playwright");

type GotoFn = () => Promise<{
status: () => number;
headers: () => Record<string, string>;
} | null>;

const makeMockBrowser = (
overrides: {
status?: number;
contentType?: string;
finalUrl?: string;
html?: string;
goto?: GotoFn;
} = {}
): { browser: unknown; closeMock: ReturnType<typeof vi.fn> } => {
const closeMock = vi.fn().mockResolvedValue(undefined);
const goto: GotoFn =
overrides.goto ??
(() =>
Promise.resolve({
status: () => overrides.status ?? 200,
headers: () => ({
"content-type": overrides.contentType ?? "text/html",
}),
}));
const page = {
route: vi.fn().mockResolvedValue(undefined),
goto: vi.fn(goto),
url: vi.fn().mockReturnValue(overrides.finalUrl ?? "https://example.com/"),
content: vi.fn().mockResolvedValue(overrides.html ?? "<html></html>"),
};
const browser = {
newPage: vi.fn().mockResolvedValue(page),
close: closeMock,
};
return { browser, closeMock };
};

describe("PlaywrightService.render", () => {
const logger = new Logger({ context: "test" });

beforeEach(() => {
vi.mocked(launch).mockReset();
});

it("returns rendered html on a successful navigation", async () => {
const { browser, closeMock } = makeMockBrowser({
html: "<html><body>hello</body></html>",
});
vi.mocked(launch).mockResolvedValueOnce(browser as never);

const result = await new PlaywrightService({} as never, logger).render(
"https://example.com/"
);

expect(result).toEqual({
ok: true,
html: "<html><body>hello</body></html>",
finalUrl: "https://example.com/",
status: 200,
contentType: "text/html",
truncated: false,
});
expect(closeMock).toHaveBeenCalledOnce();
});

it("rejects when the post-redirect URL is unsafe", async () => {
const { browser, closeMock } = makeMockBrowser({
finalUrl: "https://127.0.0.1/secrets",
});
vi.mocked(launch).mockResolvedValueOnce(browser as never);

const result = await new PlaywrightService({} as never, logger).render(
"https://example.com/"
);

expect(result.ok).toBe(false);
if (!result.ok) {
expect(result.error).toMatch(/Unsafe final URL/);
}
expect(closeMock).toHaveBeenCalledOnce();
});

it("returns an error when goto throws", async () => {
const { browser, closeMock } = makeMockBrowser({
goto: () => Promise.reject(new Error("Navigation timeout")),
});
vi.mocked(launch).mockResolvedValueOnce(browser as never);

const result = await new PlaywrightService({} as never, logger).render(
"https://example.com/"
);

expect(result).toEqual({ ok: false, error: "Navigation timeout" });
expect(closeMock).toHaveBeenCalledOnce();
});

it("returns an error for non-2xx page status", async () => {
const { browser } = makeMockBrowser({ status: 404 });
vi.mocked(launch).mockResolvedValueOnce(browser as never);

const result = await new PlaywrightService({} as never, logger).render(
"https://example.com/missing"
);

expect(result).toEqual({ ok: false, error: "HTTP 404", status: 404 });
});
});
102 changes: 102 additions & 0 deletions apps/browser-scraper/src/services/playwright.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import { launch } from "@cloudflare/playwright";
import type { BrowserWorker } from "@cloudflare/playwright";
import type { Logger } from "@repo/logger";
import { validateSourceUrl } from "@repo/url-validator";

const NAV_TIMEOUT_MS = 15_000;
const MAX_HTML_CHARS = 2 * 1024 * 1024;

type RenderSuccess = {
ok: true;
html: string;
finalUrl: string;
status: number;
contentType: string;
truncated: boolean;
};
type RenderError = { ok: false; error: string; status?: number };
type RenderResult = RenderSuccess | RenderError;

class PlaywrightService {
constructor(
private readonly browser: BrowserWorker,
private readonly logger: Logger
) {}

async render(url: string): Promise<RenderResult> {
const start = Date.now();
const browser = await launch(this.browser);
try {
const page = await browser.newPage();

await page.route("**/*", (route) => {
const reqUrl = route.request().url();
const check = validateSourceUrl(reqUrl);
if (!check.valid) {
this.logger.warn("blocked unsafe subresource", {
url: reqUrl,
reason: check.reason,
});
return route.abort();
}
return route.continue();
});

let response;
try {
response = await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: NAV_TIMEOUT_MS,
});
} catch (error) {
return {
ok: false,
error: error instanceof Error ? error.message : "Navigation failed",
};
}

if (!response) {
return { ok: false, error: "No navigation response" };
}

const finalUrl = page.url();
const finalCheck = validateSourceUrl(finalUrl);
if (!finalCheck.valid) {
return {
ok: false,
error: `Unsafe final URL after redirect: ${finalCheck.reason}`,
};
}

const status = response.status();
if (status < 200 || status >= 300) {
return { ok: false, error: `HTTP ${String(status)}`, status };
}

const headers = response.headers();
const contentType = headers["content-type"] ?? "";

const html = await page.content();
const truncated = html.length > MAX_HTML_CHARS;
const finalHtml = truncated ? html.slice(0, MAX_HTML_CHARS) : html;
Comment on lines +79 to +81
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional. page.content() has no streaming variant on the @cloudflare/playwright binding, so the only enforcement point the worker exposes is post-serialization. The cap is therefore on the response payload we send back over the service binding (and ultimately into the AI prompt), not on browser memory — the 60s hard timeout and 10 min/day daily budget bound the memory side. Char vs byte: agreed it's not a hard byte cap, but UTF-8 worst case is ~4× and downstream convertContent handles oversized input fine, so the slack is acceptable here.


return {
ok: true,
html: finalHtml,
finalUrl,
status,
contentType,
truncated,
};
} finally {
await browser.close();
this.logger.info("browser render finished", {
url,
browserDurationMs: Date.now() - start,
});
}
}
}

export { MAX_HTML_CHARS, NAV_TIMEOUT_MS, PlaywrightService };
export type { RenderError, RenderResult, RenderSuccess };
8 changes: 8 additions & 0 deletions apps/browser-scraper/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"extends": "@repo/typescript/base.json",
"compilerOptions": {
"types": ["@cloudflare/workers-types", "vitest/globals"]
},
"include": ["src"],
"exclude": ["node_modules"]
}
17 changes: 17 additions & 0 deletions apps/browser-scraper/wrangler.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"$schema": "node_modules/wrangler/config-schema.json",
"name": "switch-operator-browser-scraper",
"main": "src/index.ts",
"compatibility_date": "2026-03-29",
"compatibility_flags": ["nodejs_compat"],
"workers_dev": false,
"observability": {
"logs": {
"enabled": true,
"invocation_logs": true,
},
},
"browser": {
"binding": "BROWSER",
},
}
1 change: 1 addition & 0 deletions apps/operator/migrations/0005_nervous_scorpion.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE `schedules` ADD `use_browser` integer DEFAULT false NOT NULL;
Loading
Loading