Skip to content

Commit

Permalink
Screencast Support for Debugging (fixes #43) (#52)
Browse files Browse the repository at this point in the history
* screencast support (fixes #43):

- add NewWindowPage concurrency mode to support opening new window, and also reusing pages

- add --screencastPort cli options to enable screencasting, uses websockets to stream frames to client

- concurrency: add separate 'window' concurrency for opening new window per-page in same session, useful for screencasting with multiple workers but within same session

* add warning if using screencasting + more than one worker + page context, recommend 'window'

* cleanup: remove debug console, bump py-wacz dependency, improve close message

* README: add screencasting info to README
  • Loading branch information
ikreymer committed Jun 8, 2021
1 parent e7d3767 commit ae4ce97
Show file tree
Hide file tree
Showing 8 changed files with 4,839 additions and 4,485 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ RUN yarn install
ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD *.js /app/
ADD screencast/ /app/screencast/

RUN ln -s /app/main.js /usr/bin/crawl
RUN ln -s /app/create-login-profile.js /usr/bin/create-login-profile
Expand Down
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,18 @@ Options:
--profile Path to tar.gz file which will be
extracted and used as the browser
profile [string]
--screencastPort If set to a non-zero value, starts
an HTTP server with screencast
accessible on this port
[number] [default: 0]
```

For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).

The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example),
while `--waitUntil networkidle0` may make sense for dynamic sites.


### Behaviors

Browsertrix Crawler also supports automatically running customized in-browser behaviors. The behaviors auto-play videos (when possible),
Expand All @@ -140,6 +145,30 @@ Behaviors to run can be specified via a comma-separated list passed to the `--be

See [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) for more info on all of the currently available behaviors.


### Watching the crawl -- Screencasting

With version 0.4.0, Browsertrix Crawler includes an experimental 'screencasting' option, which allows watching the crawl in real-time via screencast (connected via a websocket).

To enable, add `--screencastPort` command-line option and also map the port on the docker container. An example command might be:

```
docker-compose run -p 9037:9037 crawler crawl --url [URL] --screencastPort 9037
```

Then, you can open `http://localhost:9037/` and watch the crawl.

Note: If specifying multiple workers, the crawler should additional be instructed to open each one in a new window, otherwise the screencasting can only update one page at a time.

For example,

```
docker-compose run -p 9037:9037 crawler crawl --url [URL] --screencastPort 9037 --newContext window --workers 3
```

will start a crawl with 3 workers, and show the screen of each of the workers from `http://localhost:9037/`.


## Creating and Using Browser Profiles

Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
Expand Down
35 changes: 32 additions & 3 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ const HTTPS_AGENT = require("https").Agent({

const HTTP_AGENT = require("http").Agent();

const { ScreenCaster, NewWindowPage } = require("./screencaster");


// ============================================================================
class Crawler {
Expand Down Expand Up @@ -176,7 +178,7 @@ class Crawler {
},

"newContext": {
describe: "The context for each new capture, can be a new: page, session or browser.",
describe: "The context for each new capture, can be a new: page, window, session or browser.",
default: "page",
type: "string"
},
Expand Down Expand Up @@ -318,6 +320,12 @@ class Crawler {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
},

"screencastPort": {
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
type: "number",
default: 0
}
};
}

Expand Down Expand Up @@ -385,6 +393,9 @@ class Crawler {
switch (argv.newContext) {
case "page":
argv.newContext = Cluster.CONCURRENCY_PAGE;
if (argv.screencastPort && argv.workers > 1) {
console.warn("Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window");
}
break;

case "session":
Expand All @@ -395,6 +406,10 @@ class Crawler {
argv.newContext = Cluster.CONCURRENCY_BROWSER;
break;

case "window":
argv.newContext = NewWindowPage;
break;

default:
throw new Error("Invalid newContext, must be one of: page, session, browser");
}
Expand Down Expand Up @@ -537,13 +552,18 @@ class Crawler {

async crawlPage({page, data}) {
try {
if (this.screencaster) {
await this.screencaster.newTarget(page.target());
}

if (this.emulateDevice) {
await page.emulate(this.emulateDevice);
}

if (this.behaviorOpts) {
if (this.behaviorOpts && !page.__bx_inited) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.behaviorOpts});`);
page.__bx_inited = true;
}

// run custom driver here
Expand All @@ -566,6 +586,10 @@ class Crawler {

await this.writeStats();

if (this.screencaster) {
await this.screencaster.endTarget(page.target());
}

} catch (e) {
console.warn(e);
}
Expand Down Expand Up @@ -616,7 +640,11 @@ class Crawler {
this.cluster.task((opts) => this.crawlPage(opts));

await this.initPages();


if (this.params.screencastPort) {
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
}

if (this.params.urlFile) {
const urlSeedFile = await fsp.readFile(this.params.urlFile, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");
Expand All @@ -627,6 +655,7 @@ class Crawler {
if (!this.params.urlFile) {
this.queueUrl(this.params.url);
}

if (this.params.useSitemap) {
await this.parseSitemap(this.params.useSitemap);
}
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
"ioredis": "^4.27.1",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",
"puppeteer-core": "^8.0.0",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",
"ws": "^7.4.4",
"yargs": "^16.0.3"
},
"devDependencies": {
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#pywb>=2.5.0
git+https://github.com/webrecorder/pywb@main
uwsgi
wacz>=0.2.1
wacz>=0.3.0
76 changes: 76 additions & 0 deletions screencast/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
<!doctype html>
<html>
<head>
<style>
#content {
display: flex;
flex-direction: row;
flex-wrap: wrap;
}
#content img {
width: 640px;
height: 480px;
margin: 2rem;
}
</style>
<script>
const ws = new WebSocket(window.location.origin.replace("http", "ws") + "/ws");
ws.addEventListener("message", (event) => handleMessage(event.data));

const unusedElems = [];

function handleMessage(resp) {
resp = JSON.parse(resp);

switch (resp.msg) {
case "newTarget":
case "screencast":
img = createImage(resp.id);
if (resp.data) {
setImageData(img, resp.data);
}
break;

case "endTarget":
img = unuseImage(resp.id);
break;
}
}

function setImageData(img, data) {
//img.style.display = "";
img.src = "data:image/png;base64," + data;
}

function createImage(id) {
let elem = document.getElementById(id);
if (elem) {
return elem;
}

if (unusedElems.length) {
elem = unusedElems.shift();
elem.setAttribute("id", id);
return elem;
}

elem = document.createElement("img");
elem.setAttribute("id", id);
document.getElementById("content").appendChild(elem);
return elem;
}

function unuseImage(id) {
const elem = document.getElementById(id);
if (!elem) {
return;
}
//elem.style.display = "none";
unusedElems.push(elem);
}
</script>
<head>
<body>
<div id="content">
</div>
</body>

0 comments on commit ae4ce97

Please sign in to comment.