Skip to content

Commit

Permalink
Update plugin for website-scraper v5 (#39)
Browse files Browse the repository at this point in the history
* Update plugin for website-scraper v5
* Migrate from CommonJS to ESM
* Add workaround for the special chars
  • Loading branch information
aivus committed Dec 29, 2021
1 parent 70601fe commit 0ae842b
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 31 deletions.
1 change: 1 addition & 0 deletions .eslintrc.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
extends: "eslint:recommended"
parserOptions:
ecmaVersion: 8
sourceType: "module"
env:
node: true
es6: true
Expand Down
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ Plugin for [website-scraper](https://github.com/website-scraper/node-website-scr
This module is an Open Source Software maintained by one developer in free time. If you want to thank the author of this module you can use [GitHub Sponsors](https://github.com/sponsors/s0ph1e) or [Patreon](https://www.patreon.com/s0ph1e).

## Requirements
* nodejs version >= 14
* website-scraper version >= 4
* nodejs version >= 14.14
* website-scraper version >= 5

## Installation
```sh
Expand All @@ -19,10 +19,10 @@ npm install website-scraper website-scraper-puppeteer

## Usage
```javascript
const scrape = require('website-scraper');
const PuppeteerPlugin = require('website-scraper-puppeteer');
import scrape from 'website-scraper';
import PuppeteerPlugin from 'website-scraper-puppeteer';

scrape({
await scrape({
urls: ['https://www.instagram.com/gopro/'],
directory: '/path/to/save',
plugins: [
Expand All @@ -35,7 +35,7 @@ scrape({
});
```
Puppeteer plugin constructor accepts next params:
* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/v1.20.0/docs/api.md#puppeteerlaunchoptions)
* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/v13.0.1/docs/api.md#puppeteerlaunchoptions)
* `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached:
* `timeout` - in milliseconds
* `viewportN` - viewport height multiplier
Expand Down
2 changes: 1 addition & 1 deletion lib/browserUtils/scrollToBottom.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module.exports = async (timeout, viewportN) => {
export default async (timeout, viewportN) => {
await new Promise((resolve) => {
let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN;
const timer = setInterval(() => {
Expand Down
10 changes: 5 additions & 5 deletions lib/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
const puppeteer = require('puppeteer');
const logger = require('./logger.js');
const scrollToBottomBrowser = require('./browserUtils/scrollToBottom.js');
import puppeteer from 'puppeteer';
import logger from './logger.js';
import scrollToBottomBrowser from './browserUtils/scrollToBottom.js';

class PuppeteerPlugin {
constructor ({
Expand Down Expand Up @@ -33,7 +33,7 @@ class PuppeteerPlugin {
const contentType = response.headers['content-type'];
const isHtml = contentType && contentType.split(';')[0] === 'text/html';
if (isHtml) {
const url = response.request.href;
const url = response.url;
const page = await this.browser.newPage();

if (hasValues(this.headers)) {
Expand Down Expand Up @@ -89,4 +89,4 @@ async function blockNavigation (page, url) {
await page.setRequestInterception(true);
}

module.exports = PuppeteerPlugin;
export default PuppeteerPlugin;
4 changes: 2 additions & 2 deletions lib/logger.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const debug = require('debug');
import debug from 'debug';

const appName = 'website-scraper-puppeteer';
const logLevels = ['error', 'warn', 'info', 'debug', 'log'];
Expand All @@ -8,4 +8,4 @@ logLevels.forEach(logLevel => {
logger[logLevel] = debug(`${appName}:${logLevel}`);
});

module.exports = logger;
export default logger;
13 changes: 8 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
{
"name": "website-scraper-puppeteer",
"version": "0.1.5",
"version": "1.0.0",
"description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer",
"readmeFilename": "README.md",
"main": "lib/index.js",
"type": "module",
"exports": {
".": "./lib/index.js"
},
"keywords": [
"website-scraper",
"puppeteer",
Expand All @@ -17,7 +20,7 @@
"puppeteer": "^13.0.1"
},
"peerDependencies": {
"website-scraper": "^4.0.0"
"website-scraper": "^5.0.0"
},
"devDependencies": {
"c8": "^7.10.0",
Expand All @@ -27,7 +30,7 @@
"fs-extra": "^10.0.0",
"mocha": "^9.1.3",
"serve-static": "^1.13.2",
"website-scraper": "^4.0.0"
"website-scraper": "^5.0.0"
},
"scripts": {
"test": "c8 --all --reporter=text --reporter=lcov mocha --recursive --timeout 10000",
Expand All @@ -47,6 +50,6 @@
"lib"
],
"engines": {
"node": ">=14"
"node": ">=14.14"
}
}
9 changes: 7 additions & 2 deletions test/mock/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@
<script>
window.onload = function() {
document.getElementById('root').innerText = 'Hello world from JS!';
document.getElementById('special-characters-test').innerText = '저는 7년 동안 한국에서 살았어요. Слава Україні!';
/**
* TODO: Original innerText "저는 7년 동안 한국에서 살았어요. Слава Україні!" was changed due to issues
* with cheerio and website-scraper itself.
* See https://github.com/cheeriojs/cheerio/pull/2280
*/
document.getElementById('special-characters-test').innerText = '7년 동안 한국에서 살았어요. Слава Україні!';
};
</script>

</body>
</html>
</html>
22 changes: 12 additions & 10 deletions test/puppeteer-plugin.test.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
const { expect } = require('chai');
const http = require('http');
const finalhandler = require('finalhandler');
const serveStatic = require('serve-static');
const fs = require('fs-extra');
const scrape = require('website-scraper');
const PuppeteerPlugin = require('../lib');
import chai from 'chai';
import http from 'http';
import finalhandler from 'finalhandler';
import serveStatic from 'serve-static';
import fs from 'fs-extra';
import scrape from 'website-scraper';
import PuppeteerPlugin from '../lib/index.js';

const directory = __dirname + '/tmp';
const { expect } = chai;

const directory = './test/tmp';
const SERVE_WEBSITE_PORT = 4567;

describe('Puppeteer plugin test', () => {
Expand Down Expand Up @@ -39,7 +41,7 @@ describe('Puppeteer plugin test', () => {
});

it('should render special characters correctly', async () => {
expect(content).to.contain('<div id="special-characters-test">저는 7년 동안 한국에서 살았어요. Слава Україні!</div>');
expect(content).to.contain('<div id="special-characters-test">7년 동안 한국에서 살았어요. Слава Україні!</div>');
});
});

Expand Down Expand Up @@ -69,7 +71,7 @@ describe('Puppeteer plugin test', () => {
});

function startWebserver(port = 3000) {
const serve = serveStatic(__dirname + '/mock', {'index': ['index.html']});
const serve = serveStatic('./test/mock', {'index': ['index.html']});
const server = http.createServer(function onRequest (req, res) {
serve(req, res, finalhandler(req, res))
});
Expand Down

0 comments on commit 0ae842b

Please sign in to comment.