From 0ae842b6918123721eabf7651d9c2506977067f6 Mon Sep 17 00:00:00 2001 From: Ilya Antipenko <1021703+aivus@users.noreply.github.com> Date: Wed, 29 Dec 2021 22:00:52 +0000 Subject: [PATCH] Update plugin for website-scraper v5 (#39) * Update plugin for website-scraper v5 * Migrate from CommonJS to ESM * Add workaround for the special chars --- .eslintrc.yml | 1 + README.md | 12 ++++++------ lib/browserUtils/scrollToBottom.js | 2 +- lib/index.js | 10 +++++----- lib/logger.js | 4 ++-- package.json | 13 ++++++++----- test/mock/index.html | 9 +++++++-- test/puppeteer-plugin.test.js | 22 ++++++++++++---------- 8 files changed, 42 insertions(+), 31 deletions(-) diff --git a/.eslintrc.yml b/.eslintrc.yml index ab11baa..2eabecb 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -1,6 +1,7 @@ extends: "eslint:recommended" parserOptions: ecmaVersion: 8 + sourceType: "module" env: node: true es6: true diff --git a/README.md b/README.md index 9f34e13..f8616d5 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ Plugin for [website-scraper](https://github.com/website-scraper/node-website-scr This module is an Open Source Software maintained by one developer in free time. If you want to thank the author of this module you can use [GitHub Sponsors](https://github.com/sponsors/s0ph1e) or [Patreon](https://www.patreon.com/s0ph1e). ## Requirements -* nodejs version >= 14 -* website-scraper version >= 4 +* nodejs version >= 14.14 +* website-scraper version >= 5 ## Installation ```sh @@ -19,10 +19,10 @@ npm install website-scraper website-scraper-puppeteer ## Usage ```javascript -const scrape = require('website-scraper'); -const PuppeteerPlugin = require('website-scraper-puppeteer'); +import scrape from 'website-scraper'; +import PuppeteerPlugin from 'website-scraper-puppeteer'; -scrape({ +await scrape({ urls: ['https://www.instagram.com/gopro/'], directory: '/path/to/save', plugins: [ @@ -35,7 +35,7 @@ scrape({ }); ``` Puppeteer plugin constructor accepts next params: -* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/v1.20.0/docs/api.md#puppeteerlaunchoptions) +* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/v13.0.1/docs/api.md#puppeteerlaunchoptions) * `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached: * `timeout` - in milliseconds * `viewportN` - viewport height multiplier diff --git a/lib/browserUtils/scrollToBottom.js b/lib/browserUtils/scrollToBottom.js index 8bd83f2..4d89f5c 100644 --- a/lib/browserUtils/scrollToBottom.js +++ b/lib/browserUtils/scrollToBottom.js @@ -1,4 +1,4 @@ -module.exports = async (timeout, viewportN) => { +export default async (timeout, viewportN) => { await new Promise((resolve) => { let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN; const timer = setInterval(() => { diff --git a/lib/index.js b/lib/index.js index c1877b3..2bc45b8 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,6 +1,6 @@ -const puppeteer = require('puppeteer'); -const logger = require('./logger.js'); -const scrollToBottomBrowser = require('./browserUtils/scrollToBottom.js'); +import puppeteer from 'puppeteer'; +import logger from './logger.js'; +import scrollToBottomBrowser from './browserUtils/scrollToBottom.js'; class PuppeteerPlugin { constructor ({ @@ -33,7 +33,7 @@ class PuppeteerPlugin { const contentType = response.headers['content-type']; const isHtml = contentType && contentType.split(';')[0] === 'text/html'; if (isHtml) { - const url = response.request.href; + const url = response.url; const page = await this.browser.newPage(); if (hasValues(this.headers)) { @@ -89,4 +89,4 @@ async function blockNavigation (page, url) { await page.setRequestInterception(true); } -module.exports = PuppeteerPlugin; +export default PuppeteerPlugin; diff --git a/lib/logger.js b/lib/logger.js index 59e025c..5e7bb2a 100644 --- a/lib/logger.js +++ b/lib/logger.js @@ -1,4 +1,4 @@ -const debug = require('debug'); +import debug from 'debug'; const appName = 'website-scraper-puppeteer'; const logLevels = ['error', 'warn', 'info', 'debug', 'log']; @@ -8,4 +8,4 @@ logLevels.forEach(logLevel => { logger[logLevel] = debug(`${appName}:${logLevel}`); }); -module.exports = logger; +export default logger; diff --git a/package.json b/package.json index a9e994b..ca5beb1 100644 --- a/package.json +++ b/package.json @@ -1,9 +1,12 @@ { "name": "website-scraper-puppeteer", - "version": "0.1.5", + "version": "1.0.0", "description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer", "readmeFilename": "README.md", - "main": "lib/index.js", + "type": "module", + "exports": { + ".": "./lib/index.js" + }, "keywords": [ "website-scraper", "puppeteer", @@ -17,7 +20,7 @@ "puppeteer": "^13.0.1" }, "peerDependencies": { - "website-scraper": "^4.0.0" + "website-scraper": "^5.0.0" }, "devDependencies": { "c8": "^7.10.0", @@ -27,7 +30,7 @@ "fs-extra": "^10.0.0", "mocha": "^9.1.3", "serve-static": "^1.13.2", - "website-scraper": "^4.0.0" + "website-scraper": "^5.0.0" }, "scripts": { "test": "c8 --all --reporter=text --reporter=lcov mocha --recursive --timeout 10000", @@ -47,6 +50,6 @@ "lib" ], "engines": { - "node": ">=14" + "node": ">=14.14" } } diff --git a/test/mock/index.html b/test/mock/index.html index 819b65e..00109f5 100644 --- a/test/mock/index.html +++ b/test/mock/index.html @@ -12,9 +12,14 @@ - \ No newline at end of file + diff --git a/test/puppeteer-plugin.test.js b/test/puppeteer-plugin.test.js index d546a42..e6444bd 100644 --- a/test/puppeteer-plugin.test.js +++ b/test/puppeteer-plugin.test.js @@ -1,12 +1,14 @@ -const { expect } = require('chai'); -const http = require('http'); -const finalhandler = require('finalhandler'); -const serveStatic = require('serve-static'); -const fs = require('fs-extra'); -const scrape = require('website-scraper'); -const PuppeteerPlugin = require('../lib'); +import chai from 'chai'; +import http from 'http'; +import finalhandler from 'finalhandler'; +import serveStatic from 'serve-static'; +import fs from 'fs-extra'; +import scrape from 'website-scraper'; +import PuppeteerPlugin from '../lib/index.js'; -const directory = __dirname + '/tmp'; +const { expect } = chai; + +const directory = './test/tmp'; const SERVE_WEBSITE_PORT = 4567; describe('Puppeteer plugin test', () => { @@ -39,7 +41,7 @@ describe('Puppeteer plugin test', () => { }); it('should render special characters correctly', async () => { - expect(content).to.contain('
저는 7년 동안 한국에서 살았어요. Слава Україні!
'); + expect(content).to.contain('
7년 동안 한국에서 살았어요. Слава Україні!
'); }); }); @@ -69,7 +71,7 @@ describe('Puppeteer plugin test', () => { }); function startWebserver(port = 3000) { - const serve = serveStatic(__dirname + '/mock', {'index': ['index.html']}); + const serve = serveStatic('./test/mock', {'index': ['index.html']}); const server = http.createServer(function onRequest (req, res) { serve(req, res, finalhandler(req, res)) });