diff --git a/README.md b/README.md index 8da35183..75d92bc4 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,8 @@ scrape(options).then((result) => {}); * [urlFilter](#urlfilter) - skip some urls * [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource * [requestConcurrency](#requestconcurrency) - set maximum concurrent requests +* [tempMode](#tempMode) - How to store data temporarily during processing +* [tempDir](#tempMode) - The directory to use to store temp files when `tempMode === fs` * [plugins](#plugins) - plugins, allow to customize filenames, request options, response handling, saving to storage, etc. Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using @@ -83,15 +85,44 @@ How to download website to existing directory and why it's not supported by defa #### sources Array of objects to download, specifies selectors and attribute values to select files for downloading. By default scraper tries to download all possible resources. Scraper uses cheerio to select html elements so `selector` can be any [selector that cheerio supports](https://github.com/cheeriojs/cheerio#selectors). + +You can also specify custom `containerClass`', these are responsible for readying and writing from attributes. For example if you want to read JSON from an attribute... + ```javascript +class JsonContainerClass { + constructor (text) { + this.text = text || ''; + this.paths = []; + + if (this.text) { + this.paths = JSON.parse(this.text); + } + } + + getPaths () { + return this.paths; + } + + updateText (pathsToUpdate) { + this.paths = this.paths.map((oldPath) => { + const toUpdate = pathsToUpdate.find((x) => x.oldPath === oldPath); + + return toUpdate ? toUpdate.newPath : oldPath; + }); + + return JSON.stringify(this.paths); + } +} + // Downloading images, css files and scripts scrape({ urls: ['http://nodejs.org/'], directory: '/path/to/save', sources: [ - {selector: 'img', attr: 'src'}, - {selector: 'link[rel="stylesheet"]', attr: 'href'}, - {selector: 'script', attr: 'src'} + { selector: 'img', attr: 'src' }, + { selector: 'link[rel="stylesheet"]', attr: 'href' }, + { selector: 'script', attr: 'src' }, + { selector: 'div', attr: 'data-json', containerClass: JsonContainerClass } ] }); ``` @@ -199,6 +230,13 @@ scrape({ #### requestConcurrency Number, maximum amount of concurrent requests. Defaults to `Infinity`. +#### tempMode + +How to store temporary data when processing + +* `memory` - Data is store in memory in its raw format (default). +* `memory-compressed` - Data is stored in memory but compressed using zlib. This is more memory efficient at the expense of CPU time spend compressing and decompressing. +* `filesystem` - Data is stored in temporary files on the filesystem. This is the most memory efficient but it is strongly recommended to only use this mode with a solid state drive. #### plugins @@ -331,7 +369,6 @@ Promise should be resolved with: * `body` (response body, string) * `encoding` (`binary` or `utf8`) used to save the file, binary used by default. * `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. -* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem. If multiple actions `afterResponse` added - scraper will use result from last one. ```javascript @@ -430,7 +467,7 @@ If multiple actions `saveResource` added - resource will be saved to multiple st ```javascript registerAction('saveResource', async ({resource}) => { const filename = resource.getFilename(); - const text = resource.getText(); + const text = await resource.getText(); await saveItSomewhere(filename, text); }); ``` diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 14e00077..1e3a120e 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -63,7 +63,9 @@ const config = { recursive: false, maxRecursiveDepth: null, maxDepth: null, - ignoreErrors: false + ignoreErrors: false, + tempMode: 'memory', // 'memory-compressed', 'fs' + tempDir: undefined }; export default config; diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js index b5cfab02..12f2ad95 100644 --- a/lib/plugins/save-resource-to-fs-plugin.js +++ b/lib/plugins/save-resource-to-fs-plugin.js @@ -1,32 +1,36 @@ import path from 'path'; -import fs from 'fs-extra'; +import { promises as fs } from 'fs'; +import { exists } from '../utils/index.js'; class SaveResourceToFileSystemPlugin { apply (registerAction) { let absoluteDirectoryPath, loadedResources = []; - registerAction('beforeStart', ({options}) => { + registerAction('beforeStart', async ({options}) => { if (!options.directory || typeof options.directory !== 'string') { throw new Error(`Incorrect directory ${options.directory}`); } absoluteDirectoryPath = path.resolve(process.cwd(), options.directory); - if (fs.existsSync(absoluteDirectoryPath)) { + if (await exists(absoluteDirectoryPath)) { throw new Error(`Directory ${absoluteDirectoryPath} exists`); } }); registerAction('saveResource', async ({resource}) => { const filename = path.join(absoluteDirectoryPath, resource.getFilename()); - const text = resource.getText(); - await fs.outputFile(filename, text, { encoding: resource.getEncoding() }); + await fs.mkdir(path.dirname(filename), { recursive: true }); + + const text = await resource.getText(); + + await fs.writeFile(filename, text, { encoding: resource.getEncoding() }); loadedResources.push(resource); }); registerAction('error', async () => { if (loadedResources.length > 0) { - await fs.remove(absoluteDirectoryPath); + await fs.rm(absoluteDirectoryPath, { recursive: true, force: true }); } }); } diff --git a/lib/request.js b/lib/request.js index f66ce29c..143eb4a3 100644 --- a/lib/request.js +++ b/lib/request.js @@ -41,6 +41,10 @@ function throwTypeError (result) { } function getData (result) { + if (typeof result === 'string') { + throw new Error('afterResponse handler returned a string, expected object'); + } + let data = result; if (result && typeof result === 'object' && 'body' in result) { data = result.body; diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index e76b60c0..661334a8 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -7,12 +7,13 @@ class CssResourceHandler { this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources); } - handle (resource) { - const pathContainer = new CssText(resource.getText()); - return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) { - resource.setText(updatedText); - return resource; - }); + async handle (resource) { + const pathContainer = new CssText(await resource.getText()); + + const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources); + await resource.setText(updatedText); + + return resource; } } diff --git a/lib/resource-handler/html/html-source-element.js b/lib/resource-handler/html/html-source-element.js index 36d3280e..01681e93 100644 --- a/lib/resource-handler/html/html-source-element.js +++ b/lib/resource-handler/html/html-source-element.js @@ -51,7 +51,7 @@ class HtmlSourceElement { */ getPathContainer () { const selectedRule = this.findMatchedRule(pathContainersByRule); - const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag; + const ContainerClass = this.rule.containerClass || (selectedRule ? selectedRule.containerClass : CommonTag); const textWithResources = this.getData(); return textWithResources ? new ContainerClass(textWithResources) : null; } diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 4c1d6a7a..4006bcf1 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -23,7 +23,7 @@ class HtmlResourceHandler { } async handle (resource) { - const $ = loadTextToCheerio(resource.getText()); + const $ = loadTextToCheerio(await resource.getText()); prepareToLoad($, resource); const sourceRulesLoadPromises = this.allSources.map( @@ -31,7 +31,7 @@ class HtmlResourceHandler { ); await series(sourceRulesLoadPromises); - resource.setText($.html()); + await resource.setText($.html()); return resource; } diff --git a/lib/resource.js b/lib/resource.js index ae78886c..4da4f4c0 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -1,9 +1,20 @@ import types from './config/resource-types.js'; +import crypto from 'crypto'; +import fs from 'fs/promises'; +import path from 'path'; +import { compress, decompress } from './utils/index.js'; class Resource { - constructor (url, filename) { - this.url = url; - this.filename = filename; + constructor (url, filename, tempMode, tempDir) { + this.tempMode = tempMode || 'memory'; + this.tempDir = tempDir; + + if (this.tempMode === 'filesystem' && !this.tempDir) { + throw new Error('tmpDir must be provided in tmpMode=filesystem'); + } + + this.setUrl(url); + this.setFilename(filename); this.type = null; this.depth = 0; @@ -16,7 +27,7 @@ class Resource { } createChild (url, filename) { - const child = new Resource(url, filename); + const child = new Resource(url, filename, this.tempMode, this.tempDir); let currentDepth = this.getDepth(); child.parent = this; @@ -39,6 +50,12 @@ class Resource { } setUrl (url) { + if (this.tempDir) { + // Generate a unique filename based on the md5 hash of the url + const tmpName = `${crypto.createHash('md5').update(url).digest('hex')}.txt`; + this.tempPath = path.join(this.tempDir, tmpName); + } + this.url = url; } @@ -50,12 +67,34 @@ class Resource { this.filename = filename; } - getText () { - return this.text; + async getText () { + switch (this.tempMode) { + case 'memory': + return await this.text; + case 'memory-compressed': + return (await decompress(this.text)).toString(this.getEncoding()); + case 'filesystem': + return await fs.readFile(this.tempPath, { encoding: this.getEncoding() }); + default: + throw new Error(`Unknown tempMode: ${this.tempMode}`); + } } - setText (text) { - this.text = text; + async setText (text) { + switch (this.tempMode) { + case 'memory': + this.text = text; + break; + case 'memory-compressed': + this.text = await compress(text); + break; + case 'filesystem': + await fs.mkdir(this.tempDir, { recursive: true }); + await fs.writeFile(this.tempPath, text, { encoding: this.getEncoding() }); + break; + default: + throw new Error(`Unknown tempMode: ${this.tempMode}`); + } } getDepth () { diff --git a/lib/scraper.js b/lib/scraper.js index e14a3d4d..982cd58c 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -11,6 +11,9 @@ import { GenerateFilenameByTypePlugin, GetResourceReferencePlugin } from './plugins/index.js'; +import fs from 'fs'; +import path from 'path'; +import os from 'os'; import * as utils from './utils/index.js'; const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils; @@ -47,7 +50,16 @@ class Scraper { requestResource: this.requestResource.bind(this), getReference: this.runActions.bind(this, 'getReference') }); - this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename)); + + logger.info('tmpMode', this.options.tempMode); + if (this.options.tempMode === 'filesystem') { + if (!this.options.tempDir) { + this.options.tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'website-scraper-')); + } + logger.info('tmpDir', this.options.tempDir); + } + + this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename, this.options.tempMode, this.options.tempDir)); this.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise this.loadedResources = new NormalizedUrlMap(); // Map url -> resource @@ -185,7 +197,8 @@ class Scraper { resource.setMetadata(responseData.metadata); } - resource.setText(responseData.body); + await resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; }).catch(function handleError (err) { @@ -280,6 +293,10 @@ class Scraper { throw error; } finally { await this.runActions('afterFinish'); + + if (this.options.tempDir) { + await fs.promises.rm(this.options.tempDir, { recursive: true, force: true }); + } } } } diff --git a/lib/utils/index.js b/lib/utils/index.js index 97c4b7be..a4b809fb 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -4,6 +4,9 @@ import normalize from 'normalize-url'; import _ from 'lodash'; import typeByMime from '../config/resource-type-by-mime.js'; import typeByExt from '../config/resource-type-by-ext.js'; +import fs from 'fs/promises'; +import zlib from 'zlib'; +import { promisify } from 'util'; import logger from '../logger.js'; @@ -161,6 +164,46 @@ async function series (promises) { return results; } +/** + * Checks to see if the file/directory exists + */ +async function exists (path) { + let exists = false; + try { + await fs.stat(path); + exists = true; + } catch (err) { + // lstat throws an error if the directory doesn't exist. + // We don't care about that error because we don't want that + // directory to exist. + } + + return exists; +} + +const inflate = promisify(zlib.inflate); +const defalate = promisify(zlib.deflate); + +/** + * Decompresses a zlib compressed string. + * + * @param text - String to decompress. + * @returns - Decompressed string. + */ +async function decompress (buffer) { + return (await inflate(buffer)); +} + +/** + * Compresses a string. + * + * @param text - String to compress. + * @returns - Compressed string. + */ +async function compress (text) { + return (await defalate(Buffer.from(text), { level: 6 })); +} + export { isUrl, getUrl, @@ -181,5 +224,8 @@ export { extend, union, isPlainObject, - series + series, + exists, + decompress, + compress }; diff --git a/package.json b/package.json index d95bb594..1060aaf8 100644 --- a/package.json +++ b/package.json @@ -41,13 +41,13 @@ "cheerio": "1.0.0-rc.11", "css-url-parser": "^1.0.0", "debug": "^4.3.1", - "fs-extra": "^10.0.0", "got": "^12.0.0", "lodash": "^4.17.21", "normalize-url": "^7.0.2", "p-queue": "^7.1.0", "sanitize-filename": "^1.6.3", - "srcset": "^5.0.0" + "srcset": "^5.0.0", + "zlib": "^1.0.5" }, "devDependencies": { "c8": "^7.7.2", diff --git a/test/e2e/e2e-test.js b/test/e2e/e2e-test.js index b234b73f..5f30d7c4 100644 --- a/test/e2e/e2e-test.js +++ b/test/e2e/e2e-test.js @@ -1,20 +1,20 @@ import 'should'; import scrape from 'website-scraper'; -import fs from 'fs-extra'; import _ from 'lodash'; +import fs from 'fs/promises'; -import { readFile } from 'fs/promises'; -const urls = JSON.parse(await readFile(new URL('./urls.json', import.meta.url))); -const options = JSON.parse(await readFile(new URL('./options.json', import.meta.url))); +const urls = JSON.parse(await fs.readFile(new URL('./urls.json', import.meta.url))); +const options = JSON.parse(await fs.readFile(new URL('./options.json', import.meta.url))); const resultDirname = './test/e2e/results'; describe('E2E', function() { - before(function() { - fs.emptyDirSync(resultDirname); + before(async () => { + await fs.rm(resultDirname, { recursive: true, force: true }); + await fs.mkdir(resultDirname, { recursive: true }); }); - after(function() { + after(() => { console.log('Scraping completed. Go to ' + resultDirname + ' to check results'); }); diff --git a/test/functional/base/base.test.js b/test/functional/base/base.test.js index 8ac3dc3f..3cecca90 100644 --- a/test/functional/base/base.test.js +++ b/test/functional/base/base.test.js @@ -1,8 +1,8 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; -import cheerio from 'cheerio'; +import fs from 'fs/promises'; +import * as cheerio from 'cheerio'; import scrape from 'website-scraper'; import Resource from '../../../lib/resource.js'; @@ -32,15 +32,15 @@ describe('Functional: base', function() { ignoreErrors: false }; - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); beforeEach(() => { @@ -67,147 +67,146 @@ describe('Functional: base', function() { nock('http://blog.example.com/').get('/files/fail-1.png').replyWithError('something awful happened'); }); - it('should load multiple urls to single directory with all specified sources', () => { - return scrape(options).then(function(result) { - // should return right result - result.should.be.instanceOf(Array).and.have.length(3); - - result[0].should.have.properties({ url: 'http://example.com/', filename: 'index.html' }); - result[0].should.have.properties('children'); - result[0].children.should.be.instanceOf(Array).and.have.length(4); - result[0].children[0].should.be.instanceOf(Resource); - - result[1].should.have.properties({ url: 'http://example.com/about', filename: 'about.html' }); - result[1].should.have.properties('children'); - result[1].children.should.be.instanceOf(Array).and.have.length(4); - result[1].children[0].should.be.instanceOf(Resource); - - result[2].should.have.properties({ url: 'http://blog.example.com/', filename: 'blog.html' }); // url after redirect - result[2].should.have.properties('children'); - result[2].children.should.be.instanceOf(Array).and.have.length(1); - result[2].children[0].should.be.instanceOf(Resource); - - // should create directory and subdirectories - fs.existsSync(testDirname).should.be.eql(true); - fs.existsSync(testDirname + '/img').should.be.eql(true); - fs.existsSync(testDirname + '/js').should.be.eql(true); - fs.existsSync(testDirname + '/css').should.be.eql(true); - - // should contain all sources found in index.html - fs.existsSync(testDirname + '/css/index.css').should.be.eql(true); - fs.existsSync(testDirname + '/img/background.png').should.be.eql(true); - fs.existsSync(testDirname + '/img/cat.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/js/script.min.js').should.be.eql(true); - - // all sources in index.html should be replaced with local paths - let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString()); - $('link[rel="stylesheet"]').attr('href').should.be.eql('css/index.css'); - $('style').html().should.containEql('img/background.png'); - $('img').attr('src').should.be.eql('img/cat.jpg'); - $('script').attr('src').should.be.eql('js/script.min.js'); - - // should contain all sources found in index.css recursively - fs.existsSync(testDirname + '/css/index-import-1.css').should.be.eql(true); - fs.existsSync(testDirname + '/css/index-import-2.css').should.be.eql(true); - fs.existsSync(testDirname + '/css/index-import-3.css').should.be.eql(true); - fs.existsSync(testDirname + '/img/index-image-1.png').should.be.eql(true); - fs.existsSync(testDirname + '/img/index-image-2.png').should.be.eql(true); - - // all sources in index.css should be replaces with local files recursively - const indexCss = fs.readFileSync(testDirname + '/css/index.css').toString(); - indexCss.should.not.containEql('files/index-import-1.css'); - indexCss.should.not.containEql('files/index-import-2.css'); - indexCss.should.not.containEql('http://example.com/files/index-image-1.png'); - indexCss.should.containEql('index-import-1.css'); - indexCss.should.containEql('index-import-2.css'); - indexCss.should.containEql('../img/index-image-1.png'); - - const indexImportCss = fs.readFileSync(testDirname + '/css/index-import-2.css').toString(); - indexImportCss.should.not.containEql('http://example.com/files/index-image-2.png'); - indexImportCss.should.containEql('../img/index-image-2.png'); - - // should deal with base tag in about.html and not load new resources - // all sources in about.html should be replaced with already loaded local resources - $ = cheerio.load(fs.readFileSync(testDirname + '/about.html').toString()); - $('link[rel="stylesheet"]').attr('href').should.be.eql('css/index.css'); - $('style').html().should.containEql('img/background.png'); - $('img').attr('src').should.be.eql('img/cat.jpg'); - $('script').attr('src').should.be.eql('js/script.min.js'); - - // should not replace not loaded files - $ = cheerio.load(fs.readFileSync(testDirname + '/blog.html').toString()); - $('img').attr('src').should.be.eql('files/fail-1.png'); - }); + it('should load multiple urls to single directory with all specified sources', async() => { + const result = await scrape(options); + // should return right result + result.should.be.instanceOf(Array).and.have.length(3); + + result[0].should.have.properties({ url: 'http://example.com/', filename: 'index.html' }); + result[0].should.have.properties('children'); + result[0].children.should.be.instanceOf(Array).and.have.length(4); + result[0].children[0].should.be.instanceOf(Resource); + + result[1].should.have.properties({ url: 'http://example.com/about', filename: 'about.html' }); + result[1].should.have.properties('children'); + result[1].children.should.be.instanceOf(Array).and.have.length(4); + result[1].children[0].should.be.instanceOf(Resource); + + result[2].should.have.properties({ url: 'http://blog.example.com/', filename: 'blog.html' }); // url after redirect + result[2].should.have.properties('children'); + result[2].children.should.be.instanceOf(Array).and.have.length(1); + result[2].children[0].should.be.instanceOf(Resource); + + // should create directory and subdirectories + await `${testDirname}`.should.dirExists(true); + await `${testDirname}/img`.should.dirExists(true); + await `${testDirname}/js`.should.dirExists(true); + await `${testDirname}/css`.should.dirExists(true); + + // should contain all sources found in index.html + await `${testDirname}/css/index.css`.should.fileExists(true); + await `${testDirname}/img/background.png`.should.fileExists(true); + await `${testDirname}/img/cat.jpg`.should.fileExists(true); + await `${testDirname}/js/script.min.js`.should.fileExists(true); + + // all sources in index.html should be replaced with local paths + let $ = cheerio.load(await fs.readFile(testDirname + '/index.html', { encoding: 'binary' })); + $('link[rel="stylesheet"]').attr('href').should.be.eql('css/index.css'); + $('style').html().should.containEql('img/background.png'); + $('img').attr('src').should.be.eql('img/cat.jpg'); + $('script').attr('src').should.be.eql('js/script.min.js'); + + // should contain all sources found in index.css recursively + await `${testDirname}/css/index-import-1.css`.should.fileExists(true); + await `${testDirname}/css/index-import-2.css`.should.fileExists(true); + await `${testDirname}/css/index-import-3.css`.should.fileExists(true); + + await `${testDirname}/img/index-image-1.png`.should.fileExists(true); + await `${testDirname}/img/index-image-2.png`.should.fileExists(true); + + // all sources in index.css should be replaces with local files recursively + const indexCss = await fs.readFile(testDirname + '/css/index.css', { encoding: 'binary' }); + indexCss.should.not.containEql('files/index-import-1.css'); + indexCss.should.not.containEql('files/index-import-2.css'); + indexCss.should.not.containEql('http://example.com/files/index-image-1.png'); + indexCss.should.containEql('index-import-1.css'); + indexCss.should.containEql('index-import-2.css'); + indexCss.should.containEql('../img/index-image-1.png'); + + const indexImportCss = await fs.readFile(testDirname + '/css/index-import-2.css', { encoding: 'binary' }); + indexImportCss.should.not.containEql('http://example.com/files/index-image-2.png'); + indexImportCss.should.containEql('../img/index-image-2.png'); + + // should deal with base tag in about.html and not load new resources + // all sources in about.html should be replaced with already loaded local resources + $ = cheerio.load(await fs.readFile(testDirname + '/about.html', { encoding: 'binary' })); + $('link[rel="stylesheet"]').attr('href').should.be.eql('css/index.css'); + $('style').html().should.containEql('img/background.png'); + $('img').attr('src').should.be.eql('img/cat.jpg'); + $('script').attr('src').should.be.eql('js/script.min.js'); + + // should not replace not loaded files + $ = cheerio.load(await fs.readFile(testDirname + '/blog.html', { encoding: 'binary' })); + $('img').attr('src').should.be.eql('files/fail-1.png'); }); - it('should load multiple urls to single directory with all specified sources with bySiteStructureFilenameGenerator', () => { - return scrape({...options, filenameGenerator: 'bySiteStructure'}).then(function(result) { - result.should.be.instanceOf(Array).and.have.length(3); - - should(result[0].url).eql('http://example.com/'); - should(result[0].filename).equalFileSystemPath('example.com/index.html'); - result[0].should.have.properties('children'); - result[0].children.should.be.instanceOf(Array).and.have.length(4); - result[0].children[0].should.be.instanceOf(Resource); - - should(result[1].url).eql('http://example.com/about'); - should(result[1].filename).equalFileSystemPath('example.com/about/index.html'); - result[1].should.have.properties('children'); - result[1].children.should.be.instanceOf(Array).and.have.length(4); - result[1].children[0].should.be.instanceOf(Resource); - - should(result[2].url).eql('http://blog.example.com/'); // url after redirect - should(result[2].filename).equalFileSystemPath('blog.example.com/index.html'); - result[2].should.have.properties('children'); - result[2].children.should.be.instanceOf(Array).and.have.length(1); - result[2].children[0].should.be.instanceOf(Resource); - - // should create directory and subdirectories - fs.existsSync(testDirname).should.be.eql(true); - fs.existsSync(testDirname + '/example.com/about').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/files').should.be.eql(true); - fs.existsSync(testDirname + '/blog.example.com').should.be.eql(true); - - // should contain all sources found in index.html - fs.existsSync(testDirname + '/example.com/index.css').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/background.png').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/cat.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/script.min.js').should.be.eql(true); - - // all sources in index.html should be replaced with local paths - let $ = cheerio.load(fs.readFileSync(testDirname + '/example.com/index.html').toString()); - $('link[rel="stylesheet"]').attr('href').should.be.eql('index.css'); - $('style').html().should.containEql('background.png'); - $('img').attr('src').should.be.eql('cat.jpg'); - $('script').attr('src').should.be.eql('script.min.js'); - - // should contain all sources found in index.css recursively - fs.existsSync(testDirname + '/example.com/files/index-import-1.css').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/files/index-import-2.css').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/files/index-import-3.css').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/files/index-image-1.png').should.be.eql(true); - fs.existsSync(testDirname + '/example.com/files/index-image-2.png').should.be.eql(true); - - // all sources in index.css should be replaces with local files recursively - const indexCss = fs.readFileSync(testDirname + '/example.com/index.css').toString(); - indexCss.should.containEql('files/index-import-1.css'); - indexCss.should.containEql('files/index-import-2.css'); - indexCss.should.containEql('files/index-image-1.png'); - - const indexImportCss = fs.readFileSync(testDirname + '/example.com/files/index-import-2.css').toString(); - indexImportCss.should.containEql('index-image-2.png'); - - // should deal with base tag in about.html and not load new resources - // all sources in about.html should be replaced with already loaded local resources - $ = cheerio.load(fs.readFileSync(testDirname + '/example.com/about/index.html').toString()); - $('link[rel="stylesheet"]').attr('href').should.be.eql('../index.css'); - $('style').html().should.containEql('../background.png'); - $('img').attr('src').should.be.eql('../cat.jpg'); - $('script').attr('src').should.be.eql('../script.min.js'); - - // should not replace not loaded files - $ = cheerio.load(fs.readFileSync(testDirname + '/blog.example.com/index.html').toString()); - $('img').attr('src').should.be.eql('files/fail-1.png'); - }); + it('should load multiple urls to single directory with all specified sources with bySiteStructureFilenameGenerator', async () => { + const result = await scrape({...options, filenameGenerator: 'bySiteStructure'}); + result.should.be.instanceOf(Array).and.have.length(3); + + should(result[0].url).eql('http://example.com/'); + should(result[0].filename).equalFileSystemPath('example.com/index.html'); + result[0].should.have.properties('children'); + result[0].children.should.be.instanceOf(Array).and.have.length(4); + result[0].children[0].should.be.instanceOf(Resource); + + should(result[1].url).eql('http://example.com/about'); + should(result[1].filename).equalFileSystemPath('example.com/about/index.html'); + result[1].should.have.properties('children'); + result[1].children.should.be.instanceOf(Array).and.have.length(4); + result[1].children[0].should.be.instanceOf(Resource); + + should(result[2].url).eql('http://blog.example.com/'); // url after redirect + should(result[2].filename).equalFileSystemPath('blog.example.com/index.html'); + result[2].should.have.properties('children'); + result[2].children.should.be.instanceOf(Array).and.have.length(1); + result[2].children[0].should.be.instanceOf(Resource); + + // should create directory and subdirectories + await `${testDirname}`.should.dirExists(true); + await `${testDirname}/example.com/about`.should.dirExists(true); + await `${testDirname}/example.com/files`.should.dirExists(true); + await `${testDirname}/blog.example.com`.should.dirExists(true); + + // should contain all sources found in index.html + await `${testDirname}/example.com/index.css`.should.fileExists(true); + await `${testDirname}/example.com/background.png`.should.fileExists(true); + await `${testDirname}/example.com/cat.jpg`.should.fileExists(true); + await `${testDirname}/example.com/script.min.js`.should.fileExists(true); + + // all sources in index.html should be replaced with local paths + let $ = cheerio.load(await fs.readFile(testDirname + '/example.com/index.html', { encoding: 'binary' })); + $('link[rel="stylesheet"]').attr('href').should.be.eql('index.css'); + $('style').html().should.containEql('background.png'); + $('img').attr('src').should.be.eql('cat.jpg'); + $('script').attr('src').should.be.eql('script.min.js'); + + // should contain all sources found in index.css recursively + await `${testDirname}/example.com/files/index-import-1.css`.should.fileExists(true); + await `${testDirname}/example.com/files/index-import-2.css`.should.fileExists(true); + await `${testDirname}/example.com/files/index-import-3.css`.should.fileExists(true); + await `${testDirname}/example.com/files/index-image-1.png`.should.fileExists(true); + await `${testDirname}/example.com/files/index-image-2.png`.should.fileExists(true); + + // all sources in index.css should be replaces with local files recursively + const indexCss = await fs.readFile(testDirname + '/example.com/index.css', { encoding: 'binary' }); + indexCss.should.containEql('files/index-import-1.css'); + indexCss.should.containEql('files/index-import-2.css'); + indexCss.should.containEql('files/index-image-1.png'); + + const indexImportCss = await fs.readFile(testDirname + '/example.com/files/index-import-2.css', { encoding: 'binary' }); + indexImportCss.should.containEql('index-image-2.png'); + + // should deal with base tag in about.html and not load new resources + // all sources in about.html should be replaced with already loaded local resources + $ = cheerio.load(await fs.readFile(testDirname + '/example.com/about/index.html', { encoding: 'binary' })); + $('link[rel="stylesheet"]').attr('href').should.be.eql('../index.css'); + $('style').html().should.containEql('../background.png'); + $('img').attr('src').should.be.eql('../cat.jpg'); + $('script').attr('src').should.be.eql('../script.min.js'); + + // should not replace not loaded files + $ = cheerio.load(await fs.readFile(testDirname + '/blog.example.com/index.html', { encoding: 'binary' })); + $('img').attr('src').should.be.eql('files/fail-1.png'); }); }); diff --git a/test/functional/base/check-it-works.js b/test/functional/base/check-it-works.js index 3bb61b5d..e8f62e43 100644 --- a/test/functional/base/check-it-works.js +++ b/test/functional/base/check-it-works.js @@ -1,22 +1,22 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/base/.tmp2'; -describe('Functional: check it works', function() { +describe('Functional: check it works', () => { - beforeEach(function () { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function () { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); it('should work with promise', () => { diff --git a/test/functional/binary-resources/images.test.js b/test/functional/binary-resources/images.test.js index 05d07916..db5d806c 100644 --- a/test/functional/binary-resources/images.test.js +++ b/test/functional/binary-resources/images.test.js @@ -1,8 +1,8 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; -import cheerio from 'cheerio'; +import fs from 'fs/promises'; +import * as cheerio from 'cheerio'; import scrape from 'website-scraper'; const testDirname = './test/functional/binary-resources/.tmp'; @@ -26,10 +26,10 @@ describe('Functional: images', () => { nock.disableNetConnect(); }); - afterEach(() => { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, }); }); beforeEach(() => { @@ -45,23 +45,25 @@ describe('Functional: images', () => { await scrape(options); // should create directory and subdirectories - fs.existsSync(testDirname).should.be.eql(true); - fs.existsSync(testDirname + '/img').should.be.eql(true); + await `${testDirname}`.should.dirExists(true); + await `${testDirname}/img`.should.dirExists(true); // should contain all sources found in index.html - fs.existsSync(testDirname + '/img/test-image.png').should.be.eql(true); - fs.existsSync(testDirname + '/img/test-image.jpg').should.be.eql(true); + await `${testDirname}/img/test-image.png`.should.fileExists(true); + await `${testDirname}/img/test-image.jpg`.should.fileExists(true); // all sources in index.html should be replaced with local paths - let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString()); + await `${testDirname}/index.html`.should.fileExists(true); + const indexHtml = await fs.readFile(`${testDirname}/index.html`, { encoding: 'binary'}); + let $ = cheerio.load(indexHtml); $('img.png').attr('src').should.be.eql('img/test-image.png'); $('img.jpg').attr('src').should.be.eql('img/test-image.jpg'); // content of downloaded images should equal original images - const originalPng = fs.readFileSync(mockDirname + '/test-image.png'); - const originalJpg = fs.readFileSync(mockDirname + '/test-image.jpg'); - const resultPng = fs.readFileSync(testDirname + '/img/test-image.png'); - const resultJpg = fs.readFileSync(testDirname + '/img/test-image.jpg'); + const originalPng = await fs.readFile(mockDirname + '/test-image.png', { encoding: 'binary' }); + const originalJpg = await fs.readFile(mockDirname + '/test-image.jpg', { encoding: 'binary' }); + const resultPng = await fs.readFile(testDirname + '/img/test-image.png', { encoding: 'binary' }); + const resultJpg = await fs.readFile(testDirname + '/img/test-image.jpg', { encoding: 'binary' }); should(resultPng).be.eql(originalPng); should(resultJpg).be.eql(originalJpg); diff --git a/test/functional/callbacks/callbacks.test.js b/test/functional/callbacks/callbacks.test.js index 7f6b3279..c35907f1 100644 --- a/test/functional/callbacks/callbacks.test.js +++ b/test/functional/callbacks/callbacks.test.js @@ -1,7 +1,7 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import sinon from 'sinon'; import scrape from 'website-scraper'; @@ -14,10 +14,10 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin', nock.disableNetConnect(); }); - afterEach(() => { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); it('should call onResourceSaved callback and onResourceError callback if ignoreErrors = true', function() { diff --git a/test/functional/circular-dependencies/circular-dependencies.test.js b/test/functional/circular-dependencies/circular-dependencies.test.js index d5a5c229..5ae956a8 100644 --- a/test/functional/circular-dependencies/circular-dependencies.test.js +++ b/test/functional/circular-dependencies/circular-dependencies.test.js @@ -1,26 +1,26 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/circular-dependencies/.tmp'; const mockDirname = './test/functional/circular-dependencies/mocks'; -describe('Functional circular dependencies', function() { +describe('Functional circular dependencies', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should correctly load files with circular dependency', function() { + it('should correctly load files with circular dependency', async () => { const options = { urls: [ { url: 'http://example.com/index.html', filename: 'index.html'}, @@ -39,12 +39,12 @@ describe('Functional circular dependencies', function() { nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'}); nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css', {'content-type': 'text/css'}); - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/about.html').should.be.eql(true); - fs.existsSync(testDirname + '/style.css').should.be.eql(true); - fs.existsSync(testDirname + '/style2.css').should.be.eql(true); - }); + await scrape(options); + + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/about.html`.should.fileExists(true); + await `${testDirname}/style.css`.should.fileExists(true); + await `${testDirname}/style2.css`.should.fileExists(true); }); }); diff --git a/test/functional/css-handling/css-handling.test.js b/test/functional/css-handling/css-handling.test.js index 211b2b1f..396d0e70 100644 --- a/test/functional/css-handling/css-handling.test.js +++ b/test/functional/css-handling/css-handling.test.js @@ -1,26 +1,26 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/css-handling/.tmp'; const mockDirname = './test/functional/css-handling/mocks'; -describe('Functional: css handling', function() { +describe('Functional: css handling', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should correctly handle css files, style tags and style attributes and ignore css-like text inside common html tags', function() { + it('should correctly handle css files, style tags and style attributes and ignore css-like text inside common html tags', async () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'}); @@ -40,21 +40,21 @@ describe('Functional: css handling', function() { ] }; - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/local/style.css').should.be.eql(true); - fs.existsSync(testDirname + '/local/style-import-1.css').should.be.eql(true); - fs.existsSync(testDirname + '/local/style-import-2.css').should.be.eql(true); - fs.existsSync(testDirname + '/local/style-tag.png').should.be.eql(true); - fs.existsSync(testDirname + '/local/style-attr.png').should.be.eql(true); - fs.existsSync(testDirname + '/local/css-like-text-in-html.png').should.be.eql(false); + await scrape(options); - const indexHtml = fs.readFileSync(testDirname + '/index.html').toString(); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/local/style.css`.should.fileExists(true); + await `${testDirname}/local/style-import-1.css`.should.fileExists(true); + await `${testDirname}/local/style-import-2.css`.should.fileExists(true); + await `${testDirname}/local/style-tag.png`.should.fileExists(true); + await `${testDirname}/local/style-attr.png`.should.fileExists(true); + await `${testDirname}/local/css-like-text-in-html.png`.should.fileExists(false); - should(indexHtml).containEql('local/style-tag.png'); - should(indexHtml).containEql('local/style-attr.png'); + const indexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary' }); - should(indexHtml).containEql('background: url(\'css-like-text-in-html.png\')'); - }); + should(indexHtml).containEql('local/style-tag.png'); + should(indexHtml).containEql('local/style-attr.png'); + + should(indexHtml).containEql('background: url(\'css-like-text-in-html.png\')'); }); }); diff --git a/test/functional/data-url/data-url.test.js b/test/functional/data-url/data-url.test.js index c494dd69..54f08c26 100644 --- a/test/functional/data-url/data-url.test.js +++ b/test/functional/data-url/data-url.test.js @@ -1,26 +1,26 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/data-url/.tmp'; const mockDirname = './test/functional/data-url/mocks'; -describe('Functional: data urls handling', function () { +describe('Functional: data urls handling', () => { - beforeEach(function () { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function () { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should correctly handle html files with data urls in attributes', function () { + it('should correctly handle html files with data urls in attributes', async () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); nock('http://example.com/').get('/product/abc/media/521811121-392x351.jpg').reply(200, '/product/abc/media/521811121-392x351.jpg'); const options = { @@ -31,18 +31,16 @@ describe('Functional: data urls handling', function () { } }; - return scrape(options).then(function () { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/images/521811121-392x351.jpg').should.be.eql(true); - - const actualIndexHtml = fs.readFileSync(testDirname + '/index.html').toString(); - - should(actualIndexHtml).containEql(''); - should(actualIndexHtml).containEql(''); - should(actualIndexHtml).containEql(''); - should(actualIndexHtml).containEql(''); - should(actualIndexHtml).containEql(''); - should(actualIndexHtml).containEql(''); - }); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/images/521811121-392x351.jpg`.should.fileExists(true); + + const actualIndexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary' }); + should(actualIndexHtml).containEql(''); + should(actualIndexHtml).containEql(''); + should(actualIndexHtml).containEql(''); + should(actualIndexHtml).containEql(''); + should(actualIndexHtml).containEql(''); + should(actualIndexHtml).containEql(''); }); }); diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 9360b668..33429104 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -1,7 +1,7 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import sinon from 'sinon'; import scrape from 'website-scraper'; import Scraper from '../../../lib/scraper.js'; @@ -9,7 +9,7 @@ import Scraper from '../../../lib/scraper.js'; const testDirname = './test/functional/error-handling/.tmp'; const mockDirname = './test/functional/error-handling/mocks'; -describe('Functional error handling', function() { +describe('Functional error handling', () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -19,7 +19,7 @@ describe('Functional error handling', function() { sources: [] }; - beforeEach(function () { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); @@ -32,16 +32,16 @@ describe('Functional error handling', function() { nock('http://example.com/').get('/page6.html').delay(600).reply(200, 'ok'); }); - afterEach(function () { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - describe('FS Error', function () { + describe('FS Error', () => { let saveResourceStub, handleErrorStub, failingFsPlugin; - beforeEach(function() { + beforeEach(() => { saveResourceStub = sinon.stub().resolves().onCall(2).rejects(new Error('FS FAILED!')); handleErrorStub = sinon.stub().resolves(); @@ -55,7 +55,7 @@ describe('Functional error handling', function() { failingFsPlugin = new FailingFSPlugin(); }); - it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () { + it('should remove directory and immediately reject on fs error if ignoreErrors is false', () => { const scraperOptions = { ...options, ignoreErrors: false, @@ -64,7 +64,7 @@ describe('Functional error handling', function() { ] }; - return scrape(scraperOptions).then(function() { + return scrape(scraperOptions).then(() => { should(true).be.eql(false); }).catch(function (err) { should(err.message).be.eql('FS FAILED!'); @@ -73,7 +73,7 @@ describe('Functional error handling', function() { }); }); - it('should ignore fs error if ignoreErrors is true', function () { + it('should ignore fs error if ignoreErrors is true', () => { const scraperOptions = { ...options, ignoreErrors: true, @@ -82,18 +82,18 @@ describe('Functional error handling', function() { ] }; - return scrape(scraperOptions).then(function() { + return scrape(scraperOptions).then(() => { should(saveResourceStub.callCount).be.eql(7); should(handleErrorStub.callCount).be.eql(0); }); }); }); - describe('Resource Handler Error', function () { + describe('Resource Handler Error', () => { let scraper; let handleResourceStub; - beforeEach(function() { + beforeEach(() => { scraper = new Scraper(options); const originalHandleResource = scraper.resourceHandler.handleResource; let callCount = 0; @@ -109,25 +109,25 @@ describe('Functional error handling', function() { handleResourceStub.restore(); }) - it('should remove directory and immediately reject on resource handler error if ignoreErrors is false', function () { + it('should remove directory and immediately reject on resource handler error if ignoreErrors is false', async () => { scraper.options.ignoreErrors = false; - return scraper.scrape().then(function() { + try { + await scraper.scrape(); should(true).be.eql(false); - }).catch(function (err) { - fs.existsSync(testDirname).should.be.eql(false); + } catch (err) { + await testDirname.should.dirExists(false); should(err.message).be.eql('RESOURCE HANDLER FAILED!'); should(handleResourceStub.callCount).be.eql(4); - }); + } }); - it('should ignore resource handler error if ignoreErrors is true', function () { + it('should ignore resource handler error if ignoreErrors is true', async () => { scraper.options.ignoreErrors = true; - return scraper.scrape().then(function() { - should(handleResourceStub.callCount).be.eql(7); - fs.existsSync(testDirname).should.be.eql(true); - }); + await scraper.scrape(); + should(handleResourceStub.callCount).be.eql(7); + await testDirname.should.dirExists(true); }); }); }); diff --git a/test/functional/html-entities/html-entities.test.js b/test/functional/html-entities/html-entities.test.js index cb682dd5..797c3eb5 100644 --- a/test/functional/html-entities/html-entities.test.js +++ b/test/functional/html-entities/html-entities.test.js @@ -1,26 +1,26 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/html-entities/.tmp'; const mockDirname = './test/functional/html-entities/mocks'; -describe('Functional: html entities', function() { +describe('Functional: html entities', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should decode all html-entities found in html files and not encode entities from css file', function() { + it('should decode all html-entities found in html files and not encode entities from css file', async () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'}); @@ -54,39 +54,38 @@ describe('Functional: html entities', function() { ignoreErrors: false }; - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - const indexHtml = fs.readFileSync(testDirname + '/index.html').toString(); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + const indexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary'}); - should(indexHtml).containEql('href="local/fonts.css'); - fs.existsSync(testDirname + '/local/fonts.css').should.be.eql(true); - should(fs.readFileSync(testDirname + '/local/fonts.css').toString()).be.eql('fonts.css'); + should(indexHtml).containEql('href="local/fonts.css'); + await `${testDirname}/local/fonts.css`.should.fileExists(true); + should(await fs.readFile(testDirname + '/local/fonts.css', { encoding: 'binary'})).be.eql('fonts.css'); - // single quote (') replaced with ' in attribute - should(indexHtml).containEql('background: url(\'local/style-attr.png\')'); - fs.existsSync(testDirname + '/local/style-attr.png').should.be.eql(true); - should(fs.readFileSync(testDirname + '/local/style-attr.png').toString()).be.eql('style-attr.png'); + // single quote (') replaced with ' in attribute + should(indexHtml).containEql('background: url(\'local/style-attr.png\')'); + await `${testDirname}/local/style-attr.png`.should.fileExists(true); + should(await fs.readFile(testDirname + '/local/style-attr.png', { encoding: 'binary'})).be.eql('style-attr.png'); - // double quote (") replaced with " in attribute - should(indexHtml).containEql('background: url("local/style-attr2.png")'); - fs.existsSync(testDirname + '/local/style-attr2.png').should.be.eql(true); - should(fs.readFileSync(testDirname + '/local/style-attr2.png').toString()).be.eql('style-attr2.png'); + // double quote (") replaced with " in attribute + should(indexHtml).containEql('background: url("local/style-attr2.png")'); + await `${testDirname}/local/style-attr2.png`.should.fileExists(true); + should(await fs.readFile(testDirname + '/local/style-attr2.png', { encoding: 'binary'})).be.eql('style-attr2.png'); - should(indexHtml).containEql('img src="local/img.png'); - fs.existsSync(testDirname + '/local/img.png').should.be.eql(true); - should(fs.readFileSync(testDirname + '/local/img.png').toString()).be.eql('img.png'); + should(indexHtml).containEql('img src="local/img.png'); + await `${testDirname}/local/img.png`.should.fileExists(true); + should(await fs.readFile(testDirname + '/local/img.png', { encoding: 'binary'})).be.eql('img.png'); - should(indexHtml).containEql('href="index_1.html"'); - fs.existsSync(testDirname + '/index_1.html').should.be.eql(true); - should(fs.readFileSync(testDirname + '/index_1.html').toString()).be.eql('index_1.html'); + should(indexHtml).containEql('href="index_1.html"'); + await `${testDirname}/index_1.html`.should.fileExists(true); + should(await fs.readFile(testDirname + '/index_1.html', { encoding: 'binary'})).be.eql('index_1.html'); - fs.existsSync(testDirname + '/local/style.css').should.be.eql(true); - const styleCss = fs.readFileSync(testDirname + '/local/style.css').toString(); + await `${testDirname}/local/style.css`.should.fileExists(true); + const styleCss = await fs.readFile(testDirname + '/local/style.css', { encoding: 'binary'}); - should(styleCss).containEql('url(\'external-style.png\')'); - fs.existsSync(testDirname + '/local/external-style.png').should.be.eql(true); - should(fs.readFileSync(testDirname + '/local/external-style.png').toString()).be.eql('external-style.png'); - }); + should(styleCss).containEql('url(\'external-style.png\')'); + await `${testDirname}/local/external-style.png`.should.fileExists(true); + should(await fs.readFile(testDirname + '/local/external-style.png', { encoding: 'binary'})).be.eql('external-style.png'); }); it('should generate correct quotes which don\'t break html markup (see #355)', async () => { @@ -99,8 +98,8 @@ describe('Functional: html entities', function() { await scrape(options); - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - const indexHtml = fs.readFileSync(testDirname + '/index.html').toString(); + await `${testDirname}/index.html`.should.fileExists(true); + const indexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary'}); /*
becomes diff --git a/test/functional/html-id-href/html-id-href.test.js b/test/functional/html-id-href/html-id-href.test.js index ed93c623..e2ea08a1 100644 --- a/test/functional/html-id-href/html-id-href.test.js +++ b/test/functional/html-id-href/html-id-href.test.js @@ -1,26 +1,26 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/html-id-href/.tmp'; const mockDirname = './test/functional/html-id-href/mocks'; -describe('Functional html id href', function() { +describe('Functional html id href', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should ignore same-file paths and update other-file paths', function() { + it('should ignore same-file paths and update other-file paths', async () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); nock('http://example.com/').get('/sprite.svg').reply(200, 'sprite.svg'); nock('https://mdn.mozillademos.org/').get('/files/6457/mdn_logo_only_color.png').reply(200, 'mdn_logo_only_color.png'); @@ -41,24 +41,23 @@ describe('Functional html id href', function() { ] }; - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/other.html').should.be.eql(true); - fs.existsSync(testDirname + '/local/sprite.svg').should.be.eql(true); - fs.existsSync(testDirname + '/local/mdn_logo_only_color.png').should.be.eql(true); + await scrape(options); + `${testDirname}/index.html`.should.fileExists(true); + `${testDirname}/other.html`.should.fileExists(true); + `${testDirname}/local/sprite.svg`.should.fileExists(true); + `${testDirname}/local/mdn_logo_only_color.png`.should.fileExists(true); - const indexHtml = fs.readFileSync(testDirname + '/index.html').toString(); + const indexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary' }); - // should update path to external svgs - should(indexHtml).containEql('xlink:href="local/sprite.svg#icon-undo"'); - should(indexHtml).containEql('href="local/sprite.svg#icon-redo"'); - // should keep links to local svgs - should(indexHtml).containEql('xlink:href="#codrops" class="codrops-1"'); - should(indexHtml).containEql('xlink:href="#codrops" class="codrops-2"'); - should(indexHtml).containEql('xlink:href="#codrops" class="codrops-3"'); + // should update path to external svgs + should(indexHtml).containEql('xlink:href="local/sprite.svg#icon-undo"'); + should(indexHtml).containEql('href="local/sprite.svg#icon-redo"'); + // should keep links to local svgs + should(indexHtml).containEql('xlink:href="#codrops" class="codrops-1"'); + should(indexHtml).containEql('xlink:href="#codrops" class="codrops-2"'); + should(indexHtml).containEql('xlink:href="#codrops" class="codrops-3"'); - should(indexHtml).containEql('Go to top (this page)'); - should(indexHtml).containEql('Go to top (other page)'); - }); + should(indexHtml).containEql('Go to top (this page)'); + should(indexHtml).containEql('Go to top (other page)'); }); }); diff --git a/test/functional/max-depth/max-depth.test.js b/test/functional/max-depth/max-depth.test.js index 00e788da..357ef59d 100644 --- a/test/functional/max-depth/max-depth.test.js +++ b/test/functional/max-depth/max-depth.test.js @@ -1,7 +1,7 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/max-depth/.tmp'; @@ -14,13 +14,13 @@ describe('Functional: maxDepth and maxRecursiveDepth ', () => { nock.disableNetConnect(); }); - afterEach(() => { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should filter out all resources by depth > maxDepth', () => { + it('should filter out all resources by depth > maxDepth', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -47,25 +47,25 @@ describe('Functional: maxDepth and maxRecursiveDepth ', () => { nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg'); nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); + await scrape(options); - fs.existsSync(testDirname + '/depth1.html').should.be.eql(true); - fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true); + await `${testDirname}/index.html`.should.fileExists(true); - fs.existsSync(testDirname + '/depth2.html').should.be.eql(true); - fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true); + await `${testDirname}/depth1.html`.should.fileExists(true); + await `${testDirname}/img-depth1.jpg`.should.fileExists(true); + await `${testDirname}/script-depth1.js`.should.fileExists(true); - fs.existsSync(testDirname + '/depth3.html').should.be.eql(false); - fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(false); - fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(false); - }); + await `${testDirname}/depth2.html`.should.fileExists(false); + await `${testDirname}/img-depth2.jpg`.should.fileExists(false); + await `${testDirname}/script-depth2.js`.should.fileExists(false); + + await `${testDirname}/depth3.html`.should.fileExists(false); + await `${testDirname}/img-depth3.jpg`.should.fileExists(false); + await `${testDirname}/script-depth3.js`.should.fileExists(false); }); - it('should filter out only anchors by depth > maxRecursiveDepth', () => { + it('should filter out only anchors by depth > maxRecursiveDepth', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -92,26 +92,26 @@ describe('Functional: maxDepth and maxRecursiveDepth ', () => { nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg'); nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); + await scrape(options); + + await `${testDirname}/index.html`.should.fileExists(true); - fs.existsSync(testDirname + '/depth1.html').should.be.eql(true); - fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true); + await `${testDirname}/depth1.html`.should.fileExists(true); + await `${testDirname}/img-depth1.jpg`.should.fileExists(true); + await `${testDirname}/script-depth1.js`.should.fileExists(true); - fs.existsSync(testDirname + '/depth2.html').should.be.eql(true); - fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true); + await `${testDirname}/depth2.html`.should.fileExists(true); + await `${testDirname}/img-depth2.jpg`.should.fileExists(true); + await `${testDirname}/script-depth2.js`.should.fileExists(true); - fs.existsSync(testDirname + '/depth3.html').should.be.eql(false); - // img-depth3.jpg and script-depth3.js - dependencies of depth2.html - // they should be loaded because maxRecursiveDepth applies only to - fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(true); - fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(true); - }); + await `${testDirname}/depth3.html`.should.fileExists(true); + // img-depth3.jpg and script-depth3.js - dependencies of depth2.html + // they should be loaded because maxRecursiveDepth applies only to + await `${testDirname}/img-depth3.jpg`.should.fileExists(true); + await `${testDirname}/script-depth3.js`.should.fileExists(true); }); - it('should correctly save same resource with different depth and maxRecursiveDepth', () => { + it('should correctly save same resource with different depth and maxRecursiveDepth', async () => { /* pageA -> pageB pageA -> pageC @@ -144,19 +144,19 @@ describe('Functional: maxDepth and maxRecursiveDepth ', () => { nock('http://example.com/').get('/pageB.html').reply(200, pageB, {'Content-Type': 'text/html'}); nock('http://example.com/').get('/pageC.html').reply(200, 'pageC', {'Content-Type': 'text/html'}); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/pageA.html').should.be.eql(true); - fs.existsSync(testDirname + '/pageB.html').should.be.eql(true); - fs.existsSync(testDirname + '/pageC.html').should.be.eql(true); + await scrape(options); + + await `${testDirname}/pageA.html`.should.fileExists(true); + await `${testDirname}/pageB.html`.should.fileExists(true); + await `${testDirname}/pageC.html`.should.fileExists(true); - const pageASaved = fs.readFileSync(testDirname + '/pageA.html').toString(); - pageASaved.should.containEql(' maxRecursiveDepth - }); + const pageBSaved = await fs.readFile(testDirname + '/pageB.html', { encoding: 'binary' }); + // todo: should we change reference here because pageC was already downloaded? + pageBSaved.should.containEql(' maxRecursiveDepth }); }); diff --git a/test/functional/recursive/recursive.test.js b/test/functional/recursive/recursive.test.js index 44cff191..4089096b 100644 --- a/test/functional/recursive/recursive.test.js +++ b/test/functional/recursive/recursive.test.js @@ -1,26 +1,26 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/recursive/.tmp'; const mockDirname = './test/functional/recursive/mocks'; -describe('Functional recursive downloading', function() { +describe('Functional recursive downloading', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should follow anchors if recursive flag is set', function () { + it('should follow anchors if recursive flag is set', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -37,20 +37,20 @@ describe('Functional recursive downloading', function() { nock('http://example.com/').get('/link2.html').reply(200, 'content 2'); nock('http://example.com/').get('/link3.html').reply(200, 'content 3'); - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); + await scrape(options); - // index.html anchors loaded - fs.existsSync(testDirname + '/about.html').should.be.eql(true); + await `${testDirname}/index.html`.should.fileExists(true); - // about.html anchors loaded - fs.existsSync(testDirname + '/link1.html').should.be.eql(true); - fs.existsSync(testDirname + '/link2.html').should.be.eql(true); - fs.existsSync(testDirname + '/link3.html').should.be.eql(true); - }); + // index.html anchors loaded (depth 1) + await `${testDirname}/about.html`.should.fileExists(true); + + // about.html anchors loaded (depth 2) + await `${testDirname}/link1.html`.should.fileExists(true); + await `${testDirname}/link2.html`.should.fileExists(true); + await `${testDirname}/link3.html`.should.fileExists(true); }); - it('should follow anchors with depth <= maxDepth if recursive flag and maxDepth are set', function () { + it('should follow anchors with depth <= maxDepth if recursive flag and maxDepth are set', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -74,24 +74,24 @@ describe('Functional recursive downloading', function() { nock('http://example.com/').get('/link1-1.html').reply(200, 'content 1-1'); nock('http://example.com/').get('/link1-2.html').reply(200, 'content 1-2'); - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); + await scrape(options); + + await `${testDirname}/index.html`.should.fileExists(true); - // index.html anchors loaded (depth 1) - fs.existsSync(testDirname + '/about.html').should.be.eql(true); + // index.html anchors loaded (depth 1) + await `${testDirname}/about.html`.should.fileExists(true); - // about.html anchors loaded (depth 2) - fs.existsSync(testDirname + '/link1.html').should.be.eql(true); - fs.existsSync(testDirname + '/link2.html').should.be.eql(true); - fs.existsSync(testDirname + '/link3.html').should.be.eql(true); + // about.html anchors loaded (depth 2) + await `${testDirname}/link1.html`.should.fileExists(true); + await `${testDirname}/link2.html`.should.fileExists(true); + await `${testDirname}/link3.html`.should.fileExists(true); - // link1.html anchors NOT loaded (depth 3) - fs.existsSync(testDirname + '/link1-1.html').should.be.eql(false); - fs.existsSync(testDirname + '/link1-2.html').should.be.eql(false); - }); + // link1.html anchors NOT loaded (depth 3) + await `${testDirname}/link1-1.html`.should.fileExists(false); + await `${testDirname}/link1-2.html`.should.fileExists(false); }); - it('should not follow anchors if recursive flag is not set', function () { + it('should not follow anchors if recursive flag is not set', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -107,16 +107,15 @@ describe('Functional recursive downloading', function() { nock('http://example.com/').get('/link2.html').reply(200, 'content 2'); nock('http://example.com/').get('/link3.html').reply(200, 'content 3'); - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); - // index.html anchors loaded - fs.existsSync(testDirname + '/about.html').should.be.eql(false); + // index.html anchors NOT loaded (depth 1) + await `${testDirname}/about.html`.should.fileExists(false); - // about.html anchors loaded - fs.existsSync(testDirname + '/link1.html').should.be.eql(false); - fs.existsSync(testDirname + '/link2.html').should.be.eql(false); - fs.existsSync(testDirname + '/link3.html').should.be.eql(false); - }); + // about.html anchors NOT loaded (depth 2) + await `${testDirname}/link1.html`.should.fileExists(false); + await `${testDirname}/link2.html`.should.fileExists(false); + await `${testDirname}/link3.html`.should.fileExists(false); }); }); diff --git a/test/functional/redirect/redirect.test.js b/test/functional/redirect/redirect.test.js index a7e4f16f..fa58193a 100644 --- a/test/functional/redirect/redirect.test.js +++ b/test/functional/redirect/redirect.test.js @@ -1,7 +1,7 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import sinon from 'sinon'; import scrape from 'website-scraper'; import Scraper from '../../../lib/scraper.js'; @@ -9,20 +9,20 @@ import Scraper from '../../../lib/scraper.js'; const testDirname = './test/functional/redirect/.tmp'; const mockDirname = './test/functional/redirect/mocks'; -describe('Functional redirects', function() { +describe('Functional redirects', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should follow redirects and save resource once if it has different urls', function() { + it('should follow redirects and save resource once if it has different urls', async () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); // true page - ok nock('http://example.com/').get('/true-page.html').reply(200, 'true page 1', {'content-type': 'text/html'}); @@ -45,25 +45,27 @@ describe('Functional redirects', function() { const scraper = new Scraper(options); const saveSpy = sinon.spy(scraper.actions.saveResource, [0]); - return scraper.scrape().then(function() { - saveSpy.callCount.should.be.eql(2); - saveSpy.args[0][0].resource.filename.should.be.eql('index.html'); - saveSpy.args[1][0].resource.filename.should.be.eql('true-page.html'); + await scraper.scrape(); - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/true-page.html').should.be.eql(true); + saveSpy.callCount.should.be.eql(2); + saveSpy.args[0][0].resource.filename.should.be.eql('index.html'); + saveSpy.args[1][0].resource.filename.should.be.eql('true-page.html'); - // should update all urls to true-page.html - fs.readFileSync(testDirname + '/index.html').toString().should.containEql('1'); - fs.readFileSync(testDirname + '/index.html').toString().should.containEql('2'); - fs.readFileSync(testDirname + '/index.html').toString().should.containEql('3'); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/true-page.html`.should.fileExists(true); - // true-page.html should have body from 1st response - fs.readFileSync(testDirname + '/true-page.html').toString().should.be.eql('true page 1'); - }); + // should update all urls to true-page.html + const index = await fs.readFile(testDirname + '/index.html', { encoding: 'binary' }); + index.should.containEql('1'); + index.should.containEql('2'); + index.should.containEql('3'); + + // true-page.html should have body from 1st response + const truePage = await fs.readFile(testDirname + '/true-page.html', { encoding: 'binary' }); + truePage.should.be.eql('true page 1'); }); - it('should correctly handle relative source in redirected page', () => { + it('should correctly handle relative source in redirected page', async () => { const options = { urls: [ { url: 'http://example.com', filename: 'index.html'} @@ -85,24 +87,24 @@ describe('Functional redirects', function() { nock('http://example.com/').get('/style.css').reply(200, 'style.css', {'content-type': 'text/css'}); nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css', {'content-type': 'text/css'}); - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/about.html').should.be.eql(true); - fs.existsSync(testDirname + '/css/style.css').should.be.eql(true); - fs.existsSync(testDirname + '/css/style_1.css').should.be.eql(true); + await scrape(options); + + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/about.html`.should.fileExists(true); + await `${testDirname}/css/style.css`.should.fileExists(true); + await `${testDirname}/css/style_1.css`.should.fileExists(true); - const style = fs.readFileSync(testDirname + '/css/style.css').toString(); - style.should.be.eql('style.css'); + const style = await fs.readFile(testDirname + '/css/style.css', { encoding: 'binary' }); + style.should.be.eql('style.css'); - const style1 = fs.readFileSync(testDirname + '/css/style_1.css').toString(); - style1.should.be.eql('about/style.css'); + const style1 = await fs.readFile(testDirname + '/css/style_1.css', { encoding: 'binary' }); + style1.should.be.eql('about/style.css'); - const index = fs.readFileSync(testDirname + '/index.html').toString(); - index.should.containEql(''); + const index = await fs.readFile(testDirname + '/index.html', { encoding: 'binary' }); + index.should.containEql(''); - const about = fs.readFileSync(testDirname + '/about.html').toString(); - about.should.containEql(''); - about.should.containEql(''); - }); + const about = await fs.readFile(testDirname + '/about.html', { encoding: 'binary' }); + about.should.containEql(''); + about.should.containEql(''); }); }); diff --git a/test/functional/request-concurrency/request-concurrency.test.js b/test/functional/request-concurrency/request-concurrency.test.js index e258d977..85faca20 100644 --- a/test/functional/request-concurrency/request-concurrency.test.js +++ b/test/functional/request-concurrency/request-concurrency.test.js @@ -1,16 +1,16 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/request-concurrency/.tmp'; const mockDirname = './test/functional/request-concurrency/mocks'; -describe('Functional concurrent requests', function() { +describe('Functional concurrent requests', () => { let maxConcurrentRequests, currentConcurrentRequests; - beforeEach(function () { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); @@ -54,10 +54,10 @@ describe('Functional concurrent requests', function() { return scrape(options); }); - afterEach(function () { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); it('should have maximum concurrent requests == requestConcurrency option', () => { diff --git a/test/functional/request-response-customizations/after-response-action.test.js b/test/functional/request-response-customizations/after-response-action.test.js index bde7ca38..b6907ea3 100644 --- a/test/functional/request-response-customizations/after-response-action.test.js +++ b/test/functional/request-response-customizations/after-response-action.test.js @@ -1,26 +1,26 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/req-res-customizations-after-response/.tmp'; const mockDirname = './test/functional/req-res-customizations-after-response/mocks'; -describe('Functional: afterResponse action in plugin', function() { +describe('Functional: afterResponse action in plugin', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should skip downloading resource if afterResponse returns null', function() { + it('should skip downloading resource if afterResponse returns null', async () => { nock('http://example.com/').get('/1.html').reply(200, 'content of 1.html'); nock('http://example.com/').get('/2.html').reply(404); @@ -36,7 +36,7 @@ describe('Functional: afterResponse action in plugin', function() { headers: response.headers, someOtherData: [ 1, 2, 3 ] } - } + }; } }); } @@ -53,15 +53,15 @@ describe('Functional: afterResponse action in plugin', function() { ] }; - return scrape(options).then(function(result) { - should(result[0]).have.properties({ url: 'http://example.com/1.html', filename: '1.html', saved: true }); - should(result[1]).have.properties({ url: 'http://example.com/2.html', filename: '2.html', saved: false }); + const result = await scrape(options); + + should(result[0]).have.properties({ url: 'http://example.com/1.html', filename: '1.html', saved: true }); + should(result[1]).have.properties({ url: 'http://example.com/2.html', filename: '2.html', saved: false }); - fs.existsSync(testDirname + '/1.html').should.be.eql(true); - const indexHtml = fs.readFileSync(testDirname + '/1.html').toString(); - should(indexHtml).containEql('content of 1.html'); + await `${testDirname}/1.html`.should.fileExists(true); + const indexHtml = await fs.readFile(testDirname + '/1.html', { encoding: 'binary' }); + should(indexHtml).containEql('content of 1.html'); - fs.existsSync(testDirname + '/2.html').should.be.eql(false); - }); + await `${testDirname}/2.html`.should.fileExists(false); }); }); diff --git a/test/functional/request-response-customizations/request.test.js b/test/functional/request-response-customizations/request.test.js index f65926b7..d5be92da 100644 --- a/test/functional/request-response-customizations/request.test.js +++ b/test/functional/request-response-customizations/request.test.js @@ -1,26 +1,26 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/req-res-customizations-request/.tmp'; const mockDirname = './test/functional/req-res-customizations-request/mocks'; -describe('Functional: customize request options with plugin', function() { +describe('Functional: customize request options with plugin', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should use options from request property if no beforeRequest actions', function() { + it('should use options from request property if no beforeRequest actions', async () => { nock('http://example.com/').get('/').query({myParam: 122}).reply(200, 'response for url with query'); const options = { @@ -31,18 +31,18 @@ describe('Functional: customize request options with plugin', function() { } }; - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - const indexHtml = fs.readFileSync(testDirname + '/index.html').toString(); - should(indexHtml).containEql('response for url with query'); - }); + await scrape(options); + + (await fs.stat(testDirname + '/index.html')).isFile().should.be.eql(true); + const indexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary'}); + should(indexHtml).containEql('response for url with query'); }); - it('should use options returned by beforeRequest action', function() { + it('should use options returned by beforeRequest action', async () => { nock('http://example.com/').get('/').query({myParam: 122}).reply(200, 'response for url with query'); class CustomRequestOptions { - apply(add) { + apply (add) { add('beforeRequest', ()=> { return { requestOptions:{ @@ -61,10 +61,10 @@ describe('Functional: customize request options with plugin', function() { ] }; - return scrape(options).then(function() { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - const indexHtml = fs.readFileSync(testDirname + '/index.html').toString(); - should(indexHtml).containEql('response for url with query'); - }); + await scrape(options); + + (await fs.stat(testDirname + '/index.html')).isFile().should.be.eql(true); + const indexHtml = await fs.readFile(testDirname + '/index.html', { encoding: 'binary' }); + indexHtml.should.containEql('response for url with query'); }); }); diff --git a/test/functional/resource-saver/resource-saver.test.js b/test/functional/resource-saver/resource-saver.test.js index 9ddea99a..aa951e6d 100644 --- a/test/functional/resource-saver/resource-saver.test.js +++ b/test/functional/resource-saver/resource-saver.test.js @@ -1,7 +1,7 @@ import should from 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import sinon from 'sinon'; import scrape from 'website-scraper'; @@ -15,10 +15,10 @@ describe('Functional: plugin for saving resources', () => { nock.disableNetConnect(); }); - afterEach(() => { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); let saveResourceStub, handleErrorStub, saveResourcePlugin; @@ -28,16 +28,16 @@ describe('Functional: plugin for saving resources', () => { handleErrorStub = sinon.stub().resolves(); class SaveResourcePlugin { - apply(registerAction) { + apply (registerAction) { registerAction('saveResource', saveResourceStub); - registerAction('error', handleErrorStub) + registerAction('error', handleErrorStub); } } saveResourcePlugin = new SaveResourcePlugin(); }); - it('should use passed resourceSaver when saving resource', function() { + it('should use passed resourceSaver when saving resource', () => { nock('http://example.com/').get('/').reply(200, 'OK'); const options = { @@ -46,13 +46,13 @@ describe('Functional: plugin for saving resources', () => { plugins: [ saveResourcePlugin ] }; - return scrape(options).catch(function() { + return scrape(options).catch(() => { should(saveResourceStub.calledOnce).be.eql(true); should(saveResourceStub.args[0][0].resource.url).be.eql('http://example.com/'); }); }); - it('should use passed resourceSaver on error', function() { + it('should use passed resourceSaver on error', () => { nock('http://example.com/').get('/').replyWithError('SCRAPER AWFUL ERROR'); const options = { @@ -62,7 +62,7 @@ describe('Functional: plugin for saving resources', () => { ignoreErrors: false }; - return scrape(options).catch(function() { + return scrape(options).catch(() => { should(handleErrorStub.calledOnce).be.eql(true); should(handleErrorStub.args[0][0].error.message).be.eql('SCRAPER AWFUL ERROR'); }); diff --git a/test/functional/resource-without-ext/resource-without-ext.test.js b/test/functional/resource-without-ext/resource-without-ext.test.js index be818082..dfd8821d 100644 --- a/test/functional/resource-without-ext/resource-without-ext.test.js +++ b/test/functional/resource-without-ext/resource-without-ext.test.js @@ -1,26 +1,26 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/resource-without-ext/.tmp'; const mockDirname = './test/functional/resource-without-ext/mocks'; -describe('Functional resources without extensions', function() { +describe('Functional resources without extensions', () => { - beforeEach(function() { + beforeEach(() => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(function() { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should load resources without extensions with correct type and wrap with extensions', function () { + it('should load resources without extensions with correct type and wrap with extensions', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -52,19 +52,19 @@ describe('Functional resources without extensions', function() { nock('http://google.com').get('/').replyWithFile(200, mockDirname + '/google.html'); nock('http://google.com').get('/google.png').reply(200, 'OK'); - return scrape(options).then(function() { - // should load css file and fonts from css file - fs.existsSync(testDirname + '/css.css').should.be.eql(true); // http://fonts.googleapis.com/css?family=Lato - fs.existsSync(testDirname + '/UyBMtLsHKBKXelqf4x7VRQ.woff2').should.be.eql(true); - fs.existsSync(testDirname + '/1YwB1sO8YE1Lyjf12WNiUA.woff2').should.be.eql(true); + await scrape(options); - // should load html file and its sources from anchor - fs.existsSync(testDirname + '/index_1.html').should.be.eql(true); - fs.existsSync(testDirname + '/google.png').should.be.eql(true); + // should load css file and fonts from css file + (await fs.stat(testDirname + '/css.css')).isFile().should.be.eql(true); // http://fonts.googleapis.com/css?family=Lato + (await fs.stat(testDirname + '/UyBMtLsHKBKXelqf4x7VRQ.woff2')).isFile().should.be.eql(true); + (await fs.stat(testDirname + '/1YwB1sO8YE1Lyjf12WNiUA.woff2')).isFile().should.be.eql(true); - // should load html file and its sources from iframe - fs.existsSync(testDirname + '/iframe.html').should.be.eql(true); - fs.existsSync(testDirname + '/cat.png').should.be.eql(true); - }); + // should load html file and its sources from anchor + (await fs.stat(testDirname + '/index_1.html')).isFile().should.be.eql(true); + (await fs.stat(testDirname + '/google.png')).isFile().should.be.eql(true); + + // should load html file and its sources from iframe + (await fs.stat(testDirname + '/iframe.html')).isFile().should.be.eql(true); + (await fs.stat(testDirname + '/cat.png')).isFile().should.be.eql(true); }); }); diff --git a/test/functional/update-missing-sources/update-missing-sources.test.js b/test/functional/update-missing-sources/update-missing-sources.test.js index 1cacdcd2..39376010 100644 --- a/test/functional/update-missing-sources/update-missing-sources.test.js +++ b/test/functional/update-missing-sources/update-missing-sources.test.js @@ -1,7 +1,7 @@ import 'should'; import '../../utils/assertions.js'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import scrape from 'website-scraper'; const testDirname = './test/functional/update-missing-sources/.tmp'; @@ -27,18 +27,18 @@ class UpdateMissingResourceReferencePlugin { describe('Functional: update missing sources', () => { - beforeEach(() => { + beforeEach(async () => { nock.cleanAll(); nock.disableNetConnect(); }); - afterEach(() => { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); - it('should not update missing sources by default', () => { + it('should not update missing sources by default', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -49,17 +49,16 @@ describe('Functional: update missing sources', () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/missing-img.png`.should.fileExists(false); - const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); - indexBody.should.containEql(' { + it('should update missing sources if missing resource plugin added', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -72,24 +71,23 @@ describe('Functional: update missing sources', () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/missing-img.png`.should.fileExists(false); - const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); - indexBody.should.containEql(' { + it('should update missing sources when source was rejected by urlFilter', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, subdirectories: null, sources: [{ selector: 'img', attr: 'src' }], plugins: [ new UpdateMissingResourceReferencePlugin() ], - urlFilter: function (url) { + urlFilter: (url) => { return url.indexOf('/missing-img.png') === -1; } }; @@ -97,17 +95,16 @@ describe('Functional: update missing sources', () => { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); nock('http://example.com/').get('/missing-img.png').reply(200, 'ok'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/missing-img.png`.should.fileExists(false); - const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); - indexBody.should.containEql(' { + it('should update missing sources when source was rejected by maxRecursiveDepth', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -123,18 +120,17 @@ describe('Functional: update missing sources', () => { nock('http://example.com/').get('/link1.html').replyWithFile(200, mockDirname + '/link1.html'); nock('http://example.com/').get('/missing-link.html').reply(200, 'ok'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/link1.html').should.be.eql(true); - fs.existsSync(testDirname + '/missing-link.html').should.be.eql(false); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/link1.html`.should.fileExists(true); + await `${testDirname}/missing-link.html`.should.fileExists(false); - const link = fs.readFileSync(testDirname + '/link1.html').toString(); - link.should.containEql(' { + it('should update missing sources if one of pathContainers path was failed', async () => { const options = { urls: [ 'http://example.com/' ], directory: testDirname, @@ -149,18 +145,17 @@ describe('Functional: update missing sources', () => { nock('http://example.com/').get('/b.png').replyWithError('Failed!'); nock('http://example.com/').get('/c.png').reply(200, 'ok'); - return scrape(options).then(() => { - fs.existsSync(testDirname + '/index.html').should.be.eql(true); - fs.existsSync(testDirname + '/a.png').should.be.eql(true); - fs.existsSync(testDirname + '/b.png').should.be.eql(false); - fs.existsSync(testDirname + '/c.png').should.be.eql(true); + await scrape(options); + await `${testDirname}/index.html`.should.fileExists(true); + await `${testDirname}/a.png`.should.fileExists(true); + await `${testDirname}/b.png`.should.fileExists(false); + await `${testDirname}/c.png`.should.fileExists(true); - const index = fs.readFileSync(testDirname + '/index.html').toString(); - index.should.containEql(`.a { background: url('a.png') }`); - index.should.containEql(`.b { background: url('http://example.com/b.png') }`); - index.should.containEql(`.c { background: url('c.png') }`); - }); + const index = (await fs.readFile(testDirname + '/index.html', { encoding: 'binary' })); + index.should.containEql(`.a { background: url('a.png') }`); + index.should.containEql(`.b { background: url('http://example.com/b.png') }`); + index.should.containEql(`.c { background: url('c.png') }`); }); }); diff --git a/test/unit/request-test.js b/test/unit/request-test.js index 80db6cc0..746352c9 100644 --- a/test/unit/request-test.js +++ b/test/unit/request-test.js @@ -49,7 +49,7 @@ describe('request', () => { it('should call afterResponse with correct params', () => { const url = 'http://example.com'; const scope = nock(url).get('/').reply(200, 'TEST BODY'); - let handlerStub = sinon.stub().resolves(''); + let handlerStub = sinon.stub().resolves({ body: '' }); return request.get({url, afterResponse: handlerStub}).then(() => { scope.isDone().should.be.eql(true); @@ -94,7 +94,7 @@ describe('request', () => { it('should transform string result', () => { const url = 'http://example.com'; nock(url).get('/').reply(200, 'TEST BODY'); - const handlerStub = sinon.stub().resolves('test body'); + const handlerStub = sinon.stub().resolves({ body: 'test body' }); return request.get({url, afterResponse: handlerStub}).then((data) => { should(data.body).be.eql('test body'); @@ -268,11 +268,15 @@ describe('transformResult', () => { }); it('should handle raw string input', () => { - const result = request.transformResult('SOME BODY'); + try { + request.transformResult('SOME BODY'); - should(result).have.property('body', 'SOME BODY'); - should(result).have.property('encoding', 'binary'); - should(result).have.property('metadata', null); + // We shouldn't get here. + should(true).eql(false); + } catch (e) { + should(e).be.instanceOf(Error); + should(e.message).eql('afterResponse handler returned a string, expected object'); + } }); it('should handle null input', () => { diff --git a/test/unit/resource-handler/css.test.js b/test/unit/resource-handler/css.test.js index 13b1eec3..49cd631d 100644 --- a/test/unit/resource-handler/css.test.js +++ b/test/unit/resource-handler/css.test.js @@ -4,15 +4,15 @@ import Resource from '../../../lib/resource.js'; import CssResourceHandler from '../../../lib/resource-handler/css/index.js'; describe('ResourceHandler: Css', () => { - it('should call downloadChildrenResources and set returned text to resource', () => { + it('should call downloadChildrenResources and set returned text to resource', async () => { const downloadChildrenPaths = sinon.stub().resolves('updated text'); const originalResource = new Resource('http://example.com'); const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths}); - return cssHandler.handle(originalResource).then((updatedResource) => { - should(updatedResource).be.equal(originalResource); - should(updatedResource.getText()).be.eql('updated text'); - }); + const updatedResource = await cssHandler.handle(originalResource); + + should(updatedResource).be.equal(originalResource); + should(await updatedResource.getText()).be.eql('updated text'); }); }); diff --git a/test/unit/resource-handler/html.test.js b/test/unit/resource-handler/html.test.js index eefe6a35..8d25c3dd 100644 --- a/test/unit/resource-handler/html.test.js +++ b/test/unit/resource-handler/html.test.js @@ -70,7 +70,7 @@ describe('ResourceHandler: Html', () => { htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); }); - it('should remove base tag from text and update resource url for absolute href', () => { + it('should remove base tag from text and update resource url for absolute href', async () => { const html = ` @@ -80,15 +80,14 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() =>{ - resource.getUrl().should.be.eql('http://some-other-domain.com/src'); - resource.getText().should.not.containEql(' { + it('should remove base tag from text and update resource url for relative href', async () => { const html = ` @@ -98,15 +97,14 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() => { - resource.getUrl().should.be.eql('http://example.com/src'); - resource.getText().should.not.containEql(' { + it('should not remove base tag if it doesn\'t have href attribute', async () => { const html = ` @@ -116,16 +114,15 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() => { - resource.getUrl().should.be.eql('http://example.com'); - resource.getText().should.containEql(''); - }); + const handledResource = await htmlHandler.handle(resource); + handledResource.getUrl().should.be.eql('http://example.com'); + (await handledResource.getText()).should.containEql(''); }); }); - it('should not encode text to html entities', () => { + it('should not encode text to html entities', async () => { htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); const html = ` @@ -136,14 +133,13 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() => { - resource.getText().should.containEql('Этот текст не должен быть преобразован в html entities'); - }); + const handledResource = await htmlHandler.handle(resource); + (await handledResource.getText()).should.containEql('Этот текст не должен быть преобразован в html entities'); }); - it('should not update attribute names to lowercase', () => { + it('should not update attribute names to lowercase', async () => { htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); const html = ` @@ -156,14 +152,13 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() => { - resource.getText().should.containEql('viewBox="0 0 100 100"'); - }); + const handledResource = await htmlHandler.handle(resource); + (await handledResource.getText()).should.containEql('viewBox="0 0 100 100"'); }); - it('should call downloadChildrenResources for each source', () => { + it('should call downloadChildrenResources for each source', async () => { const sources = [{ selector: 'img', attr: 'src' }]; htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); @@ -179,14 +174,13 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() =>{ - htmlHandler.downloadChildrenPaths.calledThrice.should.be.eql(true); - }); + await htmlHandler.handle(resource); + htmlHandler.downloadChildrenPaths.calledThrice.should.be.eql(true); }); - it('should not call downloadChildrenResources if source attr is empty', () =>{ + it('should not call downloadChildrenResources if source attr is empty', async () =>{ const sources = [{ selector: 'img', attr: 'src' }]; htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); @@ -198,14 +192,59 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() =>{ - htmlHandler.downloadChildrenPaths.called.should.be.eql(false); - }); + await htmlHandler.handle(resource); + htmlHandler.downloadChildrenPaths.called.should.be.eql(false); + }); + + it('should allow custom container classes', async () => { + class TestJsonClass { + constructor (text) { + this.text = text || ''; + this.paths = []; + + if (this.text) { + this.paths = JSON.parse(this.text); + } + } + + getPaths () { + return this.paths; + } + + updateText (pathsToUpdate) { + this.paths = this.paths.map((oldPath) => { + const toUpdate = pathsToUpdate.find((x) => x.oldPath === oldPath); + + return toUpdate ? toUpdate.newPath : oldPath; + }); + + return JSON.stringify(this.paths); + } + } + + const sources = [{ selector: 'div', attr: 'data-json', containerClass: TestJsonClass }]; + htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); + + const html = ` + + +
+ + `; + + const resource = new Resource('http://example.com', 'index.html'); + await resource.setText(html); + + await htmlHandler.handle(resource); + + htmlHandler.downloadChildrenPaths.called.should.be.eql(true); + htmlHandler.downloadChildrenPaths.args[0][0].should.be.instanceOf(TestJsonClass); + htmlHandler.downloadChildrenPaths.args[0][0].paths.should.eql(['foo/bar.jpg', 'foo/baz.jpg']); }); - it('should use correct path containers based on tag', () => { + it('should use correct path containers based on tag', async () => { const sources = [ { selector: 'img', attr: 'src' }, { selector: 'img', attr: 'srcset' }, @@ -225,17 +264,16 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - return htmlHandler.handle(resource).then(() =>{ - htmlHandler.downloadChildrenPaths.calledThrice.should.be.eql(true); - htmlHandler.downloadChildrenPaths.args[0][0].should.be.instanceOf(HtmlCommonTag); - htmlHandler.downloadChildrenPaths.args[1][0].should.be.instanceOf(HtmlImgSrcsetTag); - htmlHandler.downloadChildrenPaths.args[2][0].should.be.instanceOf(CssText); - }); + await htmlHandler.handle(resource); + htmlHandler.downloadChildrenPaths.calledThrice.should.be.eql(true); + htmlHandler.downloadChildrenPaths.args[0][0].should.be.instanceOf(HtmlCommonTag); + htmlHandler.downloadChildrenPaths.args[1][0].should.be.instanceOf(HtmlImgSrcsetTag); + htmlHandler.downloadChildrenPaths.args[2][0].should.be.instanceOf(CssText); }); - it('should remove SRI check for loaded resources', () => { + it('should remove SRI check for loaded resources', async () => { const sources = [ { selector: 'script', attr: 'src'} ]; @@ -253,18 +291,19 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); // before handle should contain both integrity checks - resource.getText().should.containEql('integrity="sha256-gaWb8m2IHSkoZnT23u/necREOC//MiCFtQukVUYMyuU="'); - resource.getText().should.containEql('integrity="sha256-X+Q/xqnlEgxCczSjjpp2AUGGgqM5gcBzhRQ0p+EAUEk="'); - - return htmlHandler.handle(resource).then(() => { - // after handle should contain integrity check for styles - // but not contain integrity check for script because it was loaded - resource.getText().should.containEql('integrity="sha256-gaWb8m2IHSkoZnT23u/necREOC//MiCFtQukVUYMyuU="'); - resource.getText().should.not.containEql('integrity="sha256-X+Q/xqnlEgxCczSjjpp2AUGGgqM5gcBzhRQ0p+EAUEk="'); - }); + (await resource.getText()).should.containEql('integrity="sha256-gaWb8m2IHSkoZnT23u/necREOC//MiCFtQukVUYMyuU="'); + (await resource.getText()).should.containEql('integrity="sha256-X+Q/xqnlEgxCczSjjpp2AUGGgqM5gcBzhRQ0p+EAUEk="'); + + + + const handledResource = await htmlHandler.handle(resource); + // after handle should contain integrity check for styles + // but not contain integrity check for script because it was loaded + (await handledResource.getText()).should.containEql('integrity="sha256-gaWb8m2IHSkoZnT23u/necREOC//MiCFtQukVUYMyuU="'); + (await handledResource.getText()).should.not.containEql('integrity="sha256-X+Q/xqnlEgxCczSjjpp2AUGGgqM5gcBzhRQ0p+EAUEk="'); }); it('should use html entities for updated attributes', async () => { @@ -283,10 +322,10 @@ describe('ResourceHandler: Html', () => { `; const resource = new Resource('http://example.com', 'index.html'); - resource.setText(html); + await resource.setText(html); - await htmlHandler.handle(resource); - const text = resource.getText(); + const handledResource = await htmlHandler.handle(resource); + const text = await handledResource.getText(); should(text).containEql('style="width: 300px; height: 300px; background-image:url("./images/cat.jpg")"'); }); diff --git a/test/unit/resource-test.js b/test/unit/resource-test.js index 8f0d14e3..303acb42 100644 --- a/test/unit/resource-test.js +++ b/test/unit/resource-test.js @@ -1,28 +1,33 @@ import 'should'; import Resource from '../../lib/resource.js'; +import fs from 'fs/promises'; +import '../utils/assertions.js'; +import path from 'path'; +import os from 'os'; +import should from 'should'; -describe('Resource', function() { - describe('#createChild', function () { - it('should return Resource', function() { +describe('Resource', () => { + describe('#createChild', () => { + it('should return Resource', () => { const parent = new Resource('http://example.com'); const child = parent.createChild('http://google.com'); child.should.be.instanceOf(Resource); }); - it('should set correct url and filename', function() { + it('should set correct url and filename', () => { const parent = new Resource('http://example.com'); const child = parent.createChild('http://google.com', 'google.html'); child.getUrl().should.be.eql('http://google.com'); child.getFilename().should.equalFileSystemPath('google.html'); }); - it('should set parent', function() { + it('should set parent', () => { const parent = new Resource('http://example.com'); const child = parent.createChild('http://google.com'); child.parent.should.be.equal(parent); }); - it('should set depth', function() { + it('should set depth', () => { const parent = new Resource('http://example.com'); const child = parent.createChild('http://google.com'); child.depth.should.be.eql(1); @@ -31,4 +36,50 @@ describe('Resource', function() { childOfChild.depth.should.be.eql(2); }); }); + + describe('set/get text', () => { + const testString1 = '동안 한국에서'; + const testString2 = '加入网站'; + + it('memory mode', async () => { + const resource = new Resource('http://example.com', 'index.html', 'memory'); + resource.setEncoding('utf8'); + + await resource.setText(testString1); + (await resource.getText()).should.eql(testString1); + + await resource.setText(testString2); + (await resource.getText()).should.eql(testString2); + }); + + it('memory-compressed mode', async () => { + const resource = new Resource('http://example.com', 'index.html', 'memory-compressed'); + resource.setEncoding('utf8'); + + await resource.setText(testString1); + (await resource.getText()).should.eql(testString1); + (resource.text).should.not.eql(testString1); + should(resource.text).instanceof(Buffer); + + await resource.setText(testString2); + (await resource.getText()).should.eql(testString2); + }); + + it('filesystem mode', async () => { + const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'website-scraper-')); + + try { + const resource = new Resource('http://example.com', 'index.html', 'filesystem', tmpDir); + resource.setEncoding('utf8'); + + await resource.setText(testString1); + (await resource.getText()).should.eql(testString1); + + await resource.setText(testString2); + (await resource.getText()).should.eql(testString2); + } finally { + await fs.rm(tmpDir, { recursive: true, force: true }); + } + }); + }); }); diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 080c5976..da9a38e9 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -8,9 +8,9 @@ import defaultOptions from 'website-scraper/defaultOptions'; const testDirname = './test/unit/.scraper-init-test'; const urls = [ 'http://example.com' ]; -describe('Scraper initialization', function () { - describe('defaultFilename', function() { - it('should use default defaultFilename if no defaultFilename were passed', function () { +describe('Scraper initialization', () => { + describe('defaultFilename', () => { + it('should use default defaultFilename if no defaultFilename were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname @@ -19,7 +19,7 @@ describe('Scraper initialization', function () { s.options.defaultFilename.should.equalFileSystemPath(defaultOptions.defaultFilename); }); - it('should use defaultFilename sources if defaultFilename were passed', function () { + it('should use defaultFilename sources if defaultFilename were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname, @@ -30,8 +30,8 @@ describe('Scraper initialization', function () { }); }); - describe('sources', function() { - it('should use default sources if no sources were passed', function () { + describe('sources', () => { + it('should use default sources if no sources were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname @@ -41,7 +41,7 @@ describe('Scraper initialization', function () { s.options.sources.length.should.be.greaterThan(0); }); - it('should use passed sources if sources were passed', function () { + it('should use passed sources if sources were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname, @@ -51,7 +51,7 @@ describe('Scraper initialization', function () { s.options.sources.should.eql([ { selector: 'img', attr: 'src' } ]); }); - it('should extend sources if recursive flag is set', function() { + it('should extend sources if recursive flag is set', () => { const s = new Scraper({ urls: { url: 'http://first-url.com' }, directory: testDirname, @@ -67,8 +67,8 @@ describe('Scraper initialization', function () { }); }); - describe('subdirectories', function () { - it('should use default subdirectories if no subdirectories were passed', function () { + describe('subdirectories', () => { + it('should use default subdirectories if no subdirectories were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname @@ -78,7 +78,7 @@ describe('Scraper initialization', function () { s.options.subdirectories.length.should.be.greaterThan(0); }); - it('should convert extensions to lower case', function () { + it('should convert extensions to lower case', () => { const s = new Scraper({ urls: urls, @@ -91,7 +91,7 @@ describe('Scraper initialization', function () { s.options.subdirectories[0].extensions.should.eql(['.txt']); }); - it('should use passed subdirectories if subdirectories were passed', function () { + it('should use passed subdirectories if subdirectories were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname, @@ -101,7 +101,7 @@ describe('Scraper initialization', function () { s.options.subdirectories.should.eql([ { directory: 'js', extensions: ['.js'] } ]); }); - it('should use null if null was passed', function () { + it('should use null if null was passed', () => { const s = new Scraper({ urls: urls, directory: testDirname, @@ -112,8 +112,8 @@ describe('Scraper initialization', function () { }); }); - describe('request', function () { - it('should use default request if no request were passed', function () { + describe('request', () => { + it('should use default request if no request were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname @@ -129,7 +129,7 @@ describe('Scraper initialization', function () { }); }); - it('should merge default and passed objects if request were passed', function () { + it('should merge default and passed objects if request were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname, @@ -154,7 +154,7 @@ describe('Scraper initialization', function () { }); }); - it('should override existing properties if request were passed', function () { + it('should override existing properties if request were passed', () => { const s = new Scraper({ urls: urls, directory: testDirname, @@ -173,8 +173,8 @@ describe('Scraper initialization', function () { }); }); - describe('resourceHandler', function () { - it('should create resourceHandler with correct params', function() { + describe('resourceHandler', () => { + it('should create resourceHandler with correct params', () => { const options = { urls: { url: 'http://first-url.com' }, directory: testDirname, @@ -187,8 +187,8 @@ describe('Scraper initialization', function () { }); }); - describe('urls', function () { - it('should create an Array of urls if string was passed', function() { + describe('urls', () => { + it('should create an Array of urls if string was passed', () => { const s = new Scraper({ urls: 'http://not-array-url.com', directory: testDirname @@ -199,8 +199,8 @@ describe('Scraper initialization', function () { }); }); - describe('resources', function () { - it('should create Resource object for each url', function() { + describe('resources', () => { + it('should create Resource object for each url', () => { const s = new Scraper({ urls: [ 'http://first-url.com', @@ -219,7 +219,7 @@ describe('Scraper initialization', function () { s.resources[2].url.should.be.eql('http://third-url.com'); }); - it('should use urls filename', function() { + it('should use urls filename', () => { const s = new Scraper({ urls: { url: 'http://first-url.com', filename: 'first.html' }, directory: testDirname @@ -227,7 +227,7 @@ describe('Scraper initialization', function () { s.resources[0].getFilename().should.equalFileSystemPath('first.html'); }); - it('should use default filename if no url filename was provided', function() { + it('should use default filename if no url filename was provided', () => { const s = new Scraper({ urls: { url: 'http://first-url.com' }, defaultFilename: 'default.html', diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 0cfa780c..2f33ba6b 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -1,10 +1,11 @@ import should from 'should'; import sinon from 'sinon'; import nock from 'nock'; -import fs from 'fs-extra'; +import fs from 'fs/promises'; import path from 'path'; import Scraper from '../../lib/scraper.js'; import Resource from '../../lib/resource.js'; +import '../utils/assertions.js'; import defaultOptions from 'website-scraper/defaultOptions'; import * as plugins from 'website-scraper/plugins'; @@ -18,10 +19,10 @@ describe('Scraper', () => { nock.disableNetConnect(); }); - afterEach(() => { + afterEach(async () => { nock.cleanAll(); nock.enableNetConnect(); - fs.removeSync(testDirname); + await fs.rm(testDirname, { recursive: true, force: true }); }); describe('#loadResource', () => { @@ -56,7 +57,7 @@ describe('Scraper', () => { }); describe('#saveResource', () => { - it('should call handleError on error', () => { + it('should call handleError on error', async () => { const s = new Scraper({ urls: 'http://example.com', directory: testDirname @@ -67,7 +68,7 @@ describe('Scraper', () => { sinon.stub(s, 'handleError').resolves(); const r = new Resource('http://example.com/a.png', 'a.png'); - r.setText('some text'); + await r.setText('some text'); return s.saveResource(r).then(() => should(true).eql(false)).catch(() => { s.handleError.calledOnce.should.be.eql(true); @@ -103,7 +104,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.not.empty(); + (await rr.getText()).should.be.not.empty(); }); it('should return null if the urlFilter returns false', async () =>{ @@ -138,7 +139,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.not.empty(); + (await rr.getText()).should.be.not.empty(); }); }); @@ -160,7 +161,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.not.empty(); + (await rr.getText()).should.be.not.empty(); }); it('should request the resource if maxDepth is set and resource depth is less than maxDept', async () =>{ @@ -181,7 +182,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.not.empty(); + (await rr.getText()).should.be.not.empty(); }); it('should request the resource if maxDepth is set and resource depth is equal to maxDept', async () =>{ @@ -201,7 +202,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.not.empty(); + (await rr.getText()).should.be.not.empty(); }); it('should return null if maxDepth is set and resource depth is greater than maxDepth', async () =>{ @@ -268,7 +269,7 @@ describe('Scraper', () => { const r = new Resource('http://example.com'); await s.requestResource(r); - should(r.getText()).be.eql('test body'); + should((await r.getText())).be.eql('test body'); should(r.getUrl()).be.eql('http://example.com'); should(r.getType()).be.eql('html'); should(r.getFilename()).be.eql('generated-filename'); @@ -457,7 +458,7 @@ describe('Scraper', () => { await s.scrape(); - should(fs.existsSync(testDirname)).be.eql(true); + await testDirname.should.dirExists(true); }); it('should save resource to FS', async () => { @@ -470,8 +471,8 @@ describe('Scraper', () => { await s.scrape(); const filename = path.join(testDirname, 'index.html'); - should(fs.existsSync(filename)).be.eql(true); - should(fs.readFileSync(filename).toString()).be.eql('some text'); + await filename.should.fileExists(true); + should(await fs.readFile(filename, { encoding: 'binary' })).be.eql('some text'); }); it('should remove directory on error', async () => { @@ -491,7 +492,8 @@ describe('Scraper', () => { } catch (err) { should(err).be.instanceOf(Error); should(err.message).be.eql('Response code 400 (Bad Request)'); - should(fs.existsSync(testDirname)).be.eql(false); + + await testDirname.should.dirExists(false); } }); @@ -538,7 +540,8 @@ describe('Scraper', () => { it('should return error if existing directory passed', async () => { try { - fs.mkdirpSync(testDirname); + await fs.mkdir(testDirname, { recursive: true }); + const s = new Scraper({ urls: 'http://example.com', directory: testDirname @@ -569,8 +572,10 @@ describe('Scraper', () => { should(s.options.plugins[0]).be.instanceOf(plugins.GenerateFilenameByTypePlugin); const filename = path.join(testDirname, 'index.html'); - should(fs.existsSync(filename)).be.eql(true); - should(fs.readFileSync(filename).toString()).be.eql('some text'); + await filename.should.fileExists(true); + + const index = await fs.readFile(filename, { encoding: 'binary'}); + index.should.be.eql('some text'); }); it('should use bySiteStructure plugin if filenameGenerator option is set', async () => { @@ -586,8 +591,10 @@ describe('Scraper', () => { should(s.options.plugins[0]).be.instanceOf(plugins.GenerateFilenameBySiteStructurePlugin); const filename = path.join(testDirname, 'example.com/index.html'); - should(fs.existsSync(filename)).be.eql(true); - should(fs.readFileSync(filename).toString()).be.eql('some text'); + await filename.should.fileExists(true); + + const index = await fs.readFile(filename, { encoding: 'binary'}); + index.should.be.eql('some text'); }); it('should ignore filenameGenerator option if function passed', async () => { diff --git a/test/unit/utils/utils-test.js b/test/unit/utils/utils-test.js index 80aab4a0..80e0ea4e 100644 --- a/test/unit/utils/utils-test.js +++ b/test/unit/utils/utils-test.js @@ -5,19 +5,21 @@ import { getFilepathFromUrl, getHashFromUrl, getRelativePath, shortenFilename, prettifyFilename, isUriSchemaSupported, urlsEqual, - normalizeUrl + normalizeUrl, + exists } from '../../../lib/utils/index.js'; +import { fileURLToPath } from 'url'; -describe('Utils', function () { - describe('#isUrl(url)', function () { - it('should return true if url starts with "http[s]://"', function () { +describe('Utils', () => { + describe('#isUrl(url)', () => { + it('should return true if url starts with "http[s]://"', () => { isUrl('http://google.com').should.be.true(); isUrl('https://github.com').should.be.true(); }); - it('should return true if url starts with "//"', function () { + it('should return true if url starts with "//"', () => { isUrl('//www.youtube.com').should.be.true(); }); - it('should return false if url starts neither with "http[s]://" nor "//"', function () { + it('should return false if url starts neither with "http[s]://" nor "//"', () => { isUrl('http//www.youtube.com').should.be.false(); isUrl('http:/www.youtube.com').should.be.false(); isUrl('htt://www.youtube.com').should.be.false(); @@ -26,33 +28,33 @@ describe('Utils', function () { }); }); - describe('#getUrl(url, path)', function () { - it('should return url + path if path is not url', function () { + describe('#getUrl(url, path)', () => { + it('should return url + path if path is not url', () => { getUrl('http://google.com', '/path').should.be.equal('http://google.com/path'); getUrl('http://google.com/qwe/qwe/qwe', '/path').should.be.equal('http://google.com/path'); getUrl('http://google.com?kjrdrgek=dmskl', '/path').should.be.equal('http://google.com/path'); }); - it('should return path if it is url', function () { + it('should return path if it is url', () => { getUrl('http://google.com', 'http://my.site.com/').should.be.equal('http://my.site.com/'); getUrl('http://google.com/qwe/qwe/qwe', '//my.site.com').should.be.equal('http://my.site.com/'); }); - it('should use the protocol from the url, if the path is a protocol-less url', function () { + it('should use the protocol from the url, if the path is a protocol-less url', () => { getUrl('http://my.site.com', '//cdn.com/library.js').should.be.equal('http://cdn.com/library.js'); getUrl('https://my.site.com', '//cdn.com/library.js').should.be.equal('https://cdn.com/library.js'); }); }); - describe('#getUnixPath(path)', function () { - it('should convert to unix format for windows', function () { + describe('#getUnixPath(path)', () => { + it('should convert to unix format for windows', () => { getUnixPath('D:\\Projects\\node-website-scraper').should.be.equal('D:/Projects/node-website-scraper'); }); - it('should return unconverted path for unix', function () { + it('should return unconverted path for unix', () => { getUnixPath('/home/sophia/projects/node-website-scraper').should.be.equal('/home/sophia/projects/node-website-scraper'); }); }); - describe('#getFilenameFromUrl(url)', function () { - it('should return last path item as filename & trim all after first ? or #', function () { + describe('#getFilenameFromUrl(url)', () => { + it('should return last path item as filename & trim all after first ? or #', () => { getFilenameFromUrl('http://example.com/index.html').should.equal('index.html'); getFilenameFromUrl('http://example.com/p/a/t/h/index.html').should.equal('index.html'); getFilenameFromUrl('http://example.com/index.html?12').should.equal('index.html'); @@ -62,16 +64,16 @@ describe('Utils', function () { getFilenameFromUrl('http://example.com/#index.html').should.equal(''); getFilenameFromUrl('http://example.com/').should.equal(''); }); - it('should return unconverted filename if there are no ?,#', function () { + it('should return unconverted filename if there are no ?,#', () => { getFilenameFromUrl('index.html').should.equal('index.html'); }); - it('should decode escaped chars', function () { + it('should decode escaped chars', () => { getFilenameFromUrl('https://example.co/logo-mobile%20(1).svg?q=650').should.equal('logo-mobile (1).svg'); }); }); - describe('#getFilepathFromUrl', function () { - it('should return empty sting if url has no pathname', function() { + describe('#getFilepathFromUrl', () => { + it('should return empty sting if url has no pathname', () => { getFilepathFromUrl('http://example.com').should.equal(''); getFilepathFromUrl('http://example.com/').should.equal(''); getFilepathFromUrl('http://example.com?').should.equal(''); @@ -79,21 +81,21 @@ describe('Utils', function () { getFilepathFromUrl('http://example.com#').should.equal(''); getFilepathFromUrl('http://example.com#test').should.equal(''); }); - it('should return path if url has pathname', function() { + it('should return path if url has pathname', () => { getFilepathFromUrl('http://example.com/some/path').should.equal('some/path'); }); - it('should return path including filename if url has pathname', function() { + it('should return path including filename if url has pathname', () => { getFilepathFromUrl('http://example.com/some/path/file.js').should.equal('some/path/file.js'); }); - it('should not contain trailing slash', function() { + it('should not contain trailing slash', () => { getFilepathFromUrl('http://example.com/some/path/').should.equal('some/path'); getFilepathFromUrl('http://example.com/some/path/file.css/').should.equal('some/path/file.css'); }); - it('should normalize slashes', function() { + it('should normalize slashes', () => { getFilepathFromUrl('http://example.com///some//path').should.equal('some/path'); getFilepathFromUrl('http://example.com//////////file.css/').should.equal('file.css'); }); - it('should decode escaped chars', function () { + it('should decode escaped chars', () => { getFilepathFromUrl('https://example.co/logo/logo-mobile%20(1).svg?q=650').should.equal('logo/logo-mobile (1).svg'); }); it('should return path as is if url is malformed', () => { @@ -101,38 +103,38 @@ describe('Utils', function () { }); }); - describe('#getHashFromUrl', function () { - it('should return hash from url', function () { + describe('#getHashFromUrl', () => { + it('should return hash from url', () => { getHashFromUrl('#').should.be.equal('#'); getHashFromUrl('#hash').should.be.equal('#hash'); getHashFromUrl('page.html#hash').should.be.equal('#hash'); getHashFromUrl('http://example.com/page.html#hash').should.be.equal('#hash'); }); - it('should return empty string if url doesn\'t contain hash', function () { + it('should return empty string if url doesn\'t contain hash', () => { getHashFromUrl('').should.be.equal(''); getHashFromUrl('page.html?a=b').should.be.equal(''); getHashFromUrl('http://example.com/page.html?a=b').should.be.equal(''); }); }); - describe('#getRelativePath', function () { - it('should return relative path', function () { + describe('#getRelativePath', () => { + it('should return relative path', () => { getRelativePath('css/1.css', 'img/1.png').should.be.equal('../img/1.png'); getRelativePath('index.html', 'img/1.png').should.be.equal('img/1.png'); getRelativePath('css/1.css', 'css/2.css').should.be.equal('2.css'); }); - it('should escape path components with encodeURIComponent', function () { + it('should escape path components with encodeURIComponent', () => { getRelativePath('index.html', 'a/css?family=Open+Sans:300,400,600,700&lang=en').should.be.equal('a/css%3Ffamily%3DOpen%2BSans%3A300%2C400%2C600%2C700%26lang%3Den'); }); - it('should also escape [\'()]', function () { + it('should also escape [\'()]', () => { getRelativePath('index.html', '\'single quote for html attrs\'').should.be.equal('%27single%20quote%20for%20html%20attrs%27'); getRelativePath('index.html', '(parenthesizes for css url)').should.be.equal('%28parenthesizes%20for%20css%20url%29'); }); }); - describe('#shortenFilename', function() { - it('should leave file with length < 255 as is', function() { + describe('#shortenFilename', () => { + it('should leave file with length < 255 as is', () => { var f1 = _.repeat('a', 25); should(f1.length).be.eql(25); should(shortenFilename(f1)).be.eql(f1); @@ -142,33 +144,33 @@ describe('Utils', function () { should(shortenFilename(f2)).be.eql(f2); }); - it('should shorten file with length = 255', function() { + it('should shorten file with length = 255', () => { var f1 = _.repeat('a', 255); should(f1.length).be.eql(255); should(shortenFilename(f1).length).be.lessThan(255); }); - it('should shorten file with length > 255', function() { + it('should shorten file with length > 255', () => { var f1 = _.repeat('a', 1255); should(f1.length).be.eql(1255); should(shortenFilename(f1).length).be.lessThan(255); }); - it('should shorten file with length = 255 and keep extension', function() { + it('should shorten file with length = 255 and keep extension', () => { var f1 = _.repeat('a', 251) + '.txt'; should(f1.length).be.eql(255); should(shortenFilename(f1).length).be.lessThan(255); should(shortenFilename(f1).split('.')[1]).be.eql('txt'); }); - it('should shorten file with length > 255 and keep extension', function() { + it('should shorten file with length > 255 and keep extension', () => { var f1 = _.repeat('a', 1251) + '.txt'; should(f1.length).be.eql(1255); should(shortenFilename(f1).length).be.lessThan(255); should(shortenFilename(f1).split('.')[1]).be.eql('txt'); }); - it('should shorten file with length > 255 to have basename length 20 chars', function() { + it('should shorten file with length > 255 to have basename length 20 chars', () => { var f1 = _.repeat('a', 500); should(f1.length).be.eql(500); should(shortenFilename(f1).split('.')[0].length).be.eql(20); @@ -193,34 +195,34 @@ describe('Utils', function () { }); }); - describe('#isUriSchemaSupported', function() { - it('should return false for mailto:', function() { + describe('#isUriSchemaSupported', () => { + it('should return false for mailto:', () => { should(isUriSchemaSupported('mailto:test@test.com')).be.eql(false); }); - it('should return false for javascript:', function() { + it('should return false for javascript:', () => { should(isUriSchemaSupported('javascript:alert("Hi!")')).be.eql(false); }); - it('should return false for skype:', function() { + it('should return false for skype:', () => { should(isUriSchemaSupported('skype:skype_name?action')).be.eql(false); }); - it('should return true for http:', function() { + it('should return true for http:', () => { should(isUriSchemaSupported('http://example.com')).be.eql(true); }); - it('should return true for https:', function() { + it('should return true for https:', () => { should(isUriSchemaSupported('https://example.com')).be.eql(true); }); - it('should return true for relative paths', function() { + it('should return true for relative paths', () => { should(isUriSchemaSupported('index.html')).be.eql(true); }); }); describe('#urlsEqual', () => { - it('should return false for /path and /path/', function() { + it('should return false for /path and /path/', () => { should(urlsEqual('http://example.com/path', 'http://example.com/path/')).be.eql(false); }); }); @@ -231,4 +233,14 @@ describe('Utils', function () { should(normalizeUrl(malformedUrl)).be.eql(malformedUrl); }); }); + + describe('#exists', () => { + it('current test file should exists', async () => { + should(await exists(fileURLToPath(import.meta.url))).be.true(); + }); + + it('random jibberish shouldn\'t exist', async () => { + should(await exists('/wfneiwfueifnw.djf')).be.false(); + }); + }); }); diff --git a/test/utils/assertions.js b/test/utils/assertions.js index 37925a0f..193e3578 100644 --- a/test/utils/assertions.js +++ b/test/utils/assertions.js @@ -1,11 +1,45 @@ import _ from 'lodash'; import path from 'path'; import should from 'should'; -should.Assertion.add('equalFileSystemPath', function (value, description) { +import fs from 'fs/promises'; + +should.Assertion.add('equalFileSystemPath', function(value, description) { value = path.normalize(value); - if (process.platform == 'win32' && _.startsWith(value, path.sep)) { + if (process.platform === 'win32' && _.startsWith(value, path.sep)) { value = __dirname.split(path.sep)[0] + value; } this.params = { operator: 'to be', expected: value, message: description}; this.obj.should.equal(value, description); }); + +should.Assertion.add('fileExists', async function(value, description) { + let exists = false; + + try { + exists = (await fs.stat(this.obj)).isFile(); + } catch (err) { + // We don't care about this error. + } + + if (value === undefined) { + value = true; + } + + exists.should.eql(value, description); +}); + +should.Assertion.add('dirExists', async function(value, description) { + let exists = false; + + try { + exists = (await fs.stat(this.obj)).isDirectory(); + } catch (err) { + // We don't care about this error. + } + + if (value === undefined) { + value = true; + } + + exists.should.eql(value, description); +});