-
-
Notifications
You must be signed in to change notification settings - Fork 292
Closed
Labels
Description
Configuration
version: website-scraper@4.0.1
options:
website-scraper:info init with options {
defaultFilename: 'index.html',
prettifyUrls: false,
sources: [
{ selector: 'style' },
{ selector: '[style]', attr: 'style' },
{ selector: 'img', attr: 'src' },
{ selector: 'img', attr: 'srcset' },
{ selector: 'input', attr: 'src' },
{ selector: 'object', attr: 'data' },
{ selector: 'embed', attr: 'src' },
{ selector: 'param[name="movie"]', attr: 'value' },
{ selector: 'script', attr: 'src' },
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
{ selector: 'link[rel*="icon"]', attr: 'href' },
{ selector: 'svg *[xlink\\:href]', attr: 'xlink:href' },
{ selector: 'svg *[href]', attr: 'href' },
{ selector: 'picture source', attr: 'srcset' },
{ selector: 'meta[property="og\\:image"]', attr: 'content' },
{ selector: 'meta[property="og\\:image\\:url"]', attr: 'content' },
{
selector: 'meta[property="og\\:image\\:secure_url"]',
attr: 'content'
},
{ selector: 'meta[property="og\\:audio"]', attr: 'content' },
{ selector: 'meta[property="og\\:audio\\:url"]', attr: 'content' },
{
selector: 'meta[property="og\\:audio\\:secure_url"]',
attr: 'content'
},
{ selector: 'meta[property="og\\:video"]', attr: 'content' },
{ selector: 'meta[property="og\\:video\\:url"]', attr: 'content' },
{
selector: 'meta[property="og\\:video\\:secure_url"]',
attr: 'content'
},
{ selector: 'video', attr: 'src' },
{ selector: 'video source', attr: 'src' },
{ selector: 'video track', attr: 'src' },
{ selector: 'audio', attr: 'src' },
{ selector: 'audio source', attr: 'src' },
{ selector: 'audio track', attr: 'src' },
{ selector: 'frame', attr: 'src' },
{ selector: 'iframe', attr: 'src' }
],
subdirectories: [
{ directory: 'images', extensions: [Array] },
{ directory: 'js', extensions: [Array] },
{ directory: 'css', extensions: [Array] },
{ directory: 'media', extensions: [Array] },
{ directory: 'fonts', extensions: [Array] }
],
request: { encoding: 'binary', strictSSL: false, jar: true, gzip: true },
requestConcurrency: Infinity,
urlFilter: null,
recursive: false,
maxRecursiveDepth: null,
maxDepth: null,
ignoreErrors: false,
urls: [
{
url: 'http://some-url.example.com?printable=yes',
filename: 'index.html'
}
],
directory: '/tmp/scraping/Review_of_AfL_and_lesson_pacing',
plugins: [ MyBeforeRequestPlugin {} ],
recursiveSources: [ { selector: 'a', attr: 'href' } ]
}Description
I'm using the beforeRequest hook. I defined a customer beforeRequest hook. With the hook defined, the scraper stops scraping resources altogether.
My plugin class definition:
class MyBeforeRequestPlugin {
apply(registerAction) {
registerAction('beforeRequest', ({resource, requestOptions}) => {
if (resource.filename.indexOf('.html') != -1) {
console.log('found html resource, encoding with utf-8', resource.filename);
requestOptions.encoding = 'utf8';
} else {
requestOptions.encoding = 'binary';
}
return { requestOptions, resource };
});
}
}The log reveals that the filename is null - after digging into the code, that might be a red herring as the filename gets set after the url is fetched:
website-scraper:warn failed to request resource { url: "http://oer.educ.cam.ac.uk/w/images/thumb/4/44/Eness_IMG_0785_square.jpg/80px-Eness_IMG_0785_square.jpg", filename: "undefined", depth: 1 } +12ms
Expected behavior: I expect the scraper to still scrape all children resources.
Actual behavior: The scraper does not scrape children resources.