Skip to content

Commit

Permalink
follow sitemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
yujiosaka committed Jan 14, 2018
1 parent 2f2032a commit ff0b2d1
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 21 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Powered by Headless Chrome, the crawler provides [simple APIs](#api-reference) t
* Emulate devices and user agents
* Priority queue for crawling efficiency
* Obey [robots.txt](https://developers.google.com/search/reference/robots_txt)
* Follow [sitemap.xml](https://www.sitemaps.org/)
* [Promise] support

## Getting Started
Expand Down Expand Up @@ -109,6 +110,8 @@ NODE_PATH=../ node examples/priority-queue.js
* [event: 'requestfinished'](#event-requestfinished)
* [event: 'requestretried'](#event-requestretried)
* [event: 'requestfailed'](#event-requestfailed)
* [event: 'robotstxtrequestfailed'](#event-robotstxtrequestfailed)
* [event: 'sitemapxmlrequestfailed'](#event-sitemapxmlrequestfailed)
* [event: 'maxdepthreached'](#event-maxdepthreached)
* [event: 'maxrequestreached'](#event-maxrequestreached)
* [event: 'disconnected'](#event-disconnected)
Expand Down Expand Up @@ -237,6 +240,7 @@ url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, d
* `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
* `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
* `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
* `allowedDomains` <[Array]<[string]>> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed.
* `delay` <[number]> Number of milliseconds after each request, defaults to `0`. When delay is set, `maxConcurrency` option must be `1`.
* `retryCount` <[number]> Number of limit when retry fails, defaults to `3`.
Expand Down Expand Up @@ -348,6 +352,18 @@ Emitted when a request is retried.

Emitted when a request failed.

#### event: 'robotstxtrequestfailed'

* `error` <[Error]>

Emitted when a request to [robots.txt](https://developers.google.com/search/reference/robots_txt) failed

#### event: 'sitemapxmlrequestfailed'

* `error` <[Error]>

Emitted when a request to [sitemap.xml](https://www.sitemaps.org/) failed

#### event: 'maxdepthreached'

* `options` <[Object]>
Expand Down
68 changes: 47 additions & 21 deletions lib/hccrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const {
pick,
omit,
extend,
map,
each,
reduce,
includes,
Expand All @@ -20,6 +21,7 @@ const {
delay,
generateKey,
getRobotsUrl,
getSitemapUrls,
tracePublicAPI,
} = require('./helper');
const PriorityQueue = require('./priority-queue');
Expand Down Expand Up @@ -120,6 +122,7 @@ class HCCrawler extends EventEmitter {
persistCache: false,
skipDuplicates: true,
obeyRobotsTxt: true,
followSitemapXml: false,
screenshot: null,
}, options);
this._cache = options.cache || new SessionCache();
Expand Down Expand Up @@ -287,7 +290,8 @@ class HCCrawler extends EventEmitter {
this.emit(HCCrawler.Events.RequestSkipped, options);
return Promise.resolve();
}
return this._request(options, depth);
return this._followSitemap(options, depth)
.then(() => this._request(options, depth));
});
}

Expand Down Expand Up @@ -315,8 +319,8 @@ class HCCrawler extends EventEmitter {
return this._success(res)
.then(() => {
this._exportLine(res);
this._followLinks(res.links, options, depth);
this._checkRequestCount();
this._followLinks(res.links, options, depth);
})
.then(() => crawler.close())
.then(() => delay(options.delay));
Expand Down Expand Up @@ -346,37 +350,58 @@ class HCCrawler extends EventEmitter {
*/
_checkAllowedRobots(options) {
if (!options.obeyRobotsTxt) return Promise.resolve(true);
const robotsUrl = getRobotsUrl(options.url);
return Promise.all([
this._getRobotsTxt(robotsUrl),
this._getRobot(options),
this._getUserAgent(options),
])
.then(([robotsTxt, userAgent]) => {
const robot = robotsParser(robotsUrl, robotsTxt);
return robot.isAllowed(options.url, userAgent);
.then(([robot, userAgent]) => robot.isAllowed(options.url, userAgent));
}

/**
* @param {!Object} options
* @param {!number} depth
* @return {!Promise}
* @private
*/
_followSitemap(options, depth) {
if (!options.followSitemapXml) return Promise.resolve();
return this._getRobot(options)
.then(robot => {
const sitemapUrls = robot.getSitemaps();
return Promise.resolve(map(sitemapUrls, sitemapUrl => (
request(sitemapUrl)
.then(xml => {
const urls = getSitemapUrls(xml);
each(urls, url => {
const _options = extend({}, options, { url });
this._queue.push(_options, depth, _options.priority);
});
})
.catch(error => void this.emit(HCCrawler.Events.SitemapXmlRequestFailed, error))
)));
});
}

/**
* @param {!string} url
* @return {!Promise<?string>}
* @param {!Object} options
* @return {!Promise<!Robots>}
* @private
*/
_getRobotsTxt(url) {
return this._cache.get(url)
_getRobot(options) {
const robotsUrl = getRobotsUrl(options.url);
return this._cache.get(robotsUrl)
.then(cachedTxt => {
if (isString(cachedTxt)) return cachedTxt;
return request(url)
return request(robotsUrl)
.then(txt => (
this._cache.set(url, txt)
.then(() => txt)
this._cache.set(robotsUrl, txt).then(() => txt)
))
.catch(error => {
this.emit(HCCrawler.Events.RobotsTxtRequestFailed, error);
return this._cache.set(url, EMPTY_TXT)
.then(() => EMPTY_TXT);
return this._cache.set(robotsUrl, EMPTY_TXT).then(() => EMPTY_TXT);
});
});
})
.then(robotsTxt => robotsParser(robotsUrl, robotsTxt));
}

/**
Expand Down Expand Up @@ -468,18 +493,18 @@ class HCCrawler extends EventEmitter {
}

/**
* @param {!Array<!string>} links
* @param {!Array<!string>} urls
* @param {!Object} options
* @param {!number} depth
* @private
*/
_followLinks(links, options, depth) {
_followLinks(urls, options, depth) {
if (depth >= options.maxDepth) {
this.emit(HCCrawler.Events.MaxDepthReached);
return;
}
each(links, link => {
const _options = extend({}, options, { url: link });
each(urls, url => {
const _options = extend({}, options, { url });
this._queue.push(_options, depth + 1, _options.priority);
});
}
Expand Down Expand Up @@ -552,6 +577,7 @@ HCCrawler.Events = {
RequestRetried: 'requestretried',
RequestFailed: 'requestfailed',
RobotsTxtRequestFailed: 'robotstxtrequestfailed',
SitemapXmlRequestFailed: 'sitemapxmlrequestfailed',
MaxDepthReached: 'maxdepthreached',
MaxRequestReached: 'maxrequestreached',
Disconnected: 'disconnected',
Expand Down
26 changes: 26 additions & 0 deletions lib/helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,32 @@ class Helper {
return first;
}

/**
* @param {!string} sitemapXml
* @return {!Array<!string>}
*/
static getSitemapUrls(sitemapXml) {
const urls = [];
sitemapXml.replace(/<loc>([^<]+)<\/loc>/g, (_, url) => {
const unescapedUrl = Helper.unescape(url);
urls.push(unescapedUrl);
});
return urls;
}

/**
* @param {!string} src
* @return {!string}
*/
static unescape(src) {
return src
.replace(/&amp;/g, '&')
.replace(/&apos;/g, "'")
.replace(/&quot;/g, '"')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>');
}

/**
* @param {!Object} classType
*/
Expand Down
49 changes: 49 additions & 0 deletions test/helper.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ const {
escapeQuotes,
getRobotsUrl,
lowerBound,
getSitemapUrls,
unescape,
stringifyArgument,
debugConsole,
debugDialog,
Expand Down Expand Up @@ -215,6 +217,53 @@ describe('Helper', () => {
});
});

describe('Helper.getSitemapUrls', () => {
it('returns empty array for empty xml', () => {
const actual = getSitemapUrls('');
const expected = [];
assert.deepEqual(actual, expected);
});

it('returns empty array for no urls', () => {
const actual = getSitemapUrls(`
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>
`);
const expected = [];
assert.deepEqual(actual, expected);
});

it('returns a url', () => {
const actual = getSitemapUrls(`
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://github.com/yujiosaka/headless-chrome-crawler/issues</loc></url>
</urlset>
`);
const expected = ['https://github.com/yujiosaka/headless-chrome-crawler/issues'];
assert.deepEqual(actual, expected);
});
});

describe('Helper.unescape', () => {
it('returns empty string for empty argument', () => {
const actual = unescape('');
const expected = '';
assert.equal(actual, expected);
});

it('returns the same string for non-escaped argument', () => {
const actual = unescape('https://github.com/yujiosaka/headless-chrome-crawler/issues');
const expected = 'https://github.com/yujiosaka/headless-chrome-crawler/issues';
assert.equal(actual, expected);
});

it('returns the unescaped argument', () => {
const actual = unescape('&lt;loc&gt;https://github.com/yujiosaka/headless-chrome-crawler/issues?a=1&amp;b=2&lt;/loc&gt;');
const expected = '<loc>https://github.com/yujiosaka/headless-chrome-crawler/issues?a=1&b=2</loc>';
assert.equal(actual, expected);
});
});

describe('Helper.stringifyArgument', () => {
it('stringifies undefined', () => {
const actual = stringifyArgument(undefined);
Expand Down

0 comments on commit ff0b2d1

Please sign in to comment.