follow sitemaps

yujiosaka · Jan 14, 2018 · ff0b2d1 · ff0b2d1
1 parent 2f2032a
commit ff0b2d1
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ Powered by Headless Chrome, the crawler provides [simple APIs](#api-reference) t
 * Emulate devices and user agents
 * Priority queue for crawling efficiency
 * Obey [robots.txt](https://developers.google.com/search/reference/robots_txt)
+* Follow [sitemap.xml](https://www.sitemaps.org/)
 * [Promise] support
 
 ## Getting Started
@@ -109,6 +110,8 @@ NODE_PATH=../ node examples/priority-queue.js
   * [event: 'requestfinished'](#event-requestfinished)
   * [event: 'requestretried'](#event-requestretried)
   * [event: 'requestfailed'](#event-requestfailed)
+  * [event: 'robotstxtrequestfailed'](#event-robotstxtrequestfailed)
+  * [event: 'sitemapxmlrequestfailed'](#event-sitemapxmlrequestfailed)
   * [event: 'maxdepthreached'](#event-maxdepthreached)
   * [event: 'maxrequestreached'](#event-maxrequestreached)
   * [event: 'disconnected'](#event-disconnected)
@@ -237,6 +240,7 @@ url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, d
   * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
   * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
   * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
+  * `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
   * `allowedDomains` <[Array]<[string]>> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed.
   * `delay` <[number]> Number of milliseconds after each request, defaults to `0`. When delay is set, `maxConcurrency` option must be `1`.
   * `retryCount` <[number]> Number of limit when retry fails, defaults to `3`.
@@ -348,6 +352,18 @@ Emitted when a request is retried.
 
 Emitted when a request failed.
 
+#### event: 'robotstxtrequestfailed'
+
+* `error` <[Error]>
+
+Emitted when a request to [robots.txt](https://developers.google.com/search/reference/robots_txt) failed
+
+#### event: 'sitemapxmlrequestfailed'
+
+* `error` <[Error]>
+
+Emitted when a request to [sitemap.xml](https://www.sitemaps.org/) failed
+
 #### event: 'maxdepthreached'
 
 * `options` <[Object]>

diff --git a/lib/hccrawler.js b/lib/hccrawler.js
@@ -4,6 +4,7 @@ const {
   pick,
   omit,
   extend,
+  map,
   each,
   reduce,
   includes,
@@ -20,6 +21,7 @@ const {
   delay,
   generateKey,
   getRobotsUrl,
+  getSitemapUrls,
   tracePublicAPI,
 } = require('./helper');
 const PriorityQueue = require('./priority-queue');
@@ -120,6 +122,7 @@ class HCCrawler extends EventEmitter {
       persistCache: false,
       skipDuplicates: true,
       obeyRobotsTxt: true,
+      followSitemapXml: false,
       screenshot: null,
     }, options);
     this._cache = options.cache || new SessionCache();
@@ -287,7 +290,8 @@ class HCCrawler extends EventEmitter {
           this.emit(HCCrawler.Events.RequestSkipped, options);
           return Promise.resolve();
         }
-        return this._request(options, depth);
+        return this._followSitemap(options, depth)
+          .then(() => this._request(options, depth));
       });
   }
 
@@ -315,8 +319,8 @@ class HCCrawler extends EventEmitter {
             return this._success(res)
               .then(() => {
                 this._exportLine(res);
-                this._followLinks(res.links, options, depth);
                 this._checkRequestCount();
+                this._followLinks(res.links, options, depth);
               })
               .then(() => crawler.close())
               .then(() => delay(options.delay));
@@ -346,37 +350,58 @@ class HCCrawler extends EventEmitter {
    */
   _checkAllowedRobots(options) {
     if (!options.obeyRobotsTxt) return Promise.resolve(true);
-    const robotsUrl = getRobotsUrl(options.url);
     return Promise.all([
-      this._getRobotsTxt(robotsUrl),
+      this._getRobot(options),
       this._getUserAgent(options),
     ])
-      .then(([robotsTxt, userAgent]) => {
-        const robot = robotsParser(robotsUrl, robotsTxt);
-        return robot.isAllowed(options.url, userAgent);
+      .then(([robot, userAgent]) => robot.isAllowed(options.url, userAgent));
+  }
+
+  /**
+   * @param {!Object} options
+   * @param {!number} depth
+   * @return {!Promise}
+   * @private
+   */
+  _followSitemap(options, depth) {
+    if (!options.followSitemapXml) return Promise.resolve();
+    return this._getRobot(options)
+      .then(robot => {
+        const sitemapUrls = robot.getSitemaps();
+        return Promise.resolve(map(sitemapUrls, sitemapUrl => (
+          request(sitemapUrl)
+            .then(xml => {
+              const urls = getSitemapUrls(xml);
+              each(urls, url => {
+                const _options = extend({}, options, { url });
+                this._queue.push(_options, depth, _options.priority);
+              });
+            })
+            .catch(error => void this.emit(HCCrawler.Events.SitemapXmlRequestFailed, error))
+        )));
       });
   }
 
   /**
-   * @param {!string} url
-   * @return {!Promise<?string>}
+   * @param {!Object} options
+   * @return {!Promise<!Robots>}
    * @private
    */
-  _getRobotsTxt(url) {
-    return this._cache.get(url)
+  _getRobot(options) {
+    const robotsUrl = getRobotsUrl(options.url);
+    return this._cache.get(robotsUrl)
       .then(cachedTxt => {
         if (isString(cachedTxt)) return cachedTxt;
-        return request(url)
+        return request(robotsUrl)
           .then(txt => (
-            this._cache.set(url, txt)
-              .then(() => txt)
+            this._cache.set(robotsUrl, txt).then(() => txt)
           ))
           .catch(error => {
             this.emit(HCCrawler.Events.RobotsTxtRequestFailed, error);
-            return this._cache.set(url, EMPTY_TXT)
-              .then(() => EMPTY_TXT);
+            return this._cache.set(robotsUrl, EMPTY_TXT).then(() => EMPTY_TXT);
           });
-      });
+      })
+      .then(robotsTxt => robotsParser(robotsUrl, robotsTxt));
   }
 
   /**
@@ -468,18 +493,18 @@ class HCCrawler extends EventEmitter {
   }
 
   /**
-   * @param {!Array<!string>} links
+   * @param {!Array<!string>} urls
    * @param {!Object} options
    * @param {!number} depth
    * @private
    */
-  _followLinks(links, options, depth) {
+  _followLinks(urls, options, depth) {
     if (depth >= options.maxDepth) {
       this.emit(HCCrawler.Events.MaxDepthReached);
       return;
     }
-    each(links, link => {
-      const _options = extend({}, options, { url: link });
+    each(urls, url => {
+      const _options = extend({}, options, { url });
       this._queue.push(_options, depth + 1, _options.priority);
     });
   }
@@ -552,6 +577,7 @@ HCCrawler.Events = {
   RequestRetried: 'requestretried',
   RequestFailed: 'requestfailed',
   RobotsTxtRequestFailed: 'robotstxtrequestfailed',
+  SitemapXmlRequestFailed: 'sitemapxmlrequestfailed',
   MaxDepthReached: 'maxdepthreached',
   MaxRequestReached: 'maxrequestreached',
   Disconnected: 'disconnected',

diff --git a/lib/helper.js b/lib/helper.js
@@ -121,6 +121,32 @@ class Helper {
     return first;
   }
 
+  /**
+   * @param {!string} sitemapXml
+   * @return {!Array<!string>}
+   */
+  static getSitemapUrls(sitemapXml) {
+    const urls = [];
+    sitemapXml.replace(/<loc>([^<]+)<\/loc>/g, (_, url) => {
+      const unescapedUrl = Helper.unescape(url);
+      urls.push(unescapedUrl);
+    });
+    return urls;
+  }
+
+  /**
+   * @param {!string} src
+   * @return {!string}
+   */
+  static unescape(src) {
+    return src
+      .replace(/&amp;/g, '&')
+      .replace(/&apos;/g, "'")
+      .replace(/&quot;/g, '"')
+      .replace(/&lt;/g, '<')
+      .replace(/&gt;/g, '>');
+  }
+
   /**
    * @param {!Object} classType
    */

diff --git a/test/helper.test.js b/test/helper.test.js
@@ -9,6 +9,8 @@ const {
   escapeQuotes,
   getRobotsUrl,
   lowerBound,
+  getSitemapUrls,
+  unescape,
   stringifyArgument,
   debugConsole,
   debugDialog,
@@ -215,6 +217,53 @@ describe('Helper', () => {
     });
   });
 
+  describe('Helper.getSitemapUrls', () => {
+    it('returns empty array for empty xml', () => {
+      const actual = getSitemapUrls('');
+      const expected = [];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('returns empty array for no urls', () => {
+      const actual = getSitemapUrls(`
+      <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+      </urlset>
+      `);
+      const expected = [];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('returns a url', () => {
+      const actual = getSitemapUrls(`
+      <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+        <url><loc>https://github.com/yujiosaka/headless-chrome-crawler/issues</loc></url>
+      </urlset>
+      `);
+      const expected = ['https://github.com/yujiosaka/headless-chrome-crawler/issues'];
+      assert.deepEqual(actual, expected);
+    });
+  });
+
+  describe('Helper.unescape', () => {
+    it('returns empty string for empty argument', () => {
+      const actual = unescape('');
+      const expected = '';
+      assert.equal(actual, expected);
+    });
+
+    it('returns the same string for non-escaped argument', () => {
+      const actual = unescape('https://github.com/yujiosaka/headless-chrome-crawler/issues');
+      const expected = 'https://github.com/yujiosaka/headless-chrome-crawler/issues';
+      assert.equal(actual, expected);
+    });
+
+    it('returns the unescaped argument', () => {
+      const actual = unescape('&lt;loc&gt;https://github.com/yujiosaka/headless-chrome-crawler/issues?a=1&amp;b=2&lt;/loc&gt;');
+      const expected = '<loc>https://github.com/yujiosaka/headless-chrome-crawler/issues?a=1&b=2</loc>';
+      assert.equal(actual, expected);
+    });
+  });
+
   describe('Helper.stringifyArgument', () => {
     it('stringifies undefined', () => {
       const actual = stringifyArgument(undefined);