Scrapyard makes scraping websites easy.
JavaScript
Latest commit 9f7703b Dec 15, 2015 @yetzt package overhaul
Failed to load latest commit information.
package.json
readme.md
scrapyard.js

readme.md

Scrapyard

Scrapyard makes scraping websites easy. I'ts a wrapper for most the things you need, comes with optional caching and retries, and opens as many connections as you like.

Installation

npm install scrapyard

Usage

var scrapyard = require("scrapyard");
var scraper = new scrapyard({
    debug: true,
    retries: 5,
    connections: 10,
    cache: './storage', 
    bestbefore: "5min"
});
  • retries number of times the scraper attempts to fetch the url before giving up. default: 5
  • connections number of concurrent connections a scraper will make. setting this too high could be considered as a ddos so be polite and keep this reasonable
  • cache is a folder, where scraped contents are cached. by default caching is off.
  • bestbefore time your cache is valid, either an int of milliseconds or a string, valid forever when 0

Call

scraper(options, callback);

or simply

scraper(url, callback);

The first argument can be either a url string or an options object. url is the only option required.

  • url is a string containing the HTTP URL
  • type is either 'html', 'xml', 'json' or 'raw' (default: 'html')
  • method is the HTTP method (default: 'GET')
  • form is an object containing your formdata
  • encoding is passed to http.setEncoding() (default: 'binary')
  • callback(err, data) is the callback method

Although scrapyard has only been tested with these 6 options, you can try to set any option available for request.

Examples

var scrapyard = require("scrapyard");
var scraper = new scrapyard({
    cache: './storage', 
    debug: true,
    timeout: 300000,
    retries: 5,
    connections: 10
});

// html, passes you a jquery-like `cheerio` object
scraper('http://example.org/test.html', function(err, $) {
    if (err) return console.error(err);
    console.log($('h1').text());
});

// post something
scraper({
    url: 'http://example.org/test.html',
    type: 'html',
    encoding: 'binary',
    method: 'POST',
    form: {key1: 'value1', key2: 'value2'}
}, function(err, $) {
    if (err) return console.error(err);
    console.log($('h1').text());
});

// xml, converts xml to a javascript object with `xml2js`
scraper({
    url: 'http://example.org/test.xml',
    type: 'xml',
    encoding: 'utf8'
}, function(err, xml) {
    if (err) return console.error(err);
    console.log(xml);
});

// json, as delivered by `json.stringify`
scraper({
    url: 'http://example.org/test.json',
    type: 'json',
}, function(err, json){
    if (err) return console.error(err);
    console.log(json);
});

// raw, just pass on whatever the webserver spits out
scraper({
    url: 'http://example.org/test.bin',
    type: 'raw',
}, function(err, data){
    if (err) return console.error(err);
    console.log(data);
});

Tor

It's possible to use scrapyard with tor using the socks5-http-client module:

var scrapyard = require("scrapyard");
var scraper = scrapyard();
var Agent = require('socks5-http-client/lib/Agent');

scraper({
    url: "http://freepress3xxs3hk.onion/about",
    headers: {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"
    },
    agentClass: Agent,
    agentOptions: {
        socksHost: 'localhost',
        socksPort: 9050
    },
    method: "GET",
    type: "html",
    encoding: "utf-8"
}, function(err, $){
    if (err) return console.log(err);
    $(".content p").each(function(){
        console.log($(this).text());
    });
});