Permalink
Browse files

improved this example to demonstrate a simple web crawler with envjs

  • Loading branch information...
thatcher
thatcher committed Mar 4, 2011
1 parent 1a94a42 commit ac0812880433e0f4580400704bd1b848e022cfb2
Showing with 79 additions and 25 deletions.
  1. +79 −25 examples/scrape.js
@@ -1,29 +1,83 @@
//#!bin/envjs node examples/simple.js
var testurl = 'http://localhost:8080/';
/**
* @author thatcher
*/
require('plugins/jquery');

var check_urls = function(event){
console.log('Sample: document %s loaded in %s', document.title, Date.now()-start);
document.removeEventListener('load', check_urls);
jQuery('a').each(function(){
console.log('a -> %s', $(this).attr('href'));
});
jQuery.ajax({
type:'get',
url: '/?fo=json',
dataType: 'json',
success: function(data){
console.log('loaded json via ajax %s', data);
},
error: function(xhr, status, e){
console.log('failed to load json via ajax %s %s \n%s',
xhr.status, xhr.url, e);
}
});
console.log('loading json via ajax');
var site = 'http://www.envjs.com/';

console.log('SCRAPING SITE %s', site);

document.location = site;

function scrape(url, links){

// scrape text from current document which we will
// assign weights to in our search index
var data = {
$id: encodeURIComponent(url),
url: url,
title: document.title,
headings: $('h1, h2, h3, h4, h5, h6').map(function(){
return $(this).text();
}),
keywords: $('meta[name=keywords]').attr('content'),
links: [],
description: $('meta[name=description]').attr('content'),
full_text: $(document.body).text()
};

data.keywords = data.keywords ? data.keywords.split(',') : [];
$(data.keywords).each(function(i){ data.keywords[i] = $.trim(this);});
data.headings = data.headings ? $.makeArray(data.headings) : '';
$(data.headings).each(function(i){ data.headings[i] = $.trim(this);});
data.full_text = data.full_text ? data.full_text.replace(/\s+/g, ' ') : '';

// find all the relavant links, but don't include any we
// already have in our link array
$('a[href]').each(function(){
var href = $.trim($(this).attr('href'));
data.links.push(Envjs.uri(href));
if($.inArray(href, links) == -1 &&
$.inArray(Envjs.uri(href), links) == -1 &&
!href.match(/^(\s)*http|#/) &&
!href.match(/(.jar|.zip|.tgz|.gz|.tar|.js)\s*$/)){
//we only want to crawl local links
console.log('ADDING LINK TO LIST: %s', Envjs.uri(href));
links.push(href);
}
});

// write the record to console (probably would want to post
// to a restful db like couchdb, mongodb or elasticsearch)
console.log("SCRAPED DATA: %s", JSON.stringify(data, null, ' '));
};

document.addEventListener('load', check_urls);
// create an array which we'll use
// to store relavant links to crawl
var links = [site],
current_link = 1,
page;

var next = function(){
// index this document
scrape(document.location.toString(), links);

// now crawl our links
if(current_link < links.length){
try{
// replaces this document with the document
// from the link
page = Envjs.uri(links[current_link++]);
console.log('LOADING PAGE %s', page);
document.location = page;
}catch(e){
console.log('FAILED TO LOAD PAGE %s \n %s', page, e);
}
}else{
console.log('SITE SCRAPE COMPLETE: %s of %s', current_link, links.length);
}

};

var start = Date.now();
console.log('Sample: loading %s', testurl);
document.location = testurl;
//This is basically our initial page load event handler
$(document).bind('load', next);

0 comments on commit ac08128

Please sign in to comment.