forked from jeresig/env-js
-
Notifications
You must be signed in to change notification settings - Fork 75
/
env.robot.js
95 lines (82 loc) · 2.51 KB
/
env.robot.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/**
* @author thatcher
*/
load('dist/env.rhino.js');
load('plugins/jquery.js');
function scrape(url, links){
// scrape text from current document which we will
// assign weights to in our search index
var data = {
$id: encodeURIComponent(url),
url: url,
full_text: $(document.body).text(),
title: document.title,
headings: $('h1, h2, h3, h4, h5, h6').text(),
description: $('meta[name=description]').attr('content'),
keywords: $('meta[name=keywords]').attr('content').split(',')
};
// find all the relavant links, but don't include any we
// already have in our link array
$('a[href]').each(function(){
var href = $(this).attr('href');
if($.inArray(href, links) == -1 && !href.match(/^(\s)*http|#/)){
//we only want to crawl local links
links.push(href);
}
});
// save the record to our index
$.ajax({
url:'http://localhost:8080/rest/index/'+data.$id,
contentType:'application/json',
dataType:'json',
type: 'post',
async: false,
data: JSON.stringify(data),
processData: false,
success: function(){
console.log('indexed document %s', url);
}
});
}
$(function(){
// delete the index to start fresh
$.ajax({
url:'http://localhost:8080/rest/index/',
contentType:'application/json',
dataType:'json',
type:'delete',
async: false,
success: function(){
console.log('deleted search index');
}
});
// create the search index we will populate with
// our simple crawl
$.ajax({
url:'http://localhost:8080/rest/index/',
contentType:'application/json',
dataType:'json',
type:'put',
async: false,
success: function(){
console.log('created search index');
}
});
// create an array which we'll use
// to store relavant links to crawl
var links = [];
// index this document
scrape(document.location.toString(), links);
// now crawl our links
for(var i = 0; i < links.length; i++){
try{
// replaces this document with the document
// from the link
document.location = Envjs.uri(links[i]);
scrape(links[i], links);
}catch(e){
console.log('failed to load %s \n %s', links[i], e);
}
}
});
window.location = 'http://localhost:8080/';