Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

update

  • Loading branch information...
commit 95c82d892f0a19abd42bbe0269d73f931f796a2f 1 parent 15cae31
@tokuhirom authored
View
1  .gitignore
@@ -1,3 +1,4 @@
developer.mozilla.org/
converted/
node_modules/
+docs.db
View
6 README.mkdn
@@ -1 +1,7 @@
This is a documentation viewer for MDN.
+
+Concept
+-------
+
+ * No server process required.
+
View
1  TODO
@@ -0,0 +1 @@
+- no nquery
View
37 convert.js
@@ -1,30 +1,35 @@
var fs = require('fs'),
- glob = require('glob'),
- nquery = require('nquery'),
util = require('util'),
path = require('path'),
- mkdirp = require('mkdirp')
- libxml = require('libxmljs'),
+ libxml = require('libxmlext'),
+ DB = require('./lib/DB').DB,
+ Url = require('url'),
+ mkdirp = require('mkdirp'),
+ querystring = require('querystring'),
+ crypto = require('crypto'),
undefined
;
-glob.globSync('developer.mozilla.org/**/*', glob.GLOB_STAR).filter(function (path) {
- return !fs.statSync(path).isDirectory();
-}).forEach(function (fname) {
- var src = fs.readFileSync(fname, 'utf-8');
+var srcdb = new DB('docs.db');
+
+mkdirp.sync('converted', 0744);
+
+srcdb.listUrls().forEach(function (url) {
+ var src = srcdb.fetch(url);
var doc = libxml.parseHtmlString(src);
- ['//script', '//head', '//noscript', '//header', '//*[@id="nav-toolbar"]', '//*[contains(concat(" ",normalize-space(@class)," "), " page-watch ")]', '//footer', '//*[@id="sessionMsg"]', '//*[@id="pageToc"]', '//*[@id="article-nav"]', '//*[@id="page-buttons"]'].forEach(function (xpath) {
- doc.find(xpath).forEach(function (e) {
- e.remove();
- });
+ // q.find('script, header, #nav-toolbar, .page-watch, footer, #sessionMsg, head, #pageToc, #article-nav, #page-buttons').remove();
+ doc.search('script, header, #nav-toolbar, .page-watch, footer, #sessionMsg, head, #pageToc, #article-nav, #page-buttons').forEach(function (e) {
+ e.remove();
});
- // q.find('script, header, #nav-toolbar, .page-watch, footer, #sessionMsg, head, #pageToc, #article-nav, #page-buttons').remove();
+ var fname = Url.parse(url).pathname;
- var title = fname.replace(/^developer.mozilla.org\/en\/JavaScript\/Reference\//, '').replace(/^[^/]+\//, '').replace(/\//g, '.').replace(/_/g, / /);
+ var title = fname.replace(/^\/en\/JavaScript\/Reference\//, '').replace(/^[^/]+\//, '').replace(/\//g, '.').replace(/_/g, ' ');
doc.find('//*[@id="title"]').forEach(function (e) { e.text(title) });
- var ofname = fname.replace(/^developer.mozilla.org/, 'converted');
- mkdirp.sync(path.dirname(ofname), 0775);
+ var md5 = crypto.createHash('md5');
+ md5.update(fname);
+ var ofname = 'converted/' + md5.digest('hex');
+ console.log('writing ' + ofname);
fs.writeFileSync(ofname, doc.toString());
});
View
93 crawler.js
@@ -0,0 +1,93 @@
+"use strict";
+
+var libxmlext = require('libxmlext'),
+ fs = require('fs'),
+ util = require('util'),
+ request = require('request'),
+ DB = require('./lib/DB').DB,
+ url_ = require('url'),
+ undefined;
+
+
+function Crawler() {
+ this.db = new DB('docs.db');
+ // this.endpoint = "https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/JSON";
+ this.endpoint = "https://developer.mozilla.org/en/JavaScript/Reference";
+ this.queue = new Array();
+ this.queue.push(this.endpoint);
+ this.seen = {};
+ var self = this;
+ setTimeout(function () {
+ console.log(self.queue.length);
+ }, 1000);
+}
+Crawler.prototype = {
+ run: function () {
+ if (this.queue.length === 0) {
+ return;
+ }
+
+ var url = this.queue.shift();
+ if (this.seen[url]) {
+ console.log('[seen] ' + url);
+ return;
+ }
+
+ console.log('[url] ' + url);
+
+ var self = this;
+ if (this.db.exists(url)) {
+ var src = this.db.fetch(url);
+ this.processBody(url, src);
+ } else {
+ request(url, function (error, response, body) {
+ if (error) {
+ console.log(url + " : " + error);
+ self.run();
+ return;
+ }
+ if (response.statusCode !== 200) {
+ console.log(url + " : " + response.statusCode + " : " + body);
+ self.run();
+ return;
+ }
+ self.db.insert(url, body);
+ self.processBody(url, body);
+ });
+ }
+ },
+ processBody: function (url, body) {
+ var self = this;
+ var doc = libxmlext.parseHtmlString(body);
+ var inserted = 0;
+ doc.find('//a').forEach(function (a) {
+ // console.log('a: ' + a.toString());
+ var href = a.attr('href');
+ if (!href) { return; }
+ var nextlink = '' + url_.resolve(url, href.value().replace(/#.*/, '')).toString();
+ if (self.seen[nextlink]) {
+ console.log('[seen2] ' + nextlink);
+ return;
+ }
+ if (nextlink.match(/\/en\/JavaScript\/Reference\//)) {
+ console.log('[push] ' + nextlink);
+ self.queue.unshift(nextlink);
+ inserted++;
+ } else {
+ // console.log('skip: ' + nextlink);
+ }
+ });
+ this.seen[url] = true;
+ for (var i=0; i<inserted; i++) {
+ this.run();
+ }
+ }
+};
+
+var crawler = new Crawler();
+crawler.run();
+
+process.on('exit', function () {
+ console.log(crawler.queue.length);
+});
+
View
3,495 index.json
3,494 additions, 1 deletion not shown
View
17 js/jsapi.js
@@ -55,8 +55,11 @@ $(function () {
a.prepend(title);
a.data('path', path);
+ a.data('url', line.url);
+ var url = line.url;
+ console.log(line);
li.click(function () {
- view.loadContent(path);
+ view.loadContent(url);
return false;
});
li.append(a);
@@ -71,18 +74,18 @@ $(function () {
return !!keyword.test(x.title);
});
},
- loadContent: function (path) {
- console.log('load ' + path);
+ loadContent: function (url) {
+ console.log('load ' + url);
var view = this;
// view.iframe.hide(path);
view.mainLoading.show();
$.ajax({
- url: path.replace('developer.mozilla.org/', 'converted/'),
+ url: url,
cache: false,
dataType: 'html'
}).done(function (dat) {
dat = dat.replace(/<html[^>]+><body[^>]+>/, '').replace('</body></html>', '');
- view.currentPath = path;
+ view.currentPath = url;
setTimeout(function () {
view.mainLoading.hide();
view.iframe.html(dat);
@@ -98,7 +101,7 @@ $(function () {
if (e.keyCode === 13) { // enter key
var elem = JSAPI.titleContainerElem.find('ul li:first a');
if (elem) {
- view.loadContent(elem.data('path'));
+ view.loadContent(elem.data('url'));
}
return false;
} else {
@@ -109,7 +112,7 @@ $(function () {
}
});
- $.getJSON('index.json').success(function (dat) {
+ $.ajax({url: 'index.json', cache: false}).success(function (dat) {
view.sideLoading.remove();
var ul = JSAPI.titleContainerElem;
View
0  lib/Converted.js
No changes.
View
29 lib/DB.js
@@ -0,0 +1,29 @@
+var gdbm = require('gdbm');
+
+function DB(path) {
+ this.gdbm = new gdbm.GDBM();
+ if (!this.gdbm.open(path, 0, gdbm.GDBM_WRCREAT)) {
+ throw path + " : " + this.gdbm.strerror();
+ }
+}
+DB.prototype = {
+ exists: function (url) {
+ return this.gdbm.exists(url);
+ },
+ insert: function (url, content) {
+ return this.gdbm.store(url, content);
+ },
+ fetch: function (url) {
+ return this.gdbm.fetch(url);
+ },
+ listUrls: function () {
+ var ret = new Array();
+ var key = this.gdbm.firstkey();
+ while (key) {
+ ret.push(key);
+ key = this.gdbm.nextkey(key);
+ }
+ return ret;
+ }
+};
+exports.DB = DB;
View
9 md5.js
@@ -0,0 +1,9 @@
+var crypto = require('crypto');
+
+console.log(md5_hex('All your base are belongs to us.'));
+
+function md5_hex(src) {
+ var md5 = crypto.createHash('md5');
+ md5.update(src);
+ return md5.digest('hex');
+}
View
42 mkindex.js
@@ -1,21 +1,28 @@
"use strict";
var fs = require('fs'),
- glob = require('glob'),
util = require('util'),
nquery = require('nquery'),
- assert = require('assert');
+ DB = require('./lib/DB').DB,
+ assert = require('assert'),
+ Url = require('url'),
+ querystring = require('querystring'),
+ crypto = require('crypto'),
+ undefined;
+
+var srcdb = new DB('docs.db');
/**
* Source file object
**/
-function SourcePath(path) {
+function SourcePath(path, content) {
assert(path);
this.path = path;
+ this.content = content;
}
SourcePath.prototype = {
getCategory: function () {
- var path = this.path.replace('developer.mozilla.org/en/JavaScript/Reference/', '');
+ var path = this.path.replace('/en/JavaScript/Reference/', '');
var x = path.match(/^([^/]+)\//);
if (x) {
return x[1];
@@ -26,8 +33,12 @@ SourcePath.prototype = {
getTitle: function () {
assert(this.path);
+ if (this.path === '/en/JavaScript/Reference') {
+ return 'Top';
+ }
+
var p = this.path;
- p = p.replace('developer.mozilla.org/en/JavaScript/Reference/', '');
+ p = p.replace('/en/JavaScript/Reference/', '');
p = p.replace(/^([^\/]+)\//, '');
p = p.replace(/_/g, ' ');
p = p.replace(/\//g, '.');
@@ -61,10 +72,12 @@ SourcePath.prototype = {
}
return version;
},
+ getUrl: function () {
+ var md5 = crypto.createHash('md5');
+ md5.update(this.path);
+ return 'converted/' + md5.digest('hex');
+ },
getContent: function () {
- if (!this.content) {
- this.content = fs.readFileSync(this.path, 'utf-8');
- }
return this.content;
},
isDeprecated: function () {
@@ -84,6 +97,7 @@ SourcePath.prototype = {
esversion: this.getESVersion(),
nonstandard: this.isNonStandard(),
deprecated: this.isDeprecated(),
+ url: this.getUrl(),
};
}
};
@@ -91,13 +105,11 @@ SourcePath.prototype = {
/**
* main routine
**/
-var matches = glob.globSync('developer.mozilla.org/**/*', glob.GLOB_STAR);
-
-var ret = matches.filter(function (path) {
- return !fs.statSync(path).isDirectory();
-}).map(function (fname) {
- var spath = new SourcePath(fname);
+var ret = srcdb.listUrls().sort().map(function (url) {
+ var content = srcdb.fetch(url);
+ var path = Url.parse(url).pathname;
+ var spath = new SourcePath(path, content);
return spath.toMap();
});
-util.puts(JSON.stringify(ret));
+util.puts(JSON.stringify(ret, null, 4));
View
6 package.json
@@ -11,7 +11,11 @@
"dependencies": {
"libxmlext": ">=1.0.0",
"libxmljs": ">=0.4.3",
- "request": ">=2.2.9"
+ "generic-pool": ">=0.0.1",
+ "gdbm": ">=0.0.1",
+ "express": ">=0.0.1",
+ "request": ">=2.2.9",
+ "mkdirp": ">=0.0.1"
},
"devDependencies": {},
"scripts": {
Please sign in to comment.
Something went wrong with that request. Please try again.