Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement and test the transform API as specced #173

Merged
merged 16 commits into from
Feb 14, 2015
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions lib/rbUtil.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ function read(req) {
});

req.on('end', function() {
req.body = Buffer.concat(chunks);
resolve();
return Buffer.concat(chunks);
});
});
}
Expand All @@ -128,6 +127,7 @@ rbUtil.parsePOST = function parsePOST(req) {
} else if (req.method !== 'POST') {
return Promise.resolve();
} else {
// Parse the POST
var headers = req.headers;
if (!headers['content-type']) {
headers = {
Expand All @@ -142,12 +142,12 @@ rbUtil.parsePOST = function parsePOST(req) {
// Increase the form field size limit from the 1M default.
limits: { fieldSize: 15 * 1024 * 1024 }
});
req.body = req.body || {};
var body = {};
bboy.on('field', function (field, val) {
req.body[field] = val;
body[field] = val;
});
bboy.on('finish', function () {
resolve();
resolve(body);
});
req.pipe(bboy);
});
Expand Down
3 changes: 1 addition & 2 deletions lib/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,9 @@ function handleRequest (opts, req, resp) {
return rbUtil.parsePOST(req)

// Then process the request
.then(function() {
.then(function(body) {
// Create a new, clean request object
var urlData = rbUtil.parseURL(req.url);
var body = req.body;

if (/^application\/json/i.test(req.headers['content-type'])) {
try {
Expand Down
147 changes: 111 additions & 36 deletions mods/parsoid.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ PSP.getRevisionInfo = function(restbase, req) {
uri: new URI([rp.domain,'sys','page_revisions','page',rp.title,rp.revision])
})
.then(function(res) {
// FIXME: use tid range!
var revInfo = res.body.items[0];
return revInfo;
});
Expand Down Expand Up @@ -144,64 +143,140 @@ PSP.transformRevision = function (restbase, req, from, to) {
var self = this;
var rp = req.params;

var fromStorage = {
revid: rp.revision
};

function get(format) {
return self.getRevisionInfo(restbase, req)
.then(function(revInfo) {
return restbase.get({ uri: self.getBucketURI(rp, format, revInfo.tid) });
return restbase.get({
uri: new URI([rp.domain,'sys','parsoid',format,rp.title,rp.revision])
})
.then(function (res) {
if (res.body &&
res.body.headers && res.body.headers['content-type'] &&
res.body.body) {
fromStorage[format] = {
headers: {
'content-type': res.body.headers['content-type']
},
body: res.body.body
};
if (res.body && res.body.constructor === Buffer) {
res.body = res.body.toString();
}
return {
headers: {
'content-type': res.headers['content-type']
},
body: res.body
};
});
}

return Promise.all([ get('html'), get('wikitext'), get('data-parsoid') ])
.then(function () {
// Get the revision info just to make sure we have access
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These things are resolved in #172 ;)

return self.getRevisionInfo(restbase, req)
.then(function(revInfo) {
return Promise.props({
html: get('html'),
// wikitext: get('wikitext'),
'data-parsoid': get('data-parsoid')
});
})
.then(function (original) {
original.revid = rp.revision;
var body2 = {
original: fromStorage
original: original
};
body2[from] = req.body;
return restbase.post({
uri: new URI([rp.domain,'sys','parsoid','transform',from,'to',to]),
body2[from] = req.body[from];
var path = [rp.domain,'sys','parsoid','transform',from,'to',to];
if (rp.title) {
path.push(rp.title);
if (rp.revision) {
path.push(rp.revision);
}
}
var newReq = {
uri: new URI(path),
params: req.params,
headers: { 'content-type': 'application/json' },
body: body2
});
};
return self.callParsoidTransform(restbase, newReq, from, to);
});

};

PSP.callParsoidTransform = function callParsoidTransform (restbase, req, from, to) {
var rp = req.params;
// Parsoid currently spells 'wikitext' as 'wt'
var parsoidTo = to;
if (to === 'wikitext') {
parsoidTo = 'wt';
} else if (to === 'html') {
// Retrieve pagebundle whenever we want HTML
parsoidTo = 'pagebundle';
}


var parsoidExtras = [];
if (rp.title) {
parsoidExtras.push(rp.title);
} else {
// fake title to avoid Parsoid error: <400/No title or wikitext was provided>
parsoidExtras.push('Main_Page');
}
if (rp.revision) {
parsoidExtras.push(rp.revision);
}
var parsoidExtraPath = parsoidExtras.map(encodeURIComponent).join('/');
if (parsoidExtraPath) { parsoidExtraPath = '/' + parsoidExtraPath; }

var domain = rp.domain;
// Re-map test domain
if (domain === 'en.wikipedia.test.local') { domain = 'en.wikipedia.org'; }
var parsoidReq = {
uri: this.parsoidHost + '/v2/' + domain + '/'
+ parsoidTo + parsoidExtraPath,
headers: { 'content-type': 'application/json' },
body: req.body
};
console.log(JSON.stringify(parsoidReq, null, 2));
return restbase.post(parsoidReq);
};

/**
* Cheap body.innerHTML extraction.
*
* This is safe as we know that the HTML we are receiving from Parsoid is
* serialized as XML.
*/
function cheapBodyInnerHTML(html) {
var match = /<body[^>]*>(.*)<\/body>/.exec(html);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe a body can span multiple lines. If so, the correct way would probably be:

var re = new RegExp('/<body[^>]*>([\\s\\S]*)<\\/body>/', 'gm');
var match = re.exec(html);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!

if (!match) {
throw new Error('No HTML body found!');
} else {
return match[1];
}
}

PSP.makeTransform = function (from, to) {
var self = this;

return function (restbase, req) {
var rp = req.params;
if (false && !req.body[from]) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

false && sth ?

throw new rbUtil.HTTPError({
status: 400,
body: {
type: 'invalid_request',
description: 'Missing request parameter: ' + from
}
});
}
var transform;
if (rp.revision) {
return self.transformRevision(restbase, req, from, to);
transform = self.transformRevision(restbase, req, from, to);
} else {
// Parsoid currently spells 'wikitext' as 'wt'
var parsoidTo = (to === 'wikitext') ? 'wt' : to;

// fake title to avoid Parsoid error: <400/No title or wikitext was provided>
var parsoidExtra = (from === 'html') ? '/_' : '';

return restbase.post({
uri: self.parsoidHost + '/v2/' + rp.domain + '/' + parsoidTo + parsoidExtra,
headers: { 'content-type': 'application/json' },
body: req.body
});
transform = self.callParsoidTransform(restbase, req, from, to);
}
return transform
.then(function(res) {
// Unwrap to the flat response format
var innerRes = res.body[to];
innerRes.status = 200;
// Handle bodyOnly flag
if (to === 'html' && req.body.bodyOnly) {
innerRes.body = cheapBodyInnerHTML(innerRes.body);
}
return innerRes;
});
};
};

Expand Down
153 changes: 153 additions & 0 deletions test/features/parsoid/transform.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
'use strict';

// mocha defines to avoid JSHint breakage
/* global describe, it, before, beforeEach, after, afterEach */

var assert = require('../../utils/assert.js');
var server = require('../../utils/server.js');
var preq = require('preq');

var testPage = {
title: 'User:GWicke%2F_restbase_test',
revision: '646859921',
html: "<!DOCTYPE html>\n<html prefix=\"dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/\" about=\"http://en.wikipedia.org/wiki/Special:Redirect/revision/646859921\"><head prefix=\"mwr: http://en.wikipedia.org/wiki/Special:Redirect/\"><meta property=\"mw:articleNamespace\" content=\"2\"/><link rel=\"dc:replaces\" resource=\"mwr:revision/0\"/><meta property=\"dc:modified\" content=\"2015-02-12T22:30:30.000Z\"/><meta about=\"mwr:user/11429869\" property=\"dc:title\" content=\"GWicke\"/><link rel=\"dc:contributor\" resource=\"mwr:user/11429869\"/><meta property=\"mw:revisionSHA1\" content=\"6417e5e59b2975e65eebb5104ea572913a61db7e\"/><meta property=\"dc:description\" content=\"selser test page\"/><meta property=\"mw:parsoidVersion\" content=\"0\"/><link rel=\"dc:isVersionOf\" href=\"//en.wikipedia.org/wiki/User%3AGWicke/_restbase_test\"/><title>User:GWicke/_restbase_test</title><base href=\"//en.wikipedia.org/wiki/\"/><link rel=\"stylesheet\" href=\"//en.wikipedia.org/w/load.php?modules=mediawiki.legacy.commonPrint,shared|mediawiki.skinning.elements|mediawiki.skinning.content|mediawiki.skinning.interface|skins.vector.styles|site|mediawiki.skinning.content.parsoid&amp;only=styles&amp;skin=vector\"/></head><body id=\"mwAA\" lang=\"en\" class=\"mw-content-ltr sitedir-ltr ltr mw-body mw-body-content mediawiki\" dir=\"ltr\"><div id=\"bar\">Selser test</div></body></html>",
wikitext: '<div id=bar>Selser test'
};

describe('transform api', function() {
this.timeout(20000);

before(function () { return server.start(); });

it('html2html', function () {
return preq.post({
uri: server.config.baseURL
+ '/transform/html/to/html/' + testPage.title
+ '/' + testPage.revision,
body: {
html: testPage.html
}
})
.then(function (res) {
assert.deepEqual(res.status, 200);
var pattern = /<div id="bar">Selser test<\/div>/;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block seems to have wrong indenting

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strike that. It seems that the whole file is tab-indented. Are you crossing over to the dark side of tabs? 🙅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nah, just happened to be tab indented & I didn't change it. Shall fix.

if (!pattern.test(res.body)) {
throw new Error('Expected pattern in response: ' + pattern
+ '\nSaw: ' + JSON.stringify(res, null, 2));
}
assert.deepEqual(res.headers['content-type'],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use assert.contentType(res, 'your-expected-content-type-here') to test for the resulting content-type header.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

K, done.

'text/html;profile=mediawiki.org/specs/html/1.0.0');
});
});

it('wt2html', function () {
return preq.post({
uri: server.config.baseURL
+ '/transform/wikitext/to/html/User:GWicke%2F_restbase_test',
body: {
wikitext: '== Heading =='
}
})
.then(function (res) {
assert.deepEqual(res.status, 200);
assert.deepEqual(res.headers['content-type'],
'text/html;profile=mediawiki.org/specs/html/1.0.0');
var pattern = /<h2.*> Heading <\/h2>/;
if (!pattern.test(res.body)) {
throw new Error('Expected pattern in response: ' + pattern
+ '\nSaw: ' + res.body);
}
});
});

it('wt2html with bodyOnly', function () {
return preq.post({
uri: server.config.baseURL
+ '/transform/wikitext/to/html/User:GWicke%2F_restbase_test',
body: {
wikitext: '== Heading ==',
bodyOnly: true
}
})
.then(function (res) {
assert.deepEqual(res.status, 200);
assert.deepEqual(res.headers['content-type'],
'text/html;profile=mediawiki.org/specs/html/1.0.0');
var pattern = /^<h2.*> Heading <\/h2>$/;
if (!pattern.test(res.body)) {
throw new Error('Expected pattern in response: ' + pattern
+ '\nSaw: ' + res.body);
}
});
});


it('html2wt, no-selser', function () {
return preq.post({
uri: server.config.baseURL
+ '/transform/html/to/wikitext/User:GWicke%2F_restbase_test',
body: {
html: '<body>The modified HTML</body>'
}
})
.then(function (res) {
assert.deepEqual(res.status, 200);
assert.deepEqual(res.body, 'The modified HTML');
assert.deepEqual(res.headers['content-type'],
'text/plain;profile=mediawiki.org/specs/wikitext/1.0.0');
});
});

// it('html2wt, selser', function () {
// return preq.post({
// uri: server.config.baseURL
// + '/transform/html/to/wikitext/' + testPage.title
// + '/' + testPage.revision,
// body: {
// html: testPage.html
// }
// })
// .then(function (res) {
// assert.deepEqual(res.status, 200);
// assert.deepEqual(res.body, testPage.wikitext);
// assert.deepEqual(res.headers['content-type'],
// 'text/plain;profile=mediawiki.org/specs/wikitext/1.0.0');
// });
// });

});



/* TODO: actually implement wikitext fetching
describe('storage-backed transform api', function() {
this.timeout(20000);

before(function () { return server.start(); });

it('should load a specific title/revision from storage to send as the "original"', function () {
return preq.post({
uri: server.config.baseURL + '/transform/html/to/wikitext/Main_Page/1',
headers: { 'content-type': 'application/json' },
body: {
headers: {
'content-type': 'text/html;profile=mediawiki.org/specs/html/1.0.0'
},
body: '<html>The modified HTML</html>'
}
})
.then(function (res) {
assert.deepEqual(res.status, 200);
assert.deepEqual(res.body, {
wikitext: {
headers: {
'content-type': 'text/plain;profile=mediawiki.org/specs/wikitext/1.0.0'
},
body: 'The modified HTML'
}
});
});
});

});
*/
Loading