From 93c8d9af78a93c6eb4588648022f9e4edf06c8e7 Mon Sep 17 00:00:00 2001 From: Petr Pchelko Date: Mon, 18 Jun 2018 18:48:41 +0300 Subject: [PATCH 1/5] WIP: Language variants support Change-Id: Ia7fbafe8abc73e96b391e77176d3873be962471b --- lib/language_variants_filter.js | 60 +++++++++++++++++++++++++++++++++ lib/mwUtil.js | 27 ++++++++++++++- sys/action.js | 21 +++++++++++- sys/parsoid.js | 8 +++-- v1/content.yaml | 2 ++ 5 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 lib/language_variants_filter.js diff --git a/lib/language_variants_filter.js b/lib/language_variants_filter.js new file mode 100644 index 000000000..f873018de --- /dev/null +++ b/lib/language_variants_filter.js @@ -0,0 +1,60 @@ +"use strict"; + +const HyperSwitch = require('hyperswitch'); +const URI = HyperSwitch.URI; +const mwUtil = require('./mwUtil'); + +module.exports = (hyper, req, next, options, specInfo) => { + const rp = req.params; + const acceptLanguage = req.headers && req.headers['accept-language']; + if (!acceptLanguage || mwUtil.isNoCacheRequest(req)) { + delete req.headers['accept-language']; + return next(hyper, req); + } + + const revTableURI = [rp.domain, 'sys', 'page_revisions', 'page', rp.title]; + if (rp.revision) { + revTableURI.push(`${rp.revision}`); + } + return hyper.get({ uri: new URI(revTableURI) }) + .then((res) => { + const revision = res.body.items[0]; + return mwUtil.shouldConvertLangVariant(hyper, req, revision.page_language, acceptLanguage) + .then((shouldConvert) => { + if (!shouldConvert) { + delete req.headers['accept-language']; + return next(hyper, req); + } + + if (/\/page\/html\//.test(specInfo.path)) { + // It's HTML, hit Parsoid for conversion + return next(hyper, req) + .then(html => hyper.post({ + uri: new URI([rp.domain, 'sys', 'parsoid', 'transform', 'html', 'to', 'html']), + headers: { + 'content-type': 'application/json', + 'content-language': revision.page_language + }, + body: { + original: { + html: { + headers: html.headers, + body: html.body.toString() + } + }, + updates: { + variant: { + source: null, + target: acceptLanguage + } + } + } + })); + // We can skip setting Vary: accept-language as Parsoid sets it. + } else { + // TODO: It's something else, so just forward to MCS + return next(hyper, req); + } + }); + }); +}; diff --git a/lib/mwUtil.js b/lib/mwUtil.js index 625074cff..4cfafe595 100644 --- a/lib/mwUtil.js +++ b/lib/mwUtil.js @@ -66,6 +66,31 @@ mwUtil.parseETag = (etag) => { } }; +/** + * Checks whether a specialized requested language variant exist for + * a page in a requested language on a particular wiki. + * @param {HyperSwitch} hyper + * @param {Object} req + * @param {string} pageLanguage the language of the requested page. + * @param {string} acceptLanguage the language variant accepted by the client. + * @return {Promise} + */ +mwUtil.shouldConvertLangVariant = (hyper, req, pageLanguage, acceptLanguage) => { + if (!acceptLanguage) { + return P.resolve(false); + } + return mwUtil.getSiteInfo(hyper, req) + .then((siteInfo) => { + // First, whether specialized variants exist for the page lang on a wiki + const pageLangVariants = siteInfo.languagevariants[pageLanguage]; + if (!pageLangVariants) { + return false; + } + // Second, if the requested language variant exist for page language + return pageLangVariants.indexOf(acceptLanguage) !== -1; + }); +}; + /** * Extract the date from an `etag` header of the form * "//" @@ -257,7 +282,7 @@ mwUtil.decodeBody = (contentResponse) => { * * @param {!HyperSwitch} hyper * @param {!Object} req - * @param {?String} domain Wiki domain to get siteinfo from (defaults to the request domain). + * @param {string} [domain] Wiki domain to get siteinfo from (defaults to the request domain). */ mwUtil.getSiteInfo = (hyper, req, domain) => hyper.get({ uri: new URI([domain || req.params.domain, 'sys', 'action', 'siteinfo']) diff --git a/sys/action.js b/sys/action.js index 966260ea5..7461e90fc 100644 --- a/sys/action.js +++ b/sys/action.js @@ -292,13 +292,31 @@ class ActionService { body: { action: 'query', meta: 'siteinfo|filerepoinfo', - siprop: 'general|namespaces|namespacealiases|specialpagealiases', + siprop: 'general' + + '|namespaces' + + '|namespacealiases' + + '|specialpagealiases' + + '|languagevariants', format: 'json' } }, {}, (apiReq, res) => { if (!res || !res.body || !res.body.query || !res.body.query.general) { throw new Error(`SiteInfo is unavailable for ${rp.domain}`); } + // Transform from original response format to + // { + // 'lang' => ['variant1', 'variant2' ... ] + // } + // for ease of use. + const origVariants = res.body.query.languagevariants; + const variants = {}; + if (origVariants) { + Object.keys(origVariants).forEach((lang) => { + variants[lang] = Object.keys(origVariants[lang]) + // Filter out non-specific variants like `en`, `zh` etc. + .filter(variant => /-/.test(variant)); + }); + } return { status: 200, body: { @@ -311,6 +329,7 @@ class ActionService { namespaces: res.body.query.namespaces, namespacealiases: res.body.query.namespacealiases, specialpagealiases: res.body.query.specialpagealiases, + languagevariants: variants, sharedRepoRootURI: findSharedRepoDomain(res), baseUri: this._getBaseUri(req) } diff --git a/sys/parsoid.js b/sys/parsoid.js index 4f52f67fb..6a9574cd3 100644 --- a/sys/parsoid.js +++ b/sys/parsoid.js @@ -713,7 +713,7 @@ class ParsoidService { parsoidTo = 'pagebundle'; } let parsoidFrom = from; - if (from === 'html' && req.body.original && req.body.original['data-parsoid']) { + if (from === 'html' && req.body.original) { parsoidFrom = 'pagebundle'; } const parsoidExtras = []; @@ -735,6 +735,7 @@ class ParsoidService { headers: { 'content-type': 'application/json', 'user-agent': req['user-agent'], + 'content-language': req.headers['content-language'] }, body: req.body }; @@ -761,7 +762,10 @@ class ParsoidService { return (hyper, req) => { const rp = req.params; if ((!req.body && req.body !== '') - || (!req.body[from] && req.body[from] !== '')) { + // The html/to/html endpoint is a bit different so the `html` + // might not be provided. + || (!(from === 'html' && to === 'html') + && !req.body[from] && req.body[from] !== '')) { throw new HTTPError({ status: 400, body: { diff --git a/v1/content.yaml b/v1/content.yaml index 018a1abf3..85e32a625 100644 --- a/v1/content.yaml +++ b/v1/content.yaml @@ -127,6 +127,7 @@ paths: get: x-route-filters: - path: lib/ensure_content_type.js + - path: ./lib/language_variants_filter.js tags: - Page content summary: Get latest HTML for a title. @@ -399,6 +400,7 @@ paths: get: x-route-filters: - path: lib/ensure_content_type.js + - path: ./lib/language_variants_filter.js tags: - Page content summary: Get HTML for a specific title/revision & optionally timeuuid. From 2b46326449edd373dbb9b1b15ea8969485cbe60b Mon Sep 17 00:00:00 2001 From: Petr Pchelko Date: Tue, 19 Jun 2018 18:58:55 +0300 Subject: [PATCH 2/5] Added tests for HTML language variants Change-Id: Icf42fca39e9691ecf92d39017f63482e0d0716f1 --- config.test.yaml | 2 + lib/language_variants_filter.js | 31 ++++++- lib/mwUtil.js | 17 +++- sys/multi_content_bucket.js | 2 +- .../features/pagecontent/language_variants.js | 87 +++++++++++++++++++ test/utils/server.js | 3 + 6 files changed, 136 insertions(+), 6 deletions(-) create mode 100644 test/features/pagecontent/language_variants.js diff --git a/config.test.yaml b/config.test.yaml index 5c96bcb5b..158898c0a 100644 --- a/config.test.yaml +++ b/config.test.yaml @@ -128,6 +128,8 @@ spec_root: &spec_root # labs, used for most tests /{domain:en.wikipedia.beta.wmflabs.org}: *labs_project + # Serbian wiki in beta for language variants tests + /{domain:sr.wikipedia.beta.wmflabs.org}: *default_project # For security testing we rely on mocks, so it's OK to use French wiki. /{domain:fr.wikipedia.org}: diff --git a/lib/language_variants_filter.js b/lib/language_variants_filter.js index f873018de..2296fc251 100644 --- a/lib/language_variants_filter.js +++ b/lib/language_variants_filter.js @@ -9,7 +9,18 @@ module.exports = (hyper, req, next, options, specInfo) => { const acceptLanguage = req.headers && req.headers['accept-language']; if (!acceptLanguage || mwUtil.isNoCacheRequest(req)) { delete req.headers['accept-language']; - return next(hyper, req); + return next(hyper, req) + // TODO: Store the Vary header so that if the page is possible + // to be converted we return a proper Vary even when accept-language + // is not specified. + // For now we hacky check by domain. + .then((res) => { + const langCode = req.params.domain.substring(0, req.params.domain.indexOf('.')); + if (mwUtil.canConvertLangVariant(hyper, req, langCode)) { + res.headers.vary = 'accept-language'; + } + return res; + }); } const revTableURI = [rp.domain, 'sys', 'page_revisions', 'page', rp.title]; @@ -23,7 +34,14 @@ module.exports = (hyper, req, next, options, specInfo) => { .then((shouldConvert) => { if (!shouldConvert) { delete req.headers['accept-language']; - return next(hyper, req); + return next(hyper, req) + // TODO: Eventually we want to store the Vary header we get from Parsoid! + .then((res) => { + if (mwUtil.canConvertLangVariant(hyper, req, revision.page_language)) { + res.headers.vary = 'accept-language'; + } + return res; + }) } if (/\/page\/html\//.test(specInfo.path)) { @@ -49,8 +67,13 @@ module.exports = (hyper, req, next, options, specInfo) => { } } } - })); - // We can skip setting Vary: accept-language as Parsoid sets it. + })) + // TODO: Now we fix up the Vary header cause Parsoid + // sets content-type in there which is wrong. + .then((res) => { + res.headers.vary = 'accept-language'; + return res; + }) } else { // TODO: It's something else, so just forward to MCS return next(hyper, req); diff --git a/lib/mwUtil.js b/lib/mwUtil.js index 4cfafe595..cbbd468fe 100644 --- a/lib/mwUtil.js +++ b/lib/mwUtil.js @@ -66,6 +66,21 @@ mwUtil.parseETag = (etag) => { } }; + +/** + * Checks whether a language variant could exist for + * a page in a requested language on a particular wiki. + * @param {HyperSwitch} hyper + * @param {Object} req + * @param {string} pageLanguage the language of the requested page. + * @return {Promise} + */ +mwUtil.canConvertLangVariant = (hyper, req, pageLanguage) => { + return mwUtil.getSiteInfo(hyper, req) + .then(siteInfo => !!siteInfo.languagevariants[pageLanguage]); +}; + + /** * Checks whether a specialized requested language variant exist for * a page in a requested language on a particular wiki. @@ -87,7 +102,7 @@ mwUtil.shouldConvertLangVariant = (hyper, req, pageLanguage, acceptLanguage) => return false; } // Second, if the requested language variant exist for page language - return pageLangVariants.indexOf(acceptLanguage) !== -1; + return pageLangVariants.indexOf(acceptLanguage.toLowerCase()) !== -1; }); }; diff --git a/sys/multi_content_bucket.js b/sys/multi_content_bucket.js index d57a96522..1fd9e5401 100644 --- a/sys/multi_content_bucket.js +++ b/sys/multi_content_bucket.js @@ -217,7 +217,7 @@ class MultiContentBucket { .thenReturn({ status: 201 }); } - makeSchema(opts) { + makeSchema(opts) { const schemaVersionMajor = 3; return { diff --git a/test/features/pagecontent/language_variants.js b/test/features/pagecontent/language_variants.js new file mode 100644 index 000000000..82f8923fc --- /dev/null +++ b/test/features/pagecontent/language_variants.js @@ -0,0 +1,87 @@ +'use strict'; + +// mocha defines to avoid JSHint breakage +/* global describe, it, before, beforeEach, after, afterEach */ + +const assert = require('../../utils/assert.js'); +const preq = require('preq'); +const server = require('../../utils/server.js'); +const variantsPageTitle = 'RESTBase_Testing_Page'; + + +describe('Language variants', function() { + + this.timeout(20000); + + before(() => server.start()); + + it('should request html with no variants', () => { + return preq.get({ uri: `${server.config.variantsWikiBucketURL}/html/${variantsPageTitle}`}) + .then((res) => { + assert.deepEqual(res.status, 200); + assert.deepEqual(res.headers.vary.toLowerCase(), 'accept-language'); + assert.deepEqual(/1\. Ово је тестна страница/.test(res.body), true); + assert.deepEqual(/2\. Ovo je testna stranica/.test(res.body), true); + }); + }); + + it('should request html with default variant', () => { + return preq.get({ + uri: `${server.config.variantsWikiBucketURL}/html/${variantsPageTitle}`, + headers: { + 'accept-language': 'sr' + } + }) + .then((res) => { + assert.deepEqual(res.status, 200); + assert.deepEqual(res.headers.vary.toLowerCase(), 'accept-language'); + assert.deepEqual(/1\. Ово је тестна страница/.test(res.body), true); + assert.deepEqual(/2\. Ovo je testna stranica/.test(res.body), true); + }); + }); + + it('should request html with wrong variant', () => { + return preq.get({ + uri: `${server.config.variantsWikiBucketURL}/html/${variantsPageTitle}`, + headers: { + 'accept-language': 'sr-blablabla' + } + }) + .then((res) => { + assert.deepEqual(res.status, 200); + assert.deepEqual(res.headers.vary.toLowerCase(), 'accept-language'); + assert.deepEqual(/1\. Ово је тестна страница/.test(res.body), true); + assert.deepEqual(/2\. Ovo je testna stranica/.test(res.body), true); + }); + }); + + it('should request html with cyrillic variant', () => { + return preq.get({ + uri: `${server.config.variantsWikiBucketURL}/html/${variantsPageTitle}`, + headers: { + 'accept-language': 'sr-ec' + } + }) + .then((res) => { + assert.deepEqual(res.status, 200); + assert.deepEqual(res.headers.vary.toLowerCase(), 'accept-language'); + assert.deepEqual(/1\. Ово је тестна страница/.test(res.body), true); + assert.deepEqual(/2\. Ово је тестна страница/.test(res.body), true); + }); + }); + + it('should request html with latin variant', () => { + return preq.get({ + uri: `${server.config.variantsWikiBucketURL}/html/${variantsPageTitle}`, + headers: { + 'accept-language': 'sr-el' + } + }) + .then((res) => { + assert.deepEqual(res.status, 200); + assert.deepEqual(res.headers.vary.toLowerCase(), 'accept-language'); + assert.deepEqual(/1\. Ovo je testna stranica/.test(res.body), true); + assert.deepEqual(/2\. Ovo je testna stranica/.test(res.body), true); + }); + }); +}); \ No newline at end of file diff --git a/test/utils/server.js b/test/utils/server.js index 97dbe3152..63985eb97 100644 --- a/test/utils/server.js +++ b/test/utils/server.js @@ -18,6 +18,8 @@ var secureURL = hostPort + '/fr.wikipedia.org/v1'; var secureBucketURL = secureURL + '/page'; var labsURL = hostPort + '/en.wikipedia.beta.wmflabs.org/v1'; var labsBucketURL = labsURL + '/page'; +const variantsWikiURL = hostPort + '/sr.wikipedia.beta.wmflabs.org/v1'; +const variantsWikiBucketURL = variantsWikiURL + '/page'; function loadConfig(path, forceSqlite) { var confString = fs.readFileSync(path).toString(); @@ -50,6 +52,7 @@ var config = { secureApiURL: 'https://fr.wikipedia.org/w/api.php', labsURL: labsURL, labsBucketURL: labsBucketURL, + variantsWikiBucketURL: variantsWikiBucketURL, labsApiURL: 'https://en.wikipedia.beta.wmflabs.org/w/api.php', logStream: logStream(), conf: loadConfig(process.env.RB_TEST_CONFIG ? process.env.RB_TEST_CONFIG : __dirname + '/../../config.test.yaml') From 8c1b2f5946da9228fd54303454fc1849ac195703 Mon Sep 17 00:00:00 2001 From: Petr Pchelko Date: Tue, 19 Jun 2018 19:13:21 +0300 Subject: [PATCH 3/5] Fixed linting errors Change-Id: I3c22de412c8735021daf97db465cd6455c57e33f --- lib/language_variants_filter.js | 4 ++-- sys/multi_content_bucket.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/language_variants_filter.js b/lib/language_variants_filter.js index 2296fc251..d2440013b 100644 --- a/lib/language_variants_filter.js +++ b/lib/language_variants_filter.js @@ -41,7 +41,7 @@ module.exports = (hyper, req, next, options, specInfo) => { res.headers.vary = 'accept-language'; } return res; - }) + }); } if (/\/page\/html\//.test(specInfo.path)) { @@ -73,7 +73,7 @@ module.exports = (hyper, req, next, options, specInfo) => { .then((res) => { res.headers.vary = 'accept-language'; return res; - }) + }); } else { // TODO: It's something else, so just forward to MCS return next(hyper, req); diff --git a/sys/multi_content_bucket.js b/sys/multi_content_bucket.js index 1fd9e5401..d57a96522 100644 --- a/sys/multi_content_bucket.js +++ b/sys/multi_content_bucket.js @@ -217,7 +217,7 @@ class MultiContentBucket { .thenReturn({ status: 201 }); } - makeSchema(opts) { + makeSchema(opts) { const schemaVersionMajor = 3; return { From a96b93862433fe4cc16f646c88d88de334e5b8f6 Mon Sep 17 00:00:00 2001 From: Petr Pchelko Date: Tue, 19 Jun 2018 19:32:35 +0300 Subject: [PATCH 4/5] Lowercase language variants everywhere Change-Id: Id0d6d2868c9cef5f6526d9129ff0f12b165a1f7e --- lib/mwUtil.js | 4 ++-- sys/action.js | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/mwUtil.js b/lib/mwUtil.js index cbbd468fe..068c72bb7 100644 --- a/lib/mwUtil.js +++ b/lib/mwUtil.js @@ -77,7 +77,7 @@ mwUtil.parseETag = (etag) => { */ mwUtil.canConvertLangVariant = (hyper, req, pageLanguage) => { return mwUtil.getSiteInfo(hyper, req) - .then(siteInfo => !!siteInfo.languagevariants[pageLanguage]); + .then(siteInfo => !!siteInfo.languagevariants[pageLanguage.toLowerCase()]); }; @@ -97,7 +97,7 @@ mwUtil.shouldConvertLangVariant = (hyper, req, pageLanguage, acceptLanguage) => return mwUtil.getSiteInfo(hyper, req) .then((siteInfo) => { // First, whether specialized variants exist for the page lang on a wiki - const pageLangVariants = siteInfo.languagevariants[pageLanguage]; + const pageLangVariants = siteInfo.languagevariants[pageLanguage.toLowerCase()]; if (!pageLangVariants) { return false; } diff --git a/sys/action.js b/sys/action.js index 7461e90fc..46c18504f 100644 --- a/sys/action.js +++ b/sys/action.js @@ -312,7 +312,8 @@ class ActionService { const variants = {}; if (origVariants) { Object.keys(origVariants).forEach((lang) => { - variants[lang] = Object.keys(origVariants[lang]) + variants[lang.toLowerCase()] = + Object.keys(origVariants[lang].toLowerCase()) // Filter out non-specific variants like `en`, `zh` etc. .filter(variant => /-/.test(variant)); }); From c30fa6f1af818d14226138508dbe8e8712131db4 Mon Sep 17 00:00:00 2001 From: Petr Pchelko Date: Tue, 19 Jun 2018 19:59:18 +0300 Subject: [PATCH 5/5] Properly lowecase the variants array Change-Id: Icc9818b299cb0f5233529cdb15e98510bd550abc --- sys/action.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/action.js b/sys/action.js index 46c18504f..8f70f64bc 100644 --- a/sys/action.js +++ b/sys/action.js @@ -313,9 +313,10 @@ class ActionService { if (origVariants) { Object.keys(origVariants).forEach((lang) => { variants[lang.toLowerCase()] = - Object.keys(origVariants[lang].toLowerCase()) + Object.keys(origVariants[lang]) // Filter out non-specific variants like `en`, `zh` etc. - .filter(variant => /-/.test(variant)); + .filter(variant => /-/.test(variant)) + .map(variant => variant.toLowerCase()); }); } return {