-
Notifications
You must be signed in to change notification settings - Fork 12
/
extractLeadIntroduction.js
108 lines (98 loc) · 2.89 KB
/
extractLeadIntroduction.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
'use strict';
/**
* @module transformation/pageextracts/extractLeadIntroduction
*/
const NodeType = require('../../nodeType');
const _ = require('underscore');
const { IGNORE_NODES } = require('../constants');
/**
* Check whether a node has any content.
*
* @param {!Element} node
* @return {!boolean} whether the node is empty after all whitespace is stripped.
*/
function isEmptyText(node) {
return node.textContent.trim().length === 0;
}
/**
* T295255 - Some templates insert redundant <style> tag inside empty <p> elements
*
* @param {!Element} node
* @return {!boolean} whether the node children have empty text nodes.
*/
function isEmptyChild(node) {
// https://www.w3.org/TR/DOM-Level-2-Traversal-Range/traversal.html#Traversal-NodeFilter
const FILTER_ACCEPT = 1;
const FILTER_REJECT = 2;
const SHOW_ALL = 0xFFFFFFFF;
const walker = node.ownerDocument.createTreeWalker(
node,
SHOW_ALL,
// Ignore these nodes when walking tree.
( _elem ) => {
if (_elem.matches && _elem.matches(IGNORE_NODES)) {
return FILTER_REJECT;
}
return FILTER_ACCEPT;
}
);
let elem;
while ((elem = walker.nextNode())) {
if (elem.tagName === 'STYLE') {
walker.lastChild();
} else if (elem.nodeType === NodeType.TEXT_NODE && !isEmptyText(elem)) {
// If at least one text element is present, mark this node as non-empty
return false;
}
}
return true;
}
/**
* Extracts the first non-empty paragraph from an article and any
* nodes that follow it that are not themselves paragraphs.
*
* @param {!Document} doc representing article
* @param {boolean} removeNodes when set the lead introduction will
* be removed from the input DOM tree.
* @return {string} representing article introduction
*/
const extractLeadIntroduction = (doc, removeNodes) => {
let p = '';
const remove = [];
const disallowed = [ 'P', 'TABLE', 'CENTER', 'FIGURE', 'DIV' ];
const nodes = doc.querySelectorAll('body > p');
Array.prototype.forEach.call(nodes, (node) => {
let nextSibling;
if (!p && !isEmptyChild(node) && (!(node.hasAttribute('about')) || node.querySelector('b'))) {
p = node.outerHTML;
remove.push(node);
nextSibling = node.nextSibling;
// check the next element is a text node or not in list of disallowed elements
while (nextSibling && (nextSibling.nodeType === NodeType.TEXT_NODE ||
!disallowed.includes(nextSibling.tagName)
)) {
// Deal with text nodes
if (nextSibling.nodeType === NodeType.TEXT_NODE) {
if (!isEmptyText(nextSibling)) {
p += _.escape(nextSibling.textContent);
}
} else {
p += nextSibling.outerHTML;
}
remove.push(nextSibling);
nextSibling = nextSibling.nextSibling;
}
}
});
// cleanup all the nodes.
if (removeNodes) {
remove.forEach((node) => {
node.parentNode.removeChild(node);
});
}
return p;
};
extractLeadIntroduction.test = {
isEmptyChild
};
module.exports = extractLeadIntroduction;