Skip to content

Commit

Permalink
Fix header post-processing. Fixes dkpro#36.
Browse files Browse the repository at this point in the history
Also align more closely with the original algorithm by:
- un-inverting conditionals so they can be checked against algorithm
easily
- adding <style> tag to list of tags cleaned in pre-processing per algo
- marking <select> tag as block level per original algorithm
- using ints for character counts instead of doubles
- adding documentation from original algorithm description
  • Loading branch information
tfmorris committed Apr 13, 2016
1 parent ec30138 commit 3fdbb6a
Showing 1 changed file with 108 additions and 80 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,16 @@
import java.util.Set;

/**
* Re-implementing the jusText python boilerplate removal algorithm (Pomikalek,
* Re-implementing the jusText Python boilerplate removal algorithm (Pomikalek,
* 2011)
* <br>
* References:
* <br>
* Pomikalek, J. (2011). Removing boilerplate and duplicate content from web corpora.
* Ph.D. thesis, Masaryk university, Faculty of informatics, Brno, Czech Republic.
* <br>
* http://corpus.tools/wiki/Justext/Algorithm
* https://github.com/miso-belica/jusText/blob/dev/doc/algorithm.rst
*
* @author Omnia Zayed
*/
Expand All @@ -58,8 +61,8 @@ public class JusTextBoilerplateRemoval
{

static final double MAX_LINK_DENSITY_DEFAULT = 0.20;
static final double LENGTH_LOW_DEFAULT = 70;
static final double LENGTH_HIGH_DEFAULT = 200;
static final int LENGTH_LOW_DEFAULT = 70;
static final int LENGTH_HIGH_DEFAULT = 200;
static final double STOPWORDS_LOW_DEFAULT = 0.30;
static final double STOPWORDS_HIGH_DEFAULT = 0.32;

Expand All @@ -85,7 +88,7 @@ private Document convertHtmlToDoc(String html)
try {
document = Jsoup.parse(html);
document = new Cleaner(
Whitelist.relaxed().removeTags("img", "head", "script", ".hidden", "embedded", "#comment"))
Whitelist.relaxed().removeTags("img", "head", "script", "style", ".hidden", "embedded", "#comment"))
.clean(document);
document.outputSettings().charset("UTF-8");
document.outputSettings().escapeMode(EscapeMode.xhtml);
Expand Down Expand Up @@ -125,7 +128,7 @@ private LinkedList<Paragraph> makeParagraphs(Node node)
* <li>near-good – somewhere in-between short and good
*/
private void classifyContextFree(List<Paragraph> paragraphs, Set<String> stoplist,
double lengthLow, double lengthHigh, double stopwordsLow,
int lengthLow, int lengthHigh, double stopwordsLow,
double stopwordsHigh, double maxLinkDensity)
{
// TODO: Move stop list initialization out of band
Expand All @@ -138,10 +141,13 @@ private void classifyContextFree(List<Paragraph> paragraphs, Set<String> stoplis
float stopWordDensity = paragraph.stopwords_density(stopListLower);
double link_density = paragraph.calcLinksDensity();

if (link_density > maxLinkDensity) {
if ("select".equalsIgnoreCase(paragraph.getTagName())) {
paragraph.setContextFreeClass(PARAGRAPH_TYPE.BAD);
}
else if (paragraph.getRawText().contains("\u00a9")) { // copyright symbol
paragraph.setContextFreeClass(PARAGRAPH_TYPE.BAD);
}
else if (paragraph.getRawText().contains("\u00a9")) { // copyright symbol
else if (link_density > maxLinkDensity) {
paragraph.setContextFreeClass(PARAGRAPH_TYPE.BAD);
}
else if (length < lengthLow) {
Expand Down Expand Up @@ -252,35 +258,57 @@ private PARAGRAPH_TYPE getNextNeighbourOptimized(int i, List<Paragraph> paragrap
return getNextNeighbourOptimized(i, paragraphs, ignoreNeargood, 1, paragraphs.size());
}

/**
* Context-sensitive paragraph classification. Assumes that context free
* classification of paragraphs has already been called. The purpose is to
* re classify neargood and short paragraphs according to the classes of the
* surrounding blocks.
/*
* Context-sensitive paragraph classification. Assumes that context free classification of
* paragraphs has already been called. The purpose is to re classify neargood and short
* paragraphs according to the classes of the surrounding blocks.
* <p>
* The algorithm adds two stages of processing for the header blocks. The first stage
* (preprocessing) is executed after context-free classification and before context-sensitive
* classification. The second stage (postprocessing) is performed after the context-sensitive
* classification:
* <ol>
* <li>context-free classification [done before this method is called]
* <li>preprocessing of header blocks
* <li>context-sensitive classification
* <li>postprocessing of header blocks
* </ol>
*
* FIXME: This can behave pathologically in the presence of large lists of "paragraphs"
* with no textual content. In this case the maxHeadingDistance parameter isn't adequate
* to short circuit large amounts of processing. We may need to max number of elements
* to search (10? 20?).
*/
private void reclassifyContextSensitive(List<Paragraph> paragraphs, int maxHeadingDistance)
{

// copy classes
// Default classification is the same as the context-free classification
for (Paragraph p : paragraphs) {
p.setClassType(p.getContextFreeClass());
}

// re-classify good headings
/*
* re-classify good headings - from the description of the original Python implementation:
* <p>
* "2. The preprocessing looks
* for short header blocks which precede good blocks and at the same time there is no more
* than MAX_HEADING_DISTANCE characters between the header block and the good block. The
* context-free class of such header blocks is changed from short to near-good. The purpose
* of this is to preserve short blocks between the heading and the good text which might
* otherwise be removed (classified as bad) by the context-sensitive classification."
*/
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph paragraph = paragraphs.get(i);
if (!(paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.SHORT)) {
continue;
}
int j = i + 1;
int distance = 0;
while (j < paragraphs.size() && distance <= maxHeadingDistance) {
if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
paragraph.setClassType(PARAGRAPH_TYPE.NEAR_GOOD);
break;
if (paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.SHORT) {
int j = i + 1;
int distance = 0;
while (j < paragraphs.size() && distance <= maxHeadingDistance) {
if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
paragraph.setClassType(PARAGRAPH_TYPE.NEAR_GOOD);
break;
}
distance += paragraphs.get(j).getRawText().length();
j += 1;
}
distance += paragraphs.get(j).getRawText().length();
j += 1;
}
}

Expand All @@ -290,33 +318,27 @@ private void reclassifyContextSensitive(List<Paragraph> paragraphs, int maxHeadi
Map<Integer, PARAGRAPH_TYPE> newClasses = new LinkedHashMap<>();

for (int i = 0; i < paragraphs.size(); i++) {
if (paragraphs.get(i).getClassType() != Paragraph.PARAGRAPH_TYPE.SHORT) {
continue;
}

PARAGRAPH_TYPE prevNeighbour = getPrevNeighbourOptimized(i, paragraphs, true); //ignore_neargood
PARAGRAPH_TYPE nextNeighbour = getNextNeighbourOptimized(i, paragraphs, true); //ignore_neargood

Set<PARAGRAPH_TYPE> neighbours = new LinkedHashSet<>();
neighbours.add(prevNeighbour);
neighbours.add(nextNeighbour);

if (neighbours.size() == 1 && neighbours.contains(PARAGRAPH_TYPE.GOOD)) {
newClasses.put(i, PARAGRAPH_TYPE.GOOD);
}
else if (neighbours.size() == 1 && neighbours.contains(PARAGRAPH_TYPE.BAD)) {
newClasses.put(i, PARAGRAPH_TYPE.BAD);
} // it must be set(['good', 'bad'])
else if ((prevNeighbour == PARAGRAPH_TYPE.BAD && getPrevNeighbourOptimized(i,
paragraphs,
false) == PARAGRAPH_TYPE.NEAR_GOOD)
|| (nextNeighbour == PARAGRAPH_TYPE.BAD && getNextNeighbourOptimized(i,
paragraphs,
false) == PARAGRAPH_TYPE.NEAR_GOOD)) {
newClasses.put(i, PARAGRAPH_TYPE.GOOD);
}
else {
newClasses.put(i, PARAGRAPH_TYPE.BAD);
if (paragraphs.get(i).getClassType() == Paragraph.PARAGRAPH_TYPE.SHORT) {
PARAGRAPH_TYPE prevNeighbour = getPrevNeighbourOptimized(i, paragraphs, true); // ignore_neargood
PARAGRAPH_TYPE nextNeighbour = getNextNeighbourOptimized(i, paragraphs, true); // ignore_neargood

Set<PARAGRAPH_TYPE> neighbours = new LinkedHashSet<>();
neighbours.add(prevNeighbour);
neighbours.add(nextNeighbour);

if (neighbours.size() == 1 && neighbours.contains(PARAGRAPH_TYPE.GOOD)) {
newClasses.put(i, PARAGRAPH_TYPE.GOOD);
} else if (neighbours.size() == 1 && neighbours.contains(PARAGRAPH_TYPE.BAD)) {
newClasses.put(i, PARAGRAPH_TYPE.BAD);
} // it must be set(['good', 'bad'])
else if ((prevNeighbour == PARAGRAPH_TYPE.BAD
&& getPrevNeighbourOptimized(i, paragraphs, false) == PARAGRAPH_TYPE.NEAR_GOOD)
|| (nextNeighbour == PARAGRAPH_TYPE.BAD
&& getNextNeighbourOptimized(i, paragraphs, false) == PARAGRAPH_TYPE.NEAR_GOOD)) {
newClasses.put(i, PARAGRAPH_TYPE.GOOD);
} else {
newClasses.put(i, PARAGRAPH_TYPE.BAD);
}
}
}

Expand All @@ -325,50 +347,56 @@ else if ((prevNeighbour == PARAGRAPH_TYPE.BAD && getPrevNeighbourOptimized(i,
paragraphs.get(i).setClassType(newClasses.get(i));
}

// revise neargood
// revise neargood
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph paragraph = paragraphs.get(i);
if (paragraph.getClassType() != PARAGRAPH_TYPE.NEAR_GOOD) {
continue;
}
PARAGRAPH_TYPE prevNeighbour = getPrevNeighbourOptimized(i, paragraphs, true);
PARAGRAPH_TYPE nextNeighbour = getNextNeighbourOptimized(i, paragraphs, true);
if (prevNeighbour == PARAGRAPH_TYPE.BAD && nextNeighbour == PARAGRAPH_TYPE.BAD) {
paragraph.setClassType(PARAGRAPH_TYPE.BAD);
}
else {
paragraph.setClassType(PARAGRAPH_TYPE.GOOD);
if (paragraph.getClassType() == PARAGRAPH_TYPE.NEAR_GOOD) {
PARAGRAPH_TYPE prevNeighbour = getPrevNeighbourOptimized(i, paragraphs, true);
PARAGRAPH_TYPE nextNeighbour = getNextNeighbourOptimized(i, paragraphs, true);
if (prevNeighbour == PARAGRAPH_TYPE.BAD && nextNeighbour == PARAGRAPH_TYPE.BAD) {
paragraph.setClassType(PARAGRAPH_TYPE.BAD);
} else {
paragraph.setClassType(PARAGRAPH_TYPE.GOOD);
}
}
}

// re-classify more good headings
/*
* re-classify more good headings - post-processing
* <p>
* "4. The postprocessing again looks for header blocks
* which precede good blocks and are no further than MAX_HEADING_DISTANCE away. This time,
* the matched headers are classified as good if their context-free class was other than
* bad. In other words, the bad headings remain bad, but some short and near-good headings
* can be classified as good if they precede good blocks, even though they would normally be
* classified as bad by the context-sensitive classification (e.g. they are surrounded by
* bad blocks). This stage preserves the 'non-bad' headings of good blocks."
*
*/
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph paragraph = paragraphs.get(i);
if (!(paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.BAD)
if (paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.BAD
&& paragraph.getContextFreeClass() != PARAGRAPH_TYPE.BAD) {
continue;
}
int j = i + 1;
int distance = 0;
while (j < paragraphs.size() && distance <= maxHeadingDistance) {
if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
paragraph.setClassType(PARAGRAPH_TYPE.GOOD);
break;
int j = i + 1;
int distance = 0;
while (j < paragraphs.size() && distance <= maxHeadingDistance) {
if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
paragraph.setClassType(PARAGRAPH_TYPE.GOOD);
break;
}
distance += paragraphs.get(j).getRawText().length();
j += 1;
}
distance += paragraphs.get(j).getRawText().length();
j += 1;
}

}
}

/**
* Converts an HTML page into a list of classified paragraphs. Each
* paragraph is represented as instance of class "Paragraph"
*/
private List<Paragraph> classify(String htmlText, Set<String> stopwordsSet, double lengthLow,
double lengthHigh, double stopwordsLow,
double stopwordsHigh, double maxLinkDensity,
private List<Paragraph> classify(String htmlText, Set<String> stopwordsSet, int lengthLow,
int lengthHigh, double stopwordsLow, double stopwordsHigh, double maxLinkDensity,
int maxHeadingDistance)
{

Expand Down

0 comments on commit 3fdbb6a

Please sign in to comment.