+ * The algorithm adds two stages of processing for the header blocks. The first stage + * (preprocessing) is executed after context-free classification and before context-sensitive + * classification. The second stage (postprocessing) is performed after the context-sensitive + * classification: + *
+ * "2. The preprocessing looks
+ * for short header blocks which precede good blocks and at the same time there is no more
+ * than MAX_HEADING_DISTANCE characters between the header block and the good block. The
+ * context-free class of such header blocks is changed from short to near-good. The purpose
+ * of this is to preserve short blocks between the heading and the good text which might
+ * otherwise be removed (classified as bad) by the context-sensitive classification."
+ */
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph paragraph = paragraphs.get(i);
- if (!(paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.SHORT)) {
- continue;
- }
- int j = i + 1;
- int distance = 0;
- while (j < paragraphs.size() && distance <= maxHeadingDistance) {
- if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
- paragraph.setClassType(PARAGRAPH_TYPE.NEAR_GOOD);
- break;
+ if (paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.SHORT) {
+ int j = i + 1;
+ int distance = 0;
+ while (j < paragraphs.size() && distance <= maxHeadingDistance) {
+ if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
+ paragraph.setClassType(PARAGRAPH_TYPE.NEAR_GOOD);
+ break;
+ }
+ distance += paragraphs.get(j).getRawText().length();
+ j += 1;
}
- distance += paragraphs.get(j).getRawText().length();
- j += 1;
}
}
@@ -290,33 +318,27 @@ private void reclassifyContextSensitive(List
+ * "4. The postprocessing again looks for header blocks
+ * which precede good blocks and are no further than MAX_HEADING_DISTANCE away. This time,
+ * the matched headers are classified as good if their context-free class was other than
+ * bad. In other words, the bad headings remain bad, but some short and near-good headings
+ * can be classified as good if they precede good blocks, even though they would normally be
+ * classified as bad by the context-sensitive classification (e.g. they are surrounded by
+ * bad blocks). This stage preserves the 'non-bad' headings of good blocks."
+ *
+ */
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph paragraph = paragraphs.get(i);
- if (!(paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.BAD)
+ if (paragraph.isHeading() && paragraph.getClassType() == PARAGRAPH_TYPE.BAD
&& paragraph.getContextFreeClass() != PARAGRAPH_TYPE.BAD) {
- continue;
- }
- int j = i + 1;
- int distance = 0;
- while (j < paragraphs.size() && distance <= maxHeadingDistance) {
- if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
- paragraph.setClassType(PARAGRAPH_TYPE.GOOD);
- break;
+ int j = i + 1;
+ int distance = 0;
+ while (j < paragraphs.size() && distance <= maxHeadingDistance) {
+ if (paragraphs.get(j).getClassType() == PARAGRAPH_TYPE.GOOD) {
+ paragraph.setClassType(PARAGRAPH_TYPE.GOOD);
+ break;
+ }
+ distance += paragraphs.get(j).getRawText().length();
+ j += 1;
}
- distance += paragraphs.get(j).getRawText().length();
- j += 1;
}
-
}
}
@@ -366,9 +395,8 @@ else if ((prevNeighbour == PARAGRAPH_TYPE.BAD && getPrevNeighbourOptimized(i,
* Converts an HTML page into a list of classified paragraphs. Each
* paragraph is represented as instance of class "Paragraph"
*/
- private List