Permalink
Browse files

added nav filter

  • Loading branch information...
Orbiter committed Mar 10, 2018
1 parent bcbd0ae commit 187075b878f9d6d56bf1fdba3bf70bf0ebe22bbf
@@ -24,12 +24,10 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.AbstractMap;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -39,9 +37,7 @@
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.peers.Seed;
@@ -370,10 +370,10 @@ <h2>Expert Crawl Start</h2>
<legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
<dl>
<dt>Filter div class names</dt>
<dt>Filter div or nav class names</dt>
<dd>
<table style="border-width: 0px">
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; element class names which should be filtered out</td></tr>
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out</td></tr>
</table>
</dd>
</dl>
@@ -152,6 +152,7 @@ public void run() {
if (access != Access.unknown) Scanner.this.services.put(this, access);
}
} catch (final OutOfMemoryError e) {
e.printStackTrace();
}
}
public long age() {
@@ -129,6 +129,7 @@
script(TagType.pair),
span(TagType.pair),
div(TagType.pair),
nav(TagType.pair),
article(TagType.pair), // html5
time(TagType.pair), // html5 <time datetime>
// tags used to capture tag content
@@ -1020,7 +1021,10 @@ public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
}
/* Parent is not marked as ignored : let's check the current tag */
if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) {
if (!ignore &&
this.ignoreDivClassNames != null &&
tag != null &&
(TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name))) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);

1 comment on commit 187075b

@stbc

This comment has been minimized.

stbc commented on 187075b Mar 13, 2018

Nice one, thanks. Martin just told me that this was implemented. Would it be possible no only to include class names but also id names?

Please sign in to comment.