Skip to content

Commit

Permalink
Fixed issue #158 : completed div CSS class ignore in crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
luccioman committed Feb 10, 2018
1 parent fa65fb1 commit eb20589
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 44 deletions.
2 changes: 1 addition & 1 deletion htroot/CrawlStartExpert.html
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ <h2>Expert Crawl Start</h2>
<dt>Filter div class names</dt> <dt>Filter div class names</dt>
<dd> <dd>
<table border="0"> <table border="0">
<tr><td width="110">set of class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of div class names which should be filtered out</td></tr> <tr><td width="110">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; element class names which should be filtered out</td></tr>
</table> </table>
</dd> </dd>
</dl> </dl>
Expand Down
11 changes: 0 additions & 11 deletions source/net/yacy/document/parser/html/AbstractScraper.java
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -65,17 +65,6 @@ public boolean isTag1(final String tag) {
return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase())); return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase()));
} }


//the 'missing' method that shall be implemented:
@Override
public abstract void scrapeText(char[] text, String insideTag);

// the other methods must take into account to construct the return value correctly
@Override
public abstract void scrapeTag0(ContentScraper.Tag tag);

@Override
public abstract void scrapeTag1(ContentScraper.Tag tag);

public static String stripAllTags(final char[] s) { public static String stripAllTags(final char[] s) {
if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return ""; if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
final StringBuilder r = new StringBuilder(s.length); final StringBuilder r = new StringBuilder(s.length);
Expand Down
88 changes: 66 additions & 22 deletions source/net/yacy/document/parser/html/ContentScraper.java
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ public static class Tag {
public String name; public String name;
public Properties opts; public Properties opts;
public CharBuffer content; public CharBuffer content;

/** Set to true when this tag should be ignored from scraping */
private boolean ignore = false;

public Tag(final String name) { public Tag(final String name) {
this.name = name; this.name = name;
this.opts = new Properties(); this.opts = new Properties();
Expand Down Expand Up @@ -174,6 +178,18 @@ public void finalize() {
public String toString() { public String toString() {
return "<" + name + " " + opts + ">" + content + "</" + name + ">"; return "<" + name + " " + opts + ">" + content + "</" + name + ">";
} }

/** @return true when this tag should be ignored from scraping */
public boolean isIgnore() {
return this.ignore;
}

/**
* @param ignore true when this tag should be ignored from scraping
*/
public void setIgnore(final boolean ignore) {
this.ignore = ignore;
}
} }


// all these tags must be given in lowercase, because the tags from the files are compared in lowercase // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
Expand Down Expand Up @@ -216,7 +232,10 @@ public String toString() {
private final int maxAnchors; private final int maxAnchors;


private final VocabularyScraper vocabularyScraper; private final VocabularyScraper vocabularyScraper;
private final Set<String> ignore_class_name;
/** Set of CSS class names whose matching div elements content should be ignored */
private final Set<String> ignoreDivClassNames;

private final int timezoneOffset; private final int timezoneOffset;
private int breadcrumbs; private int breadcrumbs;


Expand Down Expand Up @@ -245,18 +264,19 @@ public String toString() {
* @param root the document root url * @param root the document root url
* @param maxAnchors the maximum number of URLs to process and store in the anchors property. * @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store * @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
* @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms * @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset * @param timezoneOffset local time zone offset
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) { public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
assert root != null; assert root != null;
this.root = root; this.root = root;
this.vocabularyScraper = vocabularyScraper; this.vocabularyScraper = vocabularyScraper;
this.ignore_class_name = ignore_class_name; this.ignoreDivClassNames = ignoreDivClassNames;
this.timezoneOffset = timezoneOffset; this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks); this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
Expand Down Expand Up @@ -314,9 +334,15 @@ public void finish() {
} }


@Override @Override
public void scrapeText(final char[] newtext0, final String insideTag) { public void scrapeText(final char[] newtext0, final Tag insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext)); if (insideTag != null) {
if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return; if(insideTag.ignore) {
return;
}
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
return;
}
}
int p, pl, q, s = 0; int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();


Expand Down Expand Up @@ -377,7 +403,7 @@ public void scrapeText(final char[] newtext0, final String insideTag) {
} }
// find tags inside text // find tags inside text
String b = cleanLine(stripAllTags(newtext)); String b = cleanLine(stripAllTags(newtext));
if ((insideTag != null) && (!(insideTag.equals("a")))) { if ((insideTag != null) && (!(insideTag.name.equals(TagName.a.name())))) {
// texts inside tags sometimes have no punctuation at the line end // texts inside tags sometimes have no punctuation at the line end
// this is bad for the text semantics, because it is not possible for the // this is bad for the text semantics, because it is not possible for the
// condenser to distinguish headlines from text beginnings. // condenser to distinguish headlines from text beginnings.
Expand Down Expand Up @@ -697,6 +723,9 @@ public Set<String> retainIconRelations(Collection<String> relTokens) {
*/ */
@Override @Override
public void scrapeTag0(final Tag tag) { public void scrapeTag0(final Tag tag) {
if(tag.ignore) {
return;
}
checkOpts(tag); checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) { if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING); final String src = tag.opts.getProperty("src", EMPTY_STRING);
Expand Down Expand Up @@ -861,6 +890,9 @@ public void scrapeTag0(final Tag tag) {
*/ */
@Override @Override
public void scrapeTag1(final Tag tag) { public void scrapeTag1(final Tag tag) {
if(tag.ignore) {
return;
}
checkOpts(tag); checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
Expand All @@ -882,18 +914,12 @@ public void scrapeTag1(final Tag tag) {
} }
final String h; final String h;
if (tag.name.equalsIgnoreCase("div")) { if (tag.name.equalsIgnoreCase("div")) {
final String classn = tag.opts.getProperty("class", EMPTY_STRING); final String id = tag.opts.getProperty("id", EMPTY_STRING);
if (classn.length() > 0 && this.ignore_class_name.contains(classn)) { this.evaluationScores.match(Element.divid, id);
// we remove everything inside that tag, so it can be ignored final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
tag.content.clear(); if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
} else { breadcrumbs++;
final String id = tag.opts.getProperty("id", EMPTY_STRING); }
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
}
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h); if (h.length() > 0) this.headlines[0].add(h);
Expand Down Expand Up @@ -974,14 +1000,32 @@ public void scrapeTag1(final Tag tag) {
* {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}. * {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
*/ */
@Override @Override
public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) { public void scrapeAnyTagOpening(final Tag tag) {
if (tagAttributes != null) { if (tag != null && !tag.ignore && tag.opts != null) {
/* /*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this * HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1 * scraping to the limited sets in linkTags0 and linkTags1
*/ */
this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes)); this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
}
}

@Override
public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
boolean ignore = false;

/* First, inherit ignore property from eventual parent */
if(parentTag != null) {
ignore = parentTag.ignore;
}

/* Parent is not marked as ignored : let's check the current tag */
if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) {
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
} }
return ignore;
} }


/** /**
Expand Down
23 changes: 17 additions & 6 deletions source/net/yacy/document/parser/html/Scraper.java
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@


package net.yacy.document.parser.html; package net.yacy.document.parser.html;


import java.util.Properties;

public interface Scraper { public interface Scraper {


/** /**
Expand All @@ -50,7 +48,12 @@ public interface Scraper {
*/ */
public boolean isTag1(String tag); public boolean isTag1(String tag);


public void scrapeText(char[] text, String insideTag); /**
* Process plain text
* @param plain text to process
* @param insideTag the eventual direct parent tag. May be null.
*/
public void scrapeText(char[] text, ContentScraper.Tag insideTag);


/** /**
* Process a tag belonging to the first category of tags according to the Scraper implementation * Process a tag belonging to the first category of tags according to the Scraper implementation
Expand All @@ -66,10 +69,18 @@ public interface Scraper {


/** /**
* Processing applied to any kind of tag opening. * Processing applied to any kind of tag opening.
* @param tagName the tag name * @param tag a parsed tag
* @param tagAttributes the atttributes of the tag
*/ */
public void scrapeAnyTagOpening(String tagName, Properties tagAttributes); public void scrapeAnyTagOpening(ContentScraper.Tag tag);

/**
* @param tag
* a parsed tag
* @param parentTag the eventual parent tag
* @return true when the tag should be ignored according to the scraper
* implementation rules
*/
public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);


public void scrapeComment(final char[] comment); public void scrapeComment(final char[] comment);


Expand Down
26 changes: 22 additions & 4 deletions source/net/yacy/document/parser/html/TransformerWriter.java
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -232,15 +232,19 @@ private char[] filterTag(final char[] content) {
if (this.tagStack.size() == 0) { if (this.tagStack.size() == 0) {
// we are not collection tag text -> case (1) - (3) // we are not collection tag text -> case (1) - (3)
// case (1): this is not a tag opener/closer // case (1): this is not a tag opener/closer
if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); if (this.scraper != null && content.length > 0) {
if (this.transformer != null) return this.transformer.transformText(content); this.scraper.scrapeText(content, null);
}
if (this.transformer != null) {
return this.transformer.transformText(content);
}
return content; return content;
} }


// we are collection tag text for the tag 'filterTag' -> case (4) - (7) // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
// case (4): getting no tag, go on collecting content // case (4): getting no tag, go on collecting content
if (this.scraper != null) { if (this.scraper != null) {
this.scraper.scrapeText(content, this.tagStack.lastElement().name); this.scraper.scrapeText(content, this.tagStack.lastElement());
} }
if (this.transformer != null) { if (this.transformer != null) {
this.tagStack.lastElement().content.append(this.transformer.transformText(content)); this.tagStack.lastElement().content.append(this.transformer.transformText(content));
Expand Down Expand Up @@ -293,8 +297,22 @@ private char[] filterTagOpening(final String tagname, final char[] content, fina
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser()); ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
charBuffer.close(); charBuffer.close();


final ContentScraper.Tag parentTag;
if(this.tagStack.size() > 0) {
parentTag = this.tagStack.lastElement();
} else {
parentTag = null;
}

/* Check scraper ignoring rules */
if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
tag.setIgnore(true);
}

/* Apply processing relevant for any kind of tag opening */ /* Apply processing relevant for any kind of tag opening */
this.scraper.scrapeAnyTagOpening(tag.name, tag.opts); if(this.scraper != null) {
this.scraper.scrapeAnyTagOpening(tag);
}


if (this.scraper != null && this.scraper.isTag0(tagname)) { if (this.scraper != null && this.scraper.isTag0(tagname)) {
// this single tag is collected at once here // this single tag is collected at once here
Expand Down
Loading

0 comments on commit eb20589

Please sign in to comment.