Permalink
Browse files

Fixed scraper NullPointerException cases on malformed URLs.

  • Loading branch information...
luccioman committed May 30, 2017
1 parent aa55d71 commit 306a82dd718e03301499e69c48c90e90ba3584ad
Showing with 12 additions and 8 deletions.
  1. +12 −8 source/net/yacy/document/parser/html/ContentScraper.java
@@ -630,17 +630,21 @@ public void scrapeTag0(Tag tag) {
final String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) {
AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts);
this.addAnchor(url);
if(url != null) {
tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts);
this.addAnchor(url);
}
}
} else if (tag.name.equalsIgnoreCase("iframe")) {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
//this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
if(src != null) {
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
// this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
}
} else if (tag.name.equalsIgnoreCase("html")) {
final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
if (!lang.isEmpty()) // fake a language meta to preserv detection from <html lang="xx" />

0 comments on commit 306a82d

Please sign in to comment.