Skip to content

Commit

Permalink
Reduce log verbosity of RSS loader on feed items with no link
Browse files Browse the repository at this point in the history
  • Loading branch information
luccioman committed Mar 20, 2018
1 parent cf62b57 commit aaefd52
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 56 deletions.
6 changes: 3 additions & 3 deletions htroot/Load_RSS_p.html
Expand Up @@ -154,10 +154,10 @@ <h2>Loading of RSS Feeds</h2>
</tr> </tr>
#{item}# #{item}#
<tr class="TableCellLight"> <tr class="TableCellLight">
<td align="left">#(state)#<input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" />::&nbsp;::&nbsp;#(/state)#</td> <td align="left">#(indexable)#&nbsp;::<input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" />#(/indexable)#</td>
#(state)#<td>new</td>::<td class="info">enqueued</td>::<td class="commit">indexed</td>#(/state)# #(state)#<td>new</td>::<td class="info">enqueued</td>::<td class="commit">indexed</td>#(/state)#
<td><a href="#[link]#">#[title]#</a></td> <td>#(hasLink)#::<a href="#[link]#">#(/hasLink)##[title]##(hasLink)#::</a>#(/hasLink)#</td>
<td><a href="#[link]#">#[link]#</a></td> <td>#(hasLink)#::<a href="#[link]#">#[link]#</a>#(/hasLink)#</td>
<td>#[author]#</td> <td>#[author]#</td>
<td>#[language]#</td> <td>#[language]#</td>
<td>#[date]#</td> <td>#[date]#</td>
Expand Down
108 changes: 65 additions & 43 deletions htroot/Load_RSS_p.java
Expand Up @@ -29,6 +29,8 @@
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;


import org.apache.commons.lang.StringUtils;

import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.Hit;
Expand Down Expand Up @@ -57,7 +59,7 @@
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;


public class Load_RSS_p { public class Load_RSS_p {

public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {


final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
Expand Down Expand Up @@ -285,31 +287,44 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
// index all selected items: description only // index all selected items: description only
if (rss != null && post.containsKey("indexSelectedItemContent")) { if (rss != null && post.containsKey("indexSelectedItemContent")) {
final RSSFeed feed = rss.getFeed(); final RSSFeed feed = rss.getFeed();
List<DigestURL> list = new ArrayList<DigestURL>(); final Map<String, DigestURL> hash2UrlMap = new HashMap<String, DigestURL>();
Map<String, RSSMessage> messages = new HashMap<String, RSSMessage>();
loop: for (final Map.Entry<String, String> entry: post.entrySet()) { loop: for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try { if (entry.getValue().startsWith("mark_")) {
final RSSMessage message = feed.getMessage(entry.getValue().substring(5)); final RSSMessage message = feed.getMessage(entry.getValue().substring(5));
final DigestURL messageurl = new DigestURL(message.getLink()); if(message == null || StringUtils.isBlank(message.getLink())) {
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop; /* Link element is optional in RSS 2.0 and Atom */
messages.put(ASCII.String(messageurl.hash()), message); continue loop;
} catch (final IOException e) { }
ConcurrentLog.logException(e); DigestURL messageUrl;
try {
messageUrl = new DigestURL(message.getLink());
} catch (MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + message.getLink());
continue loop;
}
if (RSSLoader.indexTriggered.containsKey(messageUrl.hash())) {
continue loop;
}
hash2UrlMap.put(ASCII.String(messageUrl.hash()), messageUrl);
} }
} }
loop: for (final Map.Entry<String, RSSMessage> entry: messages.entrySet()) {
final List<DigestURL> urlsToIndex = new ArrayList<DigestURL>();
loop: for (final Map.Entry<String, DigestURL> entry: hash2UrlMap.entrySet()) {
try { try {
final RSSMessage message = entry.getValue(); final DigestURL messageUrl = entry.getValue();
final DigestURL messageurl = new DigestURL(message.getLink()); HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageUrl.hash()));
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash())); if (harvestProcess != null) {
if (harvestProcess != null) continue loop; continue loop;
list.add(messageurl); }
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); urlsToIndex.add(messageUrl);
RSSLoader.indexTriggered.insertIfAbsent(messageUrl.hash(), new Date());
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
sb.addToIndex(list, null, null, collections, true);
sb.addToIndex(urlsToIndex, null, null, collections, true);
} }


if (rss != null && post.containsKey("indexAllItemContent")) { if (rss != null && post.containsKey("indexAllItemContent")) {
Expand Down Expand Up @@ -339,33 +354,44 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL()); prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL());
prop.put("showitems_docs", feed.size()); // number of documents prop.put("showitems_docs", feed.size()); // number of documents


Map<String, DigestURL> urls = new HashMap<String, DigestURL>();
for (final Hit item: feed) {
try {
final DigestURL messageurl = new DigestURL(item.getLink());
urls.put(ASCII.String(messageurl.hash()), messageurl);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
continue;
}
}

int i = 0; int i = 0;
for (final Hit item: feed) { for (final Hit item: feed) {
try { DigestURL link = null;
final DigestURL messageurl = new DigestURL(item.getLink()); final String linkStr = item.getLink();
author = item.getAuthor(); if(StringUtils.isNotBlank(linkStr)) {
if (author == null) author = item.getCopyright(); /* Link element is optional in RSS 2.0 and Atom */
pubDate = item.getPubDate(); try {
HarvestProcess harvestProcess; link = new DigestURL(linkStr);
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr);
}
}

author = item.getAuthor();
if (author == null) {
author = item.getCopyright();
}
pubDate = item.getPubDate();

HarvestProcess harvestProcess;
try { try {
harvestProcess = sb.urlExists(ASCII.String(messageurl.hash())); if(link != null && StringUtils.isNotEmpty(item.getGuid())) {
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); harvestProcess = sb.urlExists(ASCII.String(link.hash()));
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid()); prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_guid", item.getGuid());
} else {
prop.put("showitems_item_" + i + "_state", 0);
prop.put("showitems_item_" + i + "_indexable", false);
prop.put("showitems_item_" + i + "_hasLink", false);
}
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
Expand All @@ -374,10 +400,6 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
continue; continue;
} }
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
continue;
}
} }
prop.put("showitems_item", i); prop.put("showitems_item", i);
prop.put("showitems_num", i); prop.put("showitems_num", i);
Expand Down
31 changes: 21 additions & 10 deletions source/net/yacy/crawler/retrieval/RSSLoader.java
Expand Up @@ -33,6 +33,8 @@
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;


import org.apache.commons.lang.StringUtils;

import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSFeed;
Expand Down Expand Up @@ -97,22 +99,31 @@ public void run() {


public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, final RSSFeed feed, Map<String, Pattern> collections) { public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, final RSSFeed feed, Map<String, Pattern> collections) {
int loadCount = 0; int loadCount = 0;
List<DigestURL> list = new ArrayList<DigestURL>(); final Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (final RSSMessage message: feed) { for (final RSSMessage message: feed) {
try { final String linkStr = message.getLink();
final DigestURL messageurl = new DigestURL(message.getLink()); if(StringUtils.isNotBlank(linkStr)) { // Link element is optional in RSS 2.0 and Atom
if (indexTriggered.containsKey(messageurl.hash())) continue; DigestURL messageurl;
urlmap.put(ASCII.String(messageurl.hash()), messageurl); try {
} catch (final IOException e) { messageurl = new DigestURL(linkStr);
ConcurrentLog.logException(e); if (indexTriggered.containsKey(messageurl.hash())) {
} continue;
}
urlmap.put(ASCII.String(messageurl.hash()), messageurl);
} catch (MalformedURLException e1) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr);
}
}
} }

final List<DigestURL> list = new ArrayList<DigestURL>();
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) { for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
HarvestProcess harvestProcess; HarvestProcess harvestProcess;
try { try {
harvestProcess = sb.urlExists(e.getKey()); harvestProcess = sb.urlExists(e.getKey());
if (harvestProcess != null) continue; if (harvestProcess != null) {
continue;
}
list.add(e.getValue()); list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++; loadCount++;
Expand Down

0 comments on commit aaefd52

Please sign in to comment.