Skip to content
Permalink
Browse files

Reduce log verbosity of RSS loader on feed items with no link

  • Loading branch information...
luccioman committed Mar 20, 2018
1 parent cf62b57 commit aaefd5219cd417092338b325c0c6f92360cc07e9
Showing with 89 additions and 56 deletions.
  1. +3 −3 htroot/Load_RSS_p.html
  2. +65 −43 htroot/Load_RSS_p.java
  3. +21 −10 source/net/yacy/crawler/retrieval/RSSLoader.java
@@ -154,10 +154,10 @@ <h2>Loading of RSS Feeds</h2>
</tr>
#{item}#
<tr class="TableCellLight">
<td align="left">#(state)#<input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" />::&nbsp;::&nbsp;#(/state)#</td>
<td align="left">#(indexable)#&nbsp;::<input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" />#(/indexable)#</td>
#(state)#<td>new</td>::<td class="info">enqueued</td>::<td class="commit">indexed</td>#(/state)#
<td><a href="#[link]#">#[title]#</a></td>
<td><a href="#[link]#">#[link]#</a></td>
<td>#(hasLink)#::<a href="#[link]#">#(/hasLink)##[title]##(hasLink)#::</a>#(/hasLink)#</td>
<td>#(hasLink)#::<a href="#[link]#">#[link]#</a>#(/hasLink)#</td>
<td>#[author]#</td>
<td>#[language]#</td>
<td>#[date]#</td>
@@ -29,6 +29,8 @@
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.Hit;
@@ -57,7 +59,7 @@
import net.yacy.server.serverSwitch;

public class Load_RSS_p {

public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {

final serverObjects prop = new serverObjects();
@@ -285,31 +287,44 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
// index all selected items: description only
if (rss != null && post.containsKey("indexSelectedItemContent")) {
final RSSFeed feed = rss.getFeed();
List<DigestURL> list = new ArrayList<DigestURL>();
Map<String, RSSMessage> messages = new HashMap<String, RSSMessage>();
final Map<String, DigestURL> hash2UrlMap = new HashMap<String, DigestURL>();
loop: for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
if (entry.getValue().startsWith("mark_")) {
final RSSMessage message = feed.getMessage(entry.getValue().substring(5));
final DigestURL messageurl = new DigestURL(message.getLink());
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop;
messages.put(ASCII.String(messageurl.hash()), message);
} catch (final IOException e) {
ConcurrentLog.logException(e);
if(message == null || StringUtils.isBlank(message.getLink())) {
/* Link element is optional in RSS 2.0 and Atom */
continue loop;
}
DigestURL messageUrl;
try {
messageUrl = new DigestURL(message.getLink());
} catch (MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + message.getLink());
continue loop;
}
if (RSSLoader.indexTriggered.containsKey(messageUrl.hash())) {
continue loop;
}
hash2UrlMap.put(ASCII.String(messageUrl.hash()), messageUrl);
}
}
loop: for (final Map.Entry<String, RSSMessage> entry: messages.entrySet()) {

final List<DigestURL> urlsToIndex = new ArrayList<DigestURL>();
loop: for (final Map.Entry<String, DigestURL> entry: hash2UrlMap.entrySet()) {
try {
final RSSMessage message = entry.getValue();
final DigestURL messageurl = new DigestURL(message.getLink());
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
if (harvestProcess != null) continue loop;
list.add(messageurl);
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
final DigestURL messageUrl = entry.getValue();
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageUrl.hash()));
if (harvestProcess != null) {
continue loop;
}
urlsToIndex.add(messageUrl);
RSSLoader.indexTriggered.insertIfAbsent(messageUrl.hash(), new Date());
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
sb.addToIndex(list, null, null, collections, true);

sb.addToIndex(urlsToIndex, null, null, collections, true);
}

if (rss != null && post.containsKey("indexAllItemContent")) {
@@ -339,33 +354,44 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL());
prop.put("showitems_docs", feed.size()); // number of documents

Map<String, DigestURL> urls = new HashMap<String, DigestURL>();
for (final Hit item: feed) {
try {
final DigestURL messageurl = new DigestURL(item.getLink());
urls.put(ASCII.String(messageurl.hash()), messageurl);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
continue;
}
}

int i = 0;
for (final Hit item: feed) {
try {
final DigestURL messageurl = new DigestURL(item.getLink());
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();
HarvestProcess harvestProcess;
DigestURL link = null;
final String linkStr = item.getLink();
if(StringUtils.isNotBlank(linkStr)) {
/* Link element is optional in RSS 2.0 and Atom */
try {
link = new DigestURL(linkStr);
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr);
}
}

author = item.getAuthor();
if (author == null) {
author = item.getCopyright();
}
pubDate = item.getPubDate();

HarvestProcess harvestProcess;
try {
harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
if(link != null && StringUtils.isNotEmpty(item.getGuid())) {
harvestProcess = sb.urlExists(ASCII.String(link.hash()));

prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_guid", item.getGuid());
} else {
prop.put("showitems_item_" + i + "_state", 0);
prop.put("showitems_item_" + i + "_indexable", false);
prop.put("showitems_item_" + i + "_hasLink", false);
}
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
@@ -374,10 +400,6 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
ConcurrentLog.logException(e);
continue;
}
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
continue;
}
}
prop.put("showitems_item", i);
prop.put("showitems_num", i);
@@ -33,6 +33,8 @@
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed;
@@ -97,22 +99,31 @@ public void run() {

public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, final RSSFeed feed, Map<String, Pattern> collections) {
int loadCount = 0;
List<DigestURL> list = new ArrayList<DigestURL>();
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
final Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (final RSSMessage message: feed) {
try {
final DigestURL messageurl = new DigestURL(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue;
urlmap.put(ASCII.String(messageurl.hash()), messageurl);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
final String linkStr = message.getLink();
if(StringUtils.isNotBlank(linkStr)) { // Link element is optional in RSS 2.0 and Atom
DigestURL messageurl;
try {
messageurl = new DigestURL(linkStr);
if (indexTriggered.containsKey(messageurl.hash())) {
continue;
}
urlmap.put(ASCII.String(messageurl.hash()), messageurl);
} catch (MalformedURLException e1) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr);
}
}
}

final List<DigestURL> list = new ArrayList<DigestURL>();
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
HarvestProcess harvestProcess;
try {
harvestProcess = sb.urlExists(e.getKey());
if (harvestProcess != null) continue;
if (harvestProcess != null) {
continue;
}
list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;

0 comments on commit aaefd52

Please sign in to comment.
You can’t perform that action at this time.