Skip to content
Permalink
Browse files

Added support for enclosures (media links) to the RSS loader

  • Loading branch information...
luccioman committed Mar 21, 2018
1 parent 29166e7 commit e45afedee4b9c809a4e3ac3da002dd93283543be
Showing with 123 additions and 37 deletions.
  1. +2 −2 htroot/Load_RSS_p.html
  2. +95 −31 htroot/Load_RSS_p.java
  3. +26 −4 source/net/yacy/crawler/retrieval/RSSLoader.java
@@ -154,14 +154,14 @@ <h2>Loading of RSS Feeds</h2>
</tr>
#{item}#
<tr class="TableCellLight">
<td align="left">#(indexable)#&nbsp;::<input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" />#(/indexable)#</td>
<td align="left">#(indexable)#&nbsp;::<input type="checkbox" name="item_#[count]#" value="#[inputValue]#" />#(/indexable)#</td>
#(state)#<td>new</td>::<td class="info">enqueued</td>::<td class="commit">indexed</td>#(/state)#
<td>#(hasLink)#::<a href="#[link]#">#(/hasLink)##[title]##(hasLink)#::</a>#(/hasLink)#</td>
<td>#(hasLink)#::<a href="#[link]#">#[link]#</a>#(/hasLink)#</td>
<td>#[author]#</td>
<td>#[language]#</td>
<td>#[date]#</td>
<td>#[description]#</td>
<td>#[description]##(defaultMediaDesc)#::Attached media#(/defaultMediaDesc)#</td>
</tr>
#{/item}#
</table>
@@ -60,6 +60,12 @@

public class Load_RSS_p {

/** Value prefix of checkbox inputs used to select items */
private static final String CHECKBOX_ITEM_PREFIX = "mark_";

/** Value prefix of checkbox inputs used to select media items */
private static final String CHECKBOX_MEDIA_ITEM_PREFIX = "media_";

public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {

final serverObjects prop = new serverObjects();
@@ -79,8 +85,8 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea

if (post != null && post.containsKey("removeSelectedFeedsNewList")) {
for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
sb.tables.delete("rss", entry.getValue().substring(5).getBytes());
if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) try {
sb.tables.delete("rss", entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()).getBytes());
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
@@ -114,8 +120,8 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea

if (post != null && post.containsKey("removeSelectedFeedsScheduler")) {
for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
final byte[] pk = entry.getValue().substring(5).getBytes();
if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) try {
final byte[] pk = entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()).getBytes();
final Row rssRow = sb.tables.select("rss", pk);
final byte[] schedulerPK = rssRow.get("api_pk", (byte[]) null);
if (schedulerPK != null) sb.tables.delete("api", schedulerPK);
@@ -161,10 +167,10 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
if (post != null && post.containsKey("addSelectedFeedScheduler")) {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) {
Row row;
try {
final byte [] pk = entry.getValue().substring(5).getBytes();
final byte [] pk = entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()).getBytes();
row = sb.tables.select("rss", pk);
} catch (final IOException e) {
ConcurrentLog.logException(e);
@@ -289,8 +295,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
final RSSFeed feed = rss.getFeed();
final Map<String, DigestURL> hash2UrlMap = new HashMap<String, DigestURL>();
loop: for (final Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
final RSSMessage message = feed.getMessage(entry.getValue().substring(5));
if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) {
/* Process selected item links */
final RSSMessage message = feed.getMessage(entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()));
if(message == null || StringUtils.isBlank(message.getLink())) {
/* Link element is optional in RSS 2.0 and Atom */
continue loop;
@@ -306,6 +313,24 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
continue loop;
}
hash2UrlMap.put(ASCII.String(messageUrl.hash()), messageUrl);
} else if(entry.getValue().startsWith(CHECKBOX_MEDIA_ITEM_PREFIX)) {
/* Process selected item enclosure (media) links */
final RSSMessage message = feed.getMessage(entry.getValue().substring(CHECKBOX_MEDIA_ITEM_PREFIX.length()));
if(message == null || StringUtils.isBlank(message.getEnclosure())) {
/* Enclosure element is optional */
continue loop;
}
DigestURL mediaUrl;
try {
mediaUrl = new DigestURL(message.getEnclosure());
} catch (MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item enclosure URL : " + message.getEnclosure());
continue loop;
}
if (RSSLoader.indexTriggered.containsKey(mediaUrl.hash())) {
continue loop;
}
hash2UrlMap.put(ASCII.String(mediaUrl.hash()), mediaUrl);
}
}

@@ -366,6 +391,21 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr);
}
}

DigestURL enclosure = null;
final String enclosureStr = item.getEnclosure();
if(StringUtils.isNotBlank(enclosureStr)) {
try {
enclosure = new DigestURL(enclosureStr);
} catch (final MalformedURLException e) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item enclosure URL : " + enclosureStr);
}
}

if(link == null) {
/* No link in this feed item : we use the enclosure media URL as the main link */
link = enclosure;
}

author = item.getAuthor();
if (author == null) {
@@ -374,32 +414,56 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
pubDate = item.getPubDate();

HarvestProcess harvestProcess;
try {
if(link != null && StringUtils.isNotEmpty(item.getGuid())) {
harvestProcess = sb.urlExists(ASCII.String(link.hash()));
try {
if(link != null && StringUtils.isNotEmpty(item.getGuid())) {
harvestProcess = sb.urlExists(ASCII.String(link.hash()));

prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_inputValue", (link == enclosure ? CHECKBOX_MEDIA_ITEM_PREFIX : CHECKBOX_ITEM_PREFIX) + item.getGuid());
} else {
prop.put("showitems_item_" + i + "_state", 0);
prop.put("showitems_item_" + i + "_indexable", false);
prop.put("showitems_item_" + i + "_hasLink", false);
}
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.put("showitems_item_" + i + "_defaultMediaDesc", false);
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;
} catch (IOException e) {
ConcurrentLog.logException(e);
}

try {
if(enclosure != null && enclosure != link && StringUtils.isNotEmpty(item.getGuid())) {
harvestProcess = sb.urlExists(ASCII.String(enclosure.hash()));

prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_guid", item.getGuid());
} else {
prop.put("showitems_item_" + i + "_state", 0);
prop.put("showitems_item_" + i + "_indexable", false);
prop.put("showitems_item_" + i + "_hasLink", false);
}
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", enclosure.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(enclosure.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_inputValue", "media_" + item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", "");
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
prop.putHTML("showitems_item_" + i + "_description", "");
/* Description is already used for the main item link, use here a default one */
prop.put("showitems_item_" + i + "_defaultMediaDesc", true);
prop.putHTML("showitems_item_" + i + "_language", "");
prop.putHTML("showitems_item_" + i + "_date", "");
i++;
} catch (IOException e) {
ConcurrentLog.logException(e);
continue;
}
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
prop.put("showitems_item", i);
prop.put("showitems_num", i);
@@ -97,7 +97,14 @@ public void run() {
recordAPI(this.sb, null, this.urlf, feed, 7, "seldays");
}

public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, final RSSFeed feed, Map<String, Pattern> collections) {
/**
* Iterate over the given feed and add all item links and enclosures URLs to a new switchboard indexing task.
* @param sb the main environment switchboard instance. Must not be null.
* @param feedUrl the feed url. Must not be null.
* @param feed the parsed feed. Must not be null.
* @param collections
*/
public static void indexAllRssFeed(final Switchboard sb, final DigestURL feedUrl, final RSSFeed feed, final Map<String, Pattern> collections) {
int loadCount = 0;
final Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (final RSSMessage message: feed) {
@@ -114,6 +121,21 @@ public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, fi
ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr);
}
}

/* An enclosure (media) URL may also be defined for that item */
final String enclosureStr = message.getEnclosure();
if(StringUtils.isNotBlank(enclosureStr)) { // Link element is optional in RSS 2.0 and Atom
DigestURL enclosureUrl;
try {
enclosureUrl = new DigestURL(enclosureStr);
if (indexTriggered.containsKey(enclosureUrl.hash())) {
continue;
}
urlmap.put(ASCII.String(enclosureUrl.hash()), enclosureUrl);
} catch (MalformedURLException e1) {
ConcurrentLog.warn("Load_RSS", "Malformed feed item enclosure URL : " + enclosureStr);
}
}
}

final List<DigestURL> list = new ArrayList<DigestURL>();
@@ -135,21 +157,21 @@ public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, fi
// update info for loading

try {
Tables.Data rssRow = sb.tables.select("rss", url.hash());
Tables.Data rssRow = sb.tables.select("rss", feedUrl.hash());
if (rssRow == null) rssRow = new Tables.Data();
final Date lastLoadDate = rssRow.get("last_load_date", new Date(0));
final long deltaTime = Math.min(System.currentTimeMillis() - lastLoadDate.getTime(), 1000 * 60 * 60 * 24);
final int allLoadCount = rssRow.get("all_load_count", 0);
final int lastAvg = rssRow.get("avg_upd_per_day", 0);
final long thisAvg = 1000 * 60 * 60 * 24 / deltaTime * loadCount;
final long nextAvg = lastAvg == 0 ? thisAvg : (thisAvg + lastAvg * 2) / 3;
rssRow.put("url", UTF8.getBytes(url.toNormalform(true)));
rssRow.put("url", UTF8.getBytes(feedUrl.toNormalform(true)));
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("last_load_date", new Date());
rssRow.put("last_load_count", loadCount);
rssRow.put("all_load_count", allLoadCount + loadCount);
rssRow.put("avg_upd_per_day", nextAvg);
sb.tables.update("rss", url.hash(), rssRow);
sb.tables.update("rss", feedUrl.hash(), rssRow);
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch (final SpaceExceededException e) {

0 comments on commit e45afed

Please sign in to comment.
You can’t perform that action at this time.