Skip to content
Permalink
Browse files

Added RSS reader support for `enclosure` feed item sub element.

Enclosure element (see
http://www.rssboard.org/rss-specification#ltenclosuregtSubelementOfLtitemgt
) can be seen for example in podcasts feeds.
  • Loading branch information...
luccioman committed Mar 20, 2018
1 parent e5f5de0 commit cf62b571bd262f66984f40facdb5c0592a76ddb2
@@ -57,6 +57,11 @@

public void setSize(long size);

/**
* @param enclosure an URL describing a media object that is attached to a feed item
*/
public void setEnclosure(String enclosure);

public String getAuthor();

public String getCopyright();
@@ -82,6 +87,11 @@
public String[] getSubject();

public long getSize();

/**
* @return an URL describing a media object that is attached to a feed item
*/
public String getEnclosure();

public double getLon();

@@ -86,6 +86,9 @@
/** A string that uniquely identifies an item (RSS 2.0) */
guid(new String[]{"guid"}),

/** URL describing a media object that is attached to a feed item */
enclosure(new String[]{"enclosure"}),

/** Time To Live : number of minutes that indicates how long a channel (RSS 2.0) can be cached before refreshing from the source. */
ttl(new String[]{"ttl"}),

@@ -163,15 +166,19 @@ public RSSMessage(final String title, final String description, final MultiProto
if (description.length() > 0) this.map.put(Token.description.name(), description);
this.map.put(Token.link.name(), link.toNormalform(true));
this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date()));
if (guid.length() > 0) this.map.put(Token.guid.name(), guid);
if (guid.length() > 0) {
this.map.put(Token.guid.name(), guid);
}
}

public RSSMessage() {
this.map = new HashMap<String, String>();
}

public void setValue(final Token token, final String value) {
if (value.length() > 0) this.map.put(token.name(), value);
if (value.length() > 0) {
this.map.put(token.name(), value);
}
}

@Override
@@ -277,7 +284,12 @@ public String getGuid() {
}
return guid;
}


@Override
public String getEnclosure() {
return Token.enclosure.valueFrom(this.map, "");
}

public String getTTL() {
return Token.ttl.valueFrom(this.map, "");
}
@@ -371,7 +383,12 @@ public void setDocs(final String docs) {
public void setGuid(final String guid) {
setValue(Token.guid, guid);
}


@Override
public void setEnclosure(final String enclosure) {
setValue(Token.enclosure, enclosure);
}

@Override
public void setLanguage(final String language) {
setValue(Token.language, language);
@@ -30,6 +30,7 @@
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.lang.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
@@ -168,10 +169,33 @@ public void startElement(final String uri, final String name, final String tag,
}
this.item = new RSSMessage();
this.parsingItem = true;
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (url != null && url.length() > 0) this.item.setValue(Token.link, url);
} else if (this.parsingItem) {
if(this.type == Type.atom) {
if ("link".equals(tag)) {
final String linkRelation = atts.getValue("rel");
if (linkRelation == null || linkRelation.equals("alternate")) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (StringUtils.isNotBlank(url)) {
this.item.setValue(Token.link, url);
}
} else if("enclosure".equals(linkRelation)) {
/* Atom rel="enclosure" link type */
final String url = atts.getValue("href");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if(this.type == Type.rss) {
/* RSS 0.92 and 2.0 <enclosure> element */
if ("enclosure".equals(tag)) {
final String url = atts.getValue("url");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if ("rss".equals(tag)) {
this.type = Type.rss;
}
@@ -189,7 +213,9 @@ public void endElement(final String uri, final String name, final String tag) th
} else if (this.parsingItem) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value);
if (RSSMessage.tags.contains(tag) && value.length() > 0) {
this.item.setValue(RSSMessage.valueOfNick(tag), value);
}
} else if (this.parsingChannel) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
@@ -34,10 +34,14 @@
import java.util.List;
import java.util.Set;

import org.apache.commons.lang.StringUtils;

import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@@ -46,6 +50,8 @@
import net.yacy.document.parser.html.ImageEntry;

public class rssParser extends AbstractParser implements Parser {

private final static ConcurrentLog LOG = new ConcurrentLog(rssParser.class.getSimpleName());

public rssParser() {
super("RSS Parser");
@@ -67,7 +73,7 @@ public rssParser() {
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
RSSReader rssReader;
final RSSReader rssReader;
try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) {
@@ -89,34 +95,59 @@ public rssParser() {
DigestURL itemuri;
Set<String> languages;
Document doc;
for (final Hit item: feed) {
try {
itemuri = new DigestURL(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
for (final Hit item : feed) {
final String linkUrlString = item.getLink();
itemuri = null;
if(StringUtils.isNotBlank(linkUrlString)) {
/* Link element is optional in RSS 2.0 and Atom */
try {
itemuri = new DigestURL(item.getLink());
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item link url : " + linkUrlString);
}
}
languages = new HashSet<String>();
languages.add(item.getLanguage());

Set<AnchorURL> anchors = null;
final String enclosureUrlString = item.getEnclosure();
if(StringUtils.isNotBlank(enclosureUrlString)) {
try {
final AnchorURL enclosureUrl = new AnchorURL(enclosureUrlString);
if(itemuri == null) {
/* No <link> element in this item : the enclosure URL is used as the sub document main location URL */
itemuri = enclosureUrl;
} else {
anchors = new HashSet<>();
anchors.add(enclosureUrl);
}
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item enclosure url : " + enclosureUrlString);
}
}

if(itemuri != null) {
doc = new Document(
itemuri,
TextParser.mimeOf(itemuri),
charset,
this,
languages,
item.getSubject(),
singleList(item.getTitle()),
item.getAuthor(),
item.getCopyright(),
null,
item.getDescriptions(),
item.getLon(),
item.getLat(),
null,
null,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
item.getPubDate());
itemuri,
TextParser.mimeOf(itemuri),
charset,
this,
languages,
item.getSubject(),
singleList(item.getTitle()),
item.getAuthor(),
item.getCopyright(),
null,
item.getDescriptions(),
item.getLon(),
item.getLat(),
null,
anchors,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
item.getPubDate());
docs.add(doc);
} catch (final MalformedURLException e) {
continue;
}
}

0 comments on commit cf62b57

Please sign in to comment.
You can’t perform that action at this time.