diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index f6e2941b8c..9763adb7ae 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -1,4 +1,4 @@ -// getpageinfo_p +// getpageinfo // (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 11.11.2011 on http://yacy.net // @@ -24,229 +24,46 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.Collection; import java.util.List; -import java.util.Set; +import java.util.Map.Entry; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.federate.yacy.CacheStrategy; -import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.repository.Blacklist.BlacklistType; -import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - - +/** + * @deprecated use now {@link getpageinfo_p} + */ +@Deprecated public class getpageinfo { - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { - final Switchboard sb = (Switchboard) env; + @SuppressWarnings("unused") + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final serverObjects prop = new serverObjects(); - - // avoid UNRESOLVED PATTERN - prop.put("title", ""); - prop.put("desc", ""); - prop.put("lang", ""); - prop.put("robots-allowed", "3"); //unknown - prop.put("robotsInfo", ""); //unknown - prop.put("icons","0"); - prop.put("sitelist", ""); - prop.put("filter", ".*"); - prop.put("oai", 0); - - // default actions - String actions = "title,robots"; - - if (post != null && post.containsKey("url")) { - if (post.containsKey("actions")) - actions=post.get("actions"); - String url = post.get("url"); - String agentName = post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName); - ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName); - if (url.toLowerCase().startsWith("ftp://")) { - prop.put("robots-allowed", "1"); // ok to crawl - prop.put("robotsInfo", "ftp does not follow robots.txt"); - prop.putXML("title", "FTP: " + url); - return prop; - } else if (!url.startsWith("http://") && - !url.startsWith("https://") && - !url.startsWith("ftp://") && - !url.startsWith("smb://") && - !url.startsWith("file://")) { - url = "http://" + url; - } - if (actions.indexOf("title",0) >= 0) { - DigestURL u = null; - try { - u = new DigestURL(url); - } catch (final MalformedURLException e) { - ConcurrentLog.logException(e); - } - net.yacy.document.Document scraper = null; - if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent); - } catch (final IOException e) { - ConcurrentLog.logException(e); - // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" - // that should not affect the robots.txt validity - } - if (scraper != null) { - // put the document title - prop.putXML("title", removelinebreaks(scraper.dc_title())); - - Set iconURLs = scraper.getIcons().keySet(); - int i = 0; - for (DigestURL iconURL : iconURLs) { - prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false)); - prop.put("icons_" + i + "_eol", 1); - i++; - } - prop.put("icons_" + (i - 1) + "_eol", 0); - prop.put("icons", iconURLs.size()); - - // put keywords - final Set list = scraper.dc_subject(); - int count = 0; - for (final String element: list) { - if (!element.equals("")) { - prop.putXML("tags_"+count+"_tag", element); - count++; - } - } - prop.put("tags", count); - // put description - prop.putXML("desc", removelinebreaks(scraper.dc_description().length > 0 ? scraper.dc_description()[0] : "")); - // put language - final Set languages = scraper.getContentLanguages(); - prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next()); - - // get links and put them into a semicolon-separated list - final Collection uris = scraper.getAnchors(); - final StringBuilder links = new StringBuilder(uris.size() * 80); - final StringBuilder filter = new StringBuilder(uris.size() * 40); - count = 0; - for (final DigestURL uri: uris) { - if (uri == null) continue; - links.append(';').append(uri.toNormalform(true)); - filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); - prop.putXML("links_" + count + "_link", uri.toNormalform(true)); - count++; - } - prop.put("links", count); - prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : ""); - prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); - } - } - if (actions.indexOf("robots",0) >= 0) { - try { - final DigestURL theURL = new DigestURL(url); - - // determine if crawling of the current URL is allowed - RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent); - prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); - prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); - - // get the sitemap URL of the domain - final List sitemaps = robotsEntry == null ? new ArrayList(0) : robotsEntry.getSitemaps(); - for (int i = 0; i < sitemaps.size(); i++) { - prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i)); - } - prop.put("sitemaps", sitemaps.size()); - } catch (final MalformedURLException e) { - ConcurrentLog.logException(e); - } - } - if (actions.indexOf("oai",0) >= 0) { - try { - final DigestURL theURL = new DigestURL(url + "?verb=Identify"); - final String oairesult = checkOAI(theURL.toNormalform(false)); - - prop.put("oai", oairesult == "" ? 0 : 1); - - if (oairesult != "") { - prop.putXML("title", oairesult); - } - - } catch (final MalformedURLException e) { - } - } - + + /* Redirect to getpageinfo_p */ + StringBuilder redirectedLocation; + if(header != null && header.getPathInfo() != null && header.getPathInfo().endsWith(".json")) { + redirectedLocation = new StringBuilder("getpageinfo_p.json"); + } else { + redirectedLocation = new StringBuilder("getpageinfo_p.xml"); } - // return rewrite properties - return prop; - } - - private static String removelinebreaks(String dc_title) { - String newtitle = dc_title.replace ("\r", ""); - newtitle = newtitle.replace ("\n", ""); - newtitle = newtitle.replace ("\r\n", ""); - return newtitle; - } - - private static String checkOAI(final String url) { - final DocumentBuilderFactory factory = DocumentBuilderFactory - .newInstance(); - try { - final DocumentBuilder builder = factory.newDocumentBuilder(); - return parseXML(builder.parse(url)); - } catch (final ParserConfigurationException ex) { - ConcurrentLog.logException(ex); - } catch (final SAXException ex) { - ConcurrentLog.logException(ex); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); - } - - return ""; - } - - private static String parseXML(final Document doc) { - - String repositoryName = null; - - final NodeList items = doc.getDocumentElement().getElementsByTagName( - "Identify"); - if (items.getLength() == 0) { - return ""; - } - - for (int i = 0, n = items.getLength(); i < n; ++i) { - - if (!"Identify".equals(items.item(i).getNodeName())) - continue; - - final NodeList currentNodeChildren = items.item(i).getChildNodes(); - - for (int j = 0, m = currentNodeChildren.getLength(); j < m; ++j) { - final Node currentNode = currentNodeChildren.item(j); - if ("repositoryName".equals(currentNode.getNodeName())) { - repositoryName = currentNode.getFirstChild().getNodeValue(); + + /* Append eventual request parameters to the redirected location */ + if (post != null) { + List> parameters = post.entrySet(); + if (parameters != null && !parameters.isEmpty()) { + redirectedLocation.append("?"); + for (Entry entry : parameters) { + redirectedLocation.append(entry.getKey()).append("=").append(entry.getValue()).append("&"); } + /* Remove trailing "&" */ + redirectedLocation.setLength(redirectedLocation.length() - 1); } - - if (repositoryName == null) { - return ""; - } - } - return repositoryName; - } - + + prop.put(serverObjects.ACTION_LOCATION, redirectedLocation.toString()); + return prop; + } } diff --git a/htroot/api/getpageinfo.xml b/htroot/api/getpageinfo.xml deleted file mode 100644 index 9ccd8e3d9c..0000000000 --- a/htroot/api/getpageinfo.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - #[title]# - #[desc]# - #[lang]# - #(robots-allowed)#0::1::#(/robots-allowed)# - #[robotsInfo]# - #{sitemaps}# - #[sitemap]# - #{/sitemaps}# - - #{icons}# - #[icon]# - #{/icons}# - - #[sitelist]# - #[filter]# - - #{tags}# - - #{/tags}# - - - #{links}# - - #{/links}# - - #[oai]# - diff --git a/htroot/api/getpageinfo.json b/htroot/api/getpageinfo_p.json similarity index 100% rename from htroot/api/getpageinfo.json rename to htroot/api/getpageinfo_p.json