Deprecated duplicated and internally unused getpageinfo servlet.

Redirections set for the transition of any eventual external uses: - /api/getpageinfo.xml to /api/getpageinfo_p.xml - /api/getpageinfo.json to /api/getpageinfo_p.json
yacy · May 30, 2017 · bd88fd3 · bd88fd3
1 parent 306a82d
commit bd88fd3
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 241 deletions.
diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java
@@ -1,4 +1,4 @@
-// getpageinfo_p
+// getpageinfo
 // (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
 // first published 11.11.2011 on http://yacy.net
 //
@@ -24,229 +24,46 @@
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.ArrayList;
-import java.util.Collection;
 import java.util.List;
-import java.util.Set;
+import java.util.Map.Entry;
 
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
-import net.yacy.cora.document.id.AnchorURL;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
-import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.crawler.robots.RobotsTxtEntry;
-import net.yacy.repository.Blacklist.BlacklistType;
-import net.yacy.search.Switchboard;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;
 
-import org.w3c.dom.Document;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-
-
+/**
+ * @deprecated use now {@link getpageinfo_p}
+ */
+@Deprecated
 public class getpageinfo {
 
-    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
-        final Switchboard sb = (Switchboard) env;
+	@SuppressWarnings("unused")
+    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
         final serverObjects prop = new serverObjects();
-
-        // avoid UNRESOLVED PATTERN
-        prop.put("title", "");
-        prop.put("desc", "");
-        prop.put("lang", "");
-        prop.put("robots-allowed", "3"); //unknown
-        prop.put("robotsInfo", ""); //unknown
-        prop.put("icons","0");
-        prop.put("sitelist", "");
-        prop.put("filter", ".*");
-        prop.put("oai", 0);
-
-        // default actions
-        String actions = "title,robots";
-
-        if (post != null && post.containsKey("url")) {
-            if (post.containsKey("actions"))
-                actions=post.get("actions");
-            String url = post.get("url");
-            String agentName = post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName);
-            ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
-			if (url.toLowerCase().startsWith("ftp://")) {
-				prop.put("robots-allowed", "1"); // ok to crawl
-		        prop.put("robotsInfo", "ftp does not follow robots.txt");
-				prop.putXML("title", "FTP: " + url);
-                return prop;
-			} else if (!url.startsWith("http://") &&
-		               !url.startsWith("https://") &&
-		               !url.startsWith("ftp://") &&
-		               !url.startsWith("smb://") &&
-		              !url.startsWith("file://")) {
-                url = "http://" + url;
-            }
-            if (actions.indexOf("title",0) >= 0) {
-                DigestURL u = null;
-                try {
-                    u = new DigestURL(url);
-                } catch (final MalformedURLException e) {
-                    ConcurrentLog.logException(e);
-                }
-                net.yacy.document.Document scraper = null;
-                if (u != null) try {
-                    scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
-                } catch (final IOException e) {
-                    ConcurrentLog.logException(e);
-                    // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
-                    // that should not affect the robots.txt validity
-                }
-                if (scraper != null) {
-                    // put the document title
-                    prop.putXML("title", removelinebreaks(scraper.dc_title()));
-
-                    Set<DigestURL> iconURLs = scraper.getIcons().keySet();
-                    int i = 0;
-                    for (DigestURL iconURL : iconURLs) {
-                        prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false));
-						prop.put("icons_" + i + "_eol", 1);
-                        i++;
-                    }
-                    prop.put("icons_" + (i - 1) + "_eol", 0);
-                    prop.put("icons", iconURLs.size());
-
-                    // put keywords
-                    final Set<String> list = scraper.dc_subject();
-                    int count = 0;
-                    for (final String element: list) {
-                        if (!element.equals("")) {
-                            prop.putXML("tags_"+count+"_tag", element);
-                            count++;
-                        }
-                    }
-                    prop.put("tags", count);
-                    // put description
-                    prop.putXML("desc", removelinebreaks(scraper.dc_description().length > 0 ? scraper.dc_description()[0] : ""));
-                    // put language
-                    final Set<String> languages = scraper.getContentLanguages();
-                    prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
-
-                    // get links and put them into a semicolon-separated list
-                    final Collection<AnchorURL> uris = scraper.getAnchors();
-                    final StringBuilder links = new StringBuilder(uris.size() * 80);
-                    final StringBuilder filter = new StringBuilder(uris.size() * 40);
-                    count = 0;
-                    for (final DigestURL uri: uris) {
-                        if (uri == null) continue;
-                        links.append(';').append(uri.toNormalform(true));
-                        filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
-                        prop.putXML("links_" + count + "_link", uri.toNormalform(true));
-                        count++;
-                    }
-                    prop.put("links", count);
-                    prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
-                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
-                }
-            }
-            if (actions.indexOf("robots",0) >= 0) {
-                try {
-                    final DigestURL theURL = new DigestURL(url);
-
-                	// determine if crawling of the current URL is allowed
-                    RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
-                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
-                    prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
-
-                    // get the sitemap URL of the domain
-                    final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps();
-                    for (int i = 0; i < sitemaps.size(); i++) {
-                        prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i));
-                    }
-                    prop.put("sitemaps", sitemaps.size());
-                } catch (final MalformedURLException e) {
-                    ConcurrentLog.logException(e);
-                }
-            }
-            if (actions.indexOf("oai",0) >= 0) {
-				try {
-					final DigestURL theURL = new DigestURL(url + "?verb=Identify");
-					final String oairesult = checkOAI(theURL.toNormalform(false));
-
-					prop.put("oai", oairesult == "" ? 0 : 1);
-
-					if (oairesult != "") {
-						prop.putXML("title", oairesult);
-					}
-
-				} catch (final MalformedURLException e) {
-				}
-			}
-
+
+        /* Redirect to getpageinfo_p */
+        StringBuilder redirectedLocation;
+        if(header != null && header.getPathInfo() != null && header.getPathInfo().endsWith(".json")) {
+        	redirectedLocation = new StringBuilder("getpageinfo_p.json");
+        } else {
+        	redirectedLocation = new StringBuilder("getpageinfo_p.xml");
         }
-        // return rewrite properties
-        return prop;
-    }
-
-    private static String removelinebreaks(String dc_title) {
-		String newtitle = dc_title.replace ("\r", "");
-		newtitle = newtitle.replace ("\n", "");
-		newtitle = newtitle.replace ("\r\n", "");
-		return newtitle;
-	}
-
-	private static String checkOAI(final String url) {
-		final DocumentBuilderFactory factory = DocumentBuilderFactory
-				.newInstance();
-		try {
-			final DocumentBuilder builder = factory.newDocumentBuilder();
-			return parseXML(builder.parse(url));
-		} catch (final ParserConfigurationException ex) {
-			ConcurrentLog.logException(ex);
-		} catch (final SAXException ex) {
-			ConcurrentLog.logException(ex);
-		} catch (final IOException ex) {
-			ConcurrentLog.logException(ex);
-		}
-
-		return "";
-	}
-
-	private static String parseXML(final Document doc) {
-
-		String repositoryName = null;
-
-		final NodeList items = doc.getDocumentElement().getElementsByTagName(
-				"Identify");
-		if (items.getLength() == 0) {
-			return "";
-		}
-
-		for (int i = 0, n = items.getLength(); i < n; ++i) {
-
-			if (!"Identify".equals(items.item(i).getNodeName()))
-				continue;
-
-			final NodeList currentNodeChildren = items.item(i).getChildNodes();
-
-			for (int j = 0, m = currentNodeChildren.getLength(); j < m; ++j) {
-				final Node currentNode = currentNodeChildren.item(j);
-				if ("repositoryName".equals(currentNode.getNodeName())) {
-					repositoryName = currentNode.getFirstChild().getNodeValue();
+
+        /* Append eventual request parameters to the redirected location */
+		if (post != null) {
+			List<Entry<String, String>> parameters = post.entrySet();
+			if (parameters != null && !parameters.isEmpty()) {
+				redirectedLocation.append("?");
+				for (Entry<String, String> entry : parameters) {
+					redirectedLocation.append(entry.getKey()).append("=").append(entry.getValue()).append("&");
 				}
+				/* Remove trailing "&" */
+				redirectedLocation.setLength(redirectedLocation.length() - 1);
 			}
-
-			if (repositoryName == null) {
-				return "";
-			}
-
 		}
-		return repositoryName;
-	}
-
+
+        prop.put(serverObjects.ACTION_LOCATION, redirectedLocation.toString());
+        return prop;
+    }
 
 }
diff --git a/htroot/api/getpageinfo.xml b/htroot/api/getpageinfo.xml
diff --git a/htroot/api/getpageinfo.json → htroot/api/getpageinfo_p.json b/htroot/api/getpageinfo.json → htroot/api/getpageinfo_p.json