Skip to content


Deprecated duplicated and internally unused getpageinfo servlet.
Browse files Browse the repository at this point in the history
Redirections set for the transition of any eventual external uses:
 - /api/getpageinfo.xml to /api/getpageinfo_p.xml
 - /api/getpageinfo.json to /api/getpageinfo_p.json
  • Loading branch information
luccioman committed May 30, 2017
1 parent 306a82d commit bd88fd3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 241 deletions.
241 changes: 29 additions & 212 deletions htroot/api/
@@ -1,4 +1,4 @@
// getpageinfo_p
// getpageinfo
// (C) 2011 by Michael Peter Christen;, Frankfurt a. M., Germany
// first published 11.11.2011 on
Expand All @@ -24,229 +24,46 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.Map.Entry;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

* @deprecated use now {@link getpageinfo_p}
public class getpageinfo {

public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();

prop.put("title", "");
prop.put("desc", "");
prop.put("lang", "");
prop.put("robots-allowed", "3"); //unknown
prop.put("robotsInfo", ""); //unknown
prop.put("sitelist", "");
prop.put("filter", ".*");
prop.put("oai", 0);

// default actions
String actions = "title,robots";

if (post != null && post.containsKey("url")) {
if (post.containsKey("actions"))
String url = post.get("url");
String agentName = post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1"); // ok to crawl
prop.put("robotsInfo", "ftp does not follow robots.txt");
prop.putXML("title", "FTP: " + url);
return prop;
} else if (!url.startsWith("http://") &&
!url.startsWith("https://") &&
!url.startsWith("ftp://") &&
!url.startsWith("smb://") &&
!url.startsWith("file://")) {
url = "http://" + url;
if (actions.indexOf("title",0) >= 0) {
DigestURL u = null;
try {
u = new DigestURL(url);
} catch (final MalformedURLException e) {
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
} catch (final IOException e) {
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
// that should not affect the robots.txt validity
if (scraper != null) {
// put the document title
prop.putXML("title", removelinebreaks(scraper.dc_title()));

Set<DigestURL> iconURLs = scraper.getIcons().keySet();
int i = 0;
for (DigestURL iconURL : iconURLs) {
prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false));
prop.put("icons_" + i + "_eol", 1);
prop.put("icons_" + (i - 1) + "_eol", 0);
prop.put("icons", iconURLs.size());

// put keywords
final Set<String> list = scraper.dc_subject();
int count = 0;
for (final String element: list) {
if (!element.equals("")) {
prop.putXML("tags_"+count+"_tag", element);
prop.put("tags", count);
// put description
prop.putXML("desc", removelinebreaks(scraper.dc_description().length > 0 ? scraper.dc_description()[0] : ""));
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());

// get links and put them into a semicolon-separated list
final Collection<AnchorURL> uris = scraper.getAnchors();
final StringBuilder links = new StringBuilder(uris.size() * 80);
final StringBuilder filter = new StringBuilder(uris.size() * 40);
count = 0;
for (final DigestURL uri: uris) {
if (uri == null) continue;
prop.putXML("links_" + count + "_link", uri.toNormalform(true));
prop.put("links", count);
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
if (actions.indexOf("robots",0) >= 0) {
try {
final DigestURL theURL = new DigestURL(url);

// determine if crawling of the current URL is allowed
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

// get the sitemap URL of the domain
final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps();
for (int i = 0; i < sitemaps.size(); i++) {
prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i));
prop.put("sitemaps", sitemaps.size());
} catch (final MalformedURLException e) {
if (actions.indexOf("oai",0) >= 0) {
try {
final DigestURL theURL = new DigestURL(url + "?verb=Identify");
final String oairesult = checkOAI(theURL.toNormalform(false));

prop.put("oai", oairesult == "" ? 0 : 1);

if (oairesult != "") {
prop.putXML("title", oairesult);

} catch (final MalformedURLException e) {

/* Redirect to getpageinfo_p */
StringBuilder redirectedLocation;
if(header != null && header.getPathInfo() != null && header.getPathInfo().endsWith(".json")) {
redirectedLocation = new StringBuilder("getpageinfo_p.json");
} else {
redirectedLocation = new StringBuilder("getpageinfo_p.xml");
// return rewrite properties
return prop;

private static String removelinebreaks(String dc_title) {
String newtitle = dc_title.replace ("\r", "");
newtitle = newtitle.replace ("\n", "");
newtitle = newtitle.replace ("\r\n", "");
return newtitle;

private static String checkOAI(final String url) {
final DocumentBuilderFactory factory = DocumentBuilderFactory
try {
final DocumentBuilder builder = factory.newDocumentBuilder();
return parseXML(builder.parse(url));
} catch (final ParserConfigurationException ex) {
} catch (final SAXException ex) {
} catch (final IOException ex) {

return "";

private static String parseXML(final Document doc) {

String repositoryName = null;

final NodeList items = doc.getDocumentElement().getElementsByTagName(
if (items.getLength() == 0) {
return "";

for (int i = 0, n = items.getLength(); i < n; ++i) {

if (!"Identify".equals(items.item(i).getNodeName()))

final NodeList currentNodeChildren = items.item(i).getChildNodes();

for (int j = 0, m = currentNodeChildren.getLength(); j < m; ++j) {
final Node currentNode = currentNodeChildren.item(j);
if ("repositoryName".equals(currentNode.getNodeName())) {
repositoryName = currentNode.getFirstChild().getNodeValue();

/* Append eventual request parameters to the redirected location */
if (post != null) {
List<Entry<String, String>> parameters = post.entrySet();
if (parameters != null && !parameters.isEmpty()) {
for (Entry<String, String> entry : parameters) {
/* Remove trailing "&" */
redirectedLocation.setLength(redirectedLocation.length() - 1);

if (repositoryName == null) {
return "";

return repositoryName;

prop.put(serverObjects.ACTION_LOCATION, redirectedLocation.toString());
return prop;

29 changes: 0 additions & 29 deletions htroot/api/getpageinfo.xml

This file was deleted.

File renamed without changes.

0 comments on commit bd88fd3

Please sign in to comment.