Permalink
Browse files

Ensure proper synchronous robots entry retrieval on first check.

Previously, when checking for the first time the robots.txt policy on a
unknown host (not cached in the robots table), result was always empty
in the /getpageinfo_p.xml api and in the /CrawlCheck_p.html page. Next
calls returned however the correct information.
  • Loading branch information...
luccioman committed Aug 16, 2017
1 parent 9da75ac commit 3f0446f14b89e6a502363dc516f82f7dbab330df
Showing with 11 additions and 5 deletions.
  1. +0 −1 htroot/api/getpageinfo_p.java
  2. +11 −4 source/net/yacy/crawler/robots/RobotsTxt.java
@@ -221,7 +221,6 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// determine if crawling of the current URL is allowed
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
sb.robots.ensureExist(theURL, agent, true);
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
@@ -198,7 +198,7 @@ public RobotsTxtEntry getEntry(final String urlHostPort, final ClientIdentificat
if (response == null) {
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, agent.robotIDs);
robotsTxt4Host = processNewEntry(response, agent.robotIDs);
}
}
}
@@ -266,7 +266,7 @@ public void run(){
if (response == null) {
processOldEntry(null, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, agent.robotIDs);
processNewEntry(response, agent.robotIDs);
}
}
}
@@ -314,7 +314,13 @@ private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURL robotsURL,
}
}
private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) {
/**
* Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
* @param response the response to the requested robots.txt URL. Must not be null.
* @param thisAgents the agent identifier(s) used to request the robots.txt URL
* @return the new robots entry
*/
private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
@@ -334,7 +340,7 @@ private void processNewEntry(DigestURL robotsURL, Response response, final Strin
boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
if (isBrowserAgent) denyPath.clear();
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
response.getRequest().url(),
parserResult.allowList(),
denyPath,
new Date(),
@@ -344,6 +350,7 @@ private void processNewEntry(DigestURL robotsURL, Response response, final Strin
parserResult.crawlDelayMillis(),
parserResult.agentName());
addEntry(robotsTxt4Host);
return robotsTxt4Host;
}
private String addEntry(final RobotsTxtEntry entry) {

0 comments on commit 3f0446f

Please sign in to comment.