Skip to content

Commit

Permalink
fix for wrong robots.txt loading for https protocol
Browse files Browse the repository at this point in the history
  • Loading branch information
Orbiter committed Jan 16, 2013
1 parent edbc86d commit af465cd
Showing 1 changed file with 100 additions and 166 deletions.
266 changes: 100 additions & 166 deletions source/net/yacy/crawler/robots/RobotsTxt.java
Expand Up @@ -132,9 +132,7 @@ record = null;

// we can now synchronize for each host separately
synchronized (syncObj) {

// if we have not found any data or the data is older than 7 days, we need to load it from the remote server

// check the robots table again for all threads that come here because they waited for another one
// to complete a download
try {
Expand All @@ -156,7 +154,7 @@ record = null;
// generating the proper url to download the robots txt
DigestURI robotsURL = null;
try {
robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt");
robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
Expand All @@ -175,58 +173,9 @@ record = null;
}

if (response == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
Integer.valueOf(0),
null);
} else {
robotsTxt4Host.setLoadedDate(new Date());
}

// store the data into the robots DB
final int sz = robotsTable.size();
addEntry(robotsTxt4Host);
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
try {clear();} catch (IOException e) {}
addEntry(robotsTxt4Host);
}
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
} else {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
denyPath = parserResult.denyList();
}

// store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
robotsTxt4Host = addEntry(
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
processNewEntry(robotsURL, response, thisAgents);
}
}
}
Expand All @@ -246,118 +195,107 @@ public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAge
return;
}
if (robotsTable == null || robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
Thread t = new Thread() {
public void run(){
// make or get a synchronization object
DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort);
if (syncObj == null) {
syncObj = new DomSync();
RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
}
// we can now synchronize for each host separately
synchronized (syncObj) {
if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;

if (concurrent)
new Thread() {public void run(){ensureExist(urlHostPort, robotsTable, thisAgents);}}.start();
else
ensureExist(urlHostPort, robotsTable, thisAgents);
}

private void ensureExist(final String urlHostPort, BEncodedHeap robotsTable, final Set<String> thisAgents) {

// make or get a synchronization object
DomSync syncObj = RobotsTxt.this.syncObjects.get(urlHostPort);
if (syncObj == null) {
syncObj = new DomSync();
RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
}
// we can now synchronize for each host separately
synchronized (syncObj) {
if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
// generating the proper url to download the robots txt
DigestURI robotsURL = null;
try {
robotsURL = new DigestURI((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}

// generating the proper url to download the robots txt
DigestURI robotsURL = null;
try {
robotsURL = new DigestURI("http://" + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.fatal("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
Response response = null;
if (robotsURL != null) {
if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null);
try {
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
} catch (IOException e) {
response = null;
}
}

Response response = null;
if (robotsURL != null) {
if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null);
try {
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0);
} catch (IOException e) {
response = null;
if (response == null) {
processOldEntry(null, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, thisAgents);
}
}
}
};
if (concurrent) t.start(); else t.run();
}

RobotsTxtEntry robotsTxt4Host = null;
if (response == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
// generate artificial entry
robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
Integer.valueOf(0),
null);

// store the data into the robots DB
final int sz = robotsTable.size();
addEntry(robotsTxt4Host);
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
try {clear();} catch (IOException e) {}
addEntry(robotsTxt4Host);
}
} else {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
denyPath = parserResult.denyList();
}
private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURI robotsURL, BEncodedHeap robotsTable) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
Integer.valueOf(0),
null);
} else {
robotsTxt4Host.setLoadedDate(new Date());
}

// store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
robotsTxt4Host = addEntry(
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
}
// store the data into the robots DB
final int sz = robotsTable.size();
addEntry(robotsTxt4Host);
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
try {clear();} catch (IOException e) {}
addEntry(robotsTxt4Host);
}
}

private RobotsTxtEntry addEntry(
final MultiProtocolURI theURL,
final ArrayList<String> allowPathList,
final ArrayList<String> denyPathList,
final Date loadedDate,
final Date modDate,
final String eTag,
final String sitemap,
final long crawlDelayMillis,
final String agentName
) {
final RobotsTxtEntry entry = new RobotsTxtEntry(
theURL, allowPathList, denyPathList,
loadedDate, modDate,
eTag, sitemap, crawlDelayMillis, agentName);
addEntry(entry);
return entry;
}
private void processNewEntry(DigestURI robotsURL, Response response, final Set<String> thisAgents) {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
ArrayList<String> denyPath;
if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
parserResult = new RobotsTxtParser(thisAgents);
// create virtual deny path
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
denyPath = parserResult.denyList();
}

// store the data into the robots DB
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
response.getResponseHeader().lastModified(),
etag,
parserResult.sitemap(),
parserResult.crawlDelayMillis(),
parserResult.agentName());
addEntry(robotsTxt4Host);
}

private String addEntry(final RobotsTxtEntry entry) {
// writes a new page and returns key
try {
Expand All @@ -371,25 +309,21 @@ private String addEntry(final RobotsTxtEntry entry) {
}

static final String getHostPort(final MultiProtocolURI theURL) {
final int port = getPort(theURL);
String host = theURL.getHost();
if (host == null) return null;
StringBuilder sb = new StringBuilder(host.length() + 6);
sb.append(host).append(':').append(Integer.toString(port));
return sb.toString();
}

private static final int getPort(final MultiProtocolURI theURL) {
int port = theURL.getPort();
if (port == -1) {
if (theURL.getProtocol().equalsIgnoreCase("http")) {
port = 80;
} else if (theURL.getProtocol().equalsIgnoreCase("https")) {
port = 443;
} else {
port = 80;
}

}
return port;
String host = theURL.getHost();
if (host == null) return null;
StringBuilder sb = new StringBuilder(host.length() + 6);
sb.append(host).append(':').append(Integer.toString(port));
return sb.toString();
}

}

0 comments on commit af465cd

Please sign in to comment.