Skip to content

Commit

Permalink
- refactoring of robots parser (removed opaque Objects[] result vector)
Browse files Browse the repository at this point in the history
- added Allow-component to robots result object

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5016 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jul 24, 2008
1 parent 7913bdb commit 50ef5c4
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 75 deletions.
82 changes: 53 additions & 29 deletions source/de/anomic/crawler/RobotsTxt.java
Expand Up @@ -180,6 +180,7 @@ private Entry getEntry(String urlHostPort, boolean fetchOnlineIfNotAvailableOrNo
robotsTxt4Host = new Entry(
urlHostPort,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
new Date(),
null,
Expand All @@ -192,8 +193,8 @@ private Entry getEntry(String urlHostPort, boolean fetchOnlineIfNotAvailableOrNo
// store the data into the robots DB
addEntry(robotsTxt4Host);
} else {
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = parserResult.denyList();
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
denyPath.add("/");
Expand All @@ -202,12 +203,13 @@ private Entry getEntry(String urlHostPort, boolean fetchOnlineIfNotAvailableOrNo
// store the data into the robots DB
robotsTxt4Host = addEntry(
urlHostPort,
parserResult.allowList(),
denyPath,
new Date(),
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
(String) parserResult[1],
(Integer) parserResult[2]);
parserResult.sitemap(),
parserResult.crawlDelay());
}
}
}
Expand All @@ -223,15 +225,16 @@ public int crawlDelay(yacyURL theURL) {

private Entry addEntry(
String hostName,
ArrayList<String> disallowPathList,
Date loadedDate,
ArrayList<String> allowPathList,
ArrayList<String> denyPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
int crawlDelay
) {
Entry entry = new Entry(
hostName, disallowPathList, loadedDate, modDate,
hostName, allowPathList, denyPathList, loadedDate, modDate,
eTag, sitemap, crawlDelay);
addEntry(entry);
return entry;
Expand All @@ -248,59 +251,85 @@ private String addEntry(Entry entry) {
}

public class Entry {
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";

// this is a simple record structure that hold all properties of a single crawl start
// this is a simple record structure that holds all properties of a single crawl start
HashMap<String, String> mem;
private LinkedList<String> disallowPathList;
private LinkedList<String> allowPathList, denyPathList;
String hostName;

public Entry(String hostName, HashMap<String, String> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;

if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.disallowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
String csPl = this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.disallowPathList.addAll(Arrays.asList(pathArray));
this.denyPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.denyPathList = new LinkedList<String>();
}
if (this.mem.containsKey(ALLOW_PATH_LIST)) {
this.allowPathList = new LinkedList<String>();
String csPl = this.mem.get(ALLOW_PATH_LIST);
if (csPl.length() > 0){
String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.allowPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.disallowPathList = new LinkedList<String>();
this.allowPathList = new LinkedList<String>();
}
}

public Entry(
String hostName,
ArrayList<String> allowPathList,
ArrayList<String> disallowPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
int crawlDelay
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");

this.hostName = hostName.trim().toLowerCase();
this.disallowPathList = new LinkedList<String>();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();

this.mem = new HashMap<String, String>(5);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString());
if (crawlDelay != 0) this.mem.put(CRAWL_DELAY, Integer.toString(crawlDelay));

if ((allowPathList != null)&&(allowPathList.size()>0)) {
this.allowPathList.addAll(allowPathList);

StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<allowPathList.size();i++) {
pathListStr.append(allowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}

if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.disallowPathList.addAll(disallowPathList);
this.denyPathList.addAll(disallowPathList);

StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<disallowPathList.size();i++) {
Expand Down Expand Up @@ -364,21 +393,16 @@ public int getCrawlDelay() {
}

public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;
if ((this.mem == null) || (this.denyPathList.size() == 0)) return false;

// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");


Iterator<String> pathIter = this.disallowPathList.iterator();
Iterator<String> pathIter = this.denyPathList.iterator();
while (pathIter.hasNext()) {
String nextPath = pathIter.next();
// allow rule
if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) {
return false;
}

// disallow rule
if (path.startsWith(nextPath)) {
Expand Down
111 changes: 69 additions & 42 deletions source/de/anomic/crawler/robotsParser.java
Expand Up @@ -22,13 +22,13 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

// extended to return structured objects instead of a Object[] and
// extended to return a Allow-List by Michael Christen, 21.07.2008

package de.anomic.crawler;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
Expand All @@ -52,49 +52,57 @@
* - Robot Exclusion Standard Revisited
* See: http://www.kollar.com/robots.html
*/
public final class robotsParser{

public final class robotsParser {

public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase();

/*public robotsParser(URL robotsUrl){
}*/
/*
* this parses the robots.txt.
* at the Moment it only creates a list of Deny Paths
*/
private ArrayList<String> allowList;
private ArrayList<String> denyList;
private String sitemap;
private int crawlDelay;

public static Object[] parse(File robotsFile) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(robotsFile));
if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
return parse(reader);
} catch (FileNotFoundException e1) {
public robotsParser(byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelay = 0;
} else {
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
parse(reader);
}
return new Object[]{new ArrayList<String>(), "", new Integer(0)};
}

@SuppressWarnings("unchecked")
public static Object[] parse(byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null,null};
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
return parse(reader);
public robotsParser(BufferedReader reader) {
if (reader == null) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelay = 0;
} else {
parse(reader);
}
}

public static Object[] parse(BufferedReader reader) {
private void parse(BufferedReader reader) {
ArrayList<String> deny4AllAgents = new ArrayList<String>();
ArrayList<String> deny4YaCyAgent = new ArrayList<String>();
ArrayList<String> allow4AllAgents = new ArrayList<String>();
ArrayList<String> allow4YaCyAgent = new ArrayList<String>();

int pos;
String line = null, lineUpper = null, sitemap = null;
Integer crawlDelay = null;
boolean isRuleBlock4AllAgents = false,
isRuleBlock4YaCyAgent = false,
String line = null, lineUpper = null;
sitemap = null;
crawlDelay = 0;
boolean isRule4AllAgents = false,
isRule4YaCyAgent = false,
rule4YaCyFound = false,
inBlock = false;

Expand All @@ -120,9 +128,9 @@ public static Object[] parse(BufferedReader reader) {
if (inBlock) {
// we have detected the start of a new block
inBlock = false;
isRuleBlock4AllAgents = false;
isRuleBlock4YaCyAgent = false;
crawlDelay = null; // each block has a separate delay
isRule4AllAgents = false;
isRule4YaCyAgent = false;
crawlDelay = 0; // each block has a separate delay
}

// cutting off comments at the line end
Expand All @@ -136,15 +144,15 @@ public static Object[] parse(BufferedReader reader) {
pos = line.indexOf(" ");
if (pos != -1) {
String userAgent = line.substring(pos).trim();
isRuleBlock4AllAgents |= userAgent.equals("*");
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
isRule4AllAgents |= userAgent.equals("*");
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRule4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.valueOf(line.substring(pos).trim());
crawlDelay = Integer.parseInt(line.substring(pos).trim());
} catch (NumberFormatException e) {
// invalid crawling delay
}
Expand All @@ -154,7 +162,7 @@ public static Object[] parse(BufferedReader reader) {
inBlock = true;
boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);

if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
if (isRule4YaCyAgent || isRule4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
Expand Down Expand Up @@ -185,17 +193,36 @@ public static Object[] parse(BufferedReader reader) {
path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");

// adding it to the pathlist
if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
if (isDisallowRule) {
if (isRule4AllAgents) deny4AllAgents.add(path);
if (isRule4YaCyAgent) deny4YaCyAgent.add(path);
} else {
if (isRule4AllAgents) allow4AllAgents.add(path);
if (isRule4YaCyAgent) allow4YaCyAgent.add(path);
}
}
}
}
}
} catch (IOException e) {}

ArrayList<String> denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
return new Object[]{denyList, sitemap, crawlDelay};
allowList = (rule4YaCyFound) ? allow4YaCyAgent : allow4AllAgents;
denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
}

public int crawlDelay() {
return this.crawlDelay;
}

public String sitemap() {
return this.sitemap;
}

public ArrayList<String> allowList() {
return this.allowList;
}

public ArrayList<String> denyList() {
return this.denyList;
}
}
1 change: 0 additions & 1 deletion source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -86,7 +86,6 @@ this class is also the core of the http crawling.
package de.anomic.plasma;


import java.awt.GraphicsEnvironment;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
Expand Down
6 changes: 3 additions & 3 deletions source/de/anomic/tools/consoleInterface.java
Expand Up @@ -38,15 +38,15 @@ public class consoleInterface extends Thread
/**
* FIXME just for debugging
*/
private final String name;
//private final String name;
private serverLog log;


public consoleInterface (final InputStream stream, String name, serverLog log)
public consoleInterface(final InputStream stream, String name, serverLog log)
{
this.log = log;
this.stream = stream;
this.name = name;
//this.name = name;
// block reading {@see getOutput()}
try {
dataIsRead.acquire();
Expand Down

0 comments on commit 50ef5c4

Please sign in to comment.