Skip to content

Commit

Permalink
- added support for multiple paths per domain to default-blacklist
Browse files Browse the repository at this point in the history
warning: an interface-change had been neccessary:
- remove(String, String) has been renamed to removeAll(String, String), because it removes all path-entries for the specified host
- remove(String, String, String) has been added to delete only a path-entry
- geBlacklistType(String) has been renamed to getBlacklistType(String)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3391 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
karlchenofhell committed Feb 24, 2007
1 parent 3d6ab19 commit 26f5757
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 37 deletions.
6 changes: 4 additions & 2 deletions htroot/BlacklistCleaner_p.java
Expand Up @@ -249,7 +249,8 @@ private static int removeEntries(String blacklistToUse, String[] supportedBlackl
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],
(s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")));
(s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")),
(s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1));
}
}
}
Expand Down Expand Up @@ -284,7 +285,8 @@ private static int alterEntries(String blacklistToUse, String[] supportedBlackli
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],
(s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")));
(s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")),
(s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1));
plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], host, path);
}
}
Expand Down
6 changes: 3 additions & 3 deletions htroot/Blacklist_p.java
Expand Up @@ -225,7 +225,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos));
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1));
}
}

Expand Down Expand Up @@ -307,7 +307,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (nextEntry.length() == 0) continue;
if (nextEntry.startsWith("#")) continue;

prop.put(DISABLED + "Itemlist_" + entryCount + "_item", de.anomic.data.wikiCode.replaceXMLEntities(nextEntry));
prop.put(DISABLED + "Itemlist_" + entryCount + "_item", nextEntry);
entryCount++;
}
prop.put(DISABLED + "Itemlist", entryCount);
Expand Down Expand Up @@ -341,7 +341,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
int blacklistCount = 0;
if (dirlist != null) {
for (int i = 0; i <= dirlist.length - 1; i++) {
prop.put(DISABLED + BLACKLIST + blacklistCount + "_name", de.anomic.data.wikiCode.replaceXMLEntities(dirlist[i]));
prop.put(DISABLED + BLACKLIST + blacklistCount + "_name", dirlist[i]);
prop.put(DISABLED + BLACKLIST + blacklistCount + "_selected", 0);

if (dirlist[i].equals(blacklistToUse)) { //current List
Expand Down
25 changes: 25 additions & 0 deletions source/de/anomic/kelondro/kelondroMSetTools.java
Expand Up @@ -45,6 +45,7 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
Expand Down Expand Up @@ -378,6 +379,30 @@ public static TreeMap loadMap(String filename, String sep) {
return map;
}

public static TreeMap /* <String,ArrayList<String>> */ loadMapMultiValsPerKey(String filename, String sep) {
TreeMap map = new TreeMap();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
String line, key, value;
int pos;
while ((line = br.readLine()) != null) {
line = line.trim();
if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) {
key = line.substring(0, pos).trim().toLowerCase();
value = line.substring(pos + sep.length()).trim();
if (!map.containsKey(key))
map.put(key, new ArrayList());
((ArrayList)map.get(key)).add(value);
}
}
} catch (IOException e) {
} finally {
if (br != null) try { br.close(); } catch (Exception e) {}
}
return map;
}

public static TreeSet loadList(File file, Comparator c) {
TreeSet list = new TreeSet(c);
if (!(file.exists())) return list;
Expand Down
55 changes: 34 additions & 21 deletions source/de/anomic/plasma/urlPattern/abstractURLPattern.java
Expand Up @@ -45,13 +45,19 @@

package de.anomic.plasma.urlPattern;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeMap;

import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL;
Expand All @@ -67,8 +73,7 @@ public abstract class abstractURLPattern implements plasmaURLPattern {

protected File blacklistRootPath = null;
protected HashMap cachedUrlHashs = null;
protected HashMap hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here

protected HashMap /* <blacklistType,HashMap<host,ArrayList<path>>> */ hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here

public abstractURLPattern(File rootPath) {
this.setRootPath(rootPath);
Expand Down Expand Up @@ -98,7 +103,7 @@ public void setRootPath(File rootPath) {
this.blacklistRootPath = rootPath;
}

protected HashMap geBlacklistMap(String blacklistType) {
protected HashMap getBlacklistMap(String blacklistType) {
if (blacklistType == null) throw new IllegalArgumentException();
if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");

Expand All @@ -124,47 +129,55 @@ public int size() {
int size = 0;
Iterator iter = this.hostpaths.keySet().iterator();
while (iter.hasNext()) {
HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next());
size += blacklistMap.size();
Iterator blIter = ((HashMap)this.hostpaths.get(iter.next())).values().iterator();
while (blIter.hasNext())
size += ((ArrayList)blIter.next()).size();
}
return size;
}

public void loadList(String[][] filenames, String sep) {
for (int j = 0; j < filenames.length; j++) {
String[] nextFile = filenames[j];
String blacklistType = nextFile[0];
String fileName = nextFile[1];
this.loadList(blacklistType, fileName, sep);
public void loadList(blacklistFile[] blFiles, String sep) {
for (int j = 0; j < blFiles.length; j++) {
blacklistFile blf = blFiles[j];
loadList(blf.getType(), blf.getFileName(), sep);
}
}

public void loadList(String blacklistType, String filenames, String sep) {

HashMap blacklistMap = geBlacklistMap(blacklistType);
HashMap blacklistMap = getBlacklistMap(blacklistType);
String[] filenamesarray = filenames.split(",");

if( filenamesarray.length > 0) {
if (filenamesarray.length > 0) {
for (int i = 0; i < filenamesarray.length; i++) {
blacklistMap.putAll(kelondroMSetTools.loadMap(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep));
blacklistMap.putAll(kelondroMSetTools.loadMapMultiValsPerKey(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep));
}
}
}
}

public void remove(String blacklistType, String host) {

HashMap blacklistMap = geBlacklistMap(blacklistType);
public void removeAll(String blacklistType, String host) {
HashMap blacklistMap = getBlacklistMap(blacklistType);
blacklistMap.remove(host);
}

public void remove(String blacklistType, String host, String path) {
HashMap blacklistMap = getBlacklistMap(blacklistType);
ArrayList hostList = (ArrayList)blacklistMap.get(host);
hostList.remove(path);
if (hostList.size() == 0)
blacklistMap.remove(host);
}

public void add(String blacklistType, String host, String path) {
if (host == null) throw new NullPointerException();
if (path == null) throw new NullPointerException();

if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);

HashMap blacklistMap = geBlacklistMap(blacklistType);
blacklistMap.put(host.toLowerCase(), path);
HashMap blacklistMap = getBlacklistMap(blacklistType);
ArrayList hostList = (ArrayList)blacklistMap.get(host.toLowerCase());
if (hostList == null)
blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList()));
hostList.add(path);
}

public int blacklistCacheSize() {
Expand Down
31 changes: 24 additions & 7 deletions source/de/anomic/plasma/urlPattern/defaultURLPattern.java
Expand Up @@ -42,6 +42,7 @@
package de.anomic.plasma.urlPattern;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;


Expand All @@ -60,28 +61,44 @@ public boolean isListed(String blacklistType, String hostlow, String path) {
if (path == null) throw new NullPointerException();

// getting the proper blacklist
HashMap blacklistMap = super.geBlacklistMap(blacklistType);
HashMap blacklistMap = super.getBlacklistMap(blacklistType);

if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
ArrayList app;
boolean matched = false;
String pp = ""; // path-pattern

// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while ((index = hostlow.indexOf('.', index + 1)) != -1) {
if ((pp = (String) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) {
return ((pp.equals("*")) || (path.matches(pp)));
if ((app = (ArrayList) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = (String)app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
return matched;
}
}
index = hostlow.length();
while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((pp = (String) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
return ((pp.equals("*")) || (path.matches(pp)));
if ((app = (ArrayList) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = (String)app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
return matched;
}
}

// try to match without wildcard in domain
return (((pp = (String) blacklistMap.get(hostlow)) != null) &&
((pp.equals("*")) || (path.matches(pp))));
if ((app = (ArrayList)blacklistMap.get(hostlow)) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = (String)app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
return matched;
}
return false;
}
}
20 changes: 17 additions & 3 deletions source/de/anomic/plasma/urlPattern/plasmaURLPattern.java
Expand Up @@ -10,7 +10,20 @@ public interface plasmaURLPattern {
public static final String BLACKLIST_CRAWLER = "crawler";
public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search";


public static final class blacklistFile {

private final String filename;
private final String type;

public blacklistFile(String filename, String type) {
this.filename = filename;
this.type = type;
}

public String getFileName() { return this.filename; }
public String getType() { return this.type; }
}

public String getEngineInfo();

Expand All @@ -21,12 +34,13 @@ public interface plasmaURLPattern {
public int size();

public void clear();
public void remove(String blacklistType, String host);
public void removeAll(String blacklistType, String host);
public void remove(String blacklistType, String host, String path);
public void add(String blacklistType, String host, String path);


public void loadList(String blacklistType, String filenames, String sep);
public void loadList(String[][] filenames, String sep);
public void loadList(blacklistFile[] blFiles, String sep);


public boolean hashInBlacklistedCache(String blacklistType, String urlHash);
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/soap/services/BlacklistService.java
Expand Up @@ -466,7 +466,7 @@ private void removeBlacklistItemFromBlacklist(String blacklistItem, String black

// if the current blacklist is activated for the type, remove the item from the list
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + BLACKLISTS,blacklistName)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],itemParts[0]);
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],itemParts[0], itemParts[1]);
}
}
}
Expand Down

0 comments on commit 26f5757

Please sign in to comment.