Skip to content

Commit

Permalink
added a 'greedy learning' mechanismn which will cause that a 'fresh'
Browse files Browse the repository at this point in the history
yacy will load linked web pages from search results until the total
number of web pages reaches 15000. This shall give fresh peers a 'boost'
to get faster a personalized search index.
  • Loading branch information
Orbiter committed Jun 11, 2013
1 parent a5e328d commit 6115bef
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 48 deletions.
11 changes: 11 additions & 0 deletions defaults/yacy.init
Original file line number Diff line number Diff line change
Expand Up @@ -1156,3 +1156,14 @@ interaction.autocrawler.categoryfilter = .*
# host browser settings
browser.autoload = false
browser.load4everyone = false


# greedy learning: fast information acquisition heuristic for new peers
# to make greedy learning work, it must be enabled in the network definition
# the user may switch it off at any time, but if the automatic learning limit is reached
# then the active flag is set to false automatically and this will switch to that state
# automatically by the cleanup process each time if the user switches it on again.
# While the switch in on, it will cause that the user-submitted search will be done along
# with some heuristics like: loading linked documents and adding a twitter search.
# When the learning mode is finished, the user may switch on individual heuristics by himself.
greedylearning.active = true
16 changes: 11 additions & 5 deletions defaults/yacy.network.freeworld.unit
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ network.unit.remotecrawl.speed = 300
# addresses of seed-list bootstrap locations
network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt
network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt
network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt
network.unit.bootstrap.seedlist3 = http://www.lulabad.de/seed.txt
network.unit.bootstrap.seedlist4 = http://sixcooler.de/yacy/seed.txt
network.unit.bootstrap.seedlist5 = http://headrift.dyndns.org/yacy/seed.txt
network.unit.bootstrap.seedlist6 = http://dk5ras.dyndns.org/seed.txt
network.unit.bootstrap.seedlist2 = http://www.lulabad.de/seed.txt
network.unit.bootstrap.seedlist3 = http://sixcooler.de/yacy/seed.txt
network.unit.bootstrap.seedlist4 = http://img.homepage.bluewin.ch/352348/seed.txt
network.unit.bootstrap.seedlist5 = https://esbek.iv.net.pl/yacy/seed.txt
network.unit.bootstrap.seedlist6 = http://yacy.seed.mylookr.com/seed.txt
network.unit.bootstrap.seedlist7 = http://mary.dyndns.biz/yacy/seed.txt


# each network may use different yacy distributions.
# the auto-updater can access network-specific update locations
Expand All @@ -94,3 +96,7 @@ network.unit.protocol.control = uncontrolled
# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
network.unit.access.blacklist =

# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = true
greedylearning.limit.doccount = 15000
6 changes: 5 additions & 1 deletion defaults/yacy.network.intranet.unit
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,8 @@ network.unit.protocol.control = uncontrolled

# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
network.unit.access.blacklist =
network.unit.access.blacklist =

# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = false
greedylearning.limit.doccount = 15000
6 changes: 5 additions & 1 deletion defaults/yacy.network.metager.unit
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,8 @@ network.unit.protocol.control = uncontrolled

# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,213.183.195.83,130.75.2.35,85.31.186.137,localhost
network.unit.access.blacklist =
network.unit.access.blacklist =

# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = false
greedylearning.limit.doccount = 15000
6 changes: 5 additions & 1 deletion defaults/yacy.network.webportal.unit
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,8 @@ network.unit.protocol.control = uncontrolled

# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
network.unit.access.blacklist =
network.unit.access.blacklist =

# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = false
greedylearning.limit.doccount = 15000
38 changes: 19 additions & 19 deletions htroot/ConfigHeuristics_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,25 +55,25 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
// store this call as api call
sb.tables.recordAPICall(post, "ConfigHeuristics.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "heuristic settings");

if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
if (post.containsKey("twitter_on")) sb.setConfig("heuristic.twitter", true);
if (post.containsKey("twitter_off")) sb.setConfig("heuristic.twitter", false);
if (post.containsKey("site_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, true);
if (post.containsKey("site_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
if (post.containsKey("searchresult_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, true);
if (post.containsKey("searchresult_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false);
if (post.containsKey("searchresultglobal_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, true);
if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false);
if (post.containsKey("blekko_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, true);
if (post.containsKey("blekko_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
if (post.containsKey("twitter_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, true);
if (post.containsKey("twitter_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
if (post.containsKey("opensearch_on")) {
sb.setConfig("heuristic.opensearch", true);
sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true);
// re-read config (and create work table)
OpenSearchConnector os = new OpenSearchConnector(sb, true);
if (os.getSize() == 0) {
osderrmsg = "no active search targets are configured";
}
}
if (post.containsKey("opensearch_off")) sb.setConfig("heuristic.opensearch", false);
if (post.containsKey("opensearch_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, false);
if (post.containsKey("discoverosd")) {
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()));
Expand Down Expand Up @@ -155,12 +155,12 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
if (!showmetafieldbutton) prop.put("osdsolrfieldswitch",1);
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
prop.put("twitter.checked", sb.getConfigBool("heuristic.twitter", false) ? 1 : 0);
prop.put("opensearch.checked", sb.getConfigBool("heuristic.opensearch", false) ? 1 : 0);
prop.put("site.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) ? 1 : 0);
prop.put("searchresult.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ? 1 : 0);
prop.put("searchresultglobal.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false) ? 1 : 0);
prop.put("twitter.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false) ? 1 : 0);
prop.put("opensearch.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) ? 1 : 0);

// display config file content
final File f = new File (sb.getDataPath(),"DATA/SETTINGS/heuristicopensearch.conf");
Expand Down Expand Up @@ -238,7 +238,7 @@ private static void writeopensearchcfg(final Switchboard sb, final serverObjects
}

// re-read config (and create/update work table)
if (sb.getConfigBool("heuristic.opensearch", true)) {
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
OpenSearchConnector os = new OpenSearchConnector(sb, true);
}
}
Expand Down
6 changes: 3 additions & 3 deletions htroot/ConfigNetwork_p.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ public static serverObjects respond(
boolean indexReceive = "on".equals(post.get("indexReceive", ""));
if ( !indexReceive ) {
// remove heuristics
sb.setConfig("heuristic.site", false);
sb.setConfig("heuristic.blekko", false);
sb.setConfig("heuristic.twitter", false);
sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
}
final boolean robinsonmode = "robinson".equals(post.get("network", ""));
if ( robinsonmode ) {
Expand Down
17 changes: 10 additions & 7 deletions htroot/yacysearch.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,13 @@ public static serverObjects respond(
sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true)
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|| clustersearch;
boolean global = post == null || (post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0 && indexReceiveGranted);
prop.put("topmenu_resource-select", (sb.peers == null || sb.peers.sizeConnected() == 0 || !indexReceiveGranted) ? 0 : global ? 1 : 2);
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
boolean stealthmode = p2pmode && !global;
prop.put("topmenu_resource-select", stealthmode ? 2 : global ? 1 : 0);

if ( post == null || indexSegment == null || env == null || !searchAllowed ) {
if (indexSegment == null) Log.logInfo("yacysearch", "indexSegment == null");
// we create empty entries for template strings
prop.put("searchagain", "0");
prop.put("former", "");
Expand Down Expand Up @@ -483,7 +486,7 @@ public static serverObjects respond(
}

final int heuristicTwitter = querystring.indexOf("/heuristic/twitter", 0);
if ( heuristicBlekko >= 0 ) {
if ( heuristicTwitter >= 0 ) {
querystring = querystring.replace("/heuristic/twitter", "");
modifier.add("/heuristic/twitter");
}
Expand Down Expand Up @@ -723,16 +726,16 @@ public static serverObjects respond(
(int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));

if ( startRecord == 0 ) {
if ( modifier.sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated ) {
if ( modifier.sitehost != null && sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) && authenticated && !stealthmode) {
sb.heuristicSite(theSearch, modifier.sitehost);
}
if ( (heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated ) {
if ( (heuristicBlekko >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false)) && authenticated && !stealthmode ) {
sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
}
if ( (heuristicTwitter >= 0 || sb.getConfigBool("heuristic.twitter", false)) && authenticated ) {
if ( (heuristicTwitter >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false)) && authenticated && !stealthmode ) {
sb.heuristicRSS("http://search.twitter.com/search.rss?rpp=50&q=$", theSearch, "twitter");
}
if (sb.getConfigBool("heuristic.opensearch", false) && authenticated) {
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) {
OpenSearchConnector.query(sb, theSearch);
}
}
Expand Down
12 changes: 11 additions & 1 deletion htroot/yacysearchitem.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,17 @@ public static serverObjects respond(final RequestHeader header, final serverObje
prop.put("content_loc_lat", result.lat());
prop.put("content_loc_lon", result.lon());
}
if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
final boolean clustersearch = sb.isRobinsonMode() && sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER);
final boolean indexReceiveGranted =
sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true)
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|| clustersearch;
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
boolean stealthmode = p2pmode && !global;
if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false))) &&
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
theSearch.query.transmitcount = item + 1;
return prop;
}
Expand Down
22 changes: 16 additions & 6 deletions source/net/yacy/search/Switchboard.java
Original file line number Diff line number Diff line change
Expand Up @@ -1303,9 +1303,9 @@ public void switchNetwork(final String networkDefinition) throws FileNotFoundExc
ResultURLs.clearStacks();

// remove heuristics
setConfig("heuristic.site", false);
setConfig("heuristic.blekko", false);
setConfig("heuristic.twitter", false);
setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);

// relocate
this.peers.relocate(
Expand Down Expand Up @@ -2041,6 +2041,15 @@ && getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "").isEmpty() ) {
setConfig("adminAccount", "");
}

// stop greedylearning if limit is reached
if (getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false)) {
long cs = this.index.fulltext().collectionSize();
if (cs > getConfigInt(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, 0)) {
setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false);
log.logInfo("finishing greedy learning phase, size=" +cs);
}
}

// refresh recrawl dates
try {
CrawlProfile selentry;
Expand Down Expand Up @@ -2265,6 +2274,7 @@ && getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "").isEmpty() ) {
// if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) {
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches
index.fulltext().getDefaultConfiguration().postprocessing(index);
index.fulltext().getWebgraphConfiguration().postprocessing(index);
}
Expand Down Expand Up @@ -3371,7 +3381,7 @@ public void run() {
}.start();
}

public final void heuristicSearchResults(final String host) {
public final void heuristicSearchResults(final String url) {
new Thread() {

@Override
Expand All @@ -3380,7 +3390,7 @@ public void run() {
// get the links for a specific site
final DigestURI startUrl;
try {
startUrl = new DigestURI(host);
startUrl = new DigestURI(url);
} catch (final MalformedURLException e) {
Log.logException(e);
return;
Expand All @@ -3393,7 +3403,7 @@ public void run() {
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<DigestURI> i = links.keySet().iterator();
final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false);
final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
Collection<DigestURI> urls = new ArrayList<DigestURI>();
while (i.hasNext()) {
url = i.next();
Expand Down
25 changes: 21 additions & 4 deletions source/net/yacy/search/SwitchboardConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,25 @@ public final class SwitchboardConstants {
/**
* system tray
*/
public static final String TRAY_ICON_ENABLED = "tray.icon.enabled";
public static final String TRAY_ICON_FORCED = "tray.icon.force";
public static final String TRAY_ICON_LABEL = "tray.icon.label";
public static final String TRAY_MENU_ENABLED = "tray.menu.enabled";
public static final String TRAY_ICON_ENABLED = "tray.icon.enabled";
public static final String TRAY_ICON_FORCED = "tray.icon.force";
public static final String TRAY_ICON_LABEL = "tray.icon.label";
public static final String TRAY_MENU_ENABLED = "tray.menu.enabled";

/*
* search heuristics
*/
public static final String HEURISTIC_SITE = "heuristic.site";
public static final String HEURISTIC_SEARCHRESULTS = "heuristic.searchresults";
public static final String HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL = "heuristic.searchresults.crawlglobal";
public static final String HEURISTIC_BLEKKO = "heuristic.blekko";
public static final String HEURISTIC_TWITTER = "heuristic.twitter";
public static final String HEURISTIC_OPENSEARCH = "heuristic.opensearch";

/*
* automatic learning heuristic
*/
public static final String GREEDYLEARNING_ENABLED = "greedylearning.enabled";
public static final String GREEDYLEARNING_LIMIT_DOCCOUNT = "greedylearning.limit.doccount";
public static final String GREEDYLEARNING_ACTIVE = "greedylearning.active";
}

0 comments on commit 6115bef

Please sign in to comment.