Skip to content

Commit

Permalink
*) Modifications for dbImport functionality
Browse files Browse the repository at this point in the history
   - dbImporter threads are now shutdown by the switchboard on server shutdown
   - adding possibility to pause a importer thread via GUI
   - Bugfix for abort function
     See: http://www.yacy-forum.de/viewtopic.php?p=13363#13363

*) Modification of content parser configuration
   - now it's possible to configure which parsers should be enabled for the proxy,
     crawler, icap, etc. separately
   - 

*) htmlFilterContentScraper.java
   - adding regular expression to normalize URLs containing /../ and /./ parts

*) httpc.java
   - adding functionality to unzip gzipped content
   - requested by roland: should be used later to allow gzipped seed lists

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1170 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed Dec 6, 2005
1 parent 28ddba8 commit 44fa94a
Show file tree
Hide file tree
Showing 20 changed files with 442 additions and 274 deletions.
25 changes: 17 additions & 8 deletions htroot/IndexImport_p.html
Expand Up @@ -30,7 +30,6 @@ <h3>Starting new Job</h3>
</form>

<hr>
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<h3>Currently running jobs</h3>
<p>
<table border="0" cellpadding="2" cellspacing="1">
Expand All @@ -44,12 +43,15 @@ <h3>Currently running jobs</h3>
<td class="small" ># URLs</td>
<td class="small" ># Word<br>Entities</td>
<td class="small" ># Word<br>Entries</td>
<td class="small" >Stop Import</td>
<td class="small" >Abort Import</td>
<td class="small" >Pause Import</td>
</tr>
#{running.jobs}#
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<input type="hidden" name="jobNr" value="#[job_nr]#">
<tr class="TableCellLight">
<td class="small">#[path]#</td>
<td class="small"><font color="#(stopped)#red::green#(/stopped)#">#(stopped)#Finished::Running#(/stopped)#</font></td>
<td class="small"><font color="#(status)#red::green::blue#(/status)#">#(status)#Finished::Running::Paused#(/status)#</font></td>
<td class="small" align="right">#[percent]#</td>
<td class="small" align="right">#[elapsed]#</td>
<td class="small" align="right">#[estimated]#</td>
Expand All @@ -59,14 +61,21 @@ <h3>Currently running jobs</h3>
<td class="small" align="rigth">#[word_entry_num]#</td>
<td class="small">
#(stopped)#::
<input type="submit" name="stopIndexDbImport" value="Stop Index Import">
<input type="hidden" name="jobNr" value="#[job_nr]#">
<input type="submit" name="stopIndexDbImport" value="Abort Import">
#(/stopped)#
</td>
</td>
<td class="small">
#(paused)#
<input type="submit" name="pauseIndexDbImport" value="Pause Import">
::
<input type="submit" name="continueIndexDbImport" value="Continue Import">
#(/paused)#
</td>
</tr>
</form>
#{/running.jobs}#
</table>
</form>


<hr>
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
Expand All @@ -86,7 +95,7 @@ <h3>Finished jobs</h3>
#{finished.jobs}#
<tr class="TableCellLight">
<td class="small">#[path]#</td>
<td class="small"><font color="#(stopped)#red::green::red#(/stopped)#">#(stopped)#Finished::<b>Error:</b> #[errorMsg]##(/stopped)#</font></td>
<td class="small"><font color="#(status)#red::green::red#(/status)#">#(status)#Finished::<b>Error:</b> #[errorMsg]#::Paused#(/status)#</font></td>
<td class="small" align="right">#[percent]#</td>
<td class="small" align="right">#[elapsed]#</td>
<td class="small" align="right"><tt>#[wordHash]#</tt></td>
Expand Down
69 changes: 37 additions & 32 deletions htroot/IndexImport_p.java
Expand Up @@ -102,8 +102,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
plasmaDbImporter.finishedJobs.clear();
prop.put("LOCATION","");
return prop;
} else if (post.containsKey("stopIndexDbImport")) {
// getting the job nr of the thread that should be stopped
} else if (
(post.containsKey("stopIndexDbImport")) ||
(post.containsKey("pauseIndexDbImport")) ||
(post.containsKey("continueIndexDbImport"))
) {
// getting the job nr of the thread
String jobNr = (String) post.get("jobNr");

Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
Expand All @@ -112,12 +116,13 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
for (int i=0; i < activeCount; i++) {
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
if (currThread.getJobNr() == Integer.valueOf(jobNr).intValue()) {
currThread.stoppIt();
try {
currThread.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
if (post.containsKey("stopIndexDbImport")) {
currThread.stoppIt();
try { currThread.join(); } catch (InterruptedException e) {e.printStackTrace();}
} else if (post.containsKey("pauseIndexDbImport")) {
currThread.pauseIt();
} else if (post.containsKey("continueIndexDbImport")) {
currThread.continueIt();
}
break;
}
Expand All @@ -138,30 +143,30 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve

for (int i=0; i < activeCount; i++) {
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];

// root path of the source db
prop.put("running.jobs_" + i + "_path", currThread.getImportRoot().toString());

// specifies if the importer is still running
prop.put("running.jobs_" + i + "_stopped", currThread.isAlive() ? 1:0);

// specifies if the importer was paused
prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? 1:0);

File importPath = currThread.getImportRoot();
String currWordHash = currThread.getCurrentWordhash();
long currWordEntryCount = currThread.getWordEntryCounter();
long currWordEntityCounter = currThread.getWordEntityCounter();
long currUrlCounter = currThread.getUrlCounter();
//long currImportDbSize = currThread.getImportWordDbSize();
long estimatedTime = currThread.getEstimatedTime();
long elapsedTime = currThread.getElapsedTime();
int jobNr = currThread.getJobNr();
int percent = currThread.getProcessingStatus();
// setting the status
prop.put("running.jobs_" + i + "_status", currThread.isPaused() ? 2 : currThread.isAlive() ? 1 : 0);

boolean isRunning = currThread.isAlive();
// other information
prop.put("running.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));
prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(currThread.getEstimatedTime()));
prop.put("running.jobs_" + i + "_wordHash", currThread.getCurrentWordhash());
prop.put("running.jobs_" + i + "_url_num", Long.toString(currThread.getUrlCounter()));
prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currThread.getWordEntityCounter()));
prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currThread.getWordEntryCounter()));

prop.put("running.jobs_" + i + "_path", importPath.toString());
prop.put("running.jobs_" + i + "_stopped", isRunning ? 1:0);
prop.put("running.jobs_" + i + "_percent", Integer.toString(percent));
prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(elapsedTime));
prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(estimatedTime));
prop.put("running.jobs_" + i + "_wordHash", currWordHash);
prop.put("running.jobs_" + i + "_url_num", Long.toString(currUrlCounter));
prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currWordEntityCounter));
prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currWordEntryCount));
prop.put("running.jobs_" + i + "_stopped_job_nr", Integer.toString(jobNr));
// job number of the importer thread
prop.put("running.jobs_" + i + "_job_nr", Integer.toString(currThread.getJobNr()));
}
prop.put("running.jobs",activeCount);

Expand All @@ -174,10 +179,10 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
String error = currThread.getError();
prop.put("finished.jobs_" + i + "_path", currThread.getImportRoot().toString());
if (error != null) {
prop.put("finished.jobs_" + i + "_stopped", 2);
prop.put("finished.jobs_" + i + "_stopped_errorMsg", error);
prop.put("finished.jobs_" + i + "_status", 2);
prop.put("finished.jobs_" + i + "_status_errorMsg", error);
} else {
prop.put("finished.jobs_" + i + "_stopped", 0);
prop.put("finished.jobs_" + i + "_status", 0);
}
prop.put("finished.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
prop.put("finished.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));
Expand Down
3 changes: 2 additions & 1 deletion htroot/QuickCrawlLink_p.html
Expand Up @@ -35,7 +35,8 @@
</td>
</tr>
<tr>
<td><a href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?localIndexing=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+location.href,'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();"><img src="/env/grafics/addlink.gif" border="0">&nbsp;Crawl with YaCy</a></td></tr>
<td><a href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?localIndexing=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+location.href,'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();"><img src="/env/grafics/addlink.gif" border="0">&nbsp;Crawl with YaCy</a></td>
</tr>
</table>

::<!-- 1 -->
Expand Down
10 changes: 6 additions & 4 deletions htroot/SettingsAck_p.html
Expand Up @@ -79,13 +79,15 @@ <h2>Settings Receipt:</h2>
Peer names must not contain characters other than (a-z, A-Z, 0-9, '-', '_') and must not be longer than 80 characters.
Your Peer Language is: <font color="#556699">#[peerLang]#</font><br>
::<!-- 18 -->
<p>
The new parser settings where changed successfully.<br>
Parsing of the following mime-types was enabled:<br>
<ul>
Parsing of the following mime-types was enabled:
</p>
<table>
#{parser}#
<li><font color="#556699">#[enabledMime]#</font></li>
<tr><td><font color="#556699">#[parserMode]#</font></td><td><font color="#556699">#[enabledMime]#</font></td></tr>
#{/parser}#
</ul>
</table>
::<!-- 19 -->
Seed Upload method was changed successfully.
#(success)#::<br>You are now a principal peer.#(/success)#
Expand Down
94 changes: 74 additions & 20 deletions htroot/SettingsAck_p.java
Expand Up @@ -49,14 +49,18 @@
import java.net.InetSocketAddress;
import java.net.SocketException;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpd;
import de.anomic.http.httpdProxyHandler;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserConfig;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
Expand Down Expand Up @@ -537,32 +541,82 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
/*
* Parser configuration
*/
if (post.containsKey("parserSettings")) {
plasmaSwitchboard sb = (plasmaSwitchboard)env;
post.remove("parserSettings");
if (post.containsKey("parserSettings")) {
post.remove("parserSettings");

String[] enabledMimes = null;
if (post.containsKey("allParserEnabled")) {
// enable all available parsers
enabledMimes = plasmaParser.setEnabledParserList(sb.parser.getAvailableParserList().keySet());
} else {
// activate all received parsers
enabledMimes = plasmaParser.setEnabledParserList(post.keySet());
}
Arrays.sort(enabledMimes);
HashMap newConfigList = new HashMap();
Set parserModes = plasmaParser.getParserConfigList().keySet();

StringBuffer enabledMimesTxt = new StringBuffer();
for (int i=0; i < enabledMimes.length; i++) {
enabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + i + "_enabledMime",enabledMimes[i]);
// looping through all received settings
int pos;
Enumeration keyEnum = post.keys();
while (keyEnum.hasMoreElements()) {
String key = (String) keyEnum.nextElement();
if ((pos = key.indexOf(".")) != -1) {
String currParserMode = key.substring(0,pos).trim().toUpperCase();
String currMimeType = key.substring(pos+1).replaceAll("\n", "");
if (parserModes.contains(currParserMode)) {
HashSet currEnabledMimeTypes;
if (newConfigList.containsKey(currParserMode)) {
currEnabledMimeTypes = (HashSet) newConfigList.get(currParserMode);
} else {
currEnabledMimeTypes = new HashSet();
newConfigList.put(currParserMode, currEnabledMimeTypes);
}
currEnabledMimeTypes.add(currMimeType);
}
}
}
prop.put("info_parser",enabledMimes.length);
if (enabledMimesTxt.length() > 0) enabledMimesTxt.deleteCharAt(enabledMimesTxt.length()-1);

env.setConfig("parseableMimeTypes",enabledMimesTxt.toString());

int enabledMimesCount = 0;
StringBuffer currEnabledMimesTxt = new StringBuffer();
Iterator parserModeIter = newConfigList.keySet().iterator();
while (parserModeIter.hasNext()) {
String currParserMode = (String)parserModeIter.next();
String[] enabledMimes = plasmaParser.setEnabledParserList(currParserMode, (Set)newConfigList.get(currParserMode));
Arrays.sort(enabledMimes);

currEnabledMimesTxt.setLength(0);
for (int i=0; i < enabledMimes.length; i++) {
currEnabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + enabledMimesCount + "_parserMode",currParserMode);
prop.put("info_parser_" + enabledMimesCount + "_enabledMime",enabledMimes[i]);
enabledMimesCount++;
}
if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
env.setConfig("parseableMimeTypes." + currParserMode,currEnabledMimesTxt.toString());
}
prop.put("info_parser",enabledMimesCount);
prop.put("info", 18);
return prop;

// plasmaSwitchboard sb = (plasmaSwitchboard)env;
//
// HashMap configList = plasmaParser.getParserConfigList();
// Iterator parserModeIter = configList.keySet().iterator();
//
// String[] enabledMimes = null;
// if (post.containsKey("allParserEnabled")) {
// // enable all available parsers
// enabledMimes = plasmaParser.setEnabledParserList(sb.parser.getAvailableParserList().keySet());
// } else {
// // activate all received parsers
// enabledMimes = plasmaParser.setEnabledParserList(post.keySet());
// }
// Arrays.sort(enabledMimes);
//
// StringBuffer enabledMimesTxt = new StringBuffer();
// for (int i=0; i < enabledMimes.length; i++) {
// enabledMimesTxt.append(enabledMimes[i]).append(",");
// prop.put("info_parser_" + i + "_enabledMime",enabledMimes[i]);
// }
// prop.put("info_parser",enabledMimes.length);
// if (enabledMimesTxt.length() > 0) enabledMimesTxt.deleteCharAt(enabledMimesTxt.length()-1);
//
// env.setConfig("parseableMimeTypes",enabledMimesTxt.toString());
//
// prop.put("info", 18);
// return prop;
}


Expand Down
18 changes: 13 additions & 5 deletions htroot/Settings_Parser.inc
Expand Up @@ -5,33 +5,41 @@ For a detailed description of the various MIME-types take a look at <a href="htt
<p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td class="small" >Activate</td>
#{parserMode}#
<td class="small" >#[name]#</td>
#{/parserMode}#
<td class="small" >Mime-Type</td>
<td class="small" >Parser&nbsp;Usage</td>
<td class="small" ></td>
</tr>
#{parser}#
<tr class="TableCellDark">
<td colspan="2">#[name]# V#[version]#</td>
<td colspan="#[colspan]#"><nobr>#[name]# V#[version]#<nobr></td>
<td>#[usage]#</td>
<td>&nbsp;</td>
</tr>
#{mime}#
<tr class="TableCellLight">
<td class="small" align="center"><input type="checkbox" name="#[mimetype]#" align="top" #(status)#::checked#(/status)#></td>
#{parserMode}#
<td class="small" align="center"><input type="checkbox" name="#[optionName]#" align="top" #(status)#::checked#(/status)#></td>
#{/parserMode}#
<td class="small">#[mimetype]#</td>
<td class="small">&nbsp;</td>
<td class="small" width="100%"></td>
</tr>
#{/mime}#
#{/parser}#
<!--
<tr class="TableCellDark">
<td class="small" align="center"><input type="checkbox" name="allParserEnabled" align="top" #(allParserEnabled)#::checked#(/allParserEnabled)#>
#{parserMode}#
<td class="small" align="center"><input type="checkbox" name="#[name]#.allParserEnabled" align="top" #(allParserEnabled)#::checked#(/allParserEnabled)#>
#{/parserMode}#
<td colspan="2" class="small" >Enable all parsers</td>
<td class="small">&nbsp;</td>
</tr>
-->
<tr class="TableCellDark">
<td colspan="4" class="small" ><input type="submit" name="parserSettings" value="submit">&nbsp;Changes take effect immediately</td>
<td colspan="#[parser.colspan]#" class="small" ><input type="submit" name="parserSettings" value="submit">&nbsp;Changes take effect immediately</td>
</tr>
</table>
</fieldset>
Expand Down

0 comments on commit 44fa94a

Please sign in to comment.