Skip to content

Commit

Permalink
- specified exceptions thrown by ResourceInfoFactory and plasmaHTCach…
Browse files Browse the repository at this point in the history
…e.loadResourceInfo()

- caught possible NPE in CacheAdmin_p and added more error-cases
- speeded up deletion of entries in the local crawl queue by crawl profile (it has been noted often that this deletion is slow)
- added a bit javadoc

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3868 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
karlchenofhell committed Jun 11, 2007
1 parent dfd5e82 commit 22ee85c
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 123 deletions.
7 changes: 6 additions & 1 deletion htroot/CacheAdmin_p.html
Expand Up @@ -96,7 +96,12 @@ <h2>Local Cache</h2>
<span style="display: block;">#[line]#</span>#{/lines}#
</span>
::<span class="error">- This file is not cached -</span>
::<img src="CacheResource_p.html?path=#[src]#" alt="Cached image from #[src]#" />#(/type)#
::<img src="CacheResource_p.html?path=#[src]#" alt="Cached image from #[src]#" />
::<span class="error">- The protocol #[protoc]# is not supported by YaCy</span>
::<span class="error">
- IllegalAccessException -Security Manager is blocking dynamic class loading
but should not be active. Please report this incident!
</span>#(/type)#
<!-- TO-DO: CSS/XHTMLize end -->
</div>::
<div class="CacheAdminTree">
Expand Down
120 changes: 66 additions & 54 deletions htroot/CacheAdmin_p.java
Expand Up @@ -52,6 +52,7 @@

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.Map;
Expand All @@ -66,6 +67,7 @@
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.UnsupportedProtocolException;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
Expand All @@ -80,6 +82,8 @@ public class CacheAdmin_p {
private static final int HtmlFile = 0;
private static final int NotCached = 1;
private static final int Image = 2;
private static final int ProtocolError = 3;
private static final int SecurityError = 4;

public static final class Filter implements FilenameFilter {
private static final String EXCLUDE_NAME = plasmaHTCache.DB_NAME;
Expand All @@ -105,7 +109,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
pathString = "/";
file = new File(switchboard.htCachePath, pathString);
}
} catch (Exception e) {
} catch (IOException e) {
pathString = "/";
file = new File(switchboard.htCachePath, pathString);
}
Expand All @@ -129,62 +133,69 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
info.ensureCapacity(10000);
try {
final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
formatHeader(prop, resInfo.getMap());

final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
if (ext.equals("gif") || ext.equals("jpg") ||
ext.equals("png") || ext.equals("jpeg")) {
prop.put("info_type", Image);
prop.put("info_type_src", pathString);
if (resInfo == null) {
prop.put("info_type", NotCached);
} else {
prop.put("info_type", HtmlFile);
// fill the htmlFilerContentScraper object with the contents of the cached file
// to retrieve all needed information
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
//final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
String sourceCharset = resInfo.getCharacterEncoding();
if (sourceCharset == null) sourceCharset = "UTF-8";
String mimeType = resInfo.getMimeType();
serverFileUtils.copy(file, sourceCharset, writer);
writer.close();

final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);

prop.put("info_type_title", scraper.getTitle());

int i;
String[] t = document.getSectionTitles();
prop.put("info_type_headlines", t.length);
for (i = 0; i < t.length; i++)
prop.put("info_type_headlines_" + i + "_headline",
t[i].replaceAll("\n", "").trim());

formatAnchor(prop, document.getHyperlinks(), "links");
formatImageAnchor(prop, document.getImages());
formatAnchor(prop, document.getAudiolinks(), "audio");
formatAnchor(prop, document.getVideolinks(), "video");
formatAnchor(prop, document.getApplinks(), "apps");
formatAnchor(prop, document.getEmaillinks(), "email");
formatHeader(prop, resInfo.getMap());

prop.put("info_type_text",
de.anomic.data.htmlTools.replaceXMLEntities(new String(scraper.getText())));

i = 0;
final Iterator sentences = document.getSentences(false);
if (sentences != null)
while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line",
new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
i++;
}
prop.put("info_type_lines", i);
if (document != null) document.close();
final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
if (ext.equals("gif") || ext.equals("jpg") ||
ext.equals("png") || ext.equals("jpeg")) {
prop.put("info_type", Image);
prop.put("info_type_src", pathString);
} else {
prop.put("info_type", HtmlFile);
// fill the htmlFilerContentScraper object with the contents of the cached file
// to retrieve all needed information
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
//final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
String sourceCharset = resInfo.getCharacterEncoding();
if (sourceCharset == null) sourceCharset = "UTF-8";
String mimeType = resInfo.getMimeType();
serverFileUtils.copy(file, sourceCharset, writer);
writer.close();

final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);

prop.put("info_type_title", scraper.getTitle());

int i;
String[] t = document.getSectionTitles();
prop.put("info_type_headlines", t.length);
for (i = 0; i < t.length; i++)
prop.put("info_type_headlines_" + i + "_headline",
t[i].replaceAll("\n", "").trim());

formatAnchor(prop, document.getHyperlinks(), "links");
formatImageAnchor(prop, document.getImages());
formatAnchor(prop, document.getAudiolinks(), "audio");
formatAnchor(prop, document.getVideolinks(), "video");
formatAnchor(prop, document.getApplinks(), "apps");
formatAnchor(prop, document.getEmaillinks(), "email");

prop.put("info_type_text", new String(scraper.getText()));

i = 0;
final Iterator sentences = document.getSentences(false);
if (sentences != null)
while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line",
new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
i++;
}
prop.put("info_type_lines", i);
if (document != null) document.close();
}
}
} catch (Exception e) {
} catch (IOException e) {
prop.put("info_type", NotCached);
} catch (UnsupportedProtocolException e) {
prop.put("info_type", ProtocolError);
} catch (IllegalAccessException e) {
prop.put("info_type", SecurityError);
}
} else {
prop.put("info", TypeDIR);
Expand Down Expand Up @@ -234,11 +245,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
}
}

prop.put("cachesize", Long.toString(switchboard.cacheManager.curCacheSize/1024));
prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("path", path.toString());
prop.put("info_info", info.toString());

/* prop.put("info_tree", tree.toString()); */
// return rewrite properties
return prop;
Expand Down
12 changes: 6 additions & 6 deletions htroot/IndexCreateWWWLocalQueue_p.html
Expand Up @@ -21,12 +21,12 @@ <h2>Index Creation: WWW Local Crawl Queue</h2>
Delete Entries:
<input type="text" name="pattern" value=".*" size="20" maxlength="200" />
<select name="option" size="1">
<option value="Initiator">Initiator</option>
<option value="Profile">Profile</option>
<option value="Depth">Depth</option>
<option value="ModifiedDate">Modified Date</option>
<option value="AnchorName">Anchor Name</option>
<option value="URL" selected="selected">URL</option>
<option value="5">Initiator</option>
<option value="3">Profile</option>
<option value="4">Depth</option>
<option value="6">Modified Date</option>
<option value="2">Anchor Name</option>
<option value="1" selected="selected">URL</option>
</select>
<input type="submit" name="deleteEntries" value="Delete" /><em>This may take a quite long time.</em>
</fieldset>
Expand Down
88 changes: 50 additions & 38 deletions htroot/IndexCreateWWWLocalQueue_p.java
Expand Up @@ -70,6 +70,14 @@ private static String daydate(Date date) {
return dayFormatter.format(date);
}

private static final int INVALID = 0;
private static final int URL = 1;
private static final int ANCHOR = 2;
private static final int PROFILE = 3;
private static final int DEPTH = 4;
private static final int INITIATOR = 5;
private static final int MODIFIED = 6;

public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
Expand All @@ -87,55 +95,59 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
int c = 0;

String pattern = post.get("pattern", ".*").trim();
String option = post.get("option", ".*").trim();
final int option = post.getInt("option", INVALID);
if (pattern.equals(".*")) {
c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */}
} else{
} else if (option > INVALID) {
Pattern compiledPattern = null;
try {
// compiling the regular expression
compiledPattern = Pattern.compile(pattern);

// iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) {
if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
String value = null;
String nextHash = entry.urlhash();
if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString();
} else if ((option.equals("AnchorName"))) {
value = entry.name();
} else if ((option.equals("Profile"))) {
String profileHandle = entry.profileHandle();
if (profileHandle == null) {
value = "unknown";
} else {
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle);
if (profile == null) {
value = "unknown";
} else {
value = profile.name();
}
}
} else if ((option.equals("Depth"))) {
value = Integer.toString(entry.depth());
} else if ((option.equals("Initiator"))) {
value = (entry.initiator()==null)?"proxy":htmlTools.replaceHTML(entry.initiator());
} else if ((option.equals("ModifiedDate"))) {
value = daydate(entry.loaddate());
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
Iterator it = switchboard.profiles.profiles(true);
plasmaCrawlProfile.entry entry;
while (it.hasNext()) {
entry = (plasmaCrawlProfile.entry)it.next();
final String name = entry.name();
if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
continue;
if (compiledPattern.matcher(name).find())
switchboard.profiles.removeEntry(entry.handle());
}

if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.noticeURL.remove(nextHash);
}
} else {
// iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) {
if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
String value = null;

switch (option) {
case URL: value = (entry.url() == null) ? null : entry.url().toString(); break;
case ANCHOR: value = entry.name(); break;
case DEPTH: value = Integer.toString(entry.depth()); break;
case INITIATOR:
value = (entry.initiator() == null) ? "proxy" : htmlTools.replaceHTML(entry.initiator());
break;
case MODIFIED: value = daydate(entry.loaddate()); break;
default: value = null;
}

if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.noticeURL.remove(entry.urlhash());
}
}
}

}
} catch (PatternSyntaxException e) {
e.printStackTrace();
Expand Down

0 comments on commit 22ee85c

Please sign in to comment.