- specified exceptions thrown by ResourceInfoFactory and plasmaHTCach…

…e.loadResourceInfo() - caught possible NPE in CacheAdmin_p and added more error-cases - speeded up deletion of entries in the local crawl queue by crawl profile (it has been noted often that this deletion is slow) - added a bit javadoc git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3868 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Jun 11, 2007 · 22ee85c · 22ee85c
1 parent dfd5e82
commit 22ee85c
Show file tree

Hide file tree

Showing 8 changed files with 211 additions and 123 deletions.
diff --git a/htroot/CacheAdmin_p.html b/htroot/CacheAdmin_p.html
@@ -96,7 +96,12 @@ <h2>Local Cache</h2>
        <span style="display: block;">#[line]#</span>#{/lines}#
      </span>
      ::<span class="error">- This file is not cached -</span>
-     ::<img src="CacheResource_p.html?path=#[src]#" alt="Cached image from #[src]#" />#(/type)#
+     ::<img src="CacheResource_p.html?path=#[src]#" alt="Cached image from #[src]#" />
+     ::<span class="error">- The protocol #[protoc]# is not supported by YaCy</span>
+     ::<span class="error">
+       - IllegalAccessException -Security Manager is blocking dynamic class loading
+       but should not be active. Please report this incident!
+     </span>#(/type)#
      <!-- TO-DO: CSS/XHTMLize end -->
    </div>::
    <div class="CacheAdminTree">

diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
@@ -52,6 +52,7 @@
 
 import java.io.File;
 import java.io.FilenameFilter;
+import java.io.IOException;
 import java.io.Writer;
 import java.util.Iterator;
 import java.util.Map;
@@ -66,6 +67,7 @@
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.cache.UnsupportedProtocolException;
 import de.anomic.server.serverFileUtils;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@@ -80,6 +82,8 @@ public class CacheAdmin_p {
 	private static final int HtmlFile = 0;
 	private static final int NotCached = 1;
 	private static final int Image = 2;
+    private static final int ProtocolError = 3;
+    private static final int SecurityError = 4;
 
     public static final class Filter implements FilenameFilter {
         private static final String EXCLUDE_NAME = plasmaHTCache.DB_NAME;
@@ -105,7 +109,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
                 pathString = "/";
                 file = new File(switchboard.htCachePath, pathString);
             }
-        } catch (Exception e) {
+        } catch (IOException e) {
             pathString = "/";
             file = new File(switchboard.htCachePath, pathString);
         }
@@ -129,62 +133,69 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
             info.ensureCapacity(10000);
             try {
                 final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
-                formatHeader(prop, resInfo.getMap());
-
-                final String ff = file.toString();
-                final int dotpos = ff.lastIndexOf('.');
-                final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
-                if (ext.equals("gif") || ext.equals("jpg") ||
-                    ext.equals("png") || ext.equals("jpeg")) {
-                	prop.put("info_type", Image);
-                    prop.put("info_type_src", pathString);
+                if (resInfo == null) {
+                    prop.put("info_type", NotCached);
                 } else {
-                	prop.put("info_type", HtmlFile);
-                	// fill the htmlFilerContentScraper object with the contents of the cached file
-                	// to retrieve all needed information
-                    final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
-                    //final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
-                    Writer writer = new htmlFilterWriter(null,null,scraper,null,false);                    
-                    String sourceCharset = resInfo.getCharacterEncoding();
-                    if (sourceCharset == null) sourceCharset = "UTF-8";
-                    String mimeType = resInfo.getMimeType();                    
-                    serverFileUtils.copy(file, sourceCharset, writer);
-                    writer.close();
-
-                    final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);
-
-                    prop.put("info_type_title", scraper.getTitle());
-
-                    int i;
-                    String[] t = document.getSectionTitles();
-                    prop.put("info_type_headlines", t.length);
-                    for (i = 0; i < t.length; i++)
-                    	prop.put("info_type_headlines_" + i + "_headline",
-                    			t[i].replaceAll("\n", "").trim());
-
-                    formatAnchor(prop, document.getHyperlinks(), "links");
-                    formatImageAnchor(prop, document.getImages());
-                    formatAnchor(prop, document.getAudiolinks(), "audio");
-                    formatAnchor(prop, document.getVideolinks(), "video");
-                    formatAnchor(prop, document.getApplinks(), "apps");
-                    formatAnchor(prop, document.getEmaillinks(), "email");
+                    formatHeader(prop, resInfo.getMap());
 
-                    prop.put("info_type_text",
-                    		de.anomic.data.htmlTools.replaceXMLEntities(new String(scraper.getText())));
-
-                    i = 0;
-                    final Iterator sentences = document.getSentences(false);
-                    if (sentences != null)
-                    	while (sentences.hasNext()) {
-                    		prop.put("info_type_lines_" + i + "_line",
-                    				new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
-	                        i++;
-	                    }
-                    prop.put("info_type_lines", i);
-                    if (document != null) document.close();
+                    final String ff = file.toString();
+                    final int dotpos = ff.lastIndexOf('.');
+                    final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
+                    if (ext.equals("gif") || ext.equals("jpg") ||
+                        ext.equals("png") || ext.equals("jpeg")) {
+                    	prop.put("info_type", Image);
+                        prop.put("info_type_src", pathString);
+                    } else {
+                    	prop.put("info_type", HtmlFile);
+                    	// fill the htmlFilerContentScraper object with the contents of the cached file
+                    	// to retrieve all needed information
+                        final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
+                        //final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
+                        Writer writer = new htmlFilterWriter(null,null,scraper,null,false);                    
+                        String sourceCharset = resInfo.getCharacterEncoding();
+                        if (sourceCharset == null) sourceCharset = "UTF-8";
+                        String mimeType = resInfo.getMimeType();                    
+                        serverFileUtils.copy(file, sourceCharset, writer);
+                        writer.close();
+
+                        final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);
+
+                        prop.put("info_type_title", scraper.getTitle());
+
+                        int i;
+                        String[] t = document.getSectionTitles();
+                        prop.put("info_type_headlines", t.length);
+                        for (i = 0; i < t.length; i++)
+                        	prop.put("info_type_headlines_" + i + "_headline",
+                        			t[i].replaceAll("\n", "").trim());
+
+                        formatAnchor(prop, document.getHyperlinks(), "links");
+                        formatImageAnchor(prop, document.getImages());
+                        formatAnchor(prop, document.getAudiolinks(), "audio");
+                        formatAnchor(prop, document.getVideolinks(), "video");
+                        formatAnchor(prop, document.getApplinks(), "apps");
+                        formatAnchor(prop, document.getEmaillinks(), "email");
+
+                        prop.put("info_type_text", new String(scraper.getText()));
+
+                        i = 0;
+                        final Iterator sentences = document.getSentences(false);
+                        if (sentences != null)
+                        	while (sentences.hasNext()) {
+                        		prop.put("info_type_lines_" + i + "_line",
+                        				new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
+    	                        i++;
+    	                    }
+                        prop.put("info_type_lines", i);
+                        if (document != null) document.close();
+                    }
                 }
-            } catch (Exception e) {
+            } catch (IOException e) {
             	prop.put("info_type", NotCached);
+            } catch (UnsupportedProtocolException e) {
+                prop.put("info_type", ProtocolError);
+            } catch (IllegalAccessException e) {
+                prop.put("info_type", SecurityError);
             }
         } else {
             prop.put("info", TypeDIR);
@@ -234,11 +245,12 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
                 }
             }
         }
-
+        
         prop.put("cachesize", Long.toString(switchboard.cacheManager.curCacheSize/1024));
         prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
         prop.put("path", path.toString());
         prop.put("info_info", info.toString());
+
         /* prop.put("info_tree", tree.toString()); */
         // return rewrite properties
         return prop;

diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html
@@ -21,12 +21,12 @@ <h2>Index Creation: WWW Local Crawl Queue</h2>
         Delete Entries:
         <input type="text" name="pattern" value=".*" size="20" maxlength="200" />
         <select name="option" size="1">
-           <option value="Initiator">Initiator</option>
-           <option value="Profile">Profile</option>
-           <option value="Depth">Depth</option>
-           <option value="ModifiedDate">Modified Date</option>
-           <option value="AnchorName">Anchor Name</option>
-           <option value="URL" selected="selected">URL</option>
+           <option value="5">Initiator</option>
+           <option value="3">Profile</option>
+           <option value="4">Depth</option>
+           <option value="6">Modified Date</option>
+           <option value="2">Anchor Name</option>
+           <option value="1" selected="selected">URL</option>
         </select>
         <input type="submit" name="deleteEntries" value="Delete" /><em>This may take a quite long time.</em>
       </fieldset>

diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java
@@ -70,6 +70,14 @@ private static String daydate(Date date) {
         return dayFormatter.format(date);
     }
 
+    private static final int INVALID    = 0;
+    private static final int URL        = 1;
+    private static final int ANCHOR     = 2;
+    private static final int PROFILE    = 3;
+    private static final int DEPTH      = 4;
+    private static final int INITIATOR  = 5;
+    private static final int MODIFIED   = 6;
+
     public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
         // return variable that accumulates replacements
         plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
@@ -87,55 +95,59 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
                 int c = 0;
 
                 String pattern = post.get("pattern", ".*").trim();
-                String option  = post.get("option", ".*").trim();
+                final int option  = post.getInt("option", INVALID);
                 if (pattern.equals(".*")) {
                     c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
                     switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
                     try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */}
-                } else{
+                } else if (option > INVALID) {
                     Pattern compiledPattern = null;
                     try {
                         // compiling the regular expression
                         compiledPattern = Pattern.compile(pattern);
 
-                        // iterating through the list of URLs
-                        Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
-                        plasmaCrawlEntry entry;
-                        while (iter.hasNext()) {
-                            if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
-                            String value = null;
-                            String nextHash = entry.urlhash();
-                            if ((option.equals("URL")&&(entry.url() != null))) {
-                                value = entry.url().toString();
-                            } else if ((option.equals("AnchorName"))) {
-                                value = entry.name();
-                            } else if ((option.equals("Profile"))) {
-                                String profileHandle = entry.profileHandle();
-                                if (profileHandle == null) {
-                                    value = "unknown";
-                                } else {                                    
-                                    plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle);
-                                    if (profile == null) {
-                                        value = "unknown";
-                                    } else {                                    
-                                        value = profile.name();
-                                    }
-                                }
-                            } else if ((option.equals("Depth"))) {
-                                value = Integer.toString(entry.depth());
-                            } else if ((option.equals("Initiator"))) {
-                                value = (entry.initiator()==null)?"proxy":htmlTools.replaceHTML(entry.initiator());
-                            } else if ((option.equals("ModifiedDate"))) {
-                                value = daydate(entry.loaddate());
+                        if (option == PROFILE) {
+                            // search and delete the crawl profile (_much_ faster, independant of queue size)
+                            // XXX: what to do about the annoying LOST PROFILE messages in the log?
+                            Iterator it = switchboard.profiles.profiles(true);
+                            plasmaCrawlProfile.entry entry;
+                            while (it.hasNext()) {
+                                entry = (plasmaCrawlProfile.entry)it.next();
+                                final String name = entry.name();
+                                if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
+                                        name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
+                                        name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
+                                        name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
+                                    continue;
+                                if (compiledPattern.matcher(name).find())
+                                    switchboard.profiles.removeEntry(entry.handle());
                             }
-
-                            if (value != null) {
-                                Matcher matcher = compiledPattern.matcher(value);
-                                if (matcher.find()) {
-                                    switchboard.noticeURL.remove(nextHash);
-                                }                                    
+                        } else {
+                            // iterating through the list of URLs
+                            Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
+                            plasmaCrawlEntry entry;
+                            while (iter.hasNext()) {
+                                if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
+                                String value = null;
+
+                                switch (option) {
+                                    case URL:       value = (entry.url() == null) ? null : entry.url().toString(); break;
+                                    case ANCHOR:    value = entry.name(); break;
+                                    case DEPTH:     value = Integer.toString(entry.depth()); break;
+                                    case INITIATOR:
+                                        value = (entry.initiator() == null) ? "proxy" : htmlTools.replaceHTML(entry.initiator());
+                                        break;
+                                    case MODIFIED:  value = daydate(entry.loaddate()); break;
+                                    default: value = null;
+                                }
+
+                                if (value != null) {
+                                    Matcher matcher = compiledPattern.matcher(value);
+                                    if (matcher.find()) {
+                                        switchboard.noticeURL.remove(entry.urlhash());
+                                    }                                    
+                                }
                             }
-
                         }
                     } catch (PatternSyntaxException e) {
                         e.printStackTrace();