fixed possible memory leak in htmlScraper: be aware that now links ca…

…n get lost; further work necessary git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@288 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Jun 16, 2005 · a25b5b4 · a25b5b4
1 parent 3874785
commit a25b5b4
Show file tree

Hide file tree

Showing 18 changed files with 158 additions and 75 deletions.
diff --git a/build.xml b/build.xml
@@ -360,6 +360,29 @@
 
 <!-- run YaCy (needed for NetBeans4) -->
 <target name="run" description="run YaCy">
+<!-- debug options:
+     java -Xrunhprof:help
+     Hprof usage: -Xrunhprof[:help]|[:<option>=<value>, ...]
+
+     Option Name and Value  Description                Default
+     _____________________  ___________                _______
+     heap=dump|sites|all    heap profiling             all
+     cpu=samples|times|old  CPU usage                  off
+     monitor=y|n            monitor contention         n
+     format=a|b             ascii or binary output     a
+     file=<file>            write data to file         java.hprof(.txt for ascii)
+     net=<host>:<port>      send data over a socket    write to file
+     depth=<size>           stack trace depth          4
+     cutoff=<value>         output cutoff point        0.0001
+     lineno=y|n             line number in traces?     y
+     thread=y|n             thread in traces?          n
+     doe=y|n                dump on exit?              y
+     gc_okay=y|n            GC okay during sampling    y
+
+     Example: java -Xrunhprof:cpu=samples,file=log.txt,depth=3 FooClass
+
+     Note: format=b cannot be used with cpu=old|times
+-->
   <java classname="yacy" fork="yes">
     <classpath>    
       <pathelement location="${build}"/>
@@ -369,8 +392,9 @@
       <pathelement location="${libx}" />                 
       <fileset dir="${libx}" includes="**/*.jar" />
     </classpath>
+    <!--<arg value="-Xrunhprof"/>-->
     <arg line="-start"/>
-    <!-- <arg line="-migratewords"/> -->
+    <!-- <arg line="-migratewords"/>-->
     <!-- <arg line="-start ${user.dir}"/>-->
   </java>
 </target>

diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
@@ -104,11 +104,11 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
                     boolean crawlOrder = ((String) post.get("crawlOrder", "")).equals("on");
                     env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
                     boolean xsstopw = ((String) post.get("xsstopw", "")).equals("on");
-                    env.setConfig("xsstopw", (crawlOrder) ? "true" : "false");
+                    env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
                     boolean xdstopw = ((String) post.get("xdstopw", "")).equals("on");
-                    env.setConfig("xdstopw", (crawlOrder) ? "true" : "false");
+                    env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
                     boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
-                    env.setConfig("xpstopw", (crawlOrder) ? "true" : "false");
+                    env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
 
                     String crawlingStart = (String) post.get("crawlingURL");
                     if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;

diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -46,11 +46,11 @@ public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) {
     }
 
     public boolean isTag0(String tag) {
-	return tags0.contains(tag);
+	return (tags0 != null) && (tags0.contains(tag));
     }
 
     public boolean isTag1(String tag) {
-	return tags1.contains(tag);
+	return (tags1 != null) && (tags1.contains(tag));
     }
 
     //the 'missing' method that shall be implemented:
@@ -405,4 +405,14 @@ public static serverByteBuffer stripAll(serverByteBuffer bb) {
 	 return convertUmlaute(transscriptAll(stripAllTags(bb)));
     }
 
+    public void close() {
+        // free resources
+        tags0 = null;
+        tags1 = null;
+    }
+
+    public void finalize() {
+        close();
+    }
+
 }
diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
@@ -78,4 +78,14 @@ public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byt
 	return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
     }
 
+    public void close() {
+        // free resources
+        tags0 = null;
+        tags1 = null;
+    }
+
+    public void finalize() {
+        close();
+    }
+
 }
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -122,10 +122,12 @@ public void scrapeTag0(String tagname, Properties tagopts) {
 
     public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
 	//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
-	if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
-						    new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
-	if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
-	if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
+	if ((tagname.equals("a")) && (text.length < 2048)) {
+            byte[] a = super.stripAll(new serverByteBuffer(text)).getBytes();
+            anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(a).trim().toString());
+        }
+	if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
+	if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
     }
 
 
@@ -161,6 +163,13 @@ public Map getImages() {
 	return images;
     }
 
+    public void close() {
+        // free resources
+        super.close();
+        linkTags0 = null;
+        linkTags1 = null;
+    }
+
     public void print() {
 	System.out.println("TITLE   :" + title);
 	System.out.println("HEADLINE:" + headline);

diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -128,4 +128,11 @@ public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byt
 	return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
     }
 
+    public void close() {
+        // free resources
+        super.close();
+        linkTags0 = null;
+        linkTags1 = null;
+    }
+
 }
diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
@@ -434,32 +434,35 @@ public void flush() throws IOException {
 	if (out != null) out.flush();
 	// if you want to flush all, call close() at end of writing;
     }
-
-    private byte[] finalized = null;
-
+
     public void finalize() throws IOException {
 	// if we are forced to close, we of course flush the buffer first,
 	// then close the connection
-        byte quotechar = (inSingleQuote) ? singlequote : doublequote;
+        close();
+    }
+
+    public void close() throws IOException {
+	byte quotechar = (inSingleQuote) ? singlequote : doublequote;
         if (buffer != null) {
             if (buffer.length() > 0) {
                 byte[] filtered = filterSentence(buffer.getBytes(), quotechar);
                 if (out != null) out.write(filtered);
             }
             buffer = null;
         }
-        finalized = filterFinalize(quotechar);
-    }
-
-    public void close() throws IOException {
-	finalize();
+        byte[] finalized = filterFinalize(quotechar);
 	if (out != null) {
 	    if (finalized != null) out.write(finalized);
 	    out.flush();
 	    out.close();
 	}
+        filterTag = null;
+        filterOpts = null;
+        filterCont = null;
+        //if (scraper != null) {scraper.close(); scraper = null;}
+        //if (transformer != null) {transformer.close(); transformer = null;}
     }
-
+    
     private static boolean binaryHint(byte b) {
 	if (b < 0) return false;
         if (b > 31) return false;

diff --git a/source/de/anomic/htmlFilter/htmlFilterScraper.java b/source/de/anomic/htmlFilter/htmlFilterScraper.java
@@ -53,5 +53,7 @@ public interface htmlFilterScraper {
     public void scrapeTag0(String tagname, Properties tagopts);
 
     public void scrapeTag1(String tagname, Properties tagopts, byte[] text);
+
+    public void close();
 
 }
diff --git a/source/de/anomic/htmlFilter/htmlFilterTransformer.java b/source/de/anomic/htmlFilter/htmlFilterTransformer.java
@@ -73,4 +73,5 @@ public interface htmlFilterTransformer {
     // method that is called when a body-containing text occurs
     public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar);
 
+    public void close();
 }
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
@@ -284,35 +284,35 @@ public static String[] setEnabledParserList(Set mimeTypeSet) {
 
         if (mimeTypeSet != null) {
             Iterator mimeTypes = mimeTypeSet.iterator();
-	        while (mimeTypes.hasNext()) {
-	            String mimeType = (String) mimeTypes.next();
-				if (availableParserList.containsKey(mimeType)) {
+            while (mimeTypes.hasNext()) {
+                String mimeType = (String) mimeTypes.next();
+                if (availableParserList.containsKey(mimeType)) {
                     Parser theParser = null;
                     try {
                         // getting the parser
                         theParser = (Parser) plasmaParser.theParserPool.borrowObject(availableParserList.get(mimeType));
 
                         // getting a list of mimeTypes that the parser supports
-                        Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();                        
+                        Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
                         if (parserSupportsMimeTypes != null) {
-                            Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);                        
-                            if ((supportedExtensions != null) && 
-                                (supportedExtensions instanceof String) && 
-                                (((String)supportedExtensions).length() > 0)) {
-                        		String[] extArray = ((String)supportedExtensions).split(",");
+                            Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
+                            if ((supportedExtensions != null) &&
+                                    (supportedExtensions instanceof String) &&
+                                    (((String)supportedExtensions).length() > 0)) {
+                                String[] extArray = ((String)supportedExtensions).split(",");
                                 newSupportedFileExt.addAll(Arrays.asList(extArray));
                             }
                         }
-						newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
+                        newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
 
-                    } catch (Exception e) { 
+                    } catch (Exception e) {
                         e.printStackTrace();
                     } finally {
-                        if (theParser != null) 
+                        if (theParser != null)
                             try { plasmaParser.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {}
                     }
-				}
-	        }
+                }
+            }
         }
 
         synchronized (enabledParserList) {
@@ -392,7 +392,7 @@ private static void loadAvailableParserList() {
                     String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
 	                try {
                         // trying to load the parser class by its name
-						Class parserClass = Class.forName(fullClassName);
+                        Class parserClass = Class.forName(fullClassName);
                         Object theParser = parserClass.newInstance();
                         if (!(theParser instanceof Parser)) continue;
 
@@ -458,11 +458,13 @@ public plasmaParserDocument parseSource(URL location, String mimeType, byte[] so
                 OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
 
                 hfos.write(source);
+                hfos.close();
                 return transformScraper(location, mimeType, scraper);
             } else {
                 return null;
             }
         } catch (Exception e) {
+            //e.printStackTrace();
             return null;
         } finally {
             if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
@@ -487,14 +489,14 @@ public plasmaParserDocument parseSource(URL location, String mimeType, File sour
                 // ...otherwise we make a scraper and transformer
                 htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
                 OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
-
                 serverFileUtils.copy(sourceFile, hfos);
+                hfos.close();
                 return transformScraper(location, mimeType, scraper);
             } else {
                 return null;
             }
         } catch (Exception e) {
-            // e.printStackTrace();
+            //e.printStackTrace();
             return null;
         } finally {
             if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
@@ -505,11 +507,14 @@ public plasmaParserDocument parseSource(URL location, String mimeType, File sour
 
     public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
         try {
-            return new plasmaParserDocument(new URL(urlNormalform(location)),
+            plasmaParserDocument ppd =  new plasmaParserDocument(new URL(urlNormalform(location)),
                                 mimeType, null, null, scraper.getHeadline(),
                                 null, null,
                                 scraper.getText(), scraper.getAnchors(), scraper.getImages());
+            //scraper.close();
+            return ppd;
         } catch (MalformedURLException e) {
+            //e.printStackTrace();
             return null;
         }
     }

diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -445,7 +445,7 @@ public void close() {
         wordIndex.close(waitingBoundSeconds);
         log.logSystem("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager");
         try {
-			cacheLoader.close();
+            cacheLoader.close();
             wikiDB.close();
             messageDB.close();
             facilityDB.close();
@@ -468,10 +468,12 @@ public int queueSize() {
         //return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
     }
 
+    /*
     public int lUrlSize() {
 	return urlPool.loadedURL.size();
     }
-
+    */
+
     public int cacheSizeMin() {
 	return wordIndex.size();
     }
@@ -765,7 +767,7 @@ private void processResourceStack(plasmaHTCache.Entry entry) {
             log.logDebug("processResourceStack processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
 
             // parse content
-            plasmaParserDocument document;
+            plasmaParserDocument document = null;
 
             if (plasmaParser.supportedMimeTypesContains(entry.responseHeader.mime())) {
                 if (entry.scraper != null) {