Skip to content

Commit

Permalink
fixed possible memory leak in htmlScraper: be aware that now links ca…
Browse files Browse the repository at this point in the history
…n get lost; further work necessary

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@288 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jun 16, 2005
1 parent 3874785 commit a25b5b4
Show file tree
Hide file tree
Showing 18 changed files with 158 additions and 75 deletions.
26 changes: 25 additions & 1 deletion build.xml
Expand Up @@ -360,6 +360,29 @@

<!-- run YaCy (needed for NetBeans4) -->
<target name="run" description="run YaCy">
<!-- debug options:
java -Xrunhprof:help
Hprof usage: -Xrunhprof[:help]|[:<option>=<value>, ...]
Option Name and Value Description Default
_____________________ ___________ _______
heap=dump|sites|all heap profiling all
cpu=samples|times|old CPU usage off
monitor=y|n monitor contention n
format=a|b ascii or binary output a
file=<file> write data to file java.hprof(.txt for ascii)
net=<host>:<port> send data over a socket write to file
depth=<size> stack trace depth 4
cutoff=<value> output cutoff point 0.0001
lineno=y|n line number in traces? y
thread=y|n thread in traces? n
doe=y|n dump on exit? y
gc_okay=y|n GC okay during sampling y
Example: java -Xrunhprof:cpu=samples,file=log.txt,depth=3 FooClass
Note: format=b cannot be used with cpu=old|times
-->
<java classname="yacy" fork="yes">
<classpath>
<pathelement location="${build}"/>
Expand All @@ -369,8 +392,9 @@
<pathelement location="${libx}" />
<fileset dir="${libx}" includes="**/*.jar" />
</classpath>
<!--<arg value="-Xrunhprof"/>-->
<arg line="-start"/>
<!-- <arg line="-migratewords"/> -->
<!-- <arg line="-migratewords"/>-->
<!-- <arg line="-start ${user.dir}"/>-->
</java>
</target>
Expand Down
6 changes: 3 additions & 3 deletions htroot/IndexCreate_p.java
Expand Up @@ -104,11 +104,11 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
boolean crawlOrder = ((String) post.get("crawlOrder", "")).equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
boolean xsstopw = ((String) post.get("xsstopw", "")).equals("on");
env.setConfig("xsstopw", (crawlOrder) ? "true" : "false");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
boolean xdstopw = ((String) post.get("xdstopw", "")).equals("on");
env.setConfig("xdstopw", (crawlOrder) ? "true" : "false");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
env.setConfig("xpstopw", (crawlOrder) ? "true" : "false");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");

String crawlingStart = (String) post.get("crawlingURL");
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
Expand Down
14 changes: 12 additions & 2 deletions source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
Expand Up @@ -46,11 +46,11 @@ public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) {
}

public boolean isTag0(String tag) {
return tags0.contains(tag);
return (tags0 != null) && (tags0.contains(tag));
}

public boolean isTag1(String tag) {
return tags1.contains(tag);
return (tags1 != null) && (tags1.contains(tag));
}

//the 'missing' method that shall be implemented:
Expand Down Expand Up @@ -405,4 +405,14 @@ public static serverByteBuffer stripAll(serverByteBuffer bb) {
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}

public void close() {
// free resources
tags0 = null;
tags1 = null;
}

public void finalize() {
close();
}

}
10 changes: 10 additions & 0 deletions source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
Expand Up @@ -78,4 +78,14 @@ public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byt
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}

public void close() {
// free resources
tags0 = null;
tags1 = null;
}

public void finalize() {
close();
}

}
17 changes: 13 additions & 4 deletions source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -122,10 +122,12 @@ public void scrapeTag0(String tagname, Properties tagopts) {

public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("a")) && (text.length < 2048)) {
byte[] a = super.stripAll(new serverByteBuffer(text)).getBytes();
anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(a).trim().toString());
}
if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
}


Expand Down Expand Up @@ -161,6 +163,13 @@ public Map getImages() {
return images;
}

public void close() {
// free resources
super.close();
linkTags0 = null;
linkTags1 = null;
}

public void print() {
System.out.println("TITLE :" + title);
System.out.println("HEADLINE:" + headline);
Expand Down
7 changes: 7 additions & 0 deletions source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
Expand Up @@ -128,4 +128,11 @@ public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byt
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}

public void close() {
// free resources
super.close();
linkTags0 = null;
linkTags1 = null;
}

}
23 changes: 13 additions & 10 deletions source/de/anomic/htmlFilter/htmlFilterOutputStream.java
Expand Up @@ -434,32 +434,35 @@ public void flush() throws IOException {
if (out != null) out.flush();
// if you want to flush all, call close() at end of writing;
}

private byte[] finalized = null;


public void finalize() throws IOException {
// if we are forced to close, we of course flush the buffer first,
// then close the connection
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
close();
}

public void close() throws IOException {
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
if (buffer != null) {
if (buffer.length() > 0) {
byte[] filtered = filterSentence(buffer.getBytes(), quotechar);
if (out != null) out.write(filtered);
}
buffer = null;
}
finalized = filterFinalize(quotechar);
}

public void close() throws IOException {
finalize();
byte[] finalized = filterFinalize(quotechar);
if (out != null) {
if (finalized != null) out.write(finalized);
out.flush();
out.close();
}
filterTag = null;
filterOpts = null;
filterCont = null;
//if (scraper != null) {scraper.close(); scraper = null;}
//if (transformer != null) {transformer.close(); transformer = null;}
}

private static boolean binaryHint(byte b) {
if (b < 0) return false;
if (b > 31) return false;
Expand Down
2 changes: 2 additions & 0 deletions source/de/anomic/htmlFilter/htmlFilterScraper.java
Expand Up @@ -53,5 +53,7 @@ public interface htmlFilterScraper {
public void scrapeTag0(String tagname, Properties tagopts);

public void scrapeTag1(String tagname, Properties tagopts, byte[] text);

public void close();

}
1 change: 1 addition & 0 deletions source/de/anomic/htmlFilter/htmlFilterTransformer.java
Expand Up @@ -73,4 +73,5 @@ public interface htmlFilterTransformer {
// method that is called when a body-containing text occurs
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar);

public void close();
}
41 changes: 23 additions & 18 deletions source/de/anomic/plasma/plasmaParser.java
Expand Up @@ -284,35 +284,35 @@ public static String[] setEnabledParserList(Set mimeTypeSet) {

if (mimeTypeSet != null) {
Iterator mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
String mimeType = (String) mimeTypes.next();
if (availableParserList.containsKey(mimeType)) {
while (mimeTypes.hasNext()) {
String mimeType = (String) mimeTypes.next();
if (availableParserList.containsKey(mimeType)) {
Parser theParser = null;
try {
// getting the parser
theParser = (Parser) plasmaParser.theParserPool.borrowObject(availableParserList.get(mimeType));

// getting a list of mimeTypes that the parser supports
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
String[] extArray = ((String)supportedExtensions).split(",");
Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
String[] extArray = ((String)supportedExtensions).split(",");
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
newEnabledParsers.put(mimeType,availableParserList.get(mimeType));

} catch (Exception e) {
} catch (Exception e) {
e.printStackTrace();
} finally {
if (theParser != null)
if (theParser != null)
try { plasmaParser.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {}
}
}
}
}
}
}

synchronized (enabledParserList) {
Expand Down Expand Up @@ -392,7 +392,7 @@ private static void loadAvailableParserList() {
String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
try {
// trying to load the parser class by its name
Class parserClass = Class.forName(fullClassName);
Class parserClass = Class.forName(fullClassName);
Object theParser = parserClass.newInstance();
if (!(theParser instanceof Parser)) continue;

Expand Down Expand Up @@ -458,11 +458,13 @@ public plasmaParserDocument parseSource(URL location, String mimeType, byte[] so
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);

hfos.write(source);
hfos.close();
return transformScraper(location, mimeType, scraper);
} else {
return null;
}
} catch (Exception e) {
//e.printStackTrace();
return null;
} finally {
if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
Expand All @@ -487,14 +489,14 @@ public plasmaParserDocument parseSource(URL location, String mimeType, File sour
// ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);

serverFileUtils.copy(sourceFile, hfos);
hfos.close();
return transformScraper(location, mimeType, scraper);
} else {
return null;
}
} catch (Exception e) {
// e.printStackTrace();
//e.printStackTrace();
return null;
} finally {
if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
Expand All @@ -505,11 +507,14 @@ public plasmaParserDocument parseSource(URL location, String mimeType, File sour

public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
try {
return new plasmaParserDocument(new URL(urlNormalform(location)),
plasmaParserDocument ppd = new plasmaParserDocument(new URL(urlNormalform(location)),
mimeType, null, null, scraper.getHeadline(),
null, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());
//scraper.close();
return ppd;
} catch (MalformedURLException e) {
//e.printStackTrace();
return null;
}
}
Expand Down
8 changes: 5 additions & 3 deletions source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -445,7 +445,7 @@ public void close() {
wordIndex.close(waitingBoundSeconds);
log.logSystem("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager");
try {
cacheLoader.close();
cacheLoader.close();
wikiDB.close();
messageDB.close();
facilityDB.close();
Expand All @@ -468,10 +468,12 @@ public int queueSize() {
//return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}

/*
public int lUrlSize() {
return urlPool.loadedURL.size();
}

*/

public int cacheSizeMin() {
return wordIndex.size();
}
Expand Down Expand Up @@ -765,7 +767,7 @@ private void processResourceStack(plasmaHTCache.Entry entry) {
log.logDebug("processResourceStack processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG

// parse content
plasmaParserDocument document;
plasmaParserDocument document = null;

if (plasmaParser.supportedMimeTypesContains(entry.responseHeader.mime())) {
if (entry.scraper != null) {
Expand Down

0 comments on commit a25b5b4

Please sign in to comment.