Skip to content

Commit

Permalink
fixed a worst case situation of the condenser which may cause a tempo…
Browse files Browse the repository at this point in the history
…rary full CPU load because of a bad data structure usage

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6372 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Oct 5, 2009
1 parent f1bde59 commit ea427df
Showing 1 changed file with 34 additions and 100 deletions.
134 changes: 34 additions & 100 deletions source/de/anomic/document/Condenser.java
Expand Up @@ -35,6 +35,7 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -453,13 +454,13 @@ private void createCondensement(final InputStream is) throws UnsupportedEncoding
public final static boolean invisible(final char c) {
final int type = Character.getType(c);
if (
(type == Character.LOWERCASE_LETTER)
|| (type == Character.DECIMAL_DIGIT_NUMBER)
|| (type == Character.UPPERCASE_LETTER)
|| (type == Character.MODIFIER_LETTER)
|| (type == Character.OTHER_LETTER)
|| (type == Character.TITLECASE_LETTER)
|| (ContentScraper.punctuation(c))) {
type == Character.LOWERCASE_LETTER
|| type == Character.DECIMAL_DIGIT_NUMBER
|| type == Character.UPPERCASE_LETTER
|| type == Character.MODIFIER_LETTER
|| type == Character.OTHER_LETTER
|| type == Character.TITLECASE_LETTER
|| ContentScraper.punctuation(c)) {
return false;
}
return true;
Expand Down Expand Up @@ -528,84 +529,18 @@ public StringBuilder nextElement() {
}

}

/*
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
StringBuilder buffer = null;
sentencesFromInputStreamEnum e;
StringBuilder s;
int off;
public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is);
s = new StringBuilder(0);
off = 0;
buffer = nextElement0();
}
public void pre(final boolean x) {
e.pre(x);
}
private StringBuilder nextElement0() {
StringBuilder r;
StringBuilder sb;
char c;
while (s.length() - off <= 0) {
if (e.hasNext()) {
r = e.next();
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(r.length() * 2);
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
}
s = trim(sb);
off = 0;
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
} else {
return null;
}
}
final int p = s.indexOf(" ", off);
if (p < 0) {
r = new StringBuilder(s.substring(off));
s = new StringBuilder(0);
off = 0;
return r;
}
r = trim(new StringBuilder(s.substring(off, p)));
off = p + 1;
while (off < s.length() && s.charAt(off) <= ' ') off++;
return r;
}
public boolean hasMoreElements() {
return buffer != null;
}
public StringBuilder nextElement() {
final StringBuilder r = buffer;
buffer = nextElement0();
return r;
}
}
*/

private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
StringBuilder buffer = null;
sentencesFromInputStreamEnum e;
StringBuilder s;
ArrayList<StringBuilder> s;
int sIndex;

public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is);
s = new StringBuilder(20);
s = new ArrayList<StringBuilder>();
sIndex = 0;
buffer = nextElement0();
}

Expand All @@ -617,32 +552,31 @@ private StringBuilder nextElement0() {
StringBuilder r;
StringBuilder sb;
char c;
while (s.length() == 0) {
if (e.hasNext()) {
r = e.next();
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(r.length() * 2);
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
else if (ContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
if (sIndex >= s.size()) {
sIndex = 0;
s.clear();
}
while (s.size() == 0) {
if (!e.hasNext()) return null;
r = e.next();
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(20);
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) {
if (sb.length() > 0) {s.add(sb); sb = new StringBuilder(20);}
} else if (ContentScraper.punctuation(c)) {
if (sb.length() > 0) {s.add(sb); sb = new StringBuilder(1);}
sb.append(c);
s.add(sb);
sb = new StringBuilder(20);
} else {
sb = sb.append(c);
}
s = trim(sb);
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
} else {
return null;
}
}
final int p = s.indexOf(" ");
if (p < 0) {
r = s;
s = new StringBuilder();
return r;
}
r = trim(new StringBuilder(s.substring(0, p)));
s = trim(s.delete(0, p + 1));
r = s.get(sIndex++);
return r;
}

Expand Down

0 comments on commit ea427df

Please sign in to comment.