Skip to content

Commit

Permalink
lines inside tags without punctuation are extended by a single dot.
Browse files Browse the repository at this point in the history
This enables the condenser to distinguish the lines in a better way.
The result is a better preparation of snippets.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2715 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Oct 8, 2006
1 parent e251728 commit fd61209
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 15 deletions.
2 changes: 1 addition & 1 deletion source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
Expand Up @@ -304,7 +304,7 @@ public boolean isTag1(String tag) {
}

//the 'missing' method that shall be implemented:
public abstract void scrapeText(char[] text);
public abstract void scrapeText(char[] text, boolean insideTag);

// the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts);
Expand Down
19 changes: 16 additions & 3 deletions source/de/anomic/htmlFilter/htmlFilterContentScraper.java
Expand Up @@ -115,10 +115,23 @@ public htmlFilterContentScraper(URL root) {
this.content = new serverCharBuffer(1024);
}

public void scrapeText(char[] newtext) {
public final static boolean punctuation(char c) {
return (c == '.') || (c == '!') || (c == '?');
}

public void scrapeText(char[] newtext, boolean insideTag) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
content.append(super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim()).append(32);
serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
if (insideTag) {
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.
// to make it easier for the condenser, a dot ('.') is appended in case that
// no punctuation is part of the newtext line
if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append('.');
//System.out.println("*** Appended dot: " + b.toString());
}
if (b.length() != 0) content.append(b).append((char) 32);
}

public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/htmlFilter/htmlFilterScraper.java
Expand Up @@ -48,7 +48,7 @@ public interface htmlFilterScraper {

public boolean isTag1(String tag);

public void scrapeText(char[] text);
public void scrapeText(char[] text, boolean insideTag);

public void scrapeTag0(String tagname, Properties tagopts);

Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/htmlFilter/htmlFilterWriter.java
Expand Up @@ -186,7 +186,7 @@ private char[] filterTag(String tag, boolean opening, char[] content, char quote
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content);
if (scraper != null) scraper.scrapeText(content, false);
if (transformer != null) return transformer.transformText(content);
return content;
}
Expand Down Expand Up @@ -221,7 +221,7 @@ private char[] filterTag(String tag, boolean opening, char[] content, char quote
// we are collection tag text for the tag 'filterTag'
if (tag == null) {
// go on collecting content
if (scraper != null) scraper.scrapeText(content);
if (scraper != null) scraper.scrapeText(content, true);
if (transformer != null) {
filterCont.append(transformer.transformText(content));
} else {
Expand Down
13 changes: 5 additions & 8 deletions source/de/anomic/plasma/plasmaCondenser.java
Expand Up @@ -61,6 +61,7 @@
import java.util.TreeMap;
import java.util.TreeSet;

import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroMSetTools;

public final class plasmaCondenser {
Expand Down Expand Up @@ -192,7 +193,7 @@ private void createCondensement(InputStream is) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
// System.out.println("PARSED-WORD " + word);
wordlen = word.length();
if ((wordlen == 1) && (punctuation(word.charAt(0)))) {
if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
// store sentence
if (sentence.length() > 0) {
// we store the punctuation symbol as first element of the sentence vector
Expand Down Expand Up @@ -470,10 +471,6 @@ public void writeMapToFile(File out) throws IOException {
writer.close();
}

protected final static boolean punctuation(char c) {
return (c == '.') || (c == '!') || (c == '?');
}

public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
Expand Down Expand Up @@ -507,7 +504,7 @@ private Object nextElement0() {
char c;
loop: while (e.hasMoreElements()) {
s = (String) e.nextElement();
if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s;
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if (s.length() < ml) continue loop;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
Expand Down Expand Up @@ -562,7 +559,7 @@ private Object nextElement0() {
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
else if (punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
}
s = sb.toString().trim();
Expand Down Expand Up @@ -721,7 +718,7 @@ static String readSentence(Reader reader) throws IOException {
if (nextChar < 0) return null;
c = (char) nextChar;
s.append(c);
if (punctuation(c)) break;
if (htmlFilterContentScraper.punctuation(c)) break;
}

// replace line endings and tabs by blanks
Expand Down

0 comments on commit fd61209

Please sign in to comment.