Skip to content

Commit

Permalink
integration of Michaels string-extraction.
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2337 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
allo committed Jul 26, 2006
1 parent 8b77afd commit 4e9f02c
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 2 deletions.
30 changes: 28 additions & 2 deletions source/de/anomic/data/gettext.java
Expand Up @@ -27,6 +27,7 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
Expand All @@ -36,6 +37,11 @@
import java.util.Iterator;
import java.util.Map;

import de.anomic.htmlFilter.htmlFilterAbstractTransformer;
import de.anomic.htmlFilter.htmlFilterContentTransformer;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.logging.serverLog;

public class gettext{
public static ArrayList createGettextRecursive(File sourceDir, String extensions, String notdir, File oldgettextfile) throws FileNotFoundException{
if(oldgettextfile==null)
Expand Down Expand Up @@ -121,7 +127,10 @@ public static ArrayList createGettext(ArrayList filenames, Map oldgettext){
while(it.hasNext()){
try {
filename=(String)it.next();
tmp=getGettextSource(new File(filename), oldgettext);
//TODO: better possibility to switch the behaviour
//tmp=getGettextSource(new File(filename), oldgettext);
tmp=getGettextSourceFromHTML(new File(filename), oldgettext);
serverLog.logFinest("Gettext", "Extracting Strings from: "+filename);
} catch (FileNotFoundException e) {
System.out.println("File \""+filename+"\" not found.");
}
Expand All @@ -139,10 +148,27 @@ public static ArrayList getGettextSource(File inputfile) throws FileNotFoundExce
return getGettextSource(inputfile, new HashMap());
}
public static ArrayList getGettextSource(File inputfile, Map oldgettextmap) throws FileNotFoundException{
ArrayList strings=getGettextItems(inputfile);
return getGettextSource(inputfile, oldgettextmap, strings);
}
public static ArrayList getGettextSourceFromHTML(File inputfile, Map oldgettextmap) throws FileNotFoundException{
htmlFilterContentTransformer transformer=new htmlFilterContentTransformer();
BufferedReader br=new BufferedReader(new FileReader(inputfile));
StringBuffer content=new StringBuffer();
String line="";
try {
while((line=br.readLine())!=null){
content.append(line).append("\n");
}
} catch (IOException e) {}
ArrayList strings = transformer.getStrings(content.toString().getBytes());
return getGettextSource(inputfile, oldgettextmap, strings);
}
public static ArrayList getGettextSource(File inputfile, Map oldgettextmap, ArrayList strings) throws FileNotFoundException{
if(oldgettextmap==null)
oldgettextmap=new HashMap();

ArrayList strings=getGettextItems(inputfile);

ArrayList list=new ArrayList();
Iterator it=strings.iterator();
if(strings.isEmpty())
Expand Down
22 changes: 22 additions & 0 deletions source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
Expand Up @@ -136,6 +136,28 @@ private boolean bluelistHit(byte[] text) {
return false;
}

public ArrayList getStrings(byte[] text){
ArrayList result=new ArrayList();

serverByteBuffer sbb = new serverByteBuffer(text);
serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb);
//sbb = new serverByteBuffer();
for (int i = 0; i < sbbs.length; i++) {
if (sbbs[i].isWhitespace(true)) {
//sbb.append(sbbs[i]);
} else if ((sbbs[i].byteAt(0) == httpTemplate.hash) ||
(sbbs[i].startsWith(httpTemplate.dpdpa))) {
// this is a template or a part of a template
//sbb.append(sbbs[i]);
} else {
// this is a text fragment, generate gettext quotation
int ws = sbbs[i].whitespaceStart(true);
int we = sbbs[i].whitespaceEnd(true);
result.add(new String(sbbs[i].getBytes(ws, we)));
}
}
return result;
}
public byte[] transformText(byte[] text) {
if (gettext) {
serverByteBuffer sbb = new serverByteBuffer(text);
Expand Down

0 comments on commit 4e9f02c

Please sign in to comment.