diff --git a/src/main/java/org/thunlp/thulac/Thulac.java b/src/main/java/org/thunlp/thulac/Thulac.java index 167704e..bde24e9 100644 --- a/src/main/java/org/thunlp/thulac/Thulac.java +++ b/src/main/java/org/thunlp/thulac/Thulac.java @@ -3,7 +3,10 @@ import org.thunlp.thulac.cb.CBTaggingDecoder; import org.thunlp.thulac.data.POCGraph; import org.thunlp.thulac.data.TaggedWord; -import org.thunlp.thulac.passes.*; +import org.thunlp.thulac.postprocess.*; +import org.thunlp.thulac.preprocess.IPreprocessPass; +import org.thunlp.thulac.preprocess.PreprocessPass; +import org.thunlp.thulac.preprocess.ConvertT2SPass; import java.io.IOException; import java.util.ArrayList; @@ -65,45 +68,44 @@ public static void split( output.onProgramStart(); // segmentation - POCGraph pocGraph = new POCGraph(); - CBTaggingDecoder cwsTaggingDecoder = new CBTaggingDecoder(); - cwsTaggingDecoder.threshold = segOnly ? 0 : 10000; - cwsTaggingDecoder.separator = separator; + POCGraph graph = new POCGraph(); + CBTaggingDecoder taggingDecoder = new CBTaggingDecoder(); + taggingDecoder.threshold = segOnly ? 0 : 10000; + taggingDecoder.separator = separator; String prefix = modelDir + (segOnly ? "cws_" : "model_c_"); - cwsTaggingDecoder.init(prefix + "model.bin", prefix + "dat.bin", + taggingDecoder.init(prefix + "model.bin", prefix + "dat.bin", prefix + "label.txt"); - cwsTaggingDecoder.setLabelTrans(); + taggingDecoder.setLabelTrans(); - // preprocessor - Preprocessor preprocessor = new Preprocessor(); - if (useT2S) preprocessor.loadT2SMap(modelDir + "t2s.dat"); + // preprocess passes + List pre = new ArrayList<>(); + pre.add(new PreprocessPass()); + if (useT2S) pre.add(new ConvertT2SPass(modelDir + "t2s.dat")); - // adjustment passes - List passes = new ArrayList<>(); - passes.add(new PostprocessPass(modelDir + "ns.dat", "ns", false)); - passes.add(new PostprocessPass(modelDir + "idiom.dat", "i", false)); - passes.add(new PunctuationPass(modelDir + "singlepun.dat")); - passes.add(new TimeWordPass()); - passes.add(new NegWordPass(modelDir + "neg.dat")); - if (userDict != null) passes.add(new PostprocessPass(userDict, "uw", true)); - if (useFilter) // filter - passes.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat")); + // postprocess passes + List post = new ArrayList<>(); + post.add(new PostprocessPass(modelDir + "ns.dat", "ns", false)); + post.add(new PostprocessPass(modelDir + "idiom.dat", "i", false)); + post.add(new PunctuationPass(modelDir + "singlepun.dat")); + post.add(new TimeWordPass()); + post.add(new NegWordPass(modelDir + "neg.dat")); + if (userDict != null) post.add(new PostprocessPass(userDict, "uw", true)); + if (useFilter) + post.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat")); // main loop - for (List vec = input.provideInput(); vec != null; vec = input.provideInput()) { + for ( + List lineSegments = input.provideInput(); + lineSegments != null; + lineSegments = input.provideInput()) { output.handleLineStart(); - for (String raw : vec) { - // preprocess - raw = preprocessor.cleanup(raw, pocGraph); - if (useT2S) raw = preprocessor.convertT2S(raw); - if (raw.isEmpty()) continue; + for (String raw : lineSegments) { + for (IPreprocessPass pass : pre) raw = pass.process(raw, graph); - // segmentation List tagged = new Vector<>(); - cwsTaggingDecoder.segment(raw, pocGraph, tagged); + taggingDecoder.segment(raw, graph, tagged); - // adjustment passes - for (IAdjustPass pass : passes) pass.adjust(tagged); + for (IPostprocessPass pass : post) pass.process(tagged); output.handleLineSegment(tagged, segOnly); } diff --git a/src/main/java/org/thunlp/thulac/passes/IAdjustPass.java b/src/main/java/org/thunlp/thulac/passes/IAdjustPass.java deleted file mode 100644 index 4c9bce4..0000000 --- a/src/main/java/org/thunlp/thulac/passes/IAdjustPass.java +++ /dev/null @@ -1,12 +0,0 @@ -package org.thunlp.thulac.passes; - -import org.thunlp.thulac.data.TaggedWord; - -import java.util.List; - -/** - * - */ -public interface IAdjustPass { - void adjust(List sentence); -} diff --git a/src/main/java/org/thunlp/thulac/passes/FilterPass.java b/src/main/java/org/thunlp/thulac/postprocess/FilterPass.java similarity index 93% rename from src/main/java/org/thunlp/thulac/passes/FilterPass.java rename to src/main/java/org/thunlp/thulac/postprocess/FilterPass.java index 4c16d76..7585f6c 100644 --- a/src/main/java/org/thunlp/thulac/passes/FilterPass.java +++ b/src/main/java/org/thunlp/thulac/postprocess/FilterPass.java @@ -1,4 +1,4 @@ -package org.thunlp.thulac.passes; +package org.thunlp.thulac.postprocess; import org.thunlp.thulac.data.Dat; import org.thunlp.thulac.data.TaggedWord; @@ -10,7 +10,7 @@ import java.util.List; import java.util.Set; -public class FilterPass implements IAdjustPass { +public class FilterPass implements IPostprocessPass { private static final Set ALLOWED_TAGS = new HashSet<>(Arrays.asList( "n", "np", "ns", "ni", "nz", "v", "a", "id", "t", "uw")); private static final String ARABIC_NUMBER_CODE_POINTS = @@ -29,7 +29,7 @@ public FilterPass(String xuWordFile, String timeWordFile) throws IOException { } @Override - public void adjust(List sentence) { + public void process(List sentence) { if (this.xu_dat == null || this.time_dat == null) return; for (int i = sentence.size() - 1; i >= 0; --i) { diff --git a/src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java b/src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java new file mode 100644 index 0000000..2636e2e --- /dev/null +++ b/src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java @@ -0,0 +1,12 @@ +package org.thunlp.thulac.postprocess; + +import org.thunlp.thulac.data.TaggedWord; + +import java.util.List; + +/** + * + */ +public interface IPostprocessPass { + void process(List sentence); +} diff --git a/src/main/java/org/thunlp/thulac/passes/NegWordPass.java b/src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java similarity index 84% rename from src/main/java/org/thunlp/thulac/passes/NegWordPass.java rename to src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java index a9ec3b9..fd6a2e7 100644 --- a/src/main/java/org/thunlp/thulac/passes/NegWordPass.java +++ b/src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java @@ -1,4 +1,4 @@ -package org.thunlp.thulac.passes; +package org.thunlp.thulac.postprocess; import org.thunlp.thulac.data.Dat; import org.thunlp.thulac.data.TaggedWord; @@ -7,7 +7,7 @@ import java.io.IOException; import java.util.List; -public class NegWordPass implements IAdjustPass { +public class NegWordPass implements IPostprocessPass { private Dat neg_dat; public NegWordPass(String filename) throws IOException { @@ -15,7 +15,7 @@ public NegWordPass(String filename) throws IOException { } @Override - public void adjust(List sentence) { + public void process(List sentence) { if (this.neg_dat == null) return; for (int i = sentence.size() - 1; i >= 0; --i) { diff --git a/src/main/java/org/thunlp/thulac/passes/PostprocessPass.java b/src/main/java/org/thunlp/thulac/postprocess/PostprocessPass.java similarity index 89% rename from src/main/java/org/thunlp/thulac/passes/PostprocessPass.java rename to src/main/java/org/thunlp/thulac/postprocess/PostprocessPass.java index f0f4c36..b38d6cf 100644 --- a/src/main/java/org/thunlp/thulac/passes/PostprocessPass.java +++ b/src/main/java/org/thunlp/thulac/postprocess/PostprocessPass.java @@ -1,4 +1,4 @@ -package org.thunlp.thulac.passes; +package org.thunlp.thulac.postprocess; import org.thunlp.thulac.data.Dat; import org.thunlp.thulac.data.DatMaker; @@ -8,7 +8,7 @@ import java.util.ArrayList; import java.util.List; -public class PostprocessPass implements IAdjustPass { +public class PostprocessPass implements IPostprocessPass { private Dat p_dat; private String tag; @@ -20,7 +20,7 @@ public PostprocessPass(String filename, String tag, boolean isTxt) throws } @Override - public void adjust(List sentence) { + public void process(List sentence) { if (this.p_dat == null) return; List tmp = new ArrayList<>(); diff --git a/src/main/java/org/thunlp/thulac/passes/PunctuationPass.java b/src/main/java/org/thunlp/thulac/postprocess/PunctuationPass.java similarity index 88% rename from src/main/java/org/thunlp/thulac/passes/PunctuationPass.java rename to src/main/java/org/thunlp/thulac/postprocess/PunctuationPass.java index 52bd669..0e5a40e 100644 --- a/src/main/java/org/thunlp/thulac/passes/PunctuationPass.java +++ b/src/main/java/org/thunlp/thulac/postprocess/PunctuationPass.java @@ -1,4 +1,4 @@ -package org.thunlp.thulac.passes; +package org.thunlp.thulac.postprocess; import org.thunlp.thulac.data.Dat; import org.thunlp.thulac.data.TaggedWord; @@ -7,7 +7,7 @@ import java.util.List; import java.util.Vector; -public class PunctuationPass implements IAdjustPass { +public class PunctuationPass implements IPostprocessPass { private Dat p_dat; public PunctuationPass(String filename) throws IOException { @@ -15,7 +15,7 @@ public PunctuationPass(String filename) throws IOException { } @Override - public void adjust(List sentence) { + public void process(List sentence) { if (this.p_dat == null) return; Vector tmp = new Vector<>(); diff --git a/src/main/java/org/thunlp/thulac/passes/TimeWordPass.java b/src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java similarity index 95% rename from src/main/java/org/thunlp/thulac/passes/TimeWordPass.java rename to src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java index 94e2bfc..3a143d7 100644 --- a/src/main/java/org/thunlp/thulac/passes/TimeWordPass.java +++ b/src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java @@ -1,11 +1,11 @@ -package org.thunlp.thulac.passes; +package org.thunlp.thulac.postprocess; import org.thunlp.thulac.data.TaggedWord; import org.thunlp.thulac.util.StringUtil; import java.util.List; -public class TimeWordPass implements IAdjustPass { +public class TimeWordPass implements IPostprocessPass { private static final String ARABIC_NUMBER_CODE_POINTS = StringUtil.toString(48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65296, 65297, 65298, 65299, 65300, 65301, 65302, 65303, 65304, 65305); @@ -48,7 +48,7 @@ private boolean isHttpWord(String word) { } @Override - public void adjust(List sentence) { + public void process(List sentence) { this.processTimeWords(sentence); this.processDoubleWords(sentence); this.processHttpWords(sentence); diff --git a/src/main/java/org/thunlp/thulac/passes/VerbPass.java b/src/main/java/org/thunlp/thulac/postprocess/VerbPass.java similarity index 84% rename from src/main/java/org/thunlp/thulac/passes/VerbPass.java rename to src/main/java/org/thunlp/thulac/postprocess/VerbPass.java index ee98869..d0cb7e4 100644 --- a/src/main/java/org/thunlp/thulac/passes/VerbPass.java +++ b/src/main/java/org/thunlp/thulac/postprocess/VerbPass.java @@ -1,4 +1,4 @@ -package org.thunlp.thulac.passes; +package org.thunlp.thulac.postprocess; import org.thunlp.thulac.data.Dat; import org.thunlp.thulac.data.TaggedWord; @@ -6,7 +6,7 @@ import java.io.IOException; import java.util.List; -public class VerbPass implements IAdjustPass { +public class VerbPass implements IPostprocessPass { private Dat vM_dat; private Dat vD_dat; private String tag; @@ -18,7 +18,7 @@ public VerbPass(String filename, String filename2) throws IOException { } @Override - public void adjust(List sentence) { + public void process(List sentence) { if ((this.vM_dat == null) || (this.vD_dat == null)) return; TaggedWord tagged = sentence.get(0), next; diff --git a/src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java b/src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java new file mode 100644 index 0000000..968d627 --- /dev/null +++ b/src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java @@ -0,0 +1,53 @@ +package org.thunlp.thulac.preprocess; + +import org.thunlp.thulac.data.POCGraph; +import org.thunlp.thulac.util.StringUtil; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.HashMap; + +/** + * + */ +public class ConvertT2SPass implements IPreprocessPass { + private HashMap t2sMap; + + public ConvertT2SPass(String fileName) throws IOException { + this.t2sMap = new HashMap<>(); + this.loadT2SMap(fileName); + } + + private void loadT2SMap(String filename) throws IOException { + File mapFile = new File(filename); + int recordCount = (int) (mapFile.length() >> 3); + DataInputStream input = new DataInputStream(new FileInputStream(mapFile)); + int[] traditional = new int[recordCount]; + for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt(); + for (int i = 0; i < recordCount; ++i) { + int simplified = input.readInt(); + this.t2sMap.put(traditional[i], simplified); + } + input.close(); + } + + private int getSimplifiedCodePoint(int c) { + if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c); + return c; + } + + private String convertT2S(String sentence) { + int[] codePoints = StringUtil.toCodePoints(sentence); + StringBuilder sb = new StringBuilder(); + for (int codePoint : codePoints) + sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint)); + return sb.toString(); + } + + @Override + public String process(String raw, POCGraph ignored) { + return this.convertT2S(raw); + } +} diff --git a/src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java b/src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java new file mode 100644 index 0000000..30afb1d --- /dev/null +++ b/src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java @@ -0,0 +1,10 @@ +package org.thunlp.thulac.preprocess; + +import org.thunlp.thulac.data.POCGraph; + +/** + * + */ +public interface IPreprocessPass { + String process(String raw, POCGraph graph); +} diff --git a/src/main/java/org/thunlp/thulac/Preprocessor.java b/src/main/java/org/thunlp/thulac/preprocess/PreprocessPass.java similarity index 73% rename from src/main/java/org/thunlp/thulac/Preprocessor.java rename to src/main/java/org/thunlp/thulac/preprocess/PreprocessPass.java index 53139eb..f0cf71b 100644 --- a/src/main/java/org/thunlp/thulac/Preprocessor.java +++ b/src/main/java/org/thunlp/thulac/preprocess/PreprocessPass.java @@ -1,15 +1,9 @@ -package org.thunlp.thulac; +package org.thunlp.thulac.preprocess; import org.thunlp.thulac.data.POCGraph; import org.thunlp.thulac.util.StringUtil; -import java.io.DataInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.HashMap; - -public class Preprocessor { +public class PreprocessPass implements IPreprocessPass { private static final String OTHER_CODE_POINTS = StringUtil.toString(65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 12304, 12305, 12289, 12298, 12299, 126, 183, 64, 124, 35, 65509, 37, 8230, 38, 42, 65288, @@ -25,17 +19,11 @@ public class Preprocessor { 63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41, 59, 61); private static final String WHITESPACE_CODE_POINTS = StringUtil.toString(32, 12288); - private HashMap t2sMap; - - public Preprocessor() { - this.t2sMap = new HashMap<>(); - } - - public boolean isSinglePunctuation(int c) { + private boolean isSinglePunctuation(int c) { return SINGLE_PUNCTUATION_CODE_POINTS.indexOf(c) != -1; } - public String cleanup(String sentence, POCGraph graph) { + private String cleanup(String sentence, POCGraph graph) { StringBuilder cleaned = new StringBuilder(); graph.clear(); boolean spaceFlag = false, otherFlag = false, @@ -111,29 +99,8 @@ else if (otherFlag) { return cleaned.toString(); } - public void loadT2SMap(String filename) throws IOException { - File mapFile = new File(filename); - int recordCount = (int) (mapFile.length() >> 3); - DataInputStream input = new DataInputStream(new FileInputStream(mapFile)); - int[] traditional = new int[recordCount]; - for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt(); - for (int i = 0; i < recordCount; ++i) { - int simplified = input.readInt(); - this.t2sMap.put(traditional[i], simplified); - } - input.close(); - } - - private int getSimplifiedCodePoint(int c) { - if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c); - return c; - } - - public String convertT2S(String sentence) { - int[] codePoints = StringUtil.toCodePoints(sentence); - StringBuilder sb = new StringBuilder(); - for (int codePoint : codePoints) - sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint)); - return sb.toString(); + @Override + public String process(String raw, POCGraph graph) { + return this.cleanup(raw, graph); } }