Skip to content

Commit

Permalink
Split Preprocessor into two classes
Browse files Browse the repository at this point in the history
And standardize them by using an interface. org.thunlp.thulac.passes is renamed to org.thunlp.thulac.postprocess to distinct it from the new package org.thumlp.thulac.preprocess.
  • Loading branch information
std4453 committed Feb 24, 2017
1 parent 785e24c commit 03008c7
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 100 deletions.
62 changes: 32 additions & 30 deletions src/main/java/org/thunlp/thulac/Thulac.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import org.thunlp.thulac.cb.CBTaggingDecoder;
import org.thunlp.thulac.data.POCGraph;
import org.thunlp.thulac.data.TaggedWord;
import org.thunlp.thulac.passes.*;
import org.thunlp.thulac.postprocess.*;
import org.thunlp.thulac.preprocess.IPreprocessPass;
import org.thunlp.thulac.preprocess.PreprocessPass;
import org.thunlp.thulac.preprocess.ConvertT2SPass;

import java.io.IOException;
import java.util.ArrayList;
Expand Down Expand Up @@ -65,45 +68,44 @@ public static void split(
output.onProgramStart();

// segmentation
POCGraph pocGraph = new POCGraph();
CBTaggingDecoder cwsTaggingDecoder = new CBTaggingDecoder();
cwsTaggingDecoder.threshold = segOnly ? 0 : 10000;
cwsTaggingDecoder.separator = separator;
POCGraph graph = new POCGraph();
CBTaggingDecoder taggingDecoder = new CBTaggingDecoder();
taggingDecoder.threshold = segOnly ? 0 : 10000;
taggingDecoder.separator = separator;
String prefix = modelDir + (segOnly ? "cws_" : "model_c_");
cwsTaggingDecoder.init(prefix + "model.bin", prefix + "dat.bin",
taggingDecoder.init(prefix + "model.bin", prefix + "dat.bin",
prefix + "label.txt");
cwsTaggingDecoder.setLabelTrans();
taggingDecoder.setLabelTrans();

// preprocessor
Preprocessor preprocessor = new Preprocessor();
if (useT2S) preprocessor.loadT2SMap(modelDir + "t2s.dat");
// preprocess passes
List<IPreprocessPass> pre = new ArrayList<>();
pre.add(new PreprocessPass());
if (useT2S) pre.add(new ConvertT2SPass(modelDir + "t2s.dat"));

// adjustment passes
List<IAdjustPass> passes = new ArrayList<>();
passes.add(new PostprocessPass(modelDir + "ns.dat", "ns", false));
passes.add(new PostprocessPass(modelDir + "idiom.dat", "i", false));
passes.add(new PunctuationPass(modelDir + "singlepun.dat"));
passes.add(new TimeWordPass());
passes.add(new NegWordPass(modelDir + "neg.dat"));
if (userDict != null) passes.add(new PostprocessPass(userDict, "uw", true));
if (useFilter) // filter
passes.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat"));
// postprocess passes
List<IPostprocessPass> post = new ArrayList<>();
post.add(new PostprocessPass(modelDir + "ns.dat", "ns", false));
post.add(new PostprocessPass(modelDir + "idiom.dat", "i", false));
post.add(new PunctuationPass(modelDir + "singlepun.dat"));
post.add(new TimeWordPass());
post.add(new NegWordPass(modelDir + "neg.dat"));
if (userDict != null) post.add(new PostprocessPass(userDict, "uw", true));
if (useFilter)
post.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat"));

// main loop
for (List<String> vec = input.provideInput(); vec != null; vec = input.provideInput()) {
for (
List<String> lineSegments = input.provideInput();
lineSegments != null;
lineSegments = input.provideInput()) {
output.handleLineStart();
for (String raw : vec) {
// preprocess
raw = preprocessor.cleanup(raw, pocGraph);
if (useT2S) raw = preprocessor.convertT2S(raw);
if (raw.isEmpty()) continue;
for (String raw : lineSegments) {
for (IPreprocessPass pass : pre) raw = pass.process(raw, graph);

// segmentation
List<TaggedWord> tagged = new Vector<>();
cwsTaggingDecoder.segment(raw, pocGraph, tagged);
taggingDecoder.segment(raw, graph, tagged);

// adjustment passes
for (IAdjustPass pass : passes) pass.adjust(tagged);
for (IPostprocessPass pass : post) pass.process(tagged);

output.handleLineSegment(tagged, segOnly);
}
Expand Down
12 changes: 0 additions & 12 deletions src/main/java/org/thunlp/thulac/passes/IAdjustPass.java

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.thunlp.thulac.passes;
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.TaggedWord;
Expand All @@ -10,7 +10,7 @@
import java.util.List;
import java.util.Set;

public class FilterPass implements IAdjustPass {
public class FilterPass implements IPostprocessPass {
private static final Set<String> ALLOWED_TAGS = new HashSet<>(Arrays.asList(
"n", "np", "ns", "ni", "nz", "v", "a", "id", "t", "uw"));
private static final String ARABIC_NUMBER_CODE_POINTS =
Expand All @@ -29,7 +29,7 @@ public FilterPass(String xuWordFile, String timeWordFile) throws IOException {
}

@Override
public void adjust(List<TaggedWord> sentence) {
public void process(List<TaggedWord> sentence) {
if (this.xu_dat == null || this.time_dat == null) return;

for (int i = sentence.size() - 1; i >= 0; --i) {
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.TaggedWord;

import java.util.List;

/**
*
*/
public interface IPostprocessPass {
void process(List<TaggedWord> sentence);
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.thunlp.thulac.passes;
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.TaggedWord;
Expand All @@ -7,15 +7,15 @@
import java.io.IOException;
import java.util.List;

public class NegWordPass implements IAdjustPass {
public class NegWordPass implements IPostprocessPass {
private Dat neg_dat;

public NegWordPass(String filename) throws IOException {
this.neg_dat = new Dat(filename);
}

@Override
public void adjust(List<TaggedWord> sentence) {
public void process(List<TaggedWord> sentence) {
if (this.neg_dat == null) return;

for (int i = sentence.size() - 1; i >= 0; --i) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.thunlp.thulac.passes;
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.DatMaker;
Expand All @@ -8,7 +8,7 @@
import java.util.ArrayList;
import java.util.List;

public class PostprocessPass implements IAdjustPass {
public class PostprocessPass implements IPostprocessPass {
private Dat p_dat;
private String tag;

Expand All @@ -20,7 +20,7 @@ public PostprocessPass(String filename, String tag, boolean isTxt) throws
}

@Override
public void adjust(List<TaggedWord> sentence) {
public void process(List<TaggedWord> sentence) {
if (this.p_dat == null) return;

List<String> tmp = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.thunlp.thulac.passes;
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.TaggedWord;
Expand All @@ -7,15 +7,15 @@
import java.util.List;
import java.util.Vector;

public class PunctuationPass implements IAdjustPass {
public class PunctuationPass implements IPostprocessPass {
private Dat p_dat;

public PunctuationPass(String filename) throws IOException {
this.p_dat = new Dat(filename);
}

@Override
public void adjust(List<TaggedWord> sentence) {
public void process(List<TaggedWord> sentence) {
if (this.p_dat == null) return;

Vector<String> tmp = new Vector<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package org.thunlp.thulac.passes;
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.TaggedWord;
import org.thunlp.thulac.util.StringUtil;

import java.util.List;

public class TimeWordPass implements IAdjustPass {
public class TimeWordPass implements IPostprocessPass {
private static final String ARABIC_NUMBER_CODE_POINTS =
StringUtil.toString(48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
65296, 65297, 65298, 65299, 65300, 65301, 65302, 65303, 65304, 65305);
Expand Down Expand Up @@ -48,7 +48,7 @@ private boolean isHttpWord(String word) {
}

@Override
public void adjust(List<TaggedWord> sentence) {
public void process(List<TaggedWord> sentence) {
this.processTimeWords(sentence);
this.processDoubleWords(sentence);
this.processHttpWords(sentence);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package org.thunlp.thulac.passes;
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.TaggedWord;

import java.io.IOException;
import java.util.List;

public class VerbPass implements IAdjustPass {
public class VerbPass implements IPostprocessPass {
private Dat vM_dat;
private Dat vD_dat;
private String tag;
Expand All @@ -18,7 +18,7 @@ public VerbPass(String filename, String filename2) throws IOException {
}

@Override
public void adjust(List<TaggedWord> sentence) {
public void process(List<TaggedWord> sentence) {
if ((this.vM_dat == null) || (this.vD_dat == null)) return;

TaggedWord tagged = sentence.get(0), next;
Expand Down
53 changes: 53 additions & 0 deletions src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.thunlp.thulac.preprocess;

import org.thunlp.thulac.data.POCGraph;
import org.thunlp.thulac.util.StringUtil;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;

/**
*
*/
public class ConvertT2SPass implements IPreprocessPass {
private HashMap<Integer, Integer> t2sMap;

public ConvertT2SPass(String fileName) throws IOException {
this.t2sMap = new HashMap<>();
this.loadT2SMap(fileName);
}

private void loadT2SMap(String filename) throws IOException {
File mapFile = new File(filename);
int recordCount = (int) (mapFile.length() >> 3);
DataInputStream input = new DataInputStream(new FileInputStream(mapFile));
int[] traditional = new int[recordCount];
for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt();
for (int i = 0; i < recordCount; ++i) {
int simplified = input.readInt();
this.t2sMap.put(traditional[i], simplified);
}
input.close();
}

private int getSimplifiedCodePoint(int c) {
if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c);
return c;
}

private String convertT2S(String sentence) {
int[] codePoints = StringUtil.toCodePoints(sentence);
StringBuilder sb = new StringBuilder();
for (int codePoint : codePoints)
sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint));
return sb.toString();
}

@Override
public String process(String raw, POCGraph ignored) {
return this.convertT2S(raw);
}
}
10 changes: 10 additions & 0 deletions src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package org.thunlp.thulac.preprocess;

import org.thunlp.thulac.data.POCGraph;

/**
*
*/
public interface IPreprocessPass {
String process(String raw, POCGraph graph);
}
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
package org.thunlp.thulac;
package org.thunlp.thulac.preprocess;

import org.thunlp.thulac.data.POCGraph;
import org.thunlp.thulac.util.StringUtil;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;

public class Preprocessor {
public class PreprocessPass implements IPreprocessPass {
private static final String OTHER_CODE_POINTS = StringUtil.toString(65292, 12290,
65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 12304, 12305,
12289, 12298, 12299, 126, 183, 64, 124, 35, 65509, 37, 8230, 38, 42, 65288,
Expand All @@ -25,17 +19,11 @@ public class Preprocessor {
63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41, 59, 61);
private static final String WHITESPACE_CODE_POINTS = StringUtil.toString(32, 12288);

private HashMap<Integer, Integer> t2sMap;

public Preprocessor() {
this.t2sMap = new HashMap<>();
}

public boolean isSinglePunctuation(int c) {
private boolean isSinglePunctuation(int c) {
return SINGLE_PUNCTUATION_CODE_POINTS.indexOf(c) != -1;
}

public String cleanup(String sentence, POCGraph graph) {
private String cleanup(String sentence, POCGraph graph) {
StringBuilder cleaned = new StringBuilder();
graph.clear();
boolean spaceFlag = false, otherFlag = false,
Expand Down Expand Up @@ -111,29 +99,8 @@ else if (otherFlag) {
return cleaned.toString();
}

public void loadT2SMap(String filename) throws IOException {
File mapFile = new File(filename);
int recordCount = (int) (mapFile.length() >> 3);
DataInputStream input = new DataInputStream(new FileInputStream(mapFile));
int[] traditional = new int[recordCount];
for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt();
for (int i = 0; i < recordCount; ++i) {
int simplified = input.readInt();
this.t2sMap.put(traditional[i], simplified);
}
input.close();
}

private int getSimplifiedCodePoint(int c) {
if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c);
return c;
}

public String convertT2S(String sentence) {
int[] codePoints = StringUtil.toCodePoints(sentence);
StringBuilder sb = new StringBuilder();
for (int codePoint : codePoints)
sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint));
return sb.toString();
@Override
public String process(String raw, POCGraph graph) {
return this.cleanup(raw, graph);
}
}

0 comments on commit 03008c7

Please sign in to comment.