-
Notifications
You must be signed in to change notification settings - Fork 115
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
And standardize them by using an interface. org.thunlp.thulac.passes is renamed to org.thunlp.thulac.postprocess to distinct it from the new package org.thumlp.thulac.preprocess.
- Loading branch information
Showing
12 changed files
with
132 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 12 additions & 0 deletions
12
src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package org.thunlp.thulac.postprocess; | ||
|
||
import org.thunlp.thulac.data.TaggedWord; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* | ||
*/ | ||
public interface IPostprocessPass { | ||
void process(List<TaggedWord> sentence); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package org.thunlp.thulac.preprocess; | ||
|
||
import org.thunlp.thulac.data.POCGraph; | ||
import org.thunlp.thulac.util.StringUtil; | ||
|
||
import java.io.DataInputStream; | ||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.util.HashMap; | ||
|
||
/** | ||
* | ||
*/ | ||
public class ConvertT2SPass implements IPreprocessPass { | ||
private HashMap<Integer, Integer> t2sMap; | ||
|
||
public ConvertT2SPass(String fileName) throws IOException { | ||
this.t2sMap = new HashMap<>(); | ||
this.loadT2SMap(fileName); | ||
} | ||
|
||
private void loadT2SMap(String filename) throws IOException { | ||
File mapFile = new File(filename); | ||
int recordCount = (int) (mapFile.length() >> 3); | ||
DataInputStream input = new DataInputStream(new FileInputStream(mapFile)); | ||
int[] traditional = new int[recordCount]; | ||
for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt(); | ||
for (int i = 0; i < recordCount; ++i) { | ||
int simplified = input.readInt(); | ||
this.t2sMap.put(traditional[i], simplified); | ||
} | ||
input.close(); | ||
} | ||
|
||
private int getSimplifiedCodePoint(int c) { | ||
if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c); | ||
return c; | ||
} | ||
|
||
private String convertT2S(String sentence) { | ||
int[] codePoints = StringUtil.toCodePoints(sentence); | ||
StringBuilder sb = new StringBuilder(); | ||
for (int codePoint : codePoints) | ||
sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint)); | ||
return sb.toString(); | ||
} | ||
|
||
@Override | ||
public String process(String raw, POCGraph ignored) { | ||
return this.convertT2S(raw); | ||
} | ||
} |
10 changes: 10 additions & 0 deletions
10
src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package org.thunlp.thulac.preprocess; | ||
|
||
import org.thunlp.thulac.data.POCGraph; | ||
|
||
/** | ||
* | ||
*/ | ||
public interface IPreprocessPass { | ||
String process(String raw, POCGraph graph); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters