Skip to content

Commit

Permalink
Use NIO for CBModel to increase performance
Browse files Browse the repository at this point in the history
  • Loading branch information
std4453 committed Feb 25, 2017
1 parent 03008c7 commit 9ce41e0
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 158 deletions.
4 changes: 2 additions & 2 deletions src/main/java/org/thunlp/thulac/ReaderInputProvider.java
@@ -1,6 +1,6 @@
package org.thunlp.thulac;

import org.thunlp.thulac.util.InputProviderUtil;
import org.thunlp.thulac.util.InputProviderUtils;

import java.io.BufferedReader;
import java.io.IOException;
Expand All @@ -21,7 +21,7 @@ public ReaderInputProvider(BufferedReader reader) {
public List<String> provideInput() throws IOException {
String line = this.reader.readLine();
if (line == null) return null;
return InputProviderUtil.getLineSegments(line);
return InputProviderUtils.getLineSegments(line);
}

@Override
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/thunlp/thulac/Thulac.java
Expand Up @@ -4,9 +4,9 @@
import org.thunlp.thulac.data.POCGraph;
import org.thunlp.thulac.data.TaggedWord;
import org.thunlp.thulac.postprocess.*;
import org.thunlp.thulac.preprocess.ConvertT2SPass;
import org.thunlp.thulac.preprocess.IPreprocessPass;
import org.thunlp.thulac.preprocess.PreprocessPass;
import org.thunlp.thulac.preprocess.ConvertT2SPass;

import java.io.IOException;
import java.util.ArrayList;
Expand Down
158 changes: 27 additions & 131 deletions src/main/java/org/thunlp/thulac/cb/CBModel.java
@@ -1,144 +1,40 @@
/**
* Created锛�May 9, 2013 12:22:21 PM
* Project锛�ThulacJava
*
* @author cxx
* @since JDK 1.6.0_13
* filename锛�CBModel.java
* description锛�
*/
package org.thunlp.thulac.cb;

import java.io.File;
import org.thunlp.thulac.util.BufferUtils;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;

public class CBModel {

private static int DEC = 1000;

public int l_size; //size of the labels
public int f_size; //size of the features
public int l_size; // size of the labels
public int f_size; // size of the features

public int[] ll_weights; // weights of (label, label)
public int[] fl_weights; // weights of (feature, label)

public double[] ave_ll_weights;
public double[] ave_fl_weights;

public CBModel(int l, int f) {
this.l_size = l;
this.f_size = f;
this.ll_weights = new int[l * l];
this.fl_weights = new int[f * l];
this.ave_ll_weights = new double[l * l];
this.ave_fl_weights = new double[f * l];
}

public void reset_ave_weights() {
this.ave_ll_weights = new double[this.l_size * this.l_size];
this.ave_fl_weights = new double[this.f_size * this.l_size];
}

public void update_ll_weights(int i, int j, int delta, long steps) {
int ind = i * this.l_size + j;
this.ll_weights[ind] += delta;
this.ave_ll_weights[ind] += steps * delta;
}

public void update_fl_weights(int i, int j, int delta, long steps) {
int ind = i * this.l_size + j;
this.fl_weights[ind] += delta;
this.ave_fl_weights[ind] += steps * delta;
}

public void average(int step) {
for (int i = 0; i < this.l_size * this.f_size; i++) {
this.fl_weights[i] = (int) (((double) this.fl_weights[i] - this.ave_fl_weights[i] / (double) step) * DEC + 0.5);
}
for (int i = 0; i < this.l_size * this.l_size; i++) {
this.ll_weights[i] = (int) ((((double) this.ll_weights[i]) - this.ave_ll_weights[i] / (double) step) * DEC + 0.5);
}
}

public static int bytesToInt(byte[] bb, int index) {
return (int) ((((bb[index + 3] & 0xff) << 24)
| ((bb[index + 2] & 0xff) << 16)
| ((bb[index + 1] & 0xff) << 8) | ((bb[index + 0] & 0xff) << 0)));
}

public static byte[] intToBytes(int n) {
byte[] b = new byte[4];
for (int i = 0; i < 4; i++) {
b[i] = (byte) (n >> (8 * i));
}
return b;
}

public CBModel(String filename) throws IOException {
File file = new File(filename);
FileInputStream in = new FileInputStream(file);

byte[] tempbytes = new byte[4];
in.read(tempbytes);
this.l_size = bytesToInt(tempbytes, 0);
in.read(tempbytes);
this.f_size = bytesToInt(tempbytes, 0);

this.ll_weights = new int[this.l_size * this.l_size];
tempbytes = new byte[4 * this.ll_weights.length];
in.read(tempbytes);
for (int i = 0; i < this.ll_weights.length; i++) {
this.ll_weights[i] = bytesToInt(tempbytes, 4 * i);
}

this.fl_weights = new int[this.f_size * this.l_size];
tempbytes = new byte[4 * this.fl_weights.length];
in.read(tempbytes);
for (int i = 0; i < this.fl_weights.length; i++) {
this.fl_weights[i] = bytesToInt(tempbytes, 4 * i);
}
in.close();
}

public void save(String filename) throws IOException {
FileOutputStream out = new FileOutputStream(filename);
out.write(intToBytes(this.l_size));
out.write(intToBytes(this.f_size));
out.flush();
for (int i = 0; i < this.ll_weights.length; i++) {
out.write(intToBytes(this.ll_weights[i]));
}
out.flush();
for (int i = 0; i < this.fl_weights.length; i++) {
out.write(intToBytes(this.fl_weights[i]));
}
out.flush();
out.close();
FileInputStream in = new FileInputStream(filename);
FileChannel channel = in.getChannel();

ByteBuffer header = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN);
header.clear();
channel.read(header);
header.flip();
IntBuffer intHeader = header.asIntBuffer();
this.l_size = intHeader.get();
this.f_size = intHeader.get();

int llSize = this.l_size * this.l_size, flSize = this.l_size * this.f_size;
this.ll_weights = new int[llSize];
this.fl_weights = new int[flSize];
ByteBuffer buf = ByteBuffer.allocate(64 * 1024).order(ByteOrder.LITTLE_ENDIAN);
buf.clear();
BufferUtils.readInts(channel, buf, this.ll_weights, this.fl_weights);

channel.close();
}

/**
* <p>Title:main</p>
* <p>Description:<p>
*
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
/*
int[] weights = new int[3];
weights[0] = 1;
weights[1] = 3;
weights[2] = 5;
for(int i = 0;i < weights.length ; i ++){
System.out.println(weights[i]);
}
weights = new int[4];
for(int i = 0;i < weights.length ; i ++){
System.out.println(weights[i]);
}
*/
}

}
4 changes: 2 additions & 2 deletions src/main/java/org/thunlp/thulac/data/Dat.java
@@ -1,6 +1,6 @@
package org.thunlp.thulac.data;

import org.thunlp.thulac.util.StringUtil;
import org.thunlp.thulac.util.StringUtils;

import java.io.FileInputStream;
import java.io.IOException;
Expand Down Expand Up @@ -48,7 +48,7 @@ public Dat(String filename) throws IOException {
public int match(String word) {
int ind = 0;
int base = 0;
int[] codePoints = StringUtil.toCodePoints(word);
int[] codePoints = StringUtils.toCodePoints(word);
for (int c : codePoints) {
ind = this.dat[ind << 1] + c;
if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return -1;
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/thunlp/thulac/postprocess/FilterPass.java
Expand Up @@ -2,7 +2,7 @@

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.TaggedWord;
import org.thunlp.thulac.util.StringUtil;
import org.thunlp.thulac.util.StringUtils;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -14,10 +14,10 @@ public class FilterPass implements IPostprocessPass {
private static final Set<String> ALLOWED_TAGS = new HashSet<>(Arrays.asList(
"n", "np", "ns", "ni", "nz", "v", "a", "id", "t", "uw"));
private static final String ARABIC_NUMBER_CODE_POINTS =
StringUtil.toString(48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
StringUtils.toString(48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
65296, 65297, 65298, 65299, 65300, 65301, 65302, 65303, 65304, 65305);
private static final String CHINESE_NUMBER_CODE_POINTS =
StringUtil.toString(12295, 19968, 20108, 19977, 22235,
StringUtils.toString(12295, 19968, 20108, 19977, 22235,
20116, 20845, 19971, 20843, 20061);

private Dat xu_dat;
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java
Expand Up @@ -2,7 +2,7 @@

import org.thunlp.thulac.data.Dat;
import org.thunlp.thulac.data.TaggedWord;
import org.thunlp.thulac.util.StringUtil;
import org.thunlp.thulac.util.StringUtils;

import java.io.IOException;
import java.util.List;
Expand All @@ -22,11 +22,11 @@ public void process(List<TaggedWord> sentence) {
TaggedWord tagged = sentence.get(i);
if (this.neg_dat.match(tagged.word) != -1) {
TaggedWord newWord = new TaggedWord(tagged.separator);
newWord.word = StringUtil.toString(
newWord.word = StringUtils.toString(
tagged.word.codePointAt(tagged.word.offsetByCodePoints(0, 1)));
newWord.tag = "v";
sentence.add(i + 1, newWord);
tagged.word = StringUtil.toString(tagged.word.codePointAt(0));
tagged.word = StringUtils.toString(tagged.word.codePointAt(0));
tagged.tag = "d";
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java
@@ -1,18 +1,18 @@
package org.thunlp.thulac.postprocess;

import org.thunlp.thulac.data.TaggedWord;
import org.thunlp.thulac.util.StringUtil;
import org.thunlp.thulac.util.StringUtils;

import java.util.List;

public class TimeWordPass implements IPostprocessPass {
private static final String ARABIC_NUMBER_CODE_POINTS =
StringUtil.toString(48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
StringUtils.toString(48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
65296, 65297, 65298, 65299, 65300, 65301, 65302, 65303, 65304, 65305);
private static final String TIME_WORD_CODE_POINTS =
StringUtil.toString(24180, 26376, 26085, 21495, 26102, 28857, 20998, 31186);
StringUtils.toString(24180, 26376, 26085, 21495, 26102, 28857, 20998, 31186);
private static final String OTHER_CODE_POINTS =
StringUtil.toString(65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217,
StringUtils.toString(65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217,
8220, 8221, 12304, 12305, 12289, 12298, 12299, 126, 183, 64, 124, 35,
65509, 37, 8230, 38, 42, 65288, 65289, 8212, 45, 43, 61, 44, 46, 60,
62, 63, 47, 33, 59, 58, 39, 34, 123, 125, 91, 93, 92, 124, 35, 36, 37,
Expand Down
@@ -1,7 +1,7 @@
package org.thunlp.thulac.preprocess;

import org.thunlp.thulac.data.POCGraph;
import org.thunlp.thulac.util.StringUtil;
import org.thunlp.thulac.util.StringUtils;

import java.io.DataInputStream;
import java.io.File;
Expand Down Expand Up @@ -39,7 +39,7 @@ private int getSimplifiedCodePoint(int c) {
}

private String convertT2S(String sentence) {
int[] codePoints = StringUtil.toCodePoints(sentence);
int[] codePoints = StringUtils.toCodePoints(sentence);
StringBuilder sb = new StringBuilder();
for (int codePoint : codePoints)
sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint));
Expand Down
10 changes: 5 additions & 5 deletions src/main/java/org/thunlp/thulac/preprocess/PreprocessPass.java
@@ -1,10 +1,10 @@
package org.thunlp.thulac.preprocess;

import org.thunlp.thulac.data.POCGraph;
import org.thunlp.thulac.util.StringUtil;
import org.thunlp.thulac.util.StringUtils;

public class PreprocessPass implements IPreprocessPass {
private static final String OTHER_CODE_POINTS = StringUtil.toString(65292, 12290,
private static final String OTHER_CODE_POINTS = StringUtils.toString(65292, 12290,
65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 12304, 12305,
12289, 12298, 12299, 126, 183, 64, 124, 35, 65509, 37, 8230, 38, 42, 65288,
65289, 8212, 45, 43, 61, 44, 46, 60, 62, 63, 47, 33, 59, 58, 39, 34, 123, 125,
Expand All @@ -13,11 +13,11 @@ public class PreprocessPass implements IPreprocessPass {
83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57);
private static final String SINGLE_PUNCTUATION_CODE_POINTS = StringUtil.toString(
private static final String SINGLE_PUNCTUATION_CODE_POINTS = StringUtils.toString(
65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 1230, 12304,
12305, 12289, 12298, 12299, 64, 35, 65288, 65289, 34, 91, 93, 126, 47, 44, 58,
63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41, 59, 61);
private static final String WHITESPACE_CODE_POINTS = StringUtil.toString(32, 12288);
private static final String WHITESPACE_CODE_POINTS = StringUtils.toString(32, 12288);

private boolean isSinglePunctuation(int c) {
return SINGLE_PUNCTUATION_CODE_POINTS.indexOf(c) != -1;
Expand All @@ -30,7 +30,7 @@ private String cleanup(String sentence, POCGraph graph) {
singlePunctuationFlag = false, titleFlag = false;

int titleStart = 0;
int[] codePoints = StringUtil.toCodePoints(sentence);
int[] codePoints = StringUtils.toCodePoints(sentence);
for (int c : codePoints) {
if (WHITESPACE_CODE_POINTS.indexOf(c) != -1) {
otherFlag = false;
Expand Down

0 comments on commit 9ce41e0

Please sign in to comment.