Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#42 internally encode all string as utf-8 byte[] instead of default (… #43

Merged
merged 1 commit into from
Jul 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/main/java/com/github/terma/fastselect/FastSelect.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@
* return data. That's why this storage not good for selection big portion of data compare to original result set.
* <p>
* We use {@link MethodHandle#invoke(Object...)} here to field values from object.
* You could be surprised but it has same performance as normal reflect. Other alternative
* which you can think will be faster {@link MethodHandle#invokeExact(Object...)} however it's not.
* You could be surprised but it has same performance as normal access.
* <p>
* More information about that:
* <a href="https://gist.github.com/raphw/881e1745996f9d314ab0#file-result-field-txt">
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/com/github/terma/fastselect/StringRequest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.github.terma.fastselect;

import com.github.terma.fastselect.data.StringData;
import com.github.terma.fastselect.utils.Utf8Utils;

import java.util.Arrays;

Expand All @@ -30,7 +31,7 @@ public class StringRequest extends ColumnRequest {

public StringRequest(String name, String value) {
super(name);
bytes = value.getBytes();
bytes = Utf8Utils.stringToBytes(value);
}

@Override
Expand All @@ -42,7 +43,7 @@ public boolean checkValue(int position) {

@Override
public String toString() {
return name + " = '" + new String(bytes) + "'";
return name + " = '" + Utf8Utils.bytesToString(bytes) + "'";
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
* <p>
* To use that type of data field should have type {@link String} and additionally
* marked by {@link StringCompressedByte}
* <p>
* Save and load string as decoded to UTF-8 <code>byte[]</code> representation
*
* @see StringCompressedShortData
* @see StringData
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
* <p>
* To use that type of data field should have type {@link String} and additionally
* marked by {@link StringCompressedInt}
* <p>
* Save and load string as decoded to UTF-8 <code>byte[]</code> representation
*
* @see StringCompressedByteData
* @see StringCompressedShortData
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
* <p>
* To use that type of data field should have type {@link String} and additionally
* marked by {@link StringCompressedShort}
* <p>
* Save and load string as decoded to UTF-8 <code>byte[]</code> representation
*
* @see StringCompressedByteData
* @see StringData
Expand Down
12 changes: 10 additions & 2 deletions src/main/java/com/github/terma/fastselect/data/StringData.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,17 @@

package com.github.terma.fastselect.data;

import com.github.terma.fastselect.utils.Utf8Utils;

import java.io.IOException;
import java.nio.ByteBuffer;

/**
* Simple string storage
* <p>
* Internally store string data as decoded UTF-8 <code>byte[]</code>
* Save and load use same <code>byte[]</code> representation
*/
public class StringData implements Data {

private static final byte[] ZERO = new byte[0];
Expand All @@ -34,7 +42,7 @@ public StringData(StringData data, byte[] needToCopy) {
}

public void add(String v) {
final byte[] bytes = v == null ? ZERO : v.getBytes();
final byte[] bytes = v == null ? ZERO : Utf8Utils.stringToBytes(v);
data.add(bytes);
}

Expand All @@ -56,7 +64,7 @@ public void load(String dataClass, ByteBuffer buffer, int size) throws IOExcepti
@Override
public Object get(int position) {
final byte[] bytes = getRaw(position);
return new String(bytes);
return Utf8Utils.bytesToString(bytes);
}

@Override
Expand Down
10 changes: 5 additions & 5 deletions src/main/java/com/github/terma/fastselect/utils/IOUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static long readLong(final FileChannel fileChannel) throws IOException {

public static void writeString(final FileChannel fileChannel, final String value) throws IOException {
if (value != null) {
byte[] b = value.getBytes();
byte[] b = Utf8Utils.stringToBytes(value);
writeInt(fileChannel, b.length);
fileChannel.write(ByteBuffer.wrap(b));
} else {
Expand All @@ -65,7 +65,7 @@ public static void writeString(final FileChannel fileChannel, final String value

public static void writeString(final ByteBuffer buffer, final String value) throws IOException {
if (value != null) {
byte[] b = value.getBytes();
byte[] b = Utf8Utils.stringToBytes(value);
buffer.putInt(b.length);
buffer.put(b);
} else {
Expand All @@ -75,7 +75,7 @@ public static void writeString(final ByteBuffer buffer, final String value) thro

public static int getStringBytesSize(final String value) {
if (value != null) {
byte[] b = value.getBytes();
byte[] b = Utf8Utils.stringToBytes(value);
return Data.INT_BYTES + b.length;
} else {
return Data.INT_BYTES;
Expand All @@ -89,7 +89,7 @@ public static String readString(FileChannel fileChannel) throws IOException {
} else {
final byte[] b = new byte[size];
fileChannel.read(ByteBuffer.wrap(b));
return new String(b);
return Utf8Utils.bytesToString(b);
}
}

Expand All @@ -100,7 +100,7 @@ public static String readString(ByteBuffer buffer) throws IOException {
} else {
final byte[] b = new byte[size];
buffer.get(b);
return new String(b);
return Utf8Utils.bytesToString(b);
}
}

Expand Down
21 changes: 21 additions & 0 deletions src/main/java/com/github/terma/fastselect/utils/Utf8Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.github.terma.fastselect.utils;

import java.nio.charset.Charset;

public final class Utf8Utils {

private final static Charset CHARSET = Charset.forName("utf-8");

private Utf8Utils() {
throw new UnsupportedOperationException("Just util class no instances!");
}

public static byte[] stringToBytes(String string) {
return string.getBytes(CHARSET);
}

public static String bytesToString(byte[] bytes) {
return new String(bytes, CHARSET);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,36 @@

package com.github.terma.fastselect.benchmark;

import com.github.terma.fastselect.utils.Utf8Utils;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

import java.nio.charset.Charset;
import java.util.Random;
import java.util.concurrent.TimeUnit;

/**
* Mac Air
* <pre>
* Benchmark Mode Cnt Score Error Units
* StringGetBytesBenchmark.createString avgt 833.393 ms/op
* StringGetBytesBenchmark.createStringGetBytes avgt 1500.364 ms/op
* Benchmark Mode Cnt Score Error Units
* StringGetBytesBenchmark.createString avgt 833.600 ms/op
* StringGetBytesBenchmark.createStringGetBytes avgt 1364.009 ms/op <<< was before
* StringGetBytesBenchmark.createStringGetBytesAscII avgt 1250.423 ms/op
* StringGetBytesBenchmark.createStringGetBytesUtf8 avgt 1500.474 ms/op
* </pre>
*/
@Fork(value = 1, jvmArgs = "-Xmx6g")
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(timeUnit = TimeUnit.SECONDS, time = 15, iterations = 1)
@Measurement(timeUnit = TimeUnit.SECONDS, time = 15, iterations = 1, batchSize = 6000000)
@Warmup(time = 15, iterations = 1)
@Measurement(time = 15, iterations = 1, batchSize = 6000000)
public class StringGetBytesBenchmark {

private Charset ascIICharset = Charset.forName("ascII");
private Random random = new Random();

public static void main(String[] args) throws RunnerException {
Expand All @@ -65,4 +70,14 @@ public Object createStringGetBytes() throws Exception {
return ("SOME STRING " + random.nextInt()).getBytes();
}

@Benchmark
public Object createStringGetBytesAscII() throws Exception {
return ("SOME STRING " + random.nextInt()).getBytes(ascIICharset);
}

@Benchmark
public Object createStringGetBytesUtf8() throws Exception {
return Utf8Utils.stringToBytes(("SOME STRING " + random.nextInt()));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,20 @@ public void saveLoad() throws IOException {
Assert.assertEquals(null, data1.get(3));
}

@Test
public void saveLoadNonAscIICharacters() throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(1000);

StringCompressedByteData data = new StringCompressedByteData(100);
data.add("юг");
data.save(buffer);
buffer.flip();

StringCompressedByteData data1 = new StringCompressedByteData(100);
data1.load("", buffer, 1);

Assert.assertEquals(data1.size(), 1);
Assert.assertEquals("юг", data1.get(0));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,20 @@ public void saveLoad() throws IOException {
Assert.assertEquals(null, data1.get(3));
}

@Test
public void saveLoadNonAscIICharacters() throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(1024 * 1024);

StringCompressedIntData data = new StringCompressedIntData(100);
data.add("侍");
data.save(buffer);
buffer.flip();

StringCompressedIntData data1 = new StringCompressedIntData(100);
data1.load("", buffer, 1);

Assert.assertEquals(data1.size(), 1);
Assert.assertEquals("侍", data1.get(0));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,20 @@ public void saveLoad() throws IOException {
Assert.assertEquals(null, data1.get(3));
}

@Test
public void saveLoadNonAscIICharacters() throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(1024 * 1024);

StringCompressedShortData data = new StringCompressedShortData(100);
data.add("侍");
data.save(buffer);
buffer.flip();

StringCompressedShortData data1 = new StringCompressedShortData(100);
data1.load("", buffer, 1);

Assert.assertEquals(data1.size(), 1);
Assert.assertEquals("侍", data1.get(0));
}

}
41 changes: 41 additions & 0 deletions src/test/java/com/github/terma/fastselect/data/StringDataTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
import junit.framework.Assert;
import org.junit.Test;

import java.io.IOException;
import java.nio.ByteBuffer;

public class StringDataTest {

@Test
Expand Down Expand Up @@ -80,4 +83,42 @@ public void provideInc() {
Assert.assertEquals(33, new StringData(33).inc());
}

@Test
public void saveLoadNonAscIICharacters() throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(1024 * 1024);

StringData data = new StringData(100);
data.add("侍");
data.save(buffer);
buffer.flip();

StringData data1 = new StringData(100);
data1.load("", buffer, 1);

Assert.assertEquals(data1.size(), 1);
Assert.assertEquals("侍", data1.get(0));
}

@Test
public void saveLoad() throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(1024 * 1024);

StringData data = new StringData(100);
data.add("arg");
data.add("Z");
data.add(null);
data.add("");
data.save(buffer);
buffer.flip();

StringData data1 = new StringData(100);
data1.load("", buffer, 4);

Assert.assertEquals(data1.size(), 4);
Assert.assertEquals("arg", data1.get(0));
Assert.assertEquals("Z", data1.get(1));
Assert.assertEquals("", data1.get(2));
Assert.assertEquals("", data1.get(3));
}

}
42 changes: 42 additions & 0 deletions src/test/java/com/github/terma/fastselect/utils/Utf8UtilsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package com.github.terma.fastselect.utils;

import org.junit.Assert;
import org.junit.Test;

public class Utf8UtilsTest {

@Test
public void stringToBytesEmptyAndBack() {
Assert.assertArrayEquals(new byte[]{}, Utf8Utils.stringToBytes(""));
Assert.assertEquals("", Utf8Utils.bytesToString(new byte[0]));
}

@Test
public void ascIIStringToBytesAndBack() {
Assert.assertArrayEquals(new byte[]{97, 98, 122}, Utf8Utils.stringToBytes("abz"));
Assert.assertEquals("abz", Utf8Utils.bytesToString(new byte[]{97, 98, 122}));
}

@Test
public void nonAscIIStringToBytes() {
Assert.assertArrayEquals(new byte[]{-47, -123, -48, -66, -48, -71}, Utf8Utils.stringToBytes("хой"));
Assert.assertEquals("хой", Utf8Utils.bytesToString(new byte[]{-47, -123, -48, -66, -48, -71}));
}

@Test
public void mixedSringToBytes() {
Assert.assertArrayEquals(new byte[]{-48, -71, 90}, Utf8Utils.stringToBytes("йZ"));
Assert.assertEquals("йZ", Utf8Utils.bytesToString(new byte[]{-48, -71, 90}));
}

@Test(expected = NullPointerException.class)
public void nullStringToBytes() {
Utf8Utils.stringToBytes(null);
}

@Test(expected = NullPointerException.class)
public void nullBytesToString() {
Utf8Utils.bytesToString(null);
}

}