Permalink
Browse files

Use compress-lzf from Ning as it includes many performance improvements.

  • Loading branch information...
1 parent 44ca674 commit 789c2f9729a63b885dedbe84a8c975e14008f99f @ijuma ijuma committed Dec 5, 2011
View
Binary file not shown.
@@ -2,8 +2,8 @@
import java.io.IOException;
-import voldemort.store.compress.lzf.LZFDecoder;
-import voldemort.store.compress.lzf.LZFEncoder;
+import com.ning.compress.lzf.LZFDecoder;
+import com.ning.compress.lzf.LZFEncoder;
/**
* Implementation of CompressionStrategy for the LZF format. LZF is optimized
@@ -1,188 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package voldemort.store.compress.lzf;
-
-/**
- * Class that handles actual encoding of individual chunks. Resulting chunks can
- * be compressed or non-compressed; compression is only used if it actually
- * reduces chunk size (including overhead of additional header bytes)
- *<p>
- * Code adapted from H2 project (http://www.h2database.com) Java LZF
- * implementation by Thomas (which itself was inspired by original C code by
- * Marc A Lehmann)
- *
- */
-public class ChunkEncoder {
-
- // Beyond certain point we won't be able to compress:
- private static final int MIN_BLOCK_TO_COMPRESS = 16;
-
- private static final int MIN_HASH_SIZE = 256;
- // Not much point in bigger tables, with 8k window
- private static final int MAX_HASH_SIZE = 16384;
-
- private static final int MAX_OFF = 1 << 13; // 8k
- private static final int MAX_REF = (1 << 8) + (1 << 3); // 264
-
- // // Encoding tables
-
- /**
- * Buffer in which encoded content is stored during processing
- */
- private final byte[] _encodeBuffer;
-
- private final int[] _hashTable;
-
- private final int _hashModulo;
-
- /**
- * @param totalLength Total encoded length; used for calculating size of
- * hash table to use
- */
- public ChunkEncoder(int totalLength) {
- int largestChunkLen = Math.max(totalLength, LZFChunk.MAX_CHUNK_LEN);
-
- int hashLen = calcHashLen(largestChunkLen);
- _hashTable = new int[hashLen];
- _hashModulo = hashLen - 1;
- // Ok, then, what's the worst case output buffer length?
- // length indicator for each 32 literals, so:
- int bufferLen = largestChunkLen + ((largestChunkLen + 31) >> 5);
- _encodeBuffer = new byte[bufferLen];
- }
-
- /**
- * Method for compressing (or not) individual chunks
- */
- public LZFChunk encodeChunk(byte[] data, int offset, int len) {
- if(len >= MIN_BLOCK_TO_COMPRESS) {
- /*
- * If we have non-trivial block, and can compress it by at least 2
- * bytes (since header is 2 bytes longer), let's compress:
- */
- int compLen = tryCompress(data, offset, offset + len, _encodeBuffer, 0);
- if(compLen < (len - 2)) { // nah; just return uncompressed
- return LZFChunk.createCompressed(len, _encodeBuffer, 0, compLen);
- }
- }
- // Otherwise leave uncompressed:
- return LZFChunk.createNonCompressed(data, offset, len);
- }
-
- private static int calcHashLen(int chunkSize) {
- // in general try get hash table size of 2x input size
- chunkSize += chunkSize;
- // but no larger than max size:
- if(chunkSize >= MAX_HASH_SIZE) {
- return MAX_HASH_SIZE;
- }
- // otherwise just need to round up to nearest 2x
- int hashLen = MIN_HASH_SIZE;
- while(hashLen < chunkSize) {
- hashLen += hashLen;
- }
- return hashLen;
- }
-
- private int first(byte[] in, int inPos) {
- return (in[inPos] << 8) + (in[inPos + 1] & 255);
- }
-
- private static int next(int v, byte[] in, int inPos) {
- return (v << 8) + (in[inPos + 2] & 255);
- }
-
- private int hash(int h) {
- // or 184117; but this seems to give better hashing?
- return ((h * 57321) >> 9) & _hashModulo;
- // original lzf-c.c used this:
- // return (((h ^ (h << 5)) >> (24 - HLOG) - h*5) & _hashModulo;
- // but that didn't seem to provide better matches
- }
-
- private int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos) {
- int literals = 0;
- outPos++;
- int hash = first(in, 0);
- inEnd -= 4;
- final int firstPos = inPos; // so that we won't have back references
- // across block boundary
- while(inPos < inEnd) {
- byte p2 = in[inPos + 2];
- // next
- hash = (hash << 8) + (p2 & 255);
- int off = hash(hash);
- int ref = _hashTable[off];
- _hashTable[off] = inPos;
- if(ref < inPos && ref >= firstPos && (off = inPos - ref - 1) < MAX_OFF
- && in[ref + 2] == p2 && in[ref + 1] == (byte) (hash >> 8)
- && in[ref] == (byte) (hash >> 16)) {
- // match
- int maxLen = inEnd - inPos + 2;
- if(maxLen > MAX_REF) {
- maxLen = MAX_REF;
- }
- if(literals == 0) {
- outPos--;
- } else {
- out[outPos - literals - 1] = (byte) (literals - 1);
- literals = 0;
- }
- int len = 3;
- while(len < maxLen && in[ref + len] == in[inPos + len]) {
- len++;
- }
- len -= 2;
- if(len < 7) {
- out[outPos++] = (byte) ((off >> 8) + (len << 5));
- } else {
- out[outPos++] = (byte) ((off >> 8) + (7 << 5));
- out[outPos++] = (byte) (len - 7);
- }
- out[outPos++] = (byte) off;
- outPos++;
- inPos += len;
- hash = first(in, inPos);
- hash = next(hash, in, inPos);
- _hashTable[hash(hash)] = inPos++;
- hash = next(hash, in, inPos);
- _hashTable[hash(hash)] = inPos++;
- } else {
- out[outPos++] = in[inPos++];
- literals++;
- if(literals == LZFChunk.MAX_LITERAL) {
- out[outPos - literals - 1] = (byte) (literals - 1);
- literals = 0;
- outPos++;
- }
- }
- }
- inEnd += 4;
- while(inPos < inEnd) {
- out[outPos++] = in[inPos++];
- literals++;
- if(literals == LZFChunk.MAX_LITERAL) {
- out[outPos - literals - 1] = (byte) (literals - 1);
- literals = 0;
- outPos++;
- }
- }
- out[outPos - literals - 1] = (byte) (literals - 1);
- if(literals == 0) {
- outPos--;
- }
- return outPos;
- }
-
-}
@@ -1,88 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package voldemort.store.compress.lzf;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
-/**
- * Simple command-line utility that can be used for testing LZF compression.
- *
- */
-public class LZF {
-
- final static String SUFFIX = ".lzf";
-
- void process(String[] args) throws IOException {
- if(args.length == 2) {
- String oper = args[0];
- boolean compress = "-c".equals(oper);
- if(compress || "-d".equals(oper)) {
- String filename = args[1];
- File src = new File(filename);
- if(!src.exists()) {
- System.err.println("File '" + filename + "' does not exist.");
- System.exit(1);
- }
- if(!compress && !filename.endsWith(SUFFIX)) {
- System.err.println("File '" + filename + "' does end with expected suffix ('"
- + SUFFIX + "', won't decompress.");
- System.exit(1);
- }
- byte[] data = readData(src);
- System.out.println("Read " + data.length + " bytes.");
- byte[] result = compress ? LZFEncoder.encode(data) : LZFDecoder.decode(data);
- System.out.println("Processed into " + result.length + " bytes.");
- File resultFile = compress ? new File(filename + SUFFIX)
- : new File(filename.substring(0, filename.length()
- - SUFFIX.length()));
- FileOutputStream out = new FileOutputStream(resultFile);
- out.write(result);
- out.close();
- System.out.println("Wrote in file '" + resultFile.getAbsolutePath() + "'.");
- return;
- }
- }
- System.err.println("Usage: java " + getClass().getName() + " -c/-d file");
- System.exit(1);
- }
-
- private byte[] readData(File in) throws IOException {
- int len = (int) in.length();
- byte[] result = new byte[len];
- int offset = 0;
- FileInputStream fis = new FileInputStream(in);
-
- while(len > 0) {
- int count = fis.read(result, offset, len);
- if(count < 0)
- break;
- len -= count;
- offset += count;
- }
- fis.close();
- if(len > 0) { // should never occur...
- throw new IOException("Could not read the whole file -- received EOF when there was "
- + len + " bytes left to read");
- }
- return result;
- }
-
- public static void main(String[] args) throws IOException {
- new LZF().process(args);
- }
-}
@@ -1,96 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package voldemort.store.compress.lzf;
-
-/**
- * Helper class used to store LZF encoded segments (compressed and
- * non-compressed) that can be sequenced to produce LZF files/streams.
- *
- */
-public class LZFChunk {
-
- /**
- * Maximum length of literal run for LZF encoding.
- */
- public static final int MAX_LITERAL = 1 << 5; // 32
-
- // Chunk length is limited by 2-byte length indicator, to 64k
- public static final int MAX_CHUNK_LEN = 0xFFFF;
-
- public final static byte BYTE_Z = 'Z';
- public final static byte BYTE_V = 'V';
-
- public final static int BLOCK_TYPE_NON_COMPRESSED = 0;
- public final static int BLOCK_TYPE_COMPRESSED = 1;
-
- final byte[] _data;
- LZFChunk _next;
-
- private LZFChunk(byte[] data) {
- _data = data;
- }
-
- /**
- * Factory method for constructing compressed chunk
- */
- public static LZFChunk createCompressed(int origLen, byte[] encData, int encPtr, int encLen) {
- byte[] result = new byte[encLen + 7];
- result[0] = BYTE_Z;
- result[1] = BYTE_V;
- result[2] = BLOCK_TYPE_COMPRESSED;
- result[3] = (byte) (encLen >> 8);
- result[4] = (byte) encLen;
- result[5] = (byte) (origLen >> 8);
- result[6] = (byte) origLen;
- System.arraycopy(encData, encPtr, result, 7, encLen);
- return new LZFChunk(result);
- }
-
- /**
- * Factory method for constructing compressed chunk
- */
- public static LZFChunk createNonCompressed(byte[] plainData, int ptr, int len) {
- byte[] result = new byte[len + 5];
- result[0] = BYTE_Z;
- result[1] = BYTE_V;
- result[2] = BLOCK_TYPE_NON_COMPRESSED;
- result[3] = (byte) (len >> 8);
- result[4] = (byte) len;
- System.arraycopy(plainData, ptr, result, 5, len);
- return new LZFChunk(result);
- }
-
- public void setNext(LZFChunk next) {
- _next = next;
- }
-
- public LZFChunk next() {
- return _next;
- }
-
- public int length() {
- return _data.length;
- }
-
- public byte[] getData() {
- return _data;
- }
-
- public int copyTo(byte[] dst, int ptr) {
- int len = _data.length;
- System.arraycopy(_data, 0, dst, ptr, len);
- return ptr + len;
- }
-}
Oops, something went wrong.

0 comments on commit 789c2f9

Please sign in to comment.