+ * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by + * the networking research community in the past decade thanks to the bandwidth efficiencies that it + * offers for the transmission of set membership information between networked hosts. A sender encodes + * the information into a bit vector, the Bloom filter, that is more compact than a conventional + * representation. Computation and space costs for construction are linear in the number of elements. + * The receiver uses the filter to test whether various elements are members of the set. Though the + * filter will occasionally return a false positive, it will never return a false negative. When creating + * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. + * + *
+ * Originally created by + * European Commission One-Lab Project 034819. + * + * @see Filter The general behavior of a filter + * + * @see Space/Time Trade-Offs in Hash Coding with Allowable Errors + */ +public class BloomFilter extends Filter { + private static final byte[] bitvalues = new byte[] { + (byte)0x01, + (byte)0x02, + (byte)0x04, + (byte)0x08, + (byte)0x10, + (byte)0x20, + (byte)0x40, + (byte)0x80 + }; + + /** The bit vector. */ + BitSet bits; + + /** Default constructor - use with readFields */ + public BloomFilter() { + super(); + } + + /** + * Constructor + * @param vectorSize The vector size of this filter. + * @param nbHash The number of hash function to consider. + * @param hashType type of the hashing function (see + * {@link org.apache.hadoop.util.hash.Hash}). + */ + public BloomFilter(int vectorSize, int nbHash, int hashType) { + super(vectorSize, nbHash, hashType); + + bits = new BitSet(this.vectorSize); + } + + @Override + public void add(Key key) { + if(key == null) { + throw new NullPointerException("key cannot be null"); + } + + int[] h = hash.hash(key); + hash.clear(); + + for(int i = 0; i < nbHash; i++) { + bits.set(h[i]); + } + } + + @Override + public void and(Filter filter) { + if(filter == null + || !(filter instanceof BloomFilter) + || filter.vectorSize != this.vectorSize + || filter.nbHash != this.nbHash) { + throw new IllegalArgumentException("filters cannot be and-ed"); + } + + this.bits.and(((BloomFilter) filter).bits); + } + + @Override + public boolean membershipTest(Key key) { + if(key == null) { + throw new NullPointerException("key cannot be null"); + } + + int[] h = hash.hash(key); + hash.clear(); + for(int i = 0; i < nbHash; i++) { + if(!bits.get(h[i])) { + return false; + } + } + return true; + } + + @Override + public void not() { + bits.flip(0, vectorSize - 1); + } + + @Override + public void or(Filter filter) { + if(filter == null + || !(filter instanceof BloomFilter) + || filter.vectorSize != this.vectorSize + || filter.nbHash != this.nbHash) { + throw new IllegalArgumentException("filters cannot be or-ed"); + } + bits.or(((BloomFilter) filter).bits); + } + + @Override + public void xor(Filter filter) { + if(filter == null + || !(filter instanceof BloomFilter) + || filter.vectorSize != this.vectorSize + || filter.nbHash != this.nbHash) { + throw new IllegalArgumentException("filters cannot be xor-ed"); + } + bits.xor(((BloomFilter) filter).bits); + } + + @Override + public String toString() { + return bits.toString(); + } + + /** + * @return size of the the bloomfilter + */ + public int getVectorSize() { + return this.vectorSize; + } + + // Writable + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + byte[] bytes = new byte[getNBytes()]; + for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { + if (bitIndex == 8) { + bitIndex = 0; + byteIndex++; + } + if (bitIndex == 0) { + bytes[byteIndex] = 0; + } + if (bits.get(i)) { + bytes[byteIndex] |= bitvalues[bitIndex]; + } + } + out.write(bytes); + } + + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + bits = new BitSet(this.vectorSize); + byte[] bytes = new byte[getNBytes()]; + in.readFully(bytes); + for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { + if (bitIndex == 8) { + bitIndex = 0; + byteIndex++; + } + if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) { + bits.set(i); + } + } + } + + /* @return number of bytes needed to hold bit vector */ + private int getNBytes() { + return (vectorSize + 7) / 8; + } +}//end class diff --git a/src/org/thrudb/util/bloom/CountingBloomFilter.java b/src/org/thrudb/util/bloom/CountingBloomFilter.java new file mode 100644 index 0000000..3d68269 --- /dev/null +++ b/src/org/thrudb/util/bloom/CountingBloomFilter.java @@ -0,0 +1,305 @@ +/** + * + * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) + * All rights reserved. + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * - Neither the name of the University Catholique de Louvain - UCL + * nor the names of its contributors may be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.thrudb.util.bloom; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** + * Implements a counting Bloom filter, as defined by Fan et al. in a ToN + * 2000 paper. + *
+ * A counting Bloom filter is an improvement to standard a Bloom filter as it + * allows dynamic additions and deletions of set membership information. This + * is achieved through the use of a counting vector instead of a bit vector. + *
+ * Originally created by + * European Commission One-Lab Project 034819. + * + * @see Filter The general behavior of a filter + * + * @see Summary cache: a scalable wide-area web cache sharing protocol + */ +public final class CountingBloomFilter extends Filter { + /** Storage for the counting buckets */ + private long[] buckets; + + /** We are using 4bit buckets, so each bucket can count to 15 */ + private final static long BUCKET_MAX_VALUE = 15; + + /** Default constructor - use with readFields */ + public CountingBloomFilter() {} + + /** + * Constructor + * @param vectorSize The vector size of this filter. + * @param nbHash The number of hash function to consider. + * @param hashType type of the hashing function (see + * {@link org.apache.hadoop.util.hash.Hash}). + */ + public CountingBloomFilter(int vectorSize, int nbHash, int hashType) { + super(vectorSize, nbHash, hashType); + buckets = new long[buckets2words(vectorSize)]; + } + + /** returns the number of 64 bit words it would take to hold vectorSize buckets */ + private static int buckets2words(int vectorSize) { + return ((vectorSize - 1) >>> 4) + 1; + } + + + @Override + public void add(Key key) { + if(key == null) { + throw new NullPointerException("key can not be null"); + } + + int[] h = hash.hash(key); + hash.clear(); + + for(int i = 0; i < nbHash; i++) { + // find the bucket + int wordNum = h[i] >> 4; // div 16 + int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 + + long bucketMask = 15L << bucketShift; + long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; + + // only increment if the count in the bucket is less than BUCKET_MAX_VALUE + if(bucketValue < BUCKET_MAX_VALUE) { + // increment by 1 + buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift); + } + } + } + + /** + * Removes a specified key from this counting Bloom filter. + *
+ * Invariant: nothing happens if the specified key does not belong to this counter Bloom filter.
+ * @param key The key to remove.
+ */
+ public void delete(Key key) {
+ if(key == null) {
+ throw new NullPointerException("Key may not be null");
+ }
+ if(!membershipTest(key)) {
+ throw new IllegalArgumentException("Key is not a member");
+ }
+
+ int[] h = hash.hash(key);
+ hash.clear();
+
+ for(int i = 0; i < nbHash; i++) {
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+
+ // only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
+ if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
+ // decrement by 1
+ buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
+ }
+ }
+ }
+
+ @Override
+ public void and(Filter filter) {
+ if(filter == null
+ || !(filter instanceof CountingBloomFilter)
+ || filter.vectorSize != this.vectorSize
+ || filter.nbHash != this.nbHash) {
+ throw new IllegalArgumentException("filters cannot be and-ed");
+ }
+ CountingBloomFilter cbf = (CountingBloomFilter)filter;
+
+ int sizeInWords = buckets2words(vectorSize);
+ for(int i = 0; i < sizeInWords; i++) {
+ this.buckets[i] &= cbf.buckets[i];
+ }
+ }
+
+ @Override
+ public boolean membershipTest(Key key) {
+ if(key == null) {
+ throw new NullPointerException("Key may not be null");
+ }
+
+ int[] h = hash.hash(key);
+ hash.clear();
+
+ for(int i = 0; i < nbHash; i++) {
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+
+ if((buckets[wordNum] & bucketMask) == 0) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * This method calculates an approximate count of the key, i.e. how many
+ * times the key was added to the filter. This allows the filter to be
+ * used as an approximate key -> count
map.
+ *
NOTE: due to the bucket size of this filter, inserting the same
+ * key more than 15 times will cause an overflow at all filter positions
+ * associated with this key, and it will significantly increase the error
+ * rate for this and other keys. For this reason the filter can only be
+ * used to store small count values 0 <= N << 15
.
+ * @param key key to be tested
+ * @return 0 if the key is not present. Otherwise, a positive value v will
+ * be returned such that v == count
with probability equal to the
+ * error rate of this filter, and v > count
otherwise.
+ * Additionally, if the filter experienced an underflow as a result of
+ * {@link #delete(Key)} operation, the return value may be lower than the
+ * count
with the probability of the false negative rate of such
+ * filter.
+ */
+ public int approximateCount(Key key) {
+ int res = Integer.MAX_VALUE;
+ int[] h = hash.hash(key);
+ hash.clear();
+ for (int i = 0; i < nbHash; i++) {
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+ if (bucketValue < res) res = (int)bucketValue;
+ }
+ if (res != Integer.MAX_VALUE) {
+ return res;
+ } else {
+ return 0;
+ }
+ }
+
+ @Override
+ public void not() {
+ throw new UnsupportedOperationException("not() is undefined for "
+ + this.getClass().getName());
+ }
+
+ @Override
+ public void or(Filter filter) {
+ if(filter == null
+ || !(filter instanceof CountingBloomFilter)
+ || filter.vectorSize != this.vectorSize
+ || filter.nbHash != this.nbHash) {
+ throw new IllegalArgumentException("filters cannot be or-ed");
+ }
+
+ CountingBloomFilter cbf = (CountingBloomFilter)filter;
+
+ int sizeInWords = buckets2words(vectorSize);
+ for(int i = 0; i < sizeInWords; i++) {
+ this.buckets[i] |= cbf.buckets[i];
+ }
+ }
+
+ @Override
+ public void xor(Filter filter) {
+ throw new UnsupportedOperationException("xor() is undefined for "
+ + this.getClass().getName());
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder res = new StringBuilder();
+
+ for(int i = 0; i < vectorSize; i++) {
+ if(i > 0) {
+ res.append(" ");
+ }
+
+ int wordNum = i >> 4; // div 16
+ int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+
+ res.append(bucketValue);
+ }
+
+ return res.toString();
+ }
+
+ // Writable
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ super.write(out);
+ int sizeInWords = buckets2words(vectorSize);
+ for(int i = 0; i < sizeInWords; i++) {
+ out.writeLong(buckets[i]);
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ super.readFields(in);
+ int sizeInWords = buckets2words(vectorSize);
+ buckets = new long[sizeInWords];
+ for(int i = 0; i < sizeInWords; i++) {
+ buckets[i] = in.readLong();
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/org/thrudb/util/bloom/DynamicBloomFilter.java b/src/org/thrudb/util/bloom/DynamicBloomFilter.java
new file mode 100644
index 0000000..30c86b6
--- /dev/null
+++ b/src/org/thrudb/util/bloom/DynamicBloomFilter.java
@@ -0,0 +1,293 @@
+/**
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.thrudb.util.bloom;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements a dynamic Bloom filter, as defined in the INFOCOM 2006 paper.
+ *
+ * A dynamic Bloom filter (DBF) makes use of a s * m
bit matrix but
+ * each of the s
rows is a standard Bloom filter. The creation
+ * process of a DBF is iterative. At the start, the DBF is a 1 * m
+ * bit matrix, i.e., it is composed of a single standard Bloom filter.
+ * It assumes that nr
elements are recorded in the
+ * initial bit vector, where nr <= n
(n
is
+ * the cardinality of the set A
to record in the filter).
+ *
+ * As the size of A
grows during the execution of the application,
+ * several keys must be inserted in the DBF. When inserting a key into the DBF,
+ * one must first get an active Bloom filter in the matrix. A Bloom filter is
+ * active when the number of recorded keys, nr
, is
+ * strictly less than the current cardinality of A
, n
.
+ * If an active Bloom filter is found, the key is inserted and
+ * nr
is incremented by one. On the other hand, if there
+ * is no active Bloom filter, a new one is created (i.e., a new row is added to
+ * the matrix) according to the current size of A
and the element
+ * is added in this new Bloom filter and the nr
value of
+ * this new Bloom filter is set to one. A given key is said to belong to the
+ * DBF if the k
positions are set to one in one of the matrix rows.
+ *
+ * Originally created by + * European Commission One-Lab Project 034819. + * + * @see Filter The general behavior of a filter + * @see BloomFilter A Bloom filter + * + * @see Theory and Network Applications of Dynamic Bloom Filters + */ +public class DynamicBloomFilter extends Filter { + /** + * Threshold for the maximum number of key to record in a dynamic Bloom filter row. + */ + private int nr; + + /** + * The number of keys recorded in the current standard active Bloom filter. + */ + private int currentNbRecord; + + /** + * The matrix of Bloom filter. + */ + private BloomFilter[] matrix; + + /** + * Zero-args constructor for the serialization. + */ + public DynamicBloomFilter() { } + + /** + * Constructor. + *
+ * Builds an empty Dynamic Bloom filter.
+ * @param vectorSize The number of bits in the vector.
+ * @param nbHash The number of hash function to consider.
+ * @param hashType type of the hashing function (see
+ * {@link org.apache.hadoop.util.hash.Hash}).
+ * @param nr The threshold for the maximum number of keys to record in a
+ * dynamic Bloom filter row.
+ */
+ public DynamicBloomFilter(int vectorSize, int nbHash, int hashType, int nr) {
+ super(vectorSize, nbHash, hashType);
+
+ this.nr = nr;
+ this.currentNbRecord = 0;
+
+ matrix = new BloomFilter[1];
+ matrix[0] = new BloomFilter(this.vectorSize, this.nbHash, this.hashType);
+ }
+
+ @Override
+ public void add(Key key) {
+ if (key == null) {
+ throw new NullPointerException("Key can not be null");
+ }
+
+ BloomFilter bf = getActiveStandardBF();
+
+ if (bf == null) {
+ addRow();
+ bf = matrix[matrix.length - 1];
+ currentNbRecord = 0;
+ }
+
+ bf.add(key);
+
+ currentNbRecord++;
+ }
+
+ @Override
+ public void and(Filter filter) {
+ if (filter == null
+ || !(filter instanceof DynamicBloomFilter)
+ || filter.vectorSize != this.vectorSize
+ || filter.nbHash != this.nbHash) {
+ throw new IllegalArgumentException("filters cannot be and-ed");
+ }
+
+ DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
+
+ if (dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
+ throw new IllegalArgumentException("filters cannot be and-ed");
+ }
+
+ for (int i = 0; i < matrix.length; i++) {
+ matrix[i].and(dbf.matrix[i]);
+ }
+ }
+
+ @Override
+ public boolean membershipTest(Key key) {
+ if (key == null) {
+ return true;
+ }
+
+ for (int i = 0; i < matrix.length; i++) {
+ if (matrix[i].membershipTest(key)) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ @Override
+ public void not() {
+ for (int i = 0; i < matrix.length; i++) {
+ matrix[i].not();
+ }
+ }
+
+ @Override
+ public void or(Filter filter) {
+ if (filter == null
+ || !(filter instanceof DynamicBloomFilter)
+ || filter.vectorSize != this.vectorSize
+ || filter.nbHash != this.nbHash) {
+ throw new IllegalArgumentException("filters cannot be or-ed");
+ }
+
+ DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
+
+ if (dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
+ throw new IllegalArgumentException("filters cannot be or-ed");
+ }
+ for (int i = 0; i < matrix.length; i++) {
+ matrix[i].or(dbf.matrix[i]);
+ }
+ }
+
+ @Override
+ public void xor(Filter filter) {
+ if (filter == null
+ || !(filter instanceof DynamicBloomFilter)
+ || filter.vectorSize != this.vectorSize
+ || filter.nbHash != this.nbHash) {
+ throw new IllegalArgumentException("filters cannot be xor-ed");
+ }
+ DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
+
+ if (dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
+ throw new IllegalArgumentException("filters cannot be xor-ed");
+ }
+
+ for(int i = 0; i
+ * A filter is a data structure which aims at offering a lossy summary of a set
+ * Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension).
+ *
+ * It must be extended in order to define the real behavior.
+ *
+ * @see Key The general behavior of a key
+ * @see HashFunction A hash function
+ */
+public abstract class Filter {
+ private static final int VERSION = -1; // negative to accommodate for old format
+ /** The vector size of this filter. */
+ protected int vectorSize;
+
+ /** The hash function used to map a key to several positions in the vector. */
+ protected HashFunction hash;
+
+ /** The number of hash function to consider. */
+ protected int nbHash;
+
+ /** Type of hashing function to use. */
+ protected int hashType;
+
+ protected Filter() {}
+
+ /**
+ * Constructor.
+ * @param vectorSize The vector size of this filter.
+ * @param nbHash The number of hash functions to consider.
+ * @param hashType type of the hashing function (see {@link Hash}).
+ */
+ protected Filter(int vectorSize, int nbHash, int hashType) {
+ this.vectorSize = vectorSize;
+ this.nbHash = nbHash;
+ this.hashType = hashType;
+ this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType);
+ }
+
+ /**
+ * Adds a key to this filter.
+ * @param key The key to add.
+ */
+ public abstract void add(Key key);
+
+ /**
+ * Determines wether a specified key belongs to this filter.
+ * @param key The key to test.
+ * @return boolean True if the specified key belongs to this filter.
+ * False otherwise.
+ */
+ public abstract boolean membershipTest(Key key);
+
+ /**
+ * Peforms a logical AND between this filter and a specified filter.
+ *
+ * Invariant: The result is assigned to this filter.
+ * @param filter The filter to AND with.
+ */
+ public abstract void and(Filter filter);
+
+ /**
+ * Peforms a logical OR between this filter and a specified filter.
+ *
+ * Invariant: The result is assigned to this filter.
+ * @param filter The filter to OR with.
+ */
+ public abstract void or(Filter filter);
+
+ /**
+ * Peforms a logical XOR between this filter and a specified filter.
+ *
+ * Invariant: The result is assigned to this filter.
+ * @param filter The filter to XOR with.
+ */
+ public abstract void xor(Filter filter);
+
+ /**
+ * Performs a logical NOT on this filter.
+ *
+ * The result is assigned to this filter.
+ */
+ public abstract void not();
+
+ /**
+ * Adds a list of keys to this filter.
+ * @param keys The list of keys.
+ */
+ public void add(List
+ * Builds a hash function that must obey to a given maximum number of returned values and a highest value.
+ * @param maxValue The maximum highest returned value.
+ * @param nbHash The number of resulting hashed values.
+ * @param hashType type of the hashing function (see {@link Hash}).
+ */
+ public HashFunction(int maxValue, int nbHash, int hashType) {
+ if (maxValue <= 0) {
+ throw new IllegalArgumentException("maxValue must be > 0");
+ }
+
+ if (nbHash <= 0) {
+ throw new IllegalArgumentException("nbHash must be > 0");
+ }
+
+ this.maxValue = maxValue;
+ this.nbHash = nbHash;
+ this.hashFunction = Hash.getInstance(hashType);
+ if (this.hashFunction == null)
+ throw new IllegalArgumentException("hashType must be known");
+ }
+
+ /** Clears this hash function. A NOOP */
+ public void clear() {
+ }
+
+ /**
+ * Hashes a specified key into several integers.
+ * @param k The specified key.
+ * @return The array of hashed values.
+ */
+ public int[] hash(Key k){
+ byte[] b = k.getBytes();
+ if (b == null) {
+ throw new NullPointerException("buffer reference is null");
+ }
+ if (b.length == 0) {
+ throw new IllegalArgumentException("key length must be > 0");
+ }
+ int[] result = new int[nbHash];
+ for (int i = 0, initval = 0; i < nbHash; i++) {
+ initval = hashFunction.hash(b, initval);
+ result[i] = Math.abs(initval % maxValue);
+ }
+ return result;
+ }
+}
\ No newline at end of file
diff --git a/src/org/thrudb/util/bloom/Key.java b/src/org/thrudb/util/bloom/Key.java
new file mode 100644
index 0000000..0c94346
--- /dev/null
+++ b/src/org/thrudb/util/bloom/Key.java
@@ -0,0 +1,176 @@
+/**
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.thrudb.util.bloom;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * The general behavior of a key that must be stored in a filter.
+ *
+ * @see Filter The general behavior of a filter
+ */
+public class Key implements Comparable
+ * Invariant: if it is not specified, each instance of
+ *
+ * Builds a key with a default weight.
+ * @param value The byte value of this key.
+ */
+ public Key(byte[] value) {
+ this(value, 1.0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * Builds a key with a specified weight.
+ * @param value The value of this key.
+ * @param weight The weight associated to this key.
+ */
+ public Key(byte[] value, double weight) {
+ set(value, weight);
+ }
+
+ /**
+ * @param value
+ * @param weight
+ */
+ public void set(byte[] value, double weight) {
+ if (value == null) {
+ throw new IllegalArgumentException("value can not be null");
+ }
+ this.bytes = value;
+ this.weight = weight;
+ }
+
+ /** @return byte[] The value of this key. */
+ public byte[] getBytes() {
+ return this.bytes;
+ }
+
+ /** @return Returns the weight associated to this key. */
+ public double getWeight() {
+ return weight;
+ }
+
+ /**
+ * Increments the weight of this key with a specified value.
+ * @param weight The increment.
+ */
+ public void incrementWeight(double weight) {
+ this.weight += weight;
+ }
+
+ /** Increments the weight of this key by one. */
+ public void incrementWeight() {
+ this.weight++;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof Key)) {
+ return false;
+ }
+ return this.compareTo((Key)o) == 0;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = 0;
+ for (int i = 0; i < bytes.length; i++) {
+ result ^= Byte.valueOf(bytes[i]).hashCode();
+ }
+ result ^= Double.valueOf(weight).hashCode();
+ return result;
+ }
+
+ // Writable
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(bytes.length);
+ out.write(bytes);
+ out.writeDouble(weight);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ this.bytes = new byte[in.readInt()];
+ in.readFully(this.bytes);
+ weight = in.readDouble();
+ }
+
+ // Comparable
+
+ public int compareTo(Key other) {
+ int result = this.bytes.length - other.getBytes().length;
+ for (int i = 0; result == 0 && i < bytes.length; i++) {
+ result = this.bytes[i] - other.bytes[i];
+ }
+
+ if (result == 0) {
+ result = Double.valueOf(this.weight - other.weight).intValue();
+ }
+ return result;
+ }
+}
\ No newline at end of file
diff --git a/src/org/thrudb/util/bloom/RemoveScheme.java b/src/org/thrudb/util/bloom/RemoveScheme.java
new file mode 100644
index 0000000..30bcdd0
--- /dev/null
+++ b/src/org/thrudb/util/bloom/RemoveScheme.java
@@ -0,0 +1,91 @@
+/**
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract 034819
+ * (http://www.one-lab.org)
+ *
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.thrudb.util.bloom;
+
+/**
+ * Defines the different remove scheme for retouched Bloom filters.
+ *
+ * Originally created by
+ * European Commission One-Lab Project 034819.
+ */
+public interface RemoveScheme {
+ /**
+ * Random selection.
+ *
+ * The idea is to randomly select a bit to reset.
+ */
+ public final static short RANDOM = 0;
+
+ /**
+ * MinimumFN Selection.
+ *
+ * The idea is to select the bit to reset that will generate the minimum
+ * number of false negative.
+ */
+ public final static short MINIMUM_FN = 1;
+
+ /**
+ * MaximumFP Selection.
+ *
+ * The idea is to select the bit to reset that will remove the maximum number
+ * of false positive.
+ */
+ public final static short MAXIMUM_FP = 2;
+
+ /**
+ * Ratio Selection.
+ *
+ * The idea is to select the bit to reset that will, at the same time, remove
+ * the maximum number of false positve while minimizing the amount of false
+ * negative generated.
+ */
+ public final static short RATIO = 3;
+}
diff --git a/src/org/thrudb/util/bloom/RetouchedBloomFilter.java b/src/org/thrudb/util/bloom/RetouchedBloomFilter.java
new file mode 100644
index 0000000..c7cb3c9
--- /dev/null
+++ b/src/org/thrudb/util/bloom/RetouchedBloomFilter.java
@@ -0,0 +1,450 @@
+/**
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.thrudb.util.bloom;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Implements a retouched Bloom filter, as defined in the CoNEXT 2006 paper.
+ *
+ * It allows the removal of selected false positives at the cost of introducing
+ * random false negatives, and with the benefit of eliminating some random false
+ * positives at the same time.
+ *
+ *
+ * Originally created by
+ * European Commission One-Lab Project 034819.
+ *
+ * @see Filter The general behavior of a filter
+ * @see BloomFilter A Bloom filter
+ * @see RemoveScheme The different selective clearing algorithms
+ *
+ * @see Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives
+ */
+public final class RetouchedBloomFilter extends BloomFilter
+implements RemoveScheme {
+ /**
+ * KeyList vector (or ElementList Vector, as defined in the paper) of false positives.
+ */
+ List
+ * Invariant: if the false positive is The best hash table sizes are powers of 2. There is no need to do mod
+ * a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask.
+ * For example, if you need only 10 bits, do
+ * If you are hashing n strings byte[][] k, do it like this:
+ * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+ *
+ * By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
+ * code any way you wish, private, educational, or commercial. It's free.
+ *
+ * Use for hash table lookup, or anything where one collision in 2^^32 is
+ * acceptable. Do NOT use for cryptographic purposes.
+ */
+ @SuppressWarnings("fallthrough")
+ public int hash(byte[] key, int nbytes, int initval) {
+ int length = nbytes;
+ long a, b, c; // We use longs because we don't have unsigned ints
+ a = b = c = (0x00000000deadbeefL + length + initval) & INT_MASK;
+ int offset = 0;
+ for (; length > 12; offset += 12, length -= 12) {
+ a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+
+ /*
+ * mix -- mix 3 32-bit values reversibly.
+ * This is reversible, so any information in (a,b,c) before mix() is
+ * still in (a,b,c) after mix().
+ *
+ * If four pairs of (a,b,c) inputs are run through mix(), or through
+ * mix() in reverse, there are at least 32 bits of the output that
+ * are sometimes the same for one pair and different for another pair.
+ *
+ * This was tested for:
+ * - pairs that differed by one bit, by two bits, in any combination
+ * of top bits of (a,b,c), or in any combination of bottom bits of
+ * (a,b,c).
+ * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ * is commonly produced by subtraction) look like a single 1-bit
+ * difference.
+ * - the base values were pseudorandom, all zero but one bit set, or
+ * all zero plus a counter that starts at zero.
+ *
+ * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+ * satisfy this are
+ * 4 6 8 16 19 4
+ * 9 15 3 18 27 15
+ * 14 9 3 7 17 3
+ * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for
+ * "differ" defined as + with a one-bit base and a two-bit delta. I
+ * used http://burtleburtle.net/bob/hash/avalanche.html to choose
+ * the operations, constants, and arrangements of the variables.
+ *
+ * This does not achieve avalanche. There are input bits of (a,b,c)
+ * that fail to affect some output bits of (a,b,c), especially of a.
+ * The most thoroughly mixed value is c, but it doesn't really even
+ * achieve avalanche in c.
+ *
+ * This allows some parallelism. Read-after-writes are good at doubling
+ * the number of bits affected, so the goal of mixing pulls in the
+ * opposite direction as the goal of parallelism. I did what I could.
+ * Rotates seem to cost as much as shifts on every machine I could lay
+ * my hands on, and rotates are much kinder to the top and bottom bits,
+ * so I used rotates.
+ *
+ * #define mix(a,b,c) \
+ * { \
+ * a -= c; a ^= rot(c, 4); c += b; \
+ * b -= a; b ^= rot(a, 6); a += c; \
+ * c -= b; c ^= rot(b, 8); b += a; \
+ * a -= c; a ^= rot(c,16); c += b; \
+ * b -= a; b ^= rot(a,19); a += c; \
+ * c -= b; c ^= rot(b, 4); b += a; \
+ * }
+ *
+ * mix(a,b,c);
+ */
+ a = (a - c) & INT_MASK; a ^= rot(c, 4); c = (c + b) & INT_MASK;
+ b = (b - a) & INT_MASK; b ^= rot(a, 6); a = (a + c) & INT_MASK;
+ c = (c - b) & INT_MASK; c ^= rot(b, 8); b = (b + a) & INT_MASK;
+ a = (a - c) & INT_MASK; a ^= rot(c,16); c = (c + b) & INT_MASK;
+ b = (b - a) & INT_MASK; b ^= rot(a,19); a = (a + c) & INT_MASK;
+ c = (c - b) & INT_MASK; c ^= rot(b, 4); b = (b + a) & INT_MASK;
+ }
+
+ //-------------------------------- last block: affect all 32 bits of (c)
+ switch (length) { // all the case statements fall through
+ case 12:
+ c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 11:
+ c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 10:
+ c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 9:
+ c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK;
+ case 8:
+ b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 7:
+ b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 6:
+ b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 5:
+ b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK;
+ case 4:
+ a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 3:
+ a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 2:
+ a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 1:
+ a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK;
+ break;
+ case 0:
+ return (int)(c & INT_MASK);
+ }
+ /*
+ * final -- final mixing of 3 32-bit values (a,b,c) into c
+ *
+ * Pairs of (a,b,c) values differing in only a few bits will usually
+ * produce values of c that look totally different. This was tested for
+ * - pairs that differed by one bit, by two bits, in any combination
+ * of top bits of (a,b,c), or in any combination of bottom bits of
+ * (a,b,c).
+ *
+ * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ * is commonly produced by subtraction) look like a single 1-bit
+ * difference.
+ *
+ * - the base values were pseudorandom, all zero but one bit set, or
+ * all zero plus a counter that starts at zero.
+ *
+ * These constants passed:
+ * 14 11 25 16 4 14 24
+ * 12 14 25 16 4 14 24
+ * and these came close:
+ * 4 8 15 26 3 22 24
+ * 10 8 15 26 3 22 24
+ * 11 8 15 26 3 22 24
+ *
+ * #define final(a,b,c) \
+ * {
+ * c ^= b; c -= rot(b,14); \
+ * a ^= c; a -= rot(c,11); \
+ * b ^= a; b -= rot(a,25); \
+ * c ^= b; c -= rot(b,16); \
+ * a ^= c; a -= rot(c,4); \
+ * b ^= a; b -= rot(a,14); \
+ * c ^= b; c -= rot(b,24); \
+ * }
+ *
+ */
+ c ^= b; c = (c - rot(b,14)) & INT_MASK;
+ a ^= c; a = (a - rot(c,11)) & INT_MASK;
+ b ^= a; b = (b - rot(a,25)) & INT_MASK;
+ c ^= b; c = (c - rot(b,16)) & INT_MASK;
+ a ^= c; a = (a - rot(c,4)) & INT_MASK;
+ b ^= a; b = (b - rot(a,14)) & INT_MASK;
+ c ^= b; c = (c - rot(b,24)) & INT_MASK;
+
+ return (int)(c & INT_MASK);
+ }
+
+ /**
+ * Compute the hash of the specified file
+ * @param args name of file to compute hash of.
+ * @throws IOException
+ */
+ public static void main(String[] args) throws IOException {
+ if (args.length != 1) {
+ System.err.println("Usage: JenkinsHash filename");
+ System.exit(-1);
+ }
+ FileInputStream in = new FileInputStream(args[0]);
+ byte[] bytes = new byte[512];
+ int value = 0;
+ JenkinsHash hash = new JenkinsHash();
+ for (int length = in.read(bytes); length > 0 ; length = in.read(bytes)) {
+ value = hash.hash(bytes, length, value);
+ }
+ System.out.println(Math.abs(value));
+ }
+}
diff --git a/src/org/thrudb/util/hash/MurmurHash.java b/src/org/thrudb/util/hash/MurmurHash.java
new file mode 100644
index 0000000..2e5d410
--- /dev/null
+++ b/src/org/thrudb/util/hash/MurmurHash.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.thrudb.util.hash;
+
+/**
+ * This is a very fast, non-cryptographic hash suitable for general hash-based
+ * lookup. See http://murmurhash.googlepages.com/ for more details.
+ *
+ * The C version of MurmurHash 2.0 found at that site was ported
+ * to Java by Andrzej Bialecki (ab at getopt org).Null
otherwise.
+ */
+ private BloomFilter getActiveStandardBF() {
+ if (currentNbRecord >= nr) {
+ return null;
+ }
+
+ return matrix[matrix.length - 1];
+ }
+}
diff --git a/src/org/thrudb/util/bloom/Filter.java b/src/org/thrudb/util/bloom/Filter.java
new file mode 100644
index 0000000..45a266e
--- /dev/null
+++ b/src/org/thrudb/util/bloom/Filter.java
@@ -0,0 +1,212 @@
+/**
+ *
+ * Copyright (c) 2005, European Commission project OneLab under contract 034819
+ * (http://www.one-lab.org)
+ *
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the distribution.
+ * - Neither the name of the University Catholique de Louvain - UCL
+ * nor the names of its contributors may be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.thrudb.util.bloom;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.thrudb.util.hash.Hash;
+
+/**
+ * Defines the general behavior of a filter.
+ * A
. The
+ * key idea is to map entries of A
(also called keys) into several positions
+ * in a vector through the use of several hash functions.
+ * Key
will have a default weight of 1.0
+ */
+ double weight;
+
+ /** default constructor - use with readFields */
+ public Key() {}
+
+ /**
+ * Constructor.
+ * null
, nothing happens.
+ * @param key The false positive key to add.
+ */
+ public void addFalsePositive(Key key) {
+ if (key == null) {
+ throw new NullPointerException("key can not be null");
+ }
+
+ int[] h = hash.hash(key);
+ hash.clear();
+
+ for (int i = 0; i < nbHash; i++) {
+ fpVector[h[i]].add(key);
+ }
+ }
+
+ /**
+ * Adds a collection of false positive information to this retouched Bloom filter.
+ * @param coll The collection of false positive.
+ */
+ public void addFalsePositive(Collectionlength
, and
+ * the provided seed value
+ * @param bytes input bytes
+ * @param length length of the valid bytes to consider
+ * @param initval seed value
+ * @return hash value
+ */
+ public abstract int hash(byte[] bytes, int length, int initval);
+}
diff --git a/src/org/thrudb/util/hash/JenkinsHash.java b/src/org/thrudb/util/hash/JenkinsHash.java
new file mode 100644
index 0000000..db1e129
--- /dev/null
+++ b/src/org/thrudb/util/hash/JenkinsHash.java
@@ -0,0 +1,258 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.thrudb.util.hash;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+
+/**
+ * Produces 32-bit hash for hash table lookup.
+ *
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * You can use this free for any purpose. It's in the public domain.
+ * It has no warranty.
+ *
+ *
+ * @see lookup3.c
+ * @see Hash Functions (and how this
+ * function compares to others such as CRC, MD?, etc
+ * @see Has update on the
+ * Dr. Dobbs Article
+ */
+public class JenkinsHash extends Hash {
+ private static long INT_MASK = 0x00000000ffffffffL;
+ private static long BYTE_MASK = 0x00000000000000ffL;
+
+ private static JenkinsHash _instance = new JenkinsHash();
+
+ public static Hash getInstance() {
+ return _instance;
+ }
+
+ private static long rot(long val, int pos) {
+ return ((Integer.rotateLeft(
+ (int)(val & INT_MASK), pos)) & INT_MASK);
+ }
+
+ /**
+ * taken from hashlittle() -- hash a variable-length key into a 32-bit value
+ *
+ * @param key the key (the unaligned variable-length array of bytes)
+ * @param nbytes number of bytes to include in hash
+ * @param initval can be any integer value
+ * @return a 32-bit value. Every bit of the key affects every bit of the
+ * return value. Two keys differing by one or two bits will have totally
+ * different hash values.
+ *
+ * h = (h & hashmask(10));
+ * In which case, the hash table should have hashsize(10) elements.
+ *
+ *