Skip to content
This repository

adds algebird-hash module, core depends on it. #138

Open
wants to merge 3 commits into from

2 participants

P. Oscar Boykin Sam Ritchie
P. Oscar Boykin
Collaborator

This is API breaking, so it it is not a patch version bump when we publish next.

Count-min is still using it's own hash approach. I think we should revisit that, and we should also consider adding something Avi suggested: a HashFamily[T,R] object, which I guess has a Seq[Hashable[T,R]]. The apply on T can return Seq[R]. Comments?

Sam Ritchie
Collaborator

@johnynek, do you want to slate this for 0.3.0?

P. Oscar Boykin johnynek commented on the diff
...c/main/scala/com/twitter/algebird/hash/Hashable.scala
((8 lines not shown))
  8 +http://www.apache.org/licenses/LICENSE-2.0
  9 +
  10 +Unless required by applicable law or agreed to in writing, software
  11 +distributed under the License is distributed on an "AS IS" BASIS,
  12 +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 +See the License for the specific language governing permissions and
  14 +limitations under the License.
  15 +*/
  16 +package com.twitter.algebird.hash
  17 +
  18 +import java.nio.{ByteBuffer, ByteOrder}
  19 +import java.security.MessageDigest
  20 +
  21 +/** Type-class for generic hashing
  22 + */
  23 +trait Hashable[-T,+R] extends (T => R) { self =>
3
P. Oscar Boykin Collaborator
johnynek added a note

don't extend function, creates implicit conversions.

P. Oscar Boykin Collaborator
johnynek added a note

self has caused issues with serialization. Be explicit with val self = this where needed.

P. Oscar Boykin Collaborator
johnynek added a note

I think Hashable32, Hashable64, Hashable128 might be better approaches. There is a false generality here that blurs with Function in a non useful way. This trait is indistinguishable from Function, and that's silly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Showing 3 unique commits by 1 author.

Mar 08, 2013
P. Oscar Boykin johnynek Checkpoint, REBASE f9ba8fa
Mar 09, 2013
P. Oscar Boykin johnynek Merge branch 'develop' into feature/hash f268210
Mar 11, 2013
P. Oscar Boykin johnynek Update to algebird-hash ffdf525
This page is out of date. Refresh to see the latest.
6 algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
@@ -19,8 +19,12 @@ package com.twitter.algebird
19 19 import scala.collection.immutable.BitSet
20 20 import scala.collection.JavaConverters._
21 21
  22 +import com.twitter.algebird.hash.Murmur
  23 +
22 24 import com.googlecode.javaewah.{EWAHCompressedBitmap => CBitSet}
23 25
  26 +import java.nio.ByteBuffer
  27 +
24 28 object RichCBitSet {
25 29 def apply(x : Int*) = {
26 30 CBitSet.bitmapOf(x.sorted : _*)
@@ -326,7 +330,7 @@ case class BFHash(numHashes: Int, width: Int, seed: Long = 0L) extends Function1
326 330 Stream.empty
327 331 else{
328 332 val d = if(digested.isEmpty){
329   - val (a, b) = MurmurHash128(k)(bytes)
  333 + val (a, b) = Murmur.hash3(k)(ByteBuffer.wrap(bytes))
330 334 val (x1, x2) = splitLong(a)
331 335 val (x3, x4) = splitLong(b)
332 336 Seq(x1, x2, x3, x4)
6 algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
@@ -16,10 +16,12 @@ limitations under the License.
16 16
17 17 package com.twitter.algebird
18 18
19   -import scala.collection.BitSet
  19 +import com.twitter.algebird.hash.Murmur
20 20
21 21 import java.nio.ByteBuffer
22 22
  23 +import scala.collection.BitSet
  24 +
23 25 /** Implementation of the HyperLogLog approximate counting as a Monoid
24 26 * @link http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
25 27 *
@@ -29,7 +31,7 @@ import java.nio.ByteBuffer
29 31 object HyperLogLog {
30 32 def hash(input : Array[Byte]) : Array[Byte] = {
31 33 val seed = 12345678
32   - val (l0, l1) = MurmurHash128(seed)(input)
  34 + val (l0, l1) = Murmur.hash3(seed)(ByteBuffer.wrap(input))
33 35 val buf = new Array[Byte](16)
34 36 ByteBuffer
35 37 .wrap(buf)
74 algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
... ... @@ -1,6 +1,8 @@
1 1 package com.twitter.algebird
2 2
3   -import java.nio._
  3 +import java.nio.ByteBuffer
  4 +
  5 +import com.twitter.algebird.hash.{Hashable, Murmur}
4 6
5 7 /**
6 8 * Instances of MinHasher can create, combine, and compare fixed-sized signatures of
@@ -14,7 +16,7 @@ import java.nio._
14 16 * You can also use a combination of the above to estimate the size of the intersection of
15 17 * two sets from their signatures.
16 18 * The more bytes in the signature, the more accurate all of the above will be.
17   - *
  19 + *
18 20 * You can also use these signatures to quickly find similar sets without doing
19 21 * n^2 comparisons. Each signature is assigned to several buckets; sets whose signatures
20 22 * end up in the same bucket are likely to be similar. The targetThreshold controls
@@ -27,6 +29,8 @@ import java.nio._
27 29 *
28 30 * This implementation is modeled after Chapter 3 of Ullman and Rajaraman's Mining of Massive Datasets:
29 31 * http://infolab.stanford.edu/~ullman/mmds/ch3a.pdf
  32 + *
  33 + * TODO: make a wrapper class, MinHashSignature, so this Monoid is not on a raw type (Array[Byte]).
30 34 **/
31 35 abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n : Numeric[H]) extends Monoid[Array[Byte]] {
32 36 /** the number of bytes used for each hash in the signature */
@@ -44,10 +48,10 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
44 48 /** We always use a 128 bit hash function, so the number of hash functions is different
45 49 * (and usually smaller) than the number of hashes in the signature.
46 50 **/
47   - val hashFunctions = {
  51 + val hashFunctions: Seq[Hashable[ByteBuffer, (Long,Long)]] = {
48 52 val r = new scala.util.Random(seed)
49 53 val numHashFunctions = math.ceil(numBytes / 16.0).toInt
50   - (1 to numHashFunctions).map{i => MurmurHash128(r.nextLong)}
  54 + (1 to numHashFunctions).map{i => Murmur.hash3(r.nextLong)}
51 55 }
52 56
53 57 /** Signature for empty set, needed to be a proper Monoid */
@@ -57,34 +61,54 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
57 61 def plus(left : Array[Byte], right : Array[Byte]) = {
58 62 buildArray(left, right){(l,r) => n.min(l, r)}
59 63 }
60   -
  64 +
61 65 /** Esimate jaccard similarity (size of union / size of intersection) */
62 66 def similarity(left : Array[Byte], right : Array[Byte]) = {
63 67 val matching = buildArray(left,right){(l,r) => if(l == r) n.one else n.zero}
64 68 matching.map{_.toDouble}.sum / numHashes
65   - }
  69 + }
66 70
67   - /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing */
68   - def buckets(sig : Array[Byte]) = {
  71 + /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing
  72 + *
  73 + * Check other signatures who collide in at least one bucket for high similarity
  74 + * Put another way: sigs: Iterable[MinHashSignature] =>
  75 + * sigs.flatMap(s => buckets(s).map { (_, s)} )
  76 + * .groupBy { _._1 }
  77 + * .mapValues { candidates: Iterable[(Long, MinHashSignature)] =>
  78 + * val sigs = candidates.view.map { _._2 }
  79 + * for(c1 <- sigs;
  80 + * c2 <- sigs;
  81 + * if Ordering[MinHashSignature].lt(c1, c2)
  82 + * if jaccardSimilarity(c1, c2) >= query)
  83 + * yield (c1, c2)
  84 + * }
  85 + * But you probably want to do the above calculation in map-reduce
  86 + */
  87 + def buckets(sig : Array[Byte]): Seq[Long] = {
69 88 sig.grouped(numRows*hashSize).toList.map{band =>
70   - val (long1, long2) = hashFunctions.head(band)
  89 + val (long1, long2) = hashFunctions.head(ByteBuffer.wrap(band))
71 90 long1
72 91 }
73 92 }
74 93
75 94 /** Create a signature for a single Long value */
76   - def init(value : Long) : Array[Byte] = init{_(value)}
  95 + def init(value : Long) : Array[Byte] = init {
  96 + val buf = ByteBuffer.allocate(8)
  97 + buf.putLong(value)
  98 + buf.rewind
  99 + buf
  100 + }
77 101
78 102 /** Create a signature for a single String value */
79   - def init(value : String) : Array[Byte]= init{_(value)}
  103 + def init(value : String) : Array[Byte]= init(ByteBuffer.wrap(value.getBytes))
80 104
81 105 /** Create a signature for an arbitrary value */
82   - def init(fn : MurmurHash128 => (Long,Long)) : Array[Byte] = {
  106 + def init(serialized: ByteBuffer) : Array[Byte] = {
83 107 val bytes = new Array[Byte](numBytes)
84 108 val buffer = ByteBuffer.allocate(hashFunctions.size * 16)
85 109 val longBuffer = buffer.asLongBuffer
86 110 hashFunctions.foreach{h =>
87   - val (long1, long2) = fn(h)
  111 + val (long1, long2) = h(serialized)
88 112 longBuffer.put(long1)
89 113 longBuffer.put(long2)
90 114 }
@@ -93,10 +117,20 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
93 117 bytes
94 118 }
95 119
96   - /** useful for understanding the effects of numBands and numRows */
  120 + /** useful for understanding the effects of numBands and numRows
  121 + *
  122 + * For the LSH, the probability of becoming a candidate a sigmoid with max
  123 + * slope at this value of jaccard similarity,
  124 + * put another way, this is where d(probabilityOfInclusion(s))/ds is at its max
  125 + */
97 126 val estimatedThreshold = math.pow(1.0/numBands, 1.0/numRows)
98 127
99   - /** useful for understanding the effects of numBands and numRows */
  128 + /** useful for understanding the effects of numBands and numRows
  129 + *
  130 + * This is the probability that two MinHashSignature objects will end up in at least
  131 + * one of the same buckets for consideration GIVEN that their true similarity
  132 + * is sim
  133 + */
100 134 def probabilityOfInclusion(sim : Double) = 1.0 - math.pow(1.0 - math.pow(sim, numRows), numBands)
101 135
102 136 /** numerically solve the inverse of estimatedThreshold, given numBands*numRows */
@@ -119,10 +153,10 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
119 153 }
120 154
121 155 class MinHasher32(t : Double, n : Int) extends MinHasher[Int](t,n) {
122   - def hashSize = 4
  156 + def hashSize = 4
123 157 def maxHash = Int.MaxValue
124 158 def buildArray(fn: => Int) : Array[Byte] = {
125   - val byteBuffer = ByteBuffer.allocate(numBytes)
  159 + val byteBuffer = ByteBuffer.allocate(numBytes)
126 160 val writeBuffer = byteBuffer.asIntBuffer
127 161 1.to(numHashes).foreach{i => writeBuffer.put(fn)}
128 162 byteBuffer.array
@@ -143,10 +177,10 @@ class MinHasher32(t : Double, n : Int) extends MinHasher[Int](t,n) {
143 177 }
144 178
145 179 class MinHasher16(t : Double, n : Int) extends MinHasher[Char](t,n) {
146   - def hashSize = 2
  180 + def hashSize = 2
147 181 def maxHash = Char.MaxValue
148 182 def buildArray(fn: => Char) : Array[Byte] = {
149   - val byteBuffer = ByteBuffer.allocate(numBytes)
  183 + val byteBuffer = ByteBuffer.allocate(numBytes)
150 184 val writeBuffer = byteBuffer.asCharBuffer
151 185 1.to(numHashes).foreach{i => writeBuffer.put(fn)}
152 186 byteBuffer.array
@@ -157,4 +191,4 @@ class MinHasher16(t : Double, n : Int) extends MinHasher[Char](t,n) {
157 191 val rightBuffer = ByteBuffer.wrap(right).asCharBuffer
158 192 buildArray{fn(leftBuffer.get, rightBuffer.get)}
159 193 }
160   -}
  194 +}
35 algebird-core/src/main/scala/com/twitter/algebird/MurmurHash.scala
... ... @@ -1,35 +0,0 @@
1   -package com.twitter.algebird
2   -
3   -import java.nio._
4   -
5   -case class MurmurHash128(seed : Long) {
6   - def apply(buffer : ByteBuffer, offset : Int, length : Int) : (Long,Long) = {
7   - val longs = CassandraMurmurHash.hash3_x64_128(buffer, offset, length, seed)
8   - (longs(0), longs(1))
9   - }
10   -
11   - def apply(bytes : Array[Byte]) : (Long, Long) = apply(ByteBuffer.wrap(bytes), 0, bytes.length)
12   - def apply(maxBytes : Int, fn : ByteBuffer => Unit) : (Long, Long) = {
13   - val buffer = ByteBuffer.allocate(maxBytes)
14   - fn(buffer)
15   - apply(buffer, 0, maxBytes)
16   - }
17   - def apply(array : Array[Char]) : (Long, Long) = apply(array.size * 2, {_.asCharBuffer.put(array)})
18   - def apply(array : Array[Short]) : (Long, Long) = apply(array.size * 2, {_.asShortBuffer.put(array)})
19   - def apply(array : Array[Int]) : (Long, Long) = apply(array.size * 4, {_.asIntBuffer.put(array)})
20   - def apply(array : Array[Float]) : (Long, Long) = apply(array.size * 4, {_.asFloatBuffer.put(array)})
21   - def apply(array : Array[Long]) : (Long, Long) = apply(array.size * 8, {_.asLongBuffer.put(array)})
22   - def apply(array : Array[Double]) : (Long, Long) = apply(array.size * 8, {_.asDoubleBuffer.put(array)})
23   -
24   - def apply(value : Char) : (Long, Long)= apply(2, {_.asCharBuffer.put(value)})
25   - def apply(value : Short) : (Long, Long) = apply(2, {_.asShortBuffer.put(value)})
26   - def apply(value : Int) : (Long, Long) = apply(4, {_.asIntBuffer.put(value)})
27   - def apply(value : Float) : (Long, Long) = apply(4, {_.asFloatBuffer.put(value)})
28   - def apply(value : Long) : (Long, Long) = apply(8, {_.asLongBuffer.put(value)})
29   - def apply(value : Double) : (Long, Long) = apply(8, {_.asDoubleBuffer.put(value)})
30   -
31   - def apply(string : CharSequence) : (Long, Long) = apply(string.length * 2, {buffer =>
32   - val charBuffer = buffer.asCharBuffer
33   - 0.to(string.length - 1).foreach{i => charBuffer.put(string.charAt(i))}
34   - })
35   -}
2  ...ava/com/twitter/algebird/CassandraMurmurHash.java → ...ava/com/twitter/algebird/CassandraMurmurHash.java
@@ -15,7 +15,7 @@
15 15 * See the License for the specific language governing permissions and
16 16 * limitations under the License.
17 17 */
18   -package com.twitter.algebird;
  18 +package com.twitter.algebird.hash;
19 19
20 20 import java.nio.ByteBuffer;
21 21
338 algebird-hash/src/main/scala/com/twitter/algebird/hash/Hashable.scala
... ... @@ -0,0 +1,338 @@
  1 +/*
  2 +Copyright 2012 Twitter, Inc.
  3 +
  4 +Licensed under the Apache License, Version 2.0 (the "License");
  5 +you may not use this file except in compliance with the License.
  6 +You may obtain a copy of the License at
  7 +
  8 +http://www.apache.org/licenses/LICENSE-2.0
  9 +
  10 +Unless required by applicable law or agreed to in writing, software
  11 +distributed under the License is distributed on an "AS IS" BASIS,
  12 +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 +See the License for the specific language governing permissions and
  14 +limitations under the License.
  15 +*/
  16 +package com.twitter.algebird.hash
  17 +
  18 +import java.nio.{ByteBuffer, ByteOrder}
  19 +import java.security.MessageDigest
  20 +
  21 +/** Type-class for generic hashing
  22 + */
  23 +trait Hashable[-T,+R] extends (T => R) { self =>
  24 + override def andThen[A](fn : (R) => A): Hashable[T,A] = new Hashable[T,A] {
  25 + override def apply(t: T) = fn(self.apply(t))
  26 + }
  27 + override def compose[A](fn : (A) => T): Hashable[A,R] = new Hashable[A,R] {
  28 + override def apply(a: A) = self.apply(fn(a))
  29 + }
  30 +}
  31 +
  32 +trait LowPriorityHashable {
  33 + // XOR the high 32 bits into the low to get a int:
  34 + implicit def toInt[T](implicit h: Hashable[T,Long]): Hashable[T,Int] =
  35 + h.andThen { long => (long>>32).toInt ^ long.toInt }
  36 +
  37 + implicit def toIntFromLongLong[T](implicit h: Hashable[T,(Long,Long)]): Hashable[T,Int] =
  38 + toInt(toLong(h))
  39 +
  40 + implicit def toLong[T](implicit h: Hashable[T,(Long,Long)]): Hashable[T,Long] =
  41 + h.andThen { tup => tup._1 ^ tup._2 }
  42 +
  43 + // Get the UTF-8 bytes of a string to hash it
  44 + implicit def fromString[T](implicit h: Hashable[Array[Byte],T]): Hashable[String,T] =
  45 + h.compose { s: String => s.getBytes }
  46 +}
  47 +
  48 +object Hashable extends LowPriorityHashable {
  49 + /** Pull the implicit Hashable instance in scope to compute hash for this type.
  50 + *
  51 + * If in your scope, you set:
  52 + * implicit def hasher[T]: Hashable[T,Int] = Hashable.hashCode // Bad choice, just an example
  53 + * you can just call:
  54 + * hash("hey") : Int
  55 + * to get a hashvalue
  56 + */
  57 + def hash[T,R](t: T)(implicit hasher: Hashable[T,R]): R = hasher(t)
  58 +
  59 + // Some standard hashing:
  60 + def hashCode[T]: Hashable[T,Int] = new Hashable[T,Int] { def apply(t: T) = t.hashCode }
  61 +
  62 + private[this] val MaxUnsignedInt: Long = 0xFFFFFFFFL
  63 + /**
  64 + * FNV fast hashing algorithm in 32 bits.
  65 + * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
  66 + */
  67 + val FNV1_32 = new Hashable[Array[Byte],Int] {
  68 + def apply(key: Array[Byte]) = {
  69 + val PRIME: Int = 16777619
  70 + var i = 0
  71 + val len = key.length
  72 + var rv: Long = 0x811c9dc5L
  73 + while (i < len) {
  74 + rv = (rv * PRIME) ^ (key(i) & 0xff)
  75 + i += 1
  76 + }
  77 + (rv & MaxUnsignedInt).toInt
  78 + }
  79 +
  80 + override def toString = "FNV1_32"
  81 + }
  82 +
  83 + /**
  84 + * FNV fast hashing algorithm in 32 bits, variant with operations reversed.
  85 + * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
  86 + */
  87 + val FNV1A_32 = new Hashable[Array[Byte],Int] {
  88 + def apply(key: Array[Byte]): Int = {
  89 + val PRIME: Int = 16777619
  90 + var i = 0
  91 + val len = key.length
  92 + var rv: Long = 0x811c9dc5L
  93 + while (i < len) {
  94 + rv = (rv ^ (key(i) & 0xff)) * PRIME
  95 + i += 1
  96 + }
  97 + (rv & MaxUnsignedInt).toInt
  98 + }
  99 +
  100 + override def toString = "FNV1A_32"
  101 + }
  102 +
  103 + /**
  104 + * FNV fast hashing algorithm in 64 bits.
  105 + * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
  106 + */
  107 + val FNV1_64 = new Hashable[Array[Byte],Long] {
  108 + def apply(key: Array[Byte]): Long = {
  109 + val PRIME: Long = 1099511628211L
  110 + var i = 0
  111 + val len = key.length
  112 + var rv: Long = 0xcbf29ce484222325L
  113 + while (i < len) {
  114 + rv = (rv * PRIME) ^ (key(i) & 0xff)
  115 + i += 1
  116 + }
  117 + rv
  118 + }
  119 +
  120 + override def toString = "FNV1_64"
  121 + }
  122 +
  123 + /**
  124 + * FNV fast hashing algorithm in 64 bits, variant with operations reversed.
  125 + * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
  126 + */
  127 + val FNV1A_64 = new Hashable[Array[Byte],Long] {
  128 + def apply(key: Array[Byte]): Long = {
  129 + val PRIME: Long = 1099511628211L
  130 + var i = 0
  131 + val len = key.length
  132 + var rv: Long = 0xcbf29ce484222325L
  133 + while (i < len) {
  134 + rv = (rv ^ (key(i) & 0xff)) * PRIME
  135 + i += 1
  136 + }
  137 + rv
  138 + }
  139 +
  140 + override def toString = "FNV1A_64"
  141 + }
  142 +
  143 + /**
  144 + * Ketama's default hash algorithm: the first 4 bytes of the MD5 as a little-endian int.
  145 + * Wow, really? Who thought that was a good way to do it? :(
  146 + */
  147 + val MD5_LEInt = new Hashable[Array[Byte],Int] {
  148 + def apply(key: Array[Byte]): Int = {
  149 + val buffer = ByteBuffer.wrap(MD5(key))
  150 + buffer.order(ByteOrder.LITTLE_ENDIAN)
  151 + buffer.getInt
  152 + }
  153 + override def toString = "Ketama"
  154 + }
  155 + /** MD5 hash, not very fast, but needed for some compatibility
  156 + */
  157 + lazy val MD5 = new Hashable[Array[Byte],Array[Byte]] {
  158 + def apply(key: Array[Byte]) = {
  159 + val hasher = MessageDigest.getInstance("MD5")
  160 + hasher.update(key)
  161 + hasher.digest
  162 + }
  163 + override def toString = "MD5"
  164 + }
  165 + val MD5LongLong = new Hashable[Array[Byte], (Long,Long)] {
  166 + def apply(key: Array[Byte]) = {
  167 + val buffer = ByteBuffer.wrap(MD5(key))
  168 + (buffer.getLong, buffer.getLong)
  169 + }
  170 + override def toString = "MD5LongLong"
  171 + }
  172 +
  173 + /**
  174 + * The default memcache hash algorithm is the ITU-T variant of CRC-32.
  175 + */
  176 + val CRC32_ITU = new Hashable[Array[Byte],Int] {
  177 + def apply(key: Array[Byte]): Int = {
  178 + var i = 0
  179 + val len = key.length
  180 + var rv: Long = MaxUnsignedInt
  181 + while (i < len) {
  182 + rv = rv ^ (key(i) & 0xff)
  183 + var j = 0
  184 + while (j < 8) {
  185 + if ((rv & 1) != 0) {
  186 + rv = (rv >> 1) ^ 0xedb88320L
  187 + } else {
  188 + rv >>= 1
  189 + }
  190 + j += 1
  191 + }
  192 + i += 1
  193 + }
  194 + ((rv ^ MaxUnsignedInt) & MaxUnsignedInt).toInt
  195 + }
  196 +
  197 + override def toString = "CRC32_ITU"
  198 + }
  199 +
  200 + /**
  201 + * Paul Hsieh's hash function.
  202 + * http://www.azillionmonkeys.com/qed/hash.html
  203 + */
  204 + val HSIEH = new Hashable[Array[Byte],Int] {
  205 + override def apply(key: Array[Byte]): Int = {
  206 + var hash: Int = 0
  207 +
  208 + if (key.isEmpty)
  209 + return 0
  210 +
  211 + for (i <- 0 until key.length / 4) {
  212 + val b0 = key(i*4)
  213 + val b1 = key(i*4 + 1)
  214 + val b2 = key(i*4 + 2)
  215 + val b3 = key(i*4 + 3)
  216 + val s0 = (b1 << 8) | b0
  217 + val s1 = (b3 << 8) | b2
  218 +
  219 + hash += s0
  220 + val tmp = (s1 << 11) ^ hash
  221 + hash = (hash << 16) ^ tmp
  222 + hash += hash >>> 11
  223 + }
  224 +
  225 + val rem = key.length % 4
  226 + val offset = key.length - rem
  227 + rem match {
  228 + case 3 =>
  229 + val b0 = key(offset)
  230 + val b1 = key(offset + 1)
  231 + val b2 = key(offset + 2)
  232 + val s0 = b1 << 8 | b0
  233 + hash += s0
  234 + hash ^= hash << 16
  235 + hash ^= b2 << 18
  236 + hash += hash >>> 11
  237 + case 2 =>
  238 + val b0 = key(offset)
  239 + val b1 = key(offset + 1)
  240 + val s0 = b1 << 8 | b0
  241 + hash += s0
  242 + hash ^= hash << 11
  243 + hash += hash >>> 17
  244 + case 1 =>
  245 + val b0 = key(offset)
  246 + hash += b0
  247 + hash ^= hash << 10
  248 + hash += hash >>> 1
  249 + case 0 => ()
  250 + }
  251 +
  252 + hash ^= hash << 3
  253 + hash += hash >>> 5
  254 + hash ^= hash << 4
  255 + hash += hash >>> 17
  256 + hash ^= hash << 25
  257 + hash += hash >>> 6
  258 +
  259 + hash
  260 + }
  261 +
  262 + override def toString = "Hsieh"
  263 + }
  264 +
  265 + /**
  266 + * Jenkins Hash Function
  267 + * http://en.wikipedia.org/wiki/Jenkins_hash_function
  268 + */
  269 + val JENKINS = new Hashable[Array[Byte],Long] {
  270 + override def apply(key: Array[Byte]): Long = {
  271 + var a, b, c = 0xdeadbeef + key.size
  272 +
  273 + def rot(x: Int, k: Int) = (((x) << (k)) | ((x) >> (32 - (k))))
  274 +
  275 + def mix() {
  276 + a -= c; a ^= rot(c, 4); c += b
  277 + b -= a; b ^= rot(a, 6); a += c
  278 + c -= b; c ^= rot(b, 8); b += a
  279 + a -= c; a ^= rot(c, 16); c += b
  280 + b -= a; b ^= rot(a, 19); a += c
  281 + c -= b; c ^= rot(b, 4); b += a
  282 + }
  283 +
  284 + def fin() {
  285 + c ^= b; c -= rot(b, 14); a ^= c; a -= rot(c, 11)
  286 + b ^= a; b -= rot(a, 25); c ^= b; c -= rot(b, 16)
  287 + a ^= c; a -= rot(c, 4); b ^= a; b -= rot(a, 14)
  288 + c ^= b; c -= rot(b, 24)
  289 + }
  290 +
  291 + var block = 0
  292 + val numBlocks = (key.size - 1) / 12
  293 + while (block < numBlocks) {
  294 + val offset = block * 12
  295 + a += key(offset)
  296 + a += key(offset + 1) << 8
  297 + a += key(offset + 2) << 16
  298 + a += key(offset + 3) << 24
  299 +
  300 + b += key(offset + 4)
  301 + b += key(offset + 5) << 8
  302 + b += key(offset + 6) << 16
  303 + b += key(offset + 7) << 24
  304 +
  305 + c += key(offset + 8)
  306 + c += key(offset + 9) << 8
  307 + c += key(offset + 10) << 16
  308 + c += key(offset + 11) << 24
  309 +
  310 + mix()
  311 + block += 1
  312 + }
  313 +
  314 + val remaining = key.size - (numBlocks * 12)
  315 + val offset = numBlocks * 12
  316 +
  317 + if (remaining > 0) a += key(offset)
  318 + if (remaining > 1) a += key(offset + 1) << 8
  319 + if (remaining > 2) a += key(offset + 2) << 16
  320 + if (remaining > 3) a += key(offset + 3) << 24
  321 +
  322 + if (remaining > 4) b += key(offset + 4)
  323 + if (remaining > 5) b += key(offset + 5) << 8
  324 + if (remaining > 6) b += key(offset + 6) << 16
  325 + if (remaining > 7) b += key(offset + 7) << 24
  326 +
  327 + if (remaining > 8) c += key(offset + 8)
  328 + if (remaining > 9) c += key(offset + 9) << 8
  329 + if (remaining > 10) c += key(offset + 10) << 16
  330 + if (remaining > 11) c += key(offset + 11) << 24
  331 +
  332 + if (key.size > 0) fin()
  333 +
  334 + (b.toLong << 32) + c.toLong
  335 + }
  336 + }
  337 +
  338 +}
38 algebird-hash/src/main/scala/com/twitter/algebird/hash/MurmurHash.scala
... ... @@ -0,0 +1,38 @@
  1 +package com.twitter.algebird.hash
  2 +
  3 +import java.nio.ByteBuffer
  4 +
  5 +object Murmur {
  6 + val defaultSeed = 0L // There doesn't appear to be any risk of choosing any particular seed
  7 + // Compiler should remove these, just here for consistency:
  8 +
  9 + /** Murmur2 32 bits. Takes a long seed for consistency with the others.
  10 + */
  11 + def hash2_32(data: ByteBuffer, offset: Int, length: Int, seed: Long): Int =
  12 + CassandraMurmurHash.hash32(data, offset, length, (seed >> 32).toInt ^ (seed.toInt))
  13 +
  14 + def hash2_64(data: ByteBuffer, offset: Int, length: Int, seed: Long): Long =
  15 + CassandraMurmurHash.hash2_64(data, offset, length, seed)
  16 +
  17 + def hash3_128(buffer: ByteBuffer, offset : Int, length : Int, seed: Long): (Long,Long) = {
  18 + val longs = CassandraMurmurHash.hash3_x64_128(buffer, offset, length, seed)
  19 + (longs(0), longs(1))
  20 + }
  21 + /** Returns a hashable that hashes the entire ByteBuffer, from position to limit
  22 + */
  23 + def hash2Int(seed: Long = defaultSeed): Hashable[ByteBuffer, Int] =
  24 + new Hashable[ByteBuffer, Int] {
  25 + def apply(input: ByteBuffer) =
  26 + Murmur.hash2_32(input, input.position, input.limit, seed)
  27 + }
  28 + def hash2Long(seed: Long = defaultSeed): Hashable[ByteBuffer, Long] =
  29 + new Hashable[ByteBuffer, Long] {
  30 + def apply(input: ByteBuffer) =
  31 + Murmur.hash2_64(input, input.position, input.limit, seed)
  32 + }
  33 + def hash3(seed: Long = defaultSeed): Hashable[ByteBuffer,(Long,Long)] =
  34 + new Hashable[ByteBuffer, (Long,Long)] {
  35 + def apply(input: ByteBuffer) =
  36 + Murmur.hash3_128(input, input.position, input.limit, seed)
  37 + }
  38 +}
13 project/Build.scala
@@ -83,8 +83,9 @@ object AlgebirdBuild extends Build {
83 83 ).settings(
84 84 test := { }, // All tests reside in algebirdTest
85 85 name := "algebird-core",
86   - libraryDependencies += "com.googlecode.javaewah" % "JavaEWAH" % "0.6.6"
87   - )
  86 + libraryDependencies ++= Seq("com.googlecode.javaewah" % "JavaEWAH" % "0.6.6",
  87 + "com.twitter" %% "bijection-core" % "0.3.0")
  88 + ).dependsOn(algebirdHash)
88 89
89 90 lazy val algebirdTest = Project(
90 91 id = "algebird-test",
@@ -106,4 +107,12 @@ object AlgebirdBuild extends Build {
106 107 name := "algebird-util",
107 108 libraryDependencies += "com.twitter" %% "util-core" % "6.2.0"
108 109 ).dependsOn(algebirdCore, algebirdTest % "compile->test")
  110 +
  111 + lazy val algebirdHash = Project(
  112 + id = "algebird-hash",
  113 + base = file("algebird-hash"),
  114 + settings = sharedSettings
  115 + ).settings(
  116 + name := "algebird-hash"
  117 + )
109 118 }

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.