Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

adds algebird-hash module, core depends on it. #138

Open
wants to merge 3 commits into from

2 participants

P. Oscar Boykin Sam Ritchie
P. Oscar Boykin
Collaborator

This is API breaking, so it it is not a patch version bump when we publish next.

Count-min is still using it's own hash approach. I think we should revisit that, and we should also consider adding something Avi suggested: a HashFamily[T,R] object, which I guess has a Seq[Hashable[T,R]]. The apply on T can return Seq[R]. Comments?

Sam Ritchie
Collaborator

@johnynek, do you want to slate this for 0.3.0?

P. Oscar Boykin johnynek commented on the diff
...c/main/scala/com/twitter/algebird/hash/Hashable.scala
((8 lines not shown))
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+package com.twitter.algebird.hash
+
+import java.nio.{ByteBuffer, ByteOrder}
+import java.security.MessageDigest
+
+/** Type-class for generic hashing
+ */
+trait Hashable[-T,+R] extends (T => R) { self =>
P. Oscar Boykin Collaborator

don't extend function, creates implicit conversions.

P. Oscar Boykin Collaborator

self has caused issues with serialization. Be explicit with val self = this where needed.

P. Oscar Boykin Collaborator

I think Hashable32, Hashable64, Hashable128 might be better approaches. There is a false generality here that blurs with Function in a non useful way. This trait is indistinguishable from Function, and that's silly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Mar 8, 2013
  1. P. Oscar Boykin

    Checkpoint, REBASE

    johnynek authored
Commits on Mar 9, 2013
  1. P. Oscar Boykin
Commits on Mar 11, 2013
  1. P. Oscar Boykin

    Update to algebird-hash

    johnynek authored
This page is out of date. Refresh to see the latest.
6 algebird-core/src/main/scala/com/twitter/algebird/BloomFilter.scala
View
@@ -19,8 +19,12 @@ package com.twitter.algebird
import scala.collection.immutable.BitSet
import scala.collection.JavaConverters._
+import com.twitter.algebird.hash.Murmur
+
import com.googlecode.javaewah.{EWAHCompressedBitmap => CBitSet}
+import java.nio.ByteBuffer
+
object RichCBitSet {
def apply(x : Int*) = {
CBitSet.bitmapOf(x.sorted : _*)
@@ -326,7 +330,7 @@ case class BFHash(numHashes: Int, width: Int, seed: Long = 0L) extends Function1
Stream.empty
else{
val d = if(digested.isEmpty){
- val (a, b) = MurmurHash128(k)(bytes)
+ val (a, b) = Murmur.hash3(k)(ByteBuffer.wrap(bytes))
val (x1, x2) = splitLong(a)
val (x3, x4) = splitLong(b)
Seq(x1, x2, x3, x4)
6 algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
View
@@ -16,10 +16,12 @@ limitations under the License.
package com.twitter.algebird
-import scala.collection.BitSet
+import com.twitter.algebird.hash.Murmur
import java.nio.ByteBuffer
+import scala.collection.BitSet
+
/** Implementation of the HyperLogLog approximate counting as a Monoid
* @link http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
*
@@ -29,7 +31,7 @@ import java.nio.ByteBuffer
object HyperLogLog {
def hash(input : Array[Byte]) : Array[Byte] = {
val seed = 12345678
- val (l0, l1) = MurmurHash128(seed)(input)
+ val (l0, l1) = Murmur.hash3(seed)(ByteBuffer.wrap(input))
val buf = new Array[Byte](16)
ByteBuffer
.wrap(buf)
74 algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
View
@@ -1,6 +1,8 @@
package com.twitter.algebird
-import java.nio._
+import java.nio.ByteBuffer
+
+import com.twitter.algebird.hash.{Hashable, Murmur}
/**
* Instances of MinHasher can create, combine, and compare fixed-sized signatures of
@@ -14,7 +16,7 @@ import java.nio._
* You can also use a combination of the above to estimate the size of the intersection of
* two sets from their signatures.
* The more bytes in the signature, the more accurate all of the above will be.
- *
+ *
* You can also use these signatures to quickly find similar sets without doing
* n^2 comparisons. Each signature is assigned to several buckets; sets whose signatures
* end up in the same bucket are likely to be similar. The targetThreshold controls
@@ -27,6 +29,8 @@ import java.nio._
*
* This implementation is modeled after Chapter 3 of Ullman and Rajaraman's Mining of Massive Datasets:
* http://infolab.stanford.edu/~ullman/mmds/ch3a.pdf
+ *
+ * TODO: make a wrapper class, MinHashSignature, so this Monoid is not on a raw type (Array[Byte]).
**/
abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n : Numeric[H]) extends Monoid[Array[Byte]] {
/** the number of bytes used for each hash in the signature */
@@ -44,10 +48,10 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
/** We always use a 128 bit hash function, so the number of hash functions is different
* (and usually smaller) than the number of hashes in the signature.
**/
- val hashFunctions = {
+ val hashFunctions: Seq[Hashable[ByteBuffer, (Long,Long)]] = {
val r = new scala.util.Random(seed)
val numHashFunctions = math.ceil(numBytes / 16.0).toInt
- (1 to numHashFunctions).map{i => MurmurHash128(r.nextLong)}
+ (1 to numHashFunctions).map{i => Murmur.hash3(r.nextLong)}
}
/** Signature for empty set, needed to be a proper Monoid */
@@ -57,34 +61,54 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
def plus(left : Array[Byte], right : Array[Byte]) = {
buildArray(left, right){(l,r) => n.min(l, r)}
}
-
+
/** Esimate jaccard similarity (size of union / size of intersection) */
def similarity(left : Array[Byte], right : Array[Byte]) = {
val matching = buildArray(left,right){(l,r) => if(l == r) n.one else n.zero}
matching.map{_.toDouble}.sum / numHashes
- }
+ }
- /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing */
- def buckets(sig : Array[Byte]) = {
+ /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing
+ *
+ * Check other signatures who collide in at least one bucket for high similarity
+ * Put another way: sigs: Iterable[MinHashSignature] =>
+ * sigs.flatMap(s => buckets(s).map { (_, s)} )
+ * .groupBy { _._1 }
+ * .mapValues { candidates: Iterable[(Long, MinHashSignature)] =>
+ * val sigs = candidates.view.map { _._2 }
+ * for(c1 <- sigs;
+ * c2 <- sigs;
+ * if Ordering[MinHashSignature].lt(c1, c2)
+ * if jaccardSimilarity(c1, c2) >= query)
+ * yield (c1, c2)
+ * }
+ * But you probably want to do the above calculation in map-reduce
+ */
+ def buckets(sig : Array[Byte]): Seq[Long] = {
sig.grouped(numRows*hashSize).toList.map{band =>
- val (long1, long2) = hashFunctions.head(band)
+ val (long1, long2) = hashFunctions.head(ByteBuffer.wrap(band))
long1
}
}
/** Create a signature for a single Long value */
- def init(value : Long) : Array[Byte] = init{_(value)}
+ def init(value : Long) : Array[Byte] = init {
+ val buf = ByteBuffer.allocate(8)
+ buf.putLong(value)
+ buf.rewind
+ buf
+ }
/** Create a signature for a single String value */
- def init(value : String) : Array[Byte]= init{_(value)}
+ def init(value : String) : Array[Byte]= init(ByteBuffer.wrap(value.getBytes))
/** Create a signature for an arbitrary value */
- def init(fn : MurmurHash128 => (Long,Long)) : Array[Byte] = {
+ def init(serialized: ByteBuffer) : Array[Byte] = {
val bytes = new Array[Byte](numBytes)
val buffer = ByteBuffer.allocate(hashFunctions.size * 16)
val longBuffer = buffer.asLongBuffer
hashFunctions.foreach{h =>
- val (long1, long2) = fn(h)
+ val (long1, long2) = h(serialized)
longBuffer.put(long1)
longBuffer.put(long2)
}
@@ -93,10 +117,20 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
bytes
}
- /** useful for understanding the effects of numBands and numRows */
+ /** useful for understanding the effects of numBands and numRows
+ *
+ * For the LSH, the probability of becoming a candidate a sigmoid with max
+ * slope at this value of jaccard similarity,
+ * put another way, this is where d(probabilityOfInclusion(s))/ds is at its max
+ */
val estimatedThreshold = math.pow(1.0/numBands, 1.0/numRows)
- /** useful for understanding the effects of numBands and numRows */
+ /** useful for understanding the effects of numBands and numRows
+ *
+ * This is the probability that two MinHashSignature objects will end up in at least
+ * one of the same buckets for consideration GIVEN that their true similarity
+ * is sim
+ */
def probabilityOfInclusion(sim : Double) = 1.0 - math.pow(1.0 - math.pow(sim, numRows), numBands)
/** numerically solve the inverse of estimatedThreshold, given numBands*numRows */
@@ -119,10 +153,10 @@ abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n
}
class MinHasher32(t : Double, n : Int) extends MinHasher[Int](t,n) {
- def hashSize = 4
+ def hashSize = 4
def maxHash = Int.MaxValue
def buildArray(fn: => Int) : Array[Byte] = {
- val byteBuffer = ByteBuffer.allocate(numBytes)
+ val byteBuffer = ByteBuffer.allocate(numBytes)
val writeBuffer = byteBuffer.asIntBuffer
1.to(numHashes).foreach{i => writeBuffer.put(fn)}
byteBuffer.array
@@ -143,10 +177,10 @@ class MinHasher32(t : Double, n : Int) extends MinHasher[Int](t,n) {
}
class MinHasher16(t : Double, n : Int) extends MinHasher[Char](t,n) {
- def hashSize = 2
+ def hashSize = 2
def maxHash = Char.MaxValue
def buildArray(fn: => Char) : Array[Byte] = {
- val byteBuffer = ByteBuffer.allocate(numBytes)
+ val byteBuffer = ByteBuffer.allocate(numBytes)
val writeBuffer = byteBuffer.asCharBuffer
1.to(numHashes).foreach{i => writeBuffer.put(fn)}
byteBuffer.array
@@ -157,4 +191,4 @@ class MinHasher16(t : Double, n : Int) extends MinHasher[Char](t,n) {
val rightBuffer = ByteBuffer.wrap(right).asCharBuffer
buildArray{fn(leftBuffer.get, rightBuffer.get)}
}
-}
+}
35 algebird-core/src/main/scala/com/twitter/algebird/MurmurHash.scala
View
@@ -1,35 +0,0 @@
-package com.twitter.algebird
-
-import java.nio._
-
-case class MurmurHash128(seed : Long) {
- def apply(buffer : ByteBuffer, offset : Int, length : Int) : (Long,Long) = {
- val longs = CassandraMurmurHash.hash3_x64_128(buffer, offset, length, seed)
- (longs(0), longs(1))
- }
-
- def apply(bytes : Array[Byte]) : (Long, Long) = apply(ByteBuffer.wrap(bytes), 0, bytes.length)
- def apply(maxBytes : Int, fn : ByteBuffer => Unit) : (Long, Long) = {
- val buffer = ByteBuffer.allocate(maxBytes)
- fn(buffer)
- apply(buffer, 0, maxBytes)
- }
- def apply(array : Array[Char]) : (Long, Long) = apply(array.size * 2, {_.asCharBuffer.put(array)})
- def apply(array : Array[Short]) : (Long, Long) = apply(array.size * 2, {_.asShortBuffer.put(array)})
- def apply(array : Array[Int]) : (Long, Long) = apply(array.size * 4, {_.asIntBuffer.put(array)})
- def apply(array : Array[Float]) : (Long, Long) = apply(array.size * 4, {_.asFloatBuffer.put(array)})
- def apply(array : Array[Long]) : (Long, Long) = apply(array.size * 8, {_.asLongBuffer.put(array)})
- def apply(array : Array[Double]) : (Long, Long) = apply(array.size * 8, {_.asDoubleBuffer.put(array)})
-
- def apply(value : Char) : (Long, Long)= apply(2, {_.asCharBuffer.put(value)})
- def apply(value : Short) : (Long, Long) = apply(2, {_.asShortBuffer.put(value)})
- def apply(value : Int) : (Long, Long) = apply(4, {_.asIntBuffer.put(value)})
- def apply(value : Float) : (Long, Long) = apply(4, {_.asFloatBuffer.put(value)})
- def apply(value : Long) : (Long, Long) = apply(8, {_.asLongBuffer.put(value)})
- def apply(value : Double) : (Long, Long) = apply(8, {_.asDoubleBuffer.put(value)})
-
- def apply(string : CharSequence) : (Long, Long) = apply(string.length * 2, {buffer =>
- val charBuffer = buffer.asCharBuffer
- 0.to(string.length - 1).foreach{i => charBuffer.put(string.charAt(i))}
- })
-}
2  ...ava/com/twitter/algebird/CassandraMurmurHash.java → ...ava/com/twitter/algebird/CassandraMurmurHash.java
View
@@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.twitter.algebird;
+package com.twitter.algebird.hash;
import java.nio.ByteBuffer;
338 algebird-hash/src/main/scala/com/twitter/algebird/hash/Hashable.scala
View
@@ -0,0 +1,338 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+package com.twitter.algebird.hash
+
+import java.nio.{ByteBuffer, ByteOrder}
+import java.security.MessageDigest
+
+/** Type-class for generic hashing
+ */
+trait Hashable[-T,+R] extends (T => R) { self =>
P. Oscar Boykin Collaborator

don't extend function, creates implicit conversions.

P. Oscar Boykin Collaborator

self has caused issues with serialization. Be explicit with val self = this where needed.

P. Oscar Boykin Collaborator

I think Hashable32, Hashable64, Hashable128 might be better approaches. There is a false generality here that blurs with Function in a non useful way. This trait is indistinguishable from Function, and that's silly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ override def andThen[A](fn : (R) => A): Hashable[T,A] = new Hashable[T,A] {
+ override def apply(t: T) = fn(self.apply(t))
+ }
+ override def compose[A](fn : (A) => T): Hashable[A,R] = new Hashable[A,R] {
+ override def apply(a: A) = self.apply(fn(a))
+ }
+}
+
+trait LowPriorityHashable {
+ // XOR the high 32 bits into the low to get a int:
+ implicit def toInt[T](implicit h: Hashable[T,Long]): Hashable[T,Int] =
+ h.andThen { long => (long>>32).toInt ^ long.toInt }
+
+ implicit def toIntFromLongLong[T](implicit h: Hashable[T,(Long,Long)]): Hashable[T,Int] =
+ toInt(toLong(h))
+
+ implicit def toLong[T](implicit h: Hashable[T,(Long,Long)]): Hashable[T,Long] =
+ h.andThen { tup => tup._1 ^ tup._2 }
+
+ // Get the UTF-8 bytes of a string to hash it
+ implicit def fromString[T](implicit h: Hashable[Array[Byte],T]): Hashable[String,T] =
+ h.compose { s: String => s.getBytes }
+}
+
+object Hashable extends LowPriorityHashable {
+ /** Pull the implicit Hashable instance in scope to compute hash for this type.
+ *
+ * If in your scope, you set:
+ * implicit def hasher[T]: Hashable[T,Int] = Hashable.hashCode // Bad choice, just an example
+ * you can just call:
+ * hash("hey") : Int
+ * to get a hashvalue
+ */
+ def hash[T,R](t: T)(implicit hasher: Hashable[T,R]): R = hasher(t)
+
+ // Some standard hashing:
+ def hashCode[T]: Hashable[T,Int] = new Hashable[T,Int] { def apply(t: T) = t.hashCode }
+
+ private[this] val MaxUnsignedInt: Long = 0xFFFFFFFFL
+ /**
+ * FNV fast hashing algorithm in 32 bits.
+ * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
+ */
+ val FNV1_32 = new Hashable[Array[Byte],Int] {
+ def apply(key: Array[Byte]) = {
+ val PRIME: Int = 16777619
+ var i = 0
+ val len = key.length
+ var rv: Long = 0x811c9dc5L
+ while (i < len) {
+ rv = (rv * PRIME) ^ (key(i) & 0xff)
+ i += 1
+ }
+ (rv & MaxUnsignedInt).toInt
+ }
+
+ override def toString = "FNV1_32"
+ }
+
+ /**
+ * FNV fast hashing algorithm in 32 bits, variant with operations reversed.
+ * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
+ */
+ val FNV1A_32 = new Hashable[Array[Byte],Int] {
+ def apply(key: Array[Byte]): Int = {
+ val PRIME: Int = 16777619
+ var i = 0
+ val len = key.length
+ var rv: Long = 0x811c9dc5L
+ while (i < len) {
+ rv = (rv ^ (key(i) & 0xff)) * PRIME
+ i += 1
+ }
+ (rv & MaxUnsignedInt).toInt
+ }
+
+ override def toString = "FNV1A_32"
+ }
+
+ /**
+ * FNV fast hashing algorithm in 64 bits.
+ * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
+ */
+ val FNV1_64 = new Hashable[Array[Byte],Long] {
+ def apply(key: Array[Byte]): Long = {
+ val PRIME: Long = 1099511628211L
+ var i = 0
+ val len = key.length
+ var rv: Long = 0xcbf29ce484222325L
+ while (i < len) {
+ rv = (rv * PRIME) ^ (key(i) & 0xff)
+ i += 1
+ }
+ rv
+ }
+
+ override def toString = "FNV1_64"
+ }
+
+ /**
+ * FNV fast hashing algorithm in 64 bits, variant with operations reversed.
+ * @see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
+ */
+ val FNV1A_64 = new Hashable[Array[Byte],Long] {
+ def apply(key: Array[Byte]): Long = {
+ val PRIME: Long = 1099511628211L
+ var i = 0
+ val len = key.length
+ var rv: Long = 0xcbf29ce484222325L
+ while (i < len) {
+ rv = (rv ^ (key(i) & 0xff)) * PRIME
+ i += 1
+ }
+ rv
+ }
+
+ override def toString = "FNV1A_64"
+ }
+
+ /**
+ * Ketama's default hash algorithm: the first 4 bytes of the MD5 as a little-endian int.
+ * Wow, really? Who thought that was a good way to do it? :(
+ */
+ val MD5_LEInt = new Hashable[Array[Byte],Int] {
+ def apply(key: Array[Byte]): Int = {
+ val buffer = ByteBuffer.wrap(MD5(key))
+ buffer.order(ByteOrder.LITTLE_ENDIAN)
+ buffer.getInt
+ }
+ override def toString = "Ketama"
+ }
+ /** MD5 hash, not very fast, but needed for some compatibility
+ */
+ lazy val MD5 = new Hashable[Array[Byte],Array[Byte]] {
+ def apply(key: Array[Byte]) = {
+ val hasher = MessageDigest.getInstance("MD5")
+ hasher.update(key)
+ hasher.digest
+ }
+ override def toString = "MD5"
+ }
+ val MD5LongLong = new Hashable[Array[Byte], (Long,Long)] {
+ def apply(key: Array[Byte]) = {
+ val buffer = ByteBuffer.wrap(MD5(key))
+ (buffer.getLong, buffer.getLong)
+ }
+ override def toString = "MD5LongLong"
+ }
+
+ /**
+ * The default memcache hash algorithm is the ITU-T variant of CRC-32.
+ */
+ val CRC32_ITU = new Hashable[Array[Byte],Int] {
+ def apply(key: Array[Byte]): Int = {
+ var i = 0
+ val len = key.length
+ var rv: Long = MaxUnsignedInt
+ while (i < len) {
+ rv = rv ^ (key(i) & 0xff)
+ var j = 0
+ while (j < 8) {
+ if ((rv & 1) != 0) {
+ rv = (rv >> 1) ^ 0xedb88320L
+ } else {
+ rv >>= 1
+ }
+ j += 1
+ }
+ i += 1
+ }
+ ((rv ^ MaxUnsignedInt) & MaxUnsignedInt).toInt
+ }
+
+ override def toString = "CRC32_ITU"
+ }
+
+ /**
+ * Paul Hsieh's hash function.
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+ val HSIEH = new Hashable[Array[Byte],Int] {
+ override def apply(key: Array[Byte]): Int = {
+ var hash: Int = 0
+
+ if (key.isEmpty)
+ return 0
+
+ for (i <- 0 until key.length / 4) {
+ val b0 = key(i*4)
+ val b1 = key(i*4 + 1)
+ val b2 = key(i*4 + 2)
+ val b3 = key(i*4 + 3)
+ val s0 = (b1 << 8) | b0
+ val s1 = (b3 << 8) | b2
+
+ hash += s0
+ val tmp = (s1 << 11) ^ hash
+ hash = (hash << 16) ^ tmp
+ hash += hash >>> 11
+ }
+
+ val rem = key.length % 4
+ val offset = key.length - rem
+ rem match {
+ case 3 =>
+ val b0 = key(offset)
+ val b1 = key(offset + 1)
+ val b2 = key(offset + 2)
+ val s0 = b1 << 8 | b0
+ hash += s0
+ hash ^= hash << 16
+ hash ^= b2 << 18
+ hash += hash >>> 11
+ case 2 =>
+ val b0 = key(offset)
+ val b1 = key(offset + 1)
+ val s0 = b1 << 8 | b0
+ hash += s0
+ hash ^= hash << 11
+ hash += hash >>> 17
+ case 1 =>
+ val b0 = key(offset)
+ hash += b0
+ hash ^= hash << 10
+ hash += hash >>> 1
+ case 0 => ()
+ }
+
+ hash ^= hash << 3
+ hash += hash >>> 5
+ hash ^= hash << 4
+ hash += hash >>> 17
+ hash ^= hash << 25
+ hash += hash >>> 6
+
+ hash
+ }
+
+ override def toString = "Hsieh"
+ }
+
+ /**
+ * Jenkins Hash Function
+ * http://en.wikipedia.org/wiki/Jenkins_hash_function
+ */
+ val JENKINS = new Hashable[Array[Byte],Long] {
+ override def apply(key: Array[Byte]): Long = {
+ var a, b, c = 0xdeadbeef + key.size
+
+ def rot(x: Int, k: Int) = (((x) << (k)) | ((x) >> (32 - (k))))
+
+ def mix() {
+ a -= c; a ^= rot(c, 4); c += b
+ b -= a; b ^= rot(a, 6); a += c
+ c -= b; c ^= rot(b, 8); b += a
+ a -= c; a ^= rot(c, 16); c += b
+ b -= a; b ^= rot(a, 19); a += c
+ c -= b; c ^= rot(b, 4); b += a
+ }
+
+ def fin() {
+ c ^= b; c -= rot(b, 14); a ^= c; a -= rot(c, 11)
+ b ^= a; b -= rot(a, 25); c ^= b; c -= rot(b, 16)
+ a ^= c; a -= rot(c, 4); b ^= a; b -= rot(a, 14)
+ c ^= b; c -= rot(b, 24)
+ }
+
+ var block = 0
+ val numBlocks = (key.size - 1) / 12
+ while (block < numBlocks) {
+ val offset = block * 12
+ a += key(offset)
+ a += key(offset + 1) << 8
+ a += key(offset + 2) << 16
+ a += key(offset + 3) << 24
+
+ b += key(offset + 4)
+ b += key(offset + 5) << 8
+ b += key(offset + 6) << 16
+ b += key(offset + 7) << 24
+
+ c += key(offset + 8)
+ c += key(offset + 9) << 8
+ c += key(offset + 10) << 16
+ c += key(offset + 11) << 24
+
+ mix()
+ block += 1
+ }
+
+ val remaining = key.size - (numBlocks * 12)
+ val offset = numBlocks * 12
+
+ if (remaining > 0) a += key(offset)
+ if (remaining > 1) a += key(offset + 1) << 8
+ if (remaining > 2) a += key(offset + 2) << 16
+ if (remaining > 3) a += key(offset + 3) << 24
+
+ if (remaining > 4) b += key(offset + 4)
+ if (remaining > 5) b += key(offset + 5) << 8
+ if (remaining > 6) b += key(offset + 6) << 16
+ if (remaining > 7) b += key(offset + 7) << 24
+
+ if (remaining > 8) c += key(offset + 8)
+ if (remaining > 9) c += key(offset + 9) << 8
+ if (remaining > 10) c += key(offset + 10) << 16
+ if (remaining > 11) c += key(offset + 11) << 24
+
+ if (key.size > 0) fin()
+
+ (b.toLong << 32) + c.toLong
+ }
+ }
+
+}
38 algebird-hash/src/main/scala/com/twitter/algebird/hash/MurmurHash.scala
View
@@ -0,0 +1,38 @@
+package com.twitter.algebird.hash
+
+import java.nio.ByteBuffer
+
+object Murmur {
+ val defaultSeed = 0L // There doesn't appear to be any risk of choosing any particular seed
+ // Compiler should remove these, just here for consistency:
+
+ /** Murmur2 32 bits. Takes a long seed for consistency with the others.
+ */
+ def hash2_32(data: ByteBuffer, offset: Int, length: Int, seed: Long): Int =
+ CassandraMurmurHash.hash32(data, offset, length, (seed >> 32).toInt ^ (seed.toInt))
+
+ def hash2_64(data: ByteBuffer, offset: Int, length: Int, seed: Long): Long =
+ CassandraMurmurHash.hash2_64(data, offset, length, seed)
+
+ def hash3_128(buffer: ByteBuffer, offset : Int, length : Int, seed: Long): (Long,Long) = {
+ val longs = CassandraMurmurHash.hash3_x64_128(buffer, offset, length, seed)
+ (longs(0), longs(1))
+ }
+ /** Returns a hashable that hashes the entire ByteBuffer, from position to limit
+ */
+ def hash2Int(seed: Long = defaultSeed): Hashable[ByteBuffer, Int] =
+ new Hashable[ByteBuffer, Int] {
+ def apply(input: ByteBuffer) =
+ Murmur.hash2_32(input, input.position, input.limit, seed)
+ }
+ def hash2Long(seed: Long = defaultSeed): Hashable[ByteBuffer, Long] =
+ new Hashable[ByteBuffer, Long] {
+ def apply(input: ByteBuffer) =
+ Murmur.hash2_64(input, input.position, input.limit, seed)
+ }
+ def hash3(seed: Long = defaultSeed): Hashable[ByteBuffer,(Long,Long)] =
+ new Hashable[ByteBuffer, (Long,Long)] {
+ def apply(input: ByteBuffer) =
+ Murmur.hash3_128(input, input.position, input.limit, seed)
+ }
+}
13 project/Build.scala
View
@@ -83,8 +83,9 @@ object AlgebirdBuild extends Build {
).settings(
test := { }, // All tests reside in algebirdTest
name := "algebird-core",
- libraryDependencies += "com.googlecode.javaewah" % "JavaEWAH" % "0.6.6"
- )
+ libraryDependencies ++= Seq("com.googlecode.javaewah" % "JavaEWAH" % "0.6.6",
+ "com.twitter" %% "bijection-core" % "0.3.0")
+ ).dependsOn(algebirdHash)
lazy val algebirdTest = Project(
id = "algebird-test",
@@ -106,4 +107,12 @@ object AlgebirdBuild extends Build {
name := "algebird-util",
libraryDependencies += "com.twitter" %% "util-core" % "6.2.0"
).dependsOn(algebirdCore, algebirdTest % "compile->test")
+
+ lazy val algebirdHash = Project(
+ id = "algebird-hash",
+ base = file("algebird-hash"),
+ settings = sharedSettings
+ ).settings(
+ name := "algebird-hash"
+ )
}
Something went wrong with that request. Please try again.