In [None]:
#https://explosion.ai/blog/bloom-embeddings

In [2]:
import numpy
import mmh3

def allocate(n_vectors, n_dimensions):
    table = numpy.zeros((n_vectors, n_dimensions), dtype='f')
    table += numpy.random.uniform(-0.1, 0.1, table.size).reshape(table.shape)
    return table

def get_vector(table, word):
    hash1 = mmh3.hash(word, seed=0)
    hash2 = mmh3.hash(word, seed=1)
    row1 = hash1 % table.shape[0]
    row2 = hash2 % table.shape[0]
    return table[row1] + table[row2]

def update_vector(table, word, d_vector):
    hash1 = mmh3.hash(word, seed=0)
    hash2 = mmh3.hash(word, seed=1)
    row1 = hash1 % table.shape[0]
    row2 = hash2 % table.shape[0]
    table[row1] -= 0.001 * d_vector
    table[row2] -= 0.001 * d_vector

In [3]:
vocab = ['apple', 'strawberry', 'orange', 'juice',
         'drink', 'smoothie', 'eat', 'fruit',
         'health', 'wellness', 'steak', 'fries',
         'ketchup', 'burger', 'chips', 'lobster',
         'caviar', 'service', 'waiter', 'chef']

In [4]:
normal = numpy.random.uniform(-0.1, 0.1, (20, 2))
hashed = numpy.random.uniform(-0.1, 0.1, (15, 2))

In [6]:
hashed

array([[ 0.05845019, -0.05989009],
       [ 0.07981738, -0.08461014],
       [-0.02075094, -0.05763877],
       [ 0.04728252, -0.01430769],
       [ 0.05705087,  0.07107201],
       [ 0.08800396,  0.03277128],
       [-0.05603557,  0.0692331 ],
       [-0.02030276,  0.09480356],
       [ 0.01767458, -0.0625011 ],
       [-0.07545846, -0.08678463],
       [ 0.01401482,  0.05783606],
       [-0.04656627,  0.02606531],
       [ 0.06772367,  0.06143871],
       [ 0.08303966,  0.00454126],
       [-0.04130344, -0.03400914]])

In [7]:
word2id = {}
def get_normal_vector(word, table):
    if word not in word2id:
        word2id[word] = len(word2id)
    return normal[word2id[word]]

In [8]:
hashes1 = [mmh3.hash(w, 1) % 15 for w in vocab]
assert hashes1 == [3, 6, 4, 13, 8, 3, 13, 1, 9, 12, 11, 4, 2, 13, 5, 10, 0, 2, 10, 13]

In [10]:
len(numpy.unique(hashes1))

13

In [11]:
from collections import Counter

hashes2 = [mmh3.hash(w, 2) % 15 for w in vocab]
assert len(Counter(hashes2).most_common()) == 12

In [12]:
len(numpy.unique(hashes2))

12

In [15]:
assert len(Counter(zip(hashes1, hashes2))) == 20

In [14]:
for i in zip(hashes1, hashes2):
    print(i)

(3, 9)
(6, 10)
(4, 6)
(13, 2)
(8, 1)
(3, 7)
(13, 5)
(1, 14)
(9, 6)
(12, 6)
(11, 4)
(4, 4)
(2, 9)
(13, 10)
(5, 11)
(10, 9)
(0, 11)
(2, 13)
(10, 2)
(13, 0)


In [17]:
for word in vocab:
    key1 = mmh3.hash(word, 0) % 15
    key2 = mmh3.hash(word, 1) % 15
    vector = hashed[key1] + hashed[key2]
    print(word, '%.3f %.3f' % tuple(vector))

apple 0.106 -0.074
strawberry -0.103 0.095
orange 0.071 0.129
juice 0.008 -0.082
drink 0.097 -0.147
smoothie -0.009 0.055
eat 0.101 -0.058
fruit 0.163 -0.080
health -0.061 -0.029
wellness 0.115 0.047
steak 0.033 -0.059
fries 0.145 0.104
ketchup -0.042 -0.115
burger 0.008 -0.082
chips 0.146 -0.027
lobster -0.006 0.153
caviar 0.106 -0.074
service -0.042 -0.115
waiter 0.082 0.119
chef 0.171 0.037
