In [1]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import h5py
import pickle
from annoy import AnnoyIndex

In [3]:
f = h5py.File('array_index.h5py', 'r')

In [4]:
list(f.keys())[:10]

['dim',
 'index_doc_id_100',
 'index_doc_id_1000',
 'index_doc_id_10000',
 'index_doc_id_10001',
 'index_doc_id_10002',
 'index_doc_id_10003',
 'index_doc_id_10004',
 'index_doc_id_10005',
 'index_doc_id_10006']

In [5]:
f['dim'][()]

26098

In [6]:
ids = set([x.split("_")[-1] for x in f.keys() if x != "dim"])

In [7]:
from collections import defaultdict

In [8]:
from tqdm import tqdm

In [9]:
passages = defaultdict(dict)

for i in tqdm(ids):
    passage_idx = f[f"index_doc_id_{i}"][:]
    values = f[f"index_doc_value_{i}"][:]
    for ip, p in enumerate(passage_idx):
        passages[str(p)][i] = float(values[ip])

100%|██████████| 26098/26098 [00:34<00:00, 748.30it/s]


In [10]:
import numpy as np
def make_sparse(dense_rep, size_collection):
    zeros = np.zeros(size_collection, dtype=np.float32)
    for k, v in dense_rep.items():
        zeros[int(k)] = v
    return zeros

In [11]:
# passage 655227
pp = passages["65527"]
make_sparse(pp, 30522)

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [12]:
# we could potentially use an encoder decoder to maske a more dense representation
import math
def condense(sparse_rep, output_dim):
    input_dim = len(sparse_rep)
    step_size = math.ceil(input_dim / output_dim)
    res = np.array([sum(sparse_rep[x:x+step_size]) for x in range(0, input_dim, step_size)])
    return res

In [13]:
from collections import defaultdict

In [14]:
def condense_from_dense(dense_rep, output_dim, vocab_size=30522):
    input_dim = vocab_size
    zeros = np.zeros(output_dim, dtype=np.float16)
    for k,v in dense_rep.items():
        bucket = math.ceil((int(k) / vocab_size) * 100)
        zeros[bucket] += v
    return zeros

# Make the Annoy Tree

In [15]:
passage_id_mapping = {
    k:v for k,v in enumerate(passages.keys())
}

In [16]:
condensed_passages = {}
for k,v in tqdm(passages.items()):
    condensed_passages[k] = condense_from_dense(v, 1000)

100%|██████████| 276142/276142 [00:36<00:00, 7503.62it/s] 


In [17]:
# length of item vector that will be indexed
# this is the vocab size
item_index_length = 1000
# create the tree
annoy_tree = AnnoyIndex(item_index_length, 'dot')

The vocabulary is too big for this model I think....  
We need to reduce the size of the item vector somehow.  
Ideally down to 100 dimensions.  

In [18]:
for i in tqdm(passage_id_mapping.keys()):
    annoy_tree.add_item(
        i,
        condensed_passages[passage_id_mapping[i]]
    )

100%|██████████| 276142/276142 [00:27<00:00, 9960.44it/s] 


In [19]:
# mo' trees mo' problems
# acutally less problems, but also less memory
annoy_tree.build(item_index_length)

True

In [20]:
annoy_tree.save("passages_1000.ann")

True

In [21]:
# save the passge id mapping
import json
with open("passage_id_mapping_annot_1000.json", "w") as f:
    json.dump(passage_id_mapping, f)

In [22]:
!pwd

/home/ec2-user/SageMaker/splade/experiments/index-full-data/index
