In [139]:
import pandas as pd
import numpy as np
#
import faiss
import pyterrier as pt

In [140]:
if not pt.started():
    pt.init()

In [245]:
# Intersección
def list_intersection(l1, l2):
    s1 = set(l1.tolist()[0])
    s2 = set(l2.tolist()[0])
    return len(s1.intersection(s2)) / len(s1)

In [141]:
dataset = pt.get_dataset("vaswani")
print("Corpus Vaswani: %s " % dataset.get_corpus())

Corpus Vaswani: ['/home/jovyan/.pyterrier/corpora/vaswani/corpus/doc-text.trec'] 


In [142]:
documents = pd.DataFrame(dataset.get_corpus_iter())
documents.shape

(11429, 2)

In [143]:
documents.head()

Unnamed: 0,docno,text
0,1,compact memories have flexible capacities a d...
1,2,an electronic analogue computer for solving sy...
2,3,electronic coordinate transformer circuit det...
3,4,the british computer society report of a conf...
4,5,millimicrosecond digital computer logic a sys...


In [255]:
%time
from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('lambdaofgod/paperswithcode_word2vec') -> = 200
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs


In [145]:
%%time
doc_text = list(documents['text'].head(10))
demo_embeddings = model.encode(doc_text, convert_to_tensor=True)
demo_embeddings.shape

CPU times: user 1.38 s, sys: 0 ns, total: 1.38 s
Wall time: 243 ms


torch.Size([10, 512])

In [146]:
demo_embeddings

tensor([[ 0.0614, -0.0944, -0.0086,  ...,  0.0350, -0.0895, -0.0111],
        [ 0.0996,  0.0078, -0.0476,  ...,  0.0371,  0.0203, -0.0325],
        [-0.0075,  0.0093,  0.0643,  ...,  0.0002,  0.0575,  0.0012],
        ...,
        [-0.0089,  0.0054, -0.0719,  ...,  0.0742,  0.0291, -0.0338],
        [-0.0508, -0.0300,  0.0256,  ...,  0.0251,  0.0359, -0.0189],
        [-0.0164, -0.0149, -0.0017,  ...,  0.0121, -0.0213, -0.0095]])

In [147]:
# Cargo los embeddings de todos los docs previamente calculados.
vaswani_docs_embeddings = np.load("./vaswani_docs_embeddings-512.npy")
vaswani_docs_embeddings.shape

(11429, 512)

In [148]:
# Obtengo los 'topics' asociados al corpus
topics = dataset.get_topics()
topics.head()

Unnamed: 0,qid,query
0,1,measurement of dielectric constant of liquids ...
1,2,mathematical analysis and design details of wa...
2,3,use of digital computers in the design of band...
3,4,systems of data coding for information transfer
4,5,use of programs in engineering testing of comp...


In [149]:
# Cargo los embeddings de todos los queries previamente calculados.
vaswani_query_embeddings = np.load("./vaswani_query_embeddings-512.npy")
vaswani_query_embeddings.shape

(93, 512)

### Indexación con FAISS (diferentes índices)
**Más sobre los tipos de índices en FAISS:** https://github.com/facebookresearch/faiss/wiki/Faiss-indexes

**Prueba 1 - Flat Index**  
Recordar: En este tipo de índice se mide la distancia L2 (euclídea) entre el vector de query 
y todos los vectores de documentos almacenados. Es simple y preciso (pero no demasiado rápido).

In [159]:
# Inicialización
d = 512
indexFlat = faiss.IndexFlatL2(d)

# Chequeo cantidad de docs en el índice
indexFlat.ntotal

0

In [167]:
# Agrego los documentos al índice
%time
indexFlat.add(vaswani_docs_embeddings)
indexFlat.ntotal

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.34 µs


11429

In [168]:
indexFlat.is_trained

True

In [169]:
# Ejemplo de recuperación
k = 5
query_vector = np.array([vaswani_query_embeddings[0]])

In [170]:
%time
DFlat, rsFlat = indexFlat.search(query_vector, k)  # Búsqueda

CPU times: user 14 µs, sys: 3 µs, total: 17 µs
Wall time: 31.2 µs


In [171]:
print(rsFlat)

[[1501 7125 1087 1896 1987]]


In [172]:
DFlat

array([[0.49462646, 0.76442146, 0.8314605 , 0.8432368 , 0.8441225 ]],
      dtype=float32)

In [209]:
faiss.write_index(indexFlat, "vaswani_faiss_flat.ndx")

**Prueba 2 - IVF Flat Index**  
Recordar: En este tipo de índice se particiona el espacio de búsqueda (nlist) para realizar
un ANN.

In [180]:
nlist = 50  # Cantidad de celdas
quantizer = faiss.IndexFlatL2(d)
indexIVFFlat = faiss.IndexIVFFlat(quantizer, d, nlist)

In [184]:
indexIVFFlat.is_trained

True

In [183]:
# Preparo (train) las estructuras de datos del índice
%time
indexIVFFlat.train(vaswani_docs_embeddings)
indexIVFFlat.ntotal

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


0

In [185]:
indexIVFFlat.is_trained

True

In [188]:
# Agrego los documentos al índice
indexIVFFlat.add(vaswani_docs_embeddings)
indexIVFFlat.ntotal

11429

In [189]:
# Ejemplo de recuperación
k = 5
query_vector = np.array([vaswani_query_embeddings[0]])

In [190]:
%time
DIVFFlat, rsIVFFlat = indexIVFFlat.search(query_vector, k)  # Búsqueda

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs


In [191]:
rsIVFFlat

array([[8275, 4255, 8581, 8481,   57]])

In [246]:
list_intersection(rsFlat, rsIVFFlat)

0.8

In [194]:
indexIVFFlat.nprobe = 10 # Aumentamos el ámbito de búsqueda a 10 celdas
DIVFFlat, rsIVFFlat = indexIVFFlat.search(query_vector, k)  # Búsqueda

In [195]:
rsIVFFlat

array([[1501, 7125, 1896, 1987, 4816]])

In [210]:
faiss.write_index(indexIVFFlat, "vaswani_faiss_ivfflat.ndx")

**Prueba 3 - IVF Flat Index con Product Quantization**  
Recordar: En este tipo de índice se aplica Product Quantization para reducir (comprimir) los vectores. 
Luego se ejecuta ANN.

In [199]:
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
indexIVFPQ = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits) 

In [200]:
indexIVFPQ.is_trained

False

In [201]:
# Preparo (train) las estructuras de datos del índice
%time
indexIVFPQ.train(vaswani_docs_embeddings)
indexIVFPQ.ntotal

CPU times: user 15 µs, sys: 3 µs, total: 18 µs
Wall time: 33.4 µs


0

In [202]:
# Agrego los documentos al índice
indexIVFPQ.add(vaswani_docs_embeddings)
indexIVFPQ.ntotal

11429

In [203]:
# Ejemplo de recuperación
k = 5
query_vector = np.array([vaswani_query_embeddings[0]])

In [204]:
%time
DIVFPQ, rsIVFPQ = indexIVFPQ.search(query_vector, k)  # Búsqueda

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs


In [205]:
rsIVFPQ

array([[1519, 8275, 3326, 4255, 8481]])

In [252]:
indexIVFPQ.nprobe = 10
%time
DIVFPQ, rsIVFPQ = indexIVFPQ.search(query_vector, k)  # Búsqueda

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [253]:
rsIVFPQ

array([[1501, 7125, 4249, 1519, 9829]])

In [254]:
faiss.write_index(indexIVFPQ, "vaswani_faiss_ivfpq.ndx")

### Tarea 
**Ejecutar todo el set de consultas y calcular el solapamiento promedio de las listas de 
resultados para nprobe = [1 .. 10]**