# Comparison

In [14]:
import timeit
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from functools import partial

from annoy import AnnoyIndex

## Loading artifacts

In [18]:
vectorizer = pickle.load(open("../artifacts/vectorizer.p", "rb"))
svd = pickle.load(open("../artifacts/svd.p", "rb"))
tfs = pickle.load(open("../artifacts/tfs.p", "rb"))
tfs_truncated = pickle.load(open("../artifacts/tfs_truncated.p", "rb"))

annoy_idx = AnnoyIndex(512, 'angular')
annoy_idx.load('../artifacts/index.ann')

True

In [7]:
print(f'tfidf matrix shape {tfs.shape}')
print(f'truncated tfidf matrix shape {tfs_truncated.shape}')

(142570, 30000)

In [9]:
def tfidf(query):
    query_tfs = vectorizer.transform(query)
    idxs = cosine_similarity(tfs, query_tfs).flatten().argsort()[-5:][::-1]

    return idxs

In [19]:
def tfidf_truncated(query):
    query_tfs = vectorizer.transform(query)
    query_tfs_truncated = svd.transform(query_tfs)
    
    idxs = cosine_similarity(tfs_truncated, query_tfs_truncated).flatten().argsort()[-5:][::-1]
    
    return idxs

In [24]:
def tfidf_truncated_annoy(query):
    query_tfs = vectorizer.transform(query)
    query_tfs_truncated = svd.transform(query_tfs)
    
    idxs = annoy_idx.get_nns_by_vector(query_tfs_truncated[0], 5)

## Speed comparison

In [21]:
query = ['global warming']

In [30]:
timeit.timeit(partial(tfidf, query), number=100)

29.74433896800474

In [31]:
timeit.timeit(partial(tfidf_truncated, query), number=100)

51.562847281005816

In [32]:
timeit.timeit(partial(tfidf_truncated_annoy, query), number=100)

4.994155715001398