In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.preprocessing import normalize

In [138]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    'The main part of a bodily structure or organ',
    'A distinct bodily mass or organ having a specific function.',
    'The principal of a bond.',
    'A human or animal body, esp. a dead one.'
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [140]:
query = [
    'this is the third one',
    'second document',
    'third one',
    'This is the first document.',
    'human',
    'bond',
    'my body is like a body !'
]
Y = vectorizer.transform(query)

In [141]:
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(X)
res = nbrs.kneighbors(Y, return_distance = False)



NearestNeighbors(algorithm='ball_tree', n_neighbors=1)

In [173]:
X_n = normalize(X, axis = 0)
Y_n = normalize(Y, axis = 0)

In [161]:
from sklearn.preprocessing import Normalizer

In [170]:
transformer_x = Normalizer().fit(X)
X_n = transformer_x.transform(X)

transformer_y = Normalizer().fit(Y)
Y_n = transformer_y.transform(Y)

In [169]:
X_n.shape

(8, 28)

In [174]:
X_n.sum()

34.84834398579813

In [175]:
X.sum()

19.018552610597162

In [176]:
cos_matrix = np.divide(
    Y.dot(X.transpose()),
    (Y_n*X_n.transpose()))
    

In [142]:
[[corpus[i] for i in x] for x in res]

[['And this is the third one.'],
 ['This document is the second document.'],
 ['And this is the third one.'],
 ['This is the first document.'],
 ['A human or animal body, esp. a dead one.'],
 ['The principal of a bond.'],
 ['A human or animal body, esp. a dead one.']]

In [196]:
index = [np.where(cos_matrix[i] == np.nanmax(cos_matrix[i]))[1][0] for i in range(cos_matrix.shape[0])]
[corpus[i] for i in index]

['And this is the third one.',
 'This is the first document.',
 'And this is the third one.',
 'This document is the second document.',
 'A human or animal body, esp. a dead one.',
 'The principal of a bond.',
 'This is the first document.']

In [197]:
query

['this is the third one',
 'second document',
 'third one',
 'This is the first document.',
 'human',
 'bond',
 'my body is like a body !']

In [107]:
from  sklearn.utils.extmath import safe_sparse_dot

In [111]:
safe_sparse_dot(X_n, Y_n.T, dense_output=False).toarray()

array([[0.07657396, 0.08310901, 0.        , 0.20617407],
       [0.05963573, 0.2883203 , 0.        , 0.14193783],
       [0.17304773, 0.        , 0.21903289, 0.05980174],
       [0.07657396, 0.08310901, 0.        , 0.20617407]])

In [127]:
normalize(X)

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [125]:
X.toarray()

array([[0.        , 0.21331533, 0.26348688, 0.17439926, 0.        ,
        0.        , 0.17439926, 0.        , 0.17439926],
       [0.        , 0.33225959, 0.        , 0.13582199, 0.        ,
        0.26027443, 0.13582199, 0.        , 0.13582199],
       [0.21903289, 0.        , 0.        , 0.11430045, 0.21903289,
        0.        , 0.11430045, 0.21903289, 0.11430045],
       [0.        , 0.21331533, 0.26348688, 0.17439926, 0.        ,
        0.        , 0.17439926, 0.        , 0.17439926]])