In [1]:
import numpy as np
import pandas as pd
#import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

ModuleNotFoundError: No module named 'faiss'

## Topic 4: Efficient Vector Space Retrieval

Load data and vectorize using TfidfVectorizer
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

TF-IDF weighting

In [7]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...
5,MED-335,differences total vitro digestible phosphorus ...
6,MED-398,grapefruit wine glass metabolic cardiovascular...
7,MED-557,dysmenorrhea pubmed ncbi abstract dysmenorrhea...
8,MED-666,role surgery treatment mastalgia pubmed ncbi a...
9,MED-691,ginger prevention nausea vomiting review pubme...


In [74]:

v = TfidfVectorizer(norm = None)
x = v.fit_transform(corpus['TEXT'])
vocab_size = len(v.get_feature_names())
print('vocabulary size(N):')
print(vocab_size)

print('\ntokens:')
features = v.get_feature_names()
print(features)

vocabulary size(N):
22019

tokens:


In [116]:
print(type(x[0]))

<class 'scipy.sparse.csr.csr_matrix'>


# Random Projections
**Hashing algorithm:**
<br>1.Choose a set of *$M$* random vectors ${r_1, r_2, ..., r_M}$ in the original high-dimensional vectors space (vector length $|V|$)
<br>2.For each document TF-IDF vector d do:
 - Compute the inner (dot) productof dand each random vector $r:θ(r, d) = \sum_{i}^{|𝑉|}𝑟_𝑖∗𝑑_𝑖$
 - Hash each inner product: $h(d, r_k) = 1$ if $θ(r, d) > t (treshold)$, else 0

3.Compute a new vector of hashes:
 - $d’ = [h(d, r_1), h(d, r_2), ..., h(d, r_M)]$
 - The number of selected random vectors, *$M$*, is the dimensionality of hashed vectors

In [104]:
#a function for  creating a set of M random vectors with the dimension dim
def get_random_vectors(dim,m):
    return np.random.randn(m, dim)

In [106]:
#test the get_random_vectors
np.random.seed(0)
m = 16
random_vectors = get_random_vectors(vocab_size, m)
print('dimension of the set of random vectors: ', random_vectors.shape)
random_vectors

dimension of the set of random vectors:  (16, 22019)


array([[ 1.76405235,  0.40015721,  0.97873798, ..., -0.00725045,
        -0.20480061, -0.87585082],
       [ 0.67949009, -0.52801718, -0.29644942, ..., -0.49891105,
        -0.03010458, -0.24500162],
       [ 1.18416672,  1.10555712, -0.3115909 , ...,  0.26205306,
         0.9489654 , -0.37826362],
       ...,
       [-0.54601508, -0.82112629, -0.64099457, ...,  2.78961432,
         0.31923957,  0.61385305],
       [-1.33838959,  1.38902032, -0.58269477, ..., -0.98244135,
        -1.48158788,  1.00555197],
       [-0.476383  ,  0.73037045,  0.44611114, ..., -0.45457409,
        -1.1455209 ,  0.97293566]])

In [110]:
# d - document vector
# rnd_vec - set of random vectors
# t - threshold

def compute_hash(doc, rnd_vec, t):
    for r in rnd_vec:
        # True if > t; False if < t
        inner_product = np.inner(r,doc)
        if inner_product>t:
            hashed_dot_product = np.append(hashed_dot_product,1)
        else:
            hashed_dot_product = np.append(hashed_dot_product,0)
    return hashed_dot_product

In [115]:
print(sum(r[0]*x[0]))

  (0, 700)	0.00926965356356899
  (0, 9416)	0.010564998175006056
  (0, 12517)	0.016477912938204383
  (0, 17163)	0.00858058035282135
  (0, 5506)	0.006023746486733122
  (0, 8824)	0.012267911549674668
  (0, 3139)	0.006021478971960161
  (0, 19773)	0.00858058035282135
  (0, 16503)	0.0017083130448625628
  (0, 13217)	0.0017171622126938384
  (0, 80)	0.0013578385177638011
  (0, 606)	0.006119372495682042
  (0, 19269)	0.002337367562786342
  (0, 5368)	0.004347081893433366
  (0, 4041)	0.00810778911066411
  (0, 13891)	0.024339926119567763
  (0, 13750)	0.008998960798774538
  (0, 13841)	0.028825985083841533
  (0, 14041)	0.009608661694613845
  (0, 14200)	0.044407938279582636
  (0, 17835)	0.0044482492397997534
  (0, 7106)	0.005106852280266836
  (0, 17157)	0.0037391542858521555
  (0, 7372)	0.0036135256693470156
  (0, 10040)	0.003823730939049321
  :	:
  (0, 524)	0.0035408304874664236
  (0, 2378)	0.004066501564699093
  (0, 11988)	0.004701775552187111
  (0, 10096)	0.00456218850676778
  (0, 2366)	0.0058929112

In [111]:
#test compute_hash
doc = x[0]
print('dimension of document vector: ', doc.shape)
teta = compute_hash(doc, random_vectors, 0)
print('dimension of new vector: ', teta.shape)
teta

dimension of document vector:  (1, 22019)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

In [34]:
x[0]

<1x22019 sparse matrix of type '<class 'numpy.float64'>'
	with 67 stored elements in Compressed Sparse Row format>

In [45]:
for doc in x:
    new_doc_vector = compute_hash(doc,random_vectors,0)
    new_docs = np.append(new_docs, new_doc_vector)
    
new_docs

array([119.81986318,  15.86071673, -38.2328904 , ...,   1.        ,
         1.        ,   0.        ])