In [19]:
import pandas as pd
import math
import numpy as np 
import string
import time

from IPython import get_ipython

get_ipython().magic('run -i "functions_helper.py"')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\linar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Preprocessing

In [2]:
# load corpus as preprocessed set of documents
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])

# corpus preprocessing
corpus = preprocess_corpus(corpus)
# preview first rows
corpus.head()

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenol human milk relat dietari habit cen...
1,MED-329,phosphat vascular toxin pubm ncbi abstract ele...
2,MED-330,dietari phosphoru acut impair endotheli functi...
3,MED-332,public health impact dietari phosphoru excess ...
4,MED-334,differ total vitro digest phosphoru content pl...


In [20]:
# load some queries for testing
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])

queries_text = preprocess_queries(corpus, queries_text, output_string = True)
queries_text.head(10)

Unnamed: 0,ID,TEXT
0,PLAIN-1,deep fri food may caus cancer latest studi die...
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...
5,PLAIN-1038,dog meat anim product cat heart health tobacco...
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...
7,PLAIN-1065,dr walter mortal heart diseas heart health die...
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...
9,PLAIN-1087,easter island mortal muscl strength morbid moo...


# Load Query-Doc Relevance

In [4]:
#upload the query relevance
queries_relevance = pd.read_csv('nfcorpus/dev.2-1-0.qrel', sep='\t', names=['QUERY_ID', '0', 'DOC_ID', 'RELEVANCE_LEVEL'])
queries_relevance.head(10)

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-1,0,MED-2421,2
1,PLAIN-1,0,MED-2422,2
2,PLAIN-1,0,MED-2416,2
3,PLAIN-1,0,MED-2423,2
4,PLAIN-1,0,MED-2417,2
5,PLAIN-1,0,MED-2418,2
6,PLAIN-1,0,MED-4451,2
7,PLAIN-1,0,MED-2420,2
8,PLAIN-1,0,MED-2414,1
9,PLAIN-1,0,MED-4070,1


##  Create TF-IDF matrix for documents

In [5]:
#create TF-IDF matrix of corpus
tf_dict = tf(corpus, column_name = 'TEXT')
idf_dict = idf(corpus, tf_dict)
tf_idf_dict = tf_idf(tf_dict, idf_dict)
tf_idf_matrix = tf_idf_to_matrix(tf_idf_dict)
tf_idf_matrix.head()

Unnamed: 0,alkylphenol,human,milk,relat,dietari,habit,central,taiwan,pubm,ncbi,...,six-year,inchianti,tuscani,studies-depress,eurosav,self-inflict,eurostat,suicide-record,scarciti,trim-and-fil
0,6.122806,2.886416,6.547579,2.90854,2.095849,5.898499,3.473596,5.503767,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,2.59775,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.59775,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create TF-IDF matrix for queries

In [6]:
tf_idf_queries = queries_tf_idf(tf_idf_matrix, idf_dict, queries_text)
tf_idf_queries.head()

  for col in tf_idf_queries.columns:


Unnamed: 0,alkylphenol,human,milk,relat,dietari,habit,central,taiwan,pubm,ncbi,...,six-year,inchianti,tuscani,studies-depress,eurosav,self-inflict,eurostat,suicide-record,scarciti,trim-and-fil
0,0.0,1.375393,0.0,0.0,4.206057,0.0,0.0,0.0,0.451235,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,3.119956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3.589001,5.282545,4.099249,3.455757,0.0,3.473596,0.0,0.451235,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.237842,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create vectors

In [7]:
#create variables for document and query vectors
doc_vectors = tf_idf_matrix.values
q_vectors = tf_idf_queries.values
#test the basic retrieve
retrieve(0, q_vectors, doc_vectors, k=5)

Unnamed: 0,ID,TEXT
1142,MED-2423,dietari pattern breast cancer risk women pubm ...
1138,MED-2418,consumpt deep-fri food risk prostat cancera b ...
956,MED-2195,influenc deep fri veget oil acrylamid format s...
1794,MED-3498,dietari acrylamid exposur french popul result ...
1141,MED-2422,statist regress model estim acrylamid concentr...


# Run basic retrieve

In [18]:
start = time.time()
vanilla_evaluation = full_evaluation(q_vectors, doc_vectors, k=5)
end = time.time() - start
print(end)

Average precision across all queries = 0.3940000000000002
Mean Average Precision = 0.26045641025641053
Average nDCG = 0.3359780305894626
495.42166328430176


In [19]:
vanilla_evaluation.head()

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.8,0.543333,0.595237
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.2,0.04,0.131205
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.4,0.233333,0.298776
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.0,0.0,0.0


# Basic pre-clustering

## Random state = 11

In [24]:
no_of_docs = len(corpus.index)
sqrt_n = round(math.sqrt(no_of_docs))

In [25]:
leaders, cluster_list = allocate_docs_to_clusters(11, sqrt_n, cosine = True)
start = time.time()
evaluate_with_leaders_state_11 = evaluate_preclustering()
print("Evaluation execution time, sec = " + str(time.time()-start))

Average precision across all queries = 0.18907692307692314
Mean Average Precision = 0.11784957264957267
Average nDCG = 0.15597203856880254
Evaluation execution time, sec = 35.34845590591431


In [26]:
evaluate_with_leaders_state_11.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.2,0.04,0.131205
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.4,0.13,0.277273
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.0,0.0,0.0
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.4,0.2,0.360055
5,PLAIN-1038,dog meat anim product cat heart health tobacco...,0.0,0.0,0.0
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...,0.2,0.066667,0.181542
7,PLAIN-1065,dr walter mortal heart diseas heart health die...,0.0,0.0,0.0
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...,0.75,0.55,0.699215
9,PLAIN-1087,easter island mortal muscl strength morbid moo...,0.0,0.0,0.0


In [29]:
evaluate_with_leaders_state_11.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.189077,0.11785,0.155972
std,0.286094,0.232718,0.25224
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.25,0.1,0.213986
max,1.0,1.0,1.0


## Random state = 110 

In [30]:
leaders, cluster_list = allocate_docs_to_clusters(110, sqrt_n, cosine = True, Faiss = False)
start = time.time()
evaluate_with_leaders_state_110 = evaluate_preclustering()
print("Evaluation execution time, sec = " + str(time.time()-start))

Average precision across all queries = 0.18994871794871807
Mean Average Precision = 0.11945641025641023
Average nDCG = 0.15664189919470173
Evaluation execution time, sec = 36.15230631828308


In [31]:
evaluate_with_leaders_state_110.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.4,0.333333,0.50874
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.8,0.543333,0.66084
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.0,0.0,0.0
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.2,0.05,0.146068
5,PLAIN-1038,dog meat anim product cat heart health tobacco...,0.8,0.543333,0.66084
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...,0.0,0.0,0.0
7,PLAIN-1065,dr walter mortal heart diseas heart health die...,0.0,0.0,0.0
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...,0.0,0.0,0.0
9,PLAIN-1087,easter island mortal muscl strength morbid moo...,0.2,0.05,0.146068


In [32]:
evaluate_with_leaders_state_110.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.189949,0.119456,0.156642
std,0.288398,0.238078,0.250659
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.25,0.1,0.21837
max,1.0,1.0,1.0


## Random state = 1100 

In [33]:
leaders, cluster_list = allocate_docs_to_clusters(1100, sqrt_n, cosine = True, Faiss = False)

In [34]:
start = time.time()
evaluate_with_leaders_state_1100 = evaluate_preclustering()
print("Evaluation execution time, sec = " + str(time.time()-start))

Average precision across all queries = 0.22235897435897453
Mean Average Precision = 0.1457384615384616
Average nDCG = 0.18726216676986415
Evaluation execution time, sec = 39.06650996208191


In [35]:
evaluate_with_leaders_state_1100.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.8,0.543333,0.41521
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.0,0.0,0.0
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.0,0.0,0.0
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.2,0.04,0.131205
5,PLAIN-1038,dog meat anim product cat heart health tobacco...,0.0,0.0,0.0
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...,0.0,0.0,0.0
7,PLAIN-1065,dr walter mortal heart diseas heart health die...,0.0,0.0,0.0
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...,0.8,0.76,0.853932
9,PLAIN-1087,easter island mortal muscl strength morbid moo...,0.2,0.066667,0.16958


In [36]:
evaluate_with_leaders_state_1100.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.222359,0.145738,0.187262
std,0.311055,0.264541,0.278859
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.4,0.166667,0.298776
max,1.0,1.0,1.0


# Faiss pre-clustering

## Random state = 11 

In [37]:
leaders, cluster_list = allocate_docs_to_clusters(11, sqrt_n, cosine = False, Faiss = True)
index, indices = set_indeces_for_faiss()
evaluate_with_leaders_state_11_faiss = evaluate_preclustering_faiss()

NameError: name 'faiss' is not defined

In [None]:
evaluate_with_leaders_state_11_faiss.head()

In [None]:
evaluate_with_leaders_state_11_faiss.describe()

## Random state = 110 

In [None]:
leaders, cluster_list = allocate_docs_to_clusters(110, sqrt_n, cosine = False, Faiss = True)
index, indices = set_indeces_for_faiss()
evaluate_with_leaders_state_110_faiss = evaluate_preclustering_faiss()

In [None]:
evaluate_with_leaders_state_110_faiss.head(10)

In [None]:
evaluate_with_leaders_state_110_faiss.describe()

## Random state = 1100 

In [None]:
leaders, cluster_list = allocate_docs_to_clusters(1100, sqrt_n, cosine = False, Faiss = True)
index, indices = set_indeces_for_faiss()

In [None]:
%%time
evaluate_with_leaders_state_1100_faiss = evaluate_preclustering_faiss()

In [None]:
evaluate_with_leaders_state_1100_faiss.head(10)

In [38]:
evaluate_with_leaders_state_1100_faiss.describe()

NameError: name 'evaluate_with_leaders_state_1100_faiss' is not defined

# KMeans pre-clustering

## Random state = 11 

In [41]:
#Run the clustering algorithm
estimator = KMeans(n_clusters = sqrt_n, random_state = 11)
model = estimator.fit(tf_idf_matrix)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=57, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=11, tol=0.0001, verbose=0)

In [42]:
#Generate cluster predictions and store in y_hat
y_hat = estimator.predict(tf_idf_matrix) #predicting to which cluster the query belongs
y_hat #array of belongings of docs to cluster

array([31, 31,  2, ..., 31,  8,  2])

In [43]:
cluster_list_kmeans = []
for i in range(sqrt_n):
    cluster_list_kmeans.append([])

for i in range(no_of_docs):
    for j in range(sqrt_n):
        if y_hat[i] == j:
            cluster_list_kmeans[j].append(i)

In [44]:
#Since we have 57 clusters, we are going to compare the query vector with 57 vectors of cluster centroids
#All of cluster centroids are stored in the attribute cluster_centers
centers = np.array(model.cluster_centers_)

In [45]:
evaluate_kmeans_random_state_11 = evaluate_preclustering_kmeans()

Average precision across all queries = 0.3051282051282056
Mean Average Precision = 0.20255641025641039
Average nDCG = 0.26325800853754705


In [46]:
evaluate_kmeans_random_state_11.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.6,0.286667,0.308217
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.0,0.0,0.0
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.2,0.05,0.146068
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.0,0.0,0.0
5,PLAIN-1038,dog meat anim product cat heart health tobacco...,1.0,1.0,1.0
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...,0.0,0.0,0.0
7,PLAIN-1065,dr walter mortal heart diseas heart health die...,0.0,0.0,0.0
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...,0.25,0.05,0.146068
9,PLAIN-1087,easter island mortal muscl strength morbid moo...,0.2,0.04,0.131205


In [47]:
evaluate_kmeans_random_state_11.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.305128,0.202556,0.263258
std,0.327641,0.295193,0.303929
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.2,0.05,0.16958
75%,0.5,0.28,0.38939
max,1.0,1.0,1.0


## Random state = 110 

In [48]:
#Run the clustering algorithm
estimator = KMeans(n_clusters = sqrt_n, random_state = 110)
model = estimator.fit(tf_idf_matrix)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=57, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=110, tol=0.0001, verbose=0)

In [49]:
#Generate cluster predictions and store in y_hat
y_hat = estimator.predict(tf_idf_matrix) #predicting to which cluster the query belongs
y_hat #array of belongings of docs to cluster

array([22, 42,  9, ..., 42, 14,  9])

In [50]:
cluster_list_kmeans = []
for i in range(sqrt_n):
    cluster_list_kmeans.append([])

for i in range(no_of_docs):
    for j in range(sqrt_n):
        if y_hat[i] == j:
            cluster_list_kmeans[j].append(i)

In [51]:
#Since we have 57 clusters, we are going to compare the query vector with 57 vectors of cluster centroids
#All of cluster centroids are stored in the attribute cluster_centers
centers = np.array(model.cluster_centers_)

In [52]:
evaluate_kmeans_random_state_110 = evaluate_preclustering_kmeans()

Average precision across all queries = 0.2943076923076925
Mean Average Precision = 0.18946666666666692
Average nDCG = 0.24741120294171953


In [53]:
evaluate_kmeans_random_state_110.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.4,0.13,0.277273
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.4,0.166667,0.315648
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.2,0.05,0.146068
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.2,0.05,0.146068
5,PLAIN-1038,dog meat anim product cat heart health tobacco...,0.25,0.05,0.146068
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...,0.0,0.0,0.0
7,PLAIN-1065,dr walter mortal heart diseas heart health die...,0.0,0.0,0.0
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...,0.0,0.0,0.0
9,PLAIN-1087,easter island mortal muscl strength morbid moo...,0.2,0.1,0.213986


In [54]:
evaluate_kmeans_random_state_110.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.294308,0.189467,0.247411
std,0.321899,0.278827,0.290343
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.2,0.05,0.156324
75%,0.5,0.25,0.383566
max,1.0,1.0,1.0


## Random state = 1100 

In [55]:
#Run the clustering algorithm
estimator = KMeans(n_clusters = sqrt_n, random_state = 1100)
model = estimator.fit(tf_idf_matrix)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=57, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1100, tol=0.0001, verbose=0)

In [56]:
#Generate cluster predictions and store in y_hat
y_hat = estimator.predict(tf_idf_matrix) #predicting to which cluster the query belongs
y_hat #array of belongings of docs to cluster

array([45, 45, 56, ...,  3,  3, 45])

In [57]:
cluster_list_kmeans = []
for i in range(sqrt_n):
    cluster_list_kmeans.append([])

for i in range(no_of_docs):
    for j in range(sqrt_n):
        if y_hat[i] == j:
            cluster_list_kmeans[j].append(i)

In [58]:
#Since we have 57 clusters, we are going to compare the query vector with 57 vectors of cluster centroids
#All of cluster centroids are stored in the attribute cluster_centers
centers = np.array(model.cluster_centers_)

In [59]:
%%time
evaluate_kmeans_random_state_1100 = evaluate_preclustering_kmeans()

Average precision across all queries = 0.31564102564102586
Mean Average Precision = 0.2192623931623934
Average nDCG = 0.2715072399669933
Wall time: 3min 24s


In [60]:
evaluate_kmeans_random_state_1100.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.6,0.286667,0.308217
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.5,0.2,0.360055
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.4,0.166667,0.242614
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.0,0.0,0.0
5,PLAIN-1038,dog meat anim product cat heart health tobacco...,0.25,0.05,0.146068
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...,0.0,0.0,0.0
7,PLAIN-1065,dr walter mortal heart diseas heart health die...,0.0,0.0,0.0
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...,0.6,0.6,0.722727
9,PLAIN-1087,easter island mortal muscl strength morbid moo...,0.2,0.05,0.146068


In [61]:
evaluate_kmeans_random_state_1100.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.315641,0.219262,0.271507
std,0.343845,0.308828,0.314057
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.2,0.0625,0.168954
75%,0.6,0.333333,0.469279
max,1.0,1.0,1.0


# Random projections

In [None]:
#get the text of query as string

queries_text = change_to_string(queries_text)

In [None]:
#create normalized random vectors

np.random.seed(0)
vocab_size = len(tf_idf_matrix.columns)
random_vectors = get_random_vectors(vocab_size, m = 15000)

print('dimension of the set of random vectors: ', random_vectors.shape)
print(random_vectors[1])
print(np.linalg.norm(random_vectors[1]))


# Create new document vectors with reduced dimensionality

In [None]:
#compute new document vectors with reduced dimensionality
doc_projections = compute_hash(norm(doc_vectors), random_vectors, 0)
doc_projections[1]

# Create new query vectors with reduced dimensionality

In [None]:
#compute new query vectors with reduced dimensionality
q_projections = compute_hash(norm(q_vectors),random_vectors, 0)

# Random projections evaluation

In [None]:
%%time
rand_proj_evaluation = full_evaluation(q_projections, doc_projections, k=5, random_projections = True)

In [None]:
rand_proj_evaluation.head()

In [None]:
precision=[]
MAP=[]
nDCG=[]
time_get_rand_vec = []
time_hash_norm_doc = []
time_hash_norm_q=[]
time_evaluation=[]
for i in [1000, 5000, 10000, 15000]:
    start_time_rand_vect = time.time()
    random_vectors_test = get_random_vectors(vocab_size, m = i)
    time_get_rand_vec.append(time.time()-start_time_rand_vect)
    
    start_hash_norm_doc = time.time()
    doc_projections_test = compute_hash(norm(doc_vectors), random_vectors_test, 0)
    time_hash_norm_doc.append(time.time() - start_hash_norm_doc)
    
    start_hash_norm_q = time.time()
    q_projections_test = compute_hash(norm(q_vectors),random_vectors_test, 0)
    time_hash_norm_q.append(time.time() - start_hash_norm_q)
    
    start_evaluation = time.time()
    rand_proj_evaluation_test = full_evaluation(q_projections_test, doc_projections_test, k=5, random_projections = True)
    time_evaluation.append(time.time() - start_evaluation)
    
    precision.append(rand_proj_evaluation_test['Precision'].mean())
    MAP.append(rand_proj_evaluation_test['Average Precision'].mean())
    nDCG.append(rand_proj_evaluation_test['nDCG'].mean())


In [None]:
import matplotlib.pyplot as plt
plt.xlabel("m")    
plt.plot([1000, 5000, 10000, 15000], precision, label = "Precision")
plt.plot([1000, 5000, 10000, 15000], MAP, label = "MAP")
plt.plot([1000, 5000, 10000, 15000], nDCG, label = "nDCG")
plt.legend()
plt.show

In [None]:
plt.ylabel("time, sec")
plt.xlabel("m")
plt.plot([1000, 5000, 10000, 15000], time_get_rand_vec, label = "Generate random vectors")
#plt.plot([1000, 5000, 10000, 15000], time_hash_norm_doc, label = "Project doc vectors")
plt.plot([1000, 5000, 10000, 15000], time_hash_norm_q, label = "Project query vectors")
plt.plot([1000, 5000, 10000, 15000], time_evaluation, label = "Evaluation")
plt.legend()
plt.show()

In [None]:
print(time_get_rand_vec)
print(time_hash_norm_doc)
print(time_hash_norm_q)
print(time_evaluation)
