In [1]:
import pandas as pd
import math
import numpy as np 
import string
import time

from IPython import get_ipython

get_ipython().magic('run -i "functions_helper.py"')

## Data Preprocessing

In [2]:
# load corpus as preprocessed set of documents
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])

# corpus preprocessing
corpus = preprocess_corpus(corpus)
# preview first rows
corpus.head()

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenol human milk relat dietari habit cen...
1,MED-329,phosphat vascular toxin pubm ncbi abstract ele...
2,MED-330,dietari phosphoru acut impair endotheli functi...
3,MED-332,public health impact dietari phosphoru excess ...
4,MED-334,differ total vitro digest phosphoru content pl...


In [3]:
# load some queries for testing
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])

queries_text = preprocess_queries(corpus, queries_text, output_string=True)
queries_text.head(10)

Unnamed: 0,ID,TEXT
0,PLAIN-1,deep fri food may caus cancer latest studi die...
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...
5,PLAIN-1038,dog meat anim product cat heart health tobacco...
6,PLAIN-1049,dr heart health heart diseas egg cholesterol s...
7,PLAIN-1065,dr walter mortal heart diseas heart health die...
8,PLAIN-1077,thyroid health hijiki sushi iodin sea veget sa...
9,PLAIN-1087,easter island mortal muscl strength morbid moo...


# Load Query-Doc Relevance

In [None]:
#upload the query relevance
queries_relevance = pd.read_csv('nfcorpus/dev.2-1-0.qrel', sep='\t', names=['QUERY_ID', '0', 'DOC_ID', 'RELEVANCE_LEVEL'])
queries_relevance.head(10)

##  Create TF-IDF matrix for documents

In [4]:
#create TF-IDF matrix of corpus
tf_dict = tf(corpus, column_name = 'TEXT')
idf_dict = idf(corpus, tf_dict)
tf_idf_dict = tf_idf(tf_dict, idf_dict)
tf_idf_matrix = tf_idf_to_matrix(tf_idf_dict)
tf_idf_matrix.head()

Unnamed: 0,alkylphenol,human,milk,relat,dietari,habit,central,taiwan,pubm,ncbi,...,six-year,inchianti,tuscani,studies-depress,eurosav,self-inflict,eurostat,suicide-record,scarciti,trim-and-fil
0,6.122806,2.886416,6.547579,2.90854,2.095849,5.898499,3.473596,5.503767,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,2.59775,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.59775,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create TF-IDF matrix for queries

In [None]:
%%time
#create tf-idf matrix of queries
tf_idf_queries = tf_idf_matrix[0:0]
tf_idf_queries.head()

for i in range(len(queries_text)):
    tf_idf_queries = tf_idf_queries.append(pd.Series(0, index=tf_idf_queries.columns), ignore_index=True)
    #count occurances
    for token in (queries_text['TEXT'][i]).split():
        for col in tf_idf_queries.columns:
            if token == col:
                tf_idf_queries[col][i] = tf_idf_queries[col][i] + 1
                
#calculate log tf
tf_idf_queries = np.log(tf_idf_queries) + 1 

tf_idf_queries = tf_idf_queries.replace(-np.inf,0)

for i in range(len(queries_text)):
    for col in tf_idf_queries.columns:
        tf_idf_queries[col][i] = tf_idf_queries[col][i] * idf_dict[col]

tf_idf_queries.head()

# Create vectors

In [69]:
#create variables for document and query vectors
doc_vectors = tf_idf_matrix.values
q_vectors = tf_idf_queries.values
#test the basic retrieve
retrieve(0, q_vectors, doc_vectors, k=20)

Unnamed: 0,ID,TEXT
1142,MED-2423,dietari pattern breast cancer risk women pubm ...
1138,MED-2418,consumpt deep-fri food risk prostat cancera b ...
956,MED-2195,influenc deep fri veget oil acrylamid format s...
1794,MED-3498,dietari acrylamid exposur french popul result ...
1141,MED-2422,statist regress model estim acrylamid concentr...
303,MED-1363,toward healthier mediterranean diet pubm ncbi ...
658,MED-1814,dietari pattern risk pancreat cancer larg popu...
3004,MED-5088,mitig strategi reduc acrylamid format fri pota...
1135,MED-2414,review epidemiolog studi dietari acrylamid int...
2521,MED-4482,meat consumpt cook practic meat mutagen risk p...


In [76]:
%%time
vanilla_evaluation = full_evaluation(q_vectors, doc_vectors, k=5)

Average precision across all queries = 0.3940000000000002
Mean Average Precision = 0.26045641025641053
Average nDCG = 0.3359780305894626
Wall time: 7min 24s


In [None]:
vanilla_evaluation.head()

# Random projections

In [111]:
#create normalized random vectors

np.random.seed(0)
vocab_size = len(tf_idf_matrix.columns)
random_vectors = get_random_vectors(vocab_size, m = 15000)

print('dimension of the set of random vectors: ', random_vectors.shape)
print(random_vectors[1])
print(np.linalg.norm(random_vectors[1]))


dimension of the set of random vectors:  (15000, 19930)
[0.00509512 0.00581969 0.00195364 ... 0.00109644 0.00322468 0.00769748]
1.0


In [17]:
print(np.mean(m))

0.045068840456878735


# Create new document vectors with reduced dimensionality

In [112]:
#compute new document vectors with reduced dimensionality
doc_projections = compute_hash(norm(doc_vectors), random_vectors, 0.045)
doc_projections[1]

array([1, 1, 1, ..., 1, 1, 1])

# Create new query vectors with reduced dimensionality

In [117]:
#compute new query vectors with reduced dimensionality
q_projections = compute_hash(norm(q_vectors),random_vectors, 0.045)

In [130]:
print(np.mean(m))

0.06775348863385784


# Random projections evaluation

In [126]:
%%time
rand_proj_evaluation = full_evaluation(q_projections, doc_projections, k=10, random_projections = True)

Average precision across all queries = 0.009803418803418805
Mean Average Precision = 0.0022924297924297923
Average nDCG = 0.007394730080254836
Wall time: 19.9 s


In [127]:
rand_proj_evaluation.head()

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,deep fri food may caus cancer latest studi die...,0.0,0.0,0.0
1,PLAIN-1007,ddt persist organ pollut industri toxin pestic...,0.0,0.0,0.0
2,PLAIN-101,treat multipl sclerosi diet multipl sclerosi u...,0.0,0.0,0.0
3,PLAIN-1017,detoxif cancer raw food heart health heart dis...,0.0,0.0,0.0
4,PLAIN-1027,dietari guidelin heart diseas cardiovascular d...,0.0,0.0,0.0
