In [2]:
import numpy as np
import pandas as pd
#import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import collections
import math
import copy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize


## Topic 4: Efficient Vector Space Retrieval

Load data and vectorize using TfidfVectorizer
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

TF-IDF weighting

In [3]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...
...,...,...
3188,MED-5367,relationship plasma carotenoids depressive sym...
3189,MED-5368,suicide mortality relation dietary intake num ...
3190,MED-5369,suicide mortality european union pubmed ncbi a...
3191,MED-5370,long chain omega num fatty acids intake fish c...


In [105]:
#find the id of med-2421
med_id = corpus['ID'] == "MED-2421"
med_id = corpus[med_id]
med_id

Unnamed: 0,ID,TEXT
1140,MED-2421,birth weight head circumference prenatal expos...


In [4]:

# create token list out of document
def tokenize(string):
    return string.split()

# apply term frequencies for each a single string (document)
def tf(string): 
    # create bag of words from the string
    bow = tokenize(string)
    
    tf_dict = {}
    for word in bow:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
            
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / len(bow)
    
    return tf_dict

In [5]:

# We then call our function on every doc and store all these tf dictionaries. 
tf_dict = {}
for index, row in corpus.iterrows():
    doc_dict = tf(row['TEXT'])
    tf_dict[index] = doc_dict

# test if tfDict was created correctly
tf_dict[0]["alkylphenols"]
# alkylphenols for doc 0 : 0.008547008547008548

0.008547008547008548

In [6]:
# total number of documents in corpus
no_of_docs = len(corpus.index)
print(no_of_docs)

3193


In [7]:
# term - key, number of docs term occured in
def count_occurances():
    count_dict = {}
    for key in tf_dict:
        for key in tf_dict[key]:
            if key in count_dict:
                count_dict[key] += 1
            else:
                count_dict[key] = 1
    return count_dict

# test if count_occurances works
count_oc = count_occurances()
count_oc["alkylphenols"] # checked with Elina, good

# number of alkylphenols occurence in entire corpus = 7

7

In [8]:
# having total number of documents and number of occurances of each word in entire corpus we can calculate 
# idf for each term as log(total # of documents / # of documents with term in it)

# idf is calculated per each term, thus we create dictionary with term as a key and idf as a value
def idf():
    
    idf_dict = {}
    for key in count_oc:
        idf_dict[key] = math.log(no_of_docs/count_oc[key])
    return idf_dict

idf = idf()

# test if idf function works
idf["alkylphenols"]

# alkylphenols idf = 6.122806043659469

6.122806043659469

In [9]:
# cosntructing the final tf-idf dictionary; tf-idf is calculated as tf-idf(t, d) = tf(t, d) * idf(t)
# so for each key in tf dict we have to miltiply it with corresponsinf idf value

def tf_idf():
    d = copy.deepcopy(tf_dict)
    for doc, value in d.items():
        for word, value in d[doc].items():
            d[doc][word] = value * idf[word]
    return d

# test if tf_idf works
a = tf_idf()
print('Result from def:')
print(a[0]["alkylphenols"])

# excpected result for (term, doc) --> (alkylphenols, 0) =  0.008547008547008548 * 6.122806043659469 = 0.05
print('Manual result:')
idf["alkylphenols"] * tf_dict[0]["alkylphenols"]

# it works :)

Result from def:
0.05233167558683307
Manual result:


0.05233167558683307

In [10]:

# First we have to build TF-IDF matrix based on obtain dictionary. 
# Rows will correspond to docs in the corpus, while columns will represent unique words

#              word1       ...          wordn
#  doc1   tf_idf_value   ...      tf_idf_value
#  ...    tf_idf_value   ...      tf_idf_value
#  docn   tf_idf_value   ...      tf_idf_value
#

tf_idf_matrix = pd.DataFrame.from_dict(a, orient = 'index').fillna(0) # if word does not appear in doc we change NaN to 0
tf_idf_matrix = tf_idf_matrix.sort_index()
tf_idf_matrix.head()

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
0,0.052332,0.041372,0.079999,0.046407,0.021178,0.060818,0.029952,0.047041,0.002278,0.002334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001777,0.00182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.028372,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.022663,0.0,0.0,0.0,0.001625,0.001665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549,0.001588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Now we have to compare docs by computing cosine similarity between each vector (row) in dataframe
# For that we need to obtain 1. vector magnitude 2. dot product between two vectors

def vector_magnitude(v):
    return np.linalg.norm(v)

def dot_product(v1, v2):
    return np.dot(v1,v2)

# Creating cosine similarity table (should be 3193 x 3193)
def cosine_similarity(v1, v2):
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))
print(tf_idf_matrix.iloc[0])
cosine_similarity(tf_idf_matrix.iloc[0],tf_idf_matrix.iloc[0])

alkylphenols         0.052332
human                0.041372
milk                 0.079999
relations            0.046407
dietary              0.021178
                       ...   
eurostat             0.000000
upward               0.000000
suicide-recording    0.000000
scarcity             0.000000
trim-and-fill        0.000000
Name: 0, Length: 26951, dtype: float64


0.9999999999999996

# Random Projections
**Hashing algorithm:**
<br>1.Choose a set of *$M$* random vectors ${r_1, r_2, ..., r_M}$ in the original high-dimensional vectors space (vector length $|V|$)
<br>2.For each document TF-IDF vector d do:
 - Compute the inner (dot) product of doc and each random vector $r:θ(r, d) = \sum_{i}^{|𝑉|}𝑟_𝑖∗𝑑_𝑖$
 - Hash each inner product: $h(d, r_k) = 1$ if $θ(r, d) > t$ (treshold), else 0

3.Compute a new vector of hashes:
 - $d’ = [h(d, r_1), h(d, r_2), ..., h(d, r_M)]$
 - The number of selected random vectors, *$M$*, is the dimensionality of hashed vectors

In [49]:
#a function for  creating a set of M random vectors with the dimension dim
def get_random_vectors(dim,m):
    return np.random.random((m, dim))

In [145]:
#test the get_random_vectors
vocab_size = len(tf_idf_matrix.columns)
np.random.seed(0)
m = 1000
random_vectors = get_random_vectors(vocab_size, m)
print('dimension of the set of random vectors: ', random_vectors.shape)
random_vectors[1]

dimension of the set of random vectors:  (1000, 26951)


array([0.06284314, 0.51746825, 0.9356375 , ..., 0.81412037, 0.24848416,
       0.09751716])

In [14]:
# d - document vector
# rnd_vec - set of random vectors
# t - threshold
def norm(vec):
    return vec/np.linalg.norm(vec)
    
    
def compute_hash(docs, rnd_vec, t):
    hashed_doc_vectors = []
    #for each document in document collection
    for doc in docs:
        hashed_dot_product = []
        inner_product = doc.dot(rnd_vec.transpose())
        for i in inner_product:  
            if i>t:
                hashed_dot_product.append(1)
            else:
                hashed_dot_product.append(0)
        hashed_doc_vectors.append(hashed_dot_product)
    return np.array(hashed_doc_vectors)

In [141]:
print(np.linalg.norm(doc_vectors[0]))
print(np.linalg.norm(random_vectors[0]))

0.4477014678356913
94.52892786806335


In [146]:
#normalize doc and random vectors
doc_vectors = tf_idf_matrix.values
norm_doc_vectors = []
for doc in doc_vectors:
    norm_doc_vectors.append(norm(doc))
norm_doc_vectors = np.array(norm_doc_vectors)
norm_rand_vectors = []
for rand in random_vectors:
    norm_rand_vectors.append(norm(rand))
norm_rand_vectors = np.array(norm_rand_vectors)
print(norm_doc_vectors[0])
print(np.linalg.norm(norm_doc_vectors[0]))
print(norm_rand_vectors[0])
print(np.linalg.norm(norm_rand_vectors[0]))

[0.11688967 0.09241034 0.178688   ... 0.         0.         0.        ]
0.9999999999999999
[0.00580577 0.00756583 0.0063765  ... 0.00439207 0.00476961 0.00070877]
1.0


In [151]:
#### test compute_hash
inn = norm_doc_vectors[120].dot(norm_rand_vectors.transpose())
h = []
for i in inn:
    if i>0.03:
        h.append(1)
    else:
        h.append(0)
print(inn)
print(h)

[0.03694885 0.03347691 0.03509818 0.03908683 0.03716179 0.03458965
 0.03233641 0.0318238  0.03385612 0.03688681 0.03829495 0.03366506
 0.03345653 0.0405509  0.03487201 0.0398219  0.0369301  0.03136072
 0.03701938 0.03681286 0.03807685 0.03580785 0.03408752 0.0365292
 0.03662782 0.03549769 0.04245463 0.03677506 0.03613482 0.04192047
 0.0339268  0.03569237 0.03441085 0.0394335  0.02881328 0.03814459
 0.03799954 0.03364659 0.03205626 0.03212065 0.03256582 0.04031167
 0.03182015 0.03658475 0.03345521 0.03114847 0.03237542 0.03913713
 0.03877613 0.03799149 0.03301124 0.03434014 0.03582192 0.03694325
 0.03342712 0.03569465 0.03696214 0.03057352 0.03821359 0.03847459
 0.0323195  0.03541804 0.03646257 0.03153249 0.03507998 0.03491626
 0.03095135 0.0397747  0.03678468 0.03572512 0.03517627 0.03644772
 0.03632206 0.03460287 0.03848196 0.0331872  0.04109994 0.03824099
 0.03728287 0.03753215 0.03939116 0.03947001 0.0380501  0.02880263
 0.02990037 0.03736004 0.03731446 0.03771752 0.03996156 0.03367

In [148]:
%%time
random_projections = compute_hash(norm_doc_vectors, norm_rand_vectors, 0.03)
random_projections[0]

Wall time: 36.7 s


array([1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,

# Use random projections

In [152]:
#read queries 
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])
queries_text.head()

Unnamed: 0,ID,TEXT
0,PLAIN-1,why deep fried foods may cause cancer in the l...
1,PLAIN-1007,"ddt - - persistent organic pollutants , indust..."
2,PLAIN-101,how to treat multiple sclerosis with diet mult...
3,PLAIN-1017,"detoxification - - cancer , raw food , heart h..."
4,PLAIN-1027,"dietary guidelines - - heart disease , cardiov..."


In [19]:
def vectorize_query(string):
    tokenized_query = string.split()
    df_query = tf_idf_matrix[0:0] #dataframe of tf-idf weights of a query
    df_query = df_query.append(pd.Series(0, index=df_query.columns), ignore_index=True)
    for token in tokenized_query:
        for col in df_query.columns:
            if token == col:
                df_query[col][0] = df_query[col][0] + 1 #raw term frequency
    
    df_query = df_query.replace(0, np.nan)
    
    df_query = np.log(df_query) + 1 #log term freq(as in the slides)
    
    df_query = df_query.fillna(0)
    
    for col in df_query.columns:
        df_query[col][0] = df_query[col][0] * idf[col]
        
    return df_query

In [159]:
queries = []
for index, row in queries_text.iterrows():
    queries.append(norm(vectorize_query(row['TEXT'])).values)
queries = np.array(queries)

In [None]:
query = norm(vectorize_query(queries_text['TEXT'][0])).values


In [172]:
print(queries[0])
print(np.linalg.norm(queries[0]))

[[0.        0.0074371 0.        ... 0.        0.        0.       ]]
1.0


In [184]:
#use random projections on query
query_projection = compute_hash(queries[0],norm_rand_vectors, 0.03)
query_projection

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [182]:
inn = queries[3][0].dot(norm_rand_vectors.transpose())
h = []
for i in inn:
    if i>0.03:
        h.append(1)
    else:
        h.append(0)
print(inn)
print(h)

[0.01418965 0.01312186 0.02024626 0.0151166  0.01711882 0.01474291
 0.02001731 0.01987751 0.01506695 0.01631306 0.02229158 0.02160481
 0.0197475  0.01924086 0.02121038 0.01692146 0.01361659 0.01874481
 0.01875865 0.02152752 0.01949002 0.02404756 0.0190541  0.01526021
 0.02133627 0.01762899 0.01601999 0.02082219 0.01608068 0.01799404
 0.01764885 0.02008633 0.01997723 0.02103125 0.01126997 0.0222526
 0.01824178 0.01600529 0.01816196 0.02189278 0.01901328 0.01861716
 0.01905788 0.01938079 0.01745998 0.02195585 0.02136404 0.02282045
 0.02137035 0.01927101 0.02318791 0.0268834  0.0218458  0.01664822
 0.01925513 0.01584503 0.02334391 0.01853832 0.01943791 0.02389146
 0.02256401 0.0198177  0.02492088 0.01892332 0.02234661 0.01925965
 0.02373871 0.02394984 0.01434891 0.01819844 0.02037914 0.02501464
 0.01632429 0.02132532 0.02264144 0.02847278 0.01673142 0.02757921
 0.02040803 0.01772402 0.01790145 0.01929002 0.01821828 0.01692696
 0.01796749 0.01744559 0.01413976 0.01675808 0.0172711  0.02639

In [185]:
cosine_similarity(query_projection[0],random_projections[0])

0.9214119599831555

# Evaluation

In [None]:
def evaluate_random_projections()