In [1]:
import numpy as np
import pandas as pd
#import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import collections
import math
import copy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize


## Topic 4: Efficient Vector Space Retrieval

Load data and vectorize using TfidfVectorizer
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

TF-IDF weighting

In [2]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...
5,MED-335,differences total vitro digestible phosphorus ...
6,MED-398,grapefruit wine glass metabolic cardiovascular...
7,MED-557,dysmenorrhea pubmed ncbi abstract dysmenorrhea...
8,MED-666,role surgery treatment mastalgia pubmed ncbi a...
9,MED-691,ginger prevention nausea vomiting review pubme...


In [3]:

# create token list out of document
def tokenize(string):
    return string.split()

# apply term frequencies for each a single string (document)
def tf(string): 
    # create bag of words from the string
    bow = tokenize(string)
    
    tf_dict = {}
    for word in bow:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
            
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / len(bow)
    
    return tf_dict

In [4]:

# We then call our function on every doc and store all these tf dictionaries. 
tf_dict = {}
for index, row in corpus.iterrows():
    doc_dict = tf(row['TEXT'])
    tf_dict[index] = doc_dict

# test if tfDict was created correctly
tf_dict[0]["alkylphenols"]
# alkylphenols for doc 0 : 0.008547008547008548

0.008547008547008548

In [5]:
# total number of documents in corpus
no_of_docs = len(corpus.index)
print(no_of_docs)

3193


In [6]:
# term - key, number of docs term occured in
def count_occurances():
    count_dict = {}
    for key in tf_dict:
        for key in tf_dict[key]:
            if key in count_dict:
                count_dict[key] += 1
            else:
                count_dict[key] = 1
    return count_dict

# test if count_occurances works
count_oc = count_occurances()
count_oc["alkylphenols"] # checked with Elina, good

# number of alkylphenols occurence in entire corpus = 7

7

In [7]:
# having total number of documents and number of occurances of each word in entire corpus we can calculate 
# idf for each term as log(total # of documents / # of documents with term in it)

# idf is calculated per each term, thus we create dictionary with term as a key and idf as a value
def idf():
    
    idf_dict = {}
    for key in count_oc:
        idf_dict[key] = math.log(no_of_docs/count_oc[key])
    return idf_dict

idf = idf()

# test if idf function works
idf["alkylphenols"]

# alkylphenols idf = 6.122806043659469

6.122806043659469

In [8]:
# cosntructing the final tf-idf dictionary; tf-idf is calculated as tf-idf(t, d) = tf(t, d) * idf(t)
# so for each key in tf dict we have to miltiply it with corresponsinf idf value

def tf_idf():
    d = copy.deepcopy(tf_dict)
    for doc, value in d.items():
        for word, value in d[doc].items():
            d[doc][word] = value * idf[word]
    return d

# test if tf_idf works
a = tf_idf()
print('Result from def:')
print(a[0]["alkylphenols"])

# excpected result for (term, doc) --> (alkylphenols, 0) =  0.008547008547008548 * 6.122806043659469 = 0.05
print('Manual result:')
idf["alkylphenols"] * tf_dict[0]["alkylphenols"]

# it works :)

Result from def:
0.05233167558683307
Manual result:


0.05233167558683307

In [9]:

# First we have to build TF-IDF matrix based on obtain dictionary. 
# Rows will correspond to docs in the corpus, while columns will represent unique words

#              word1       ...          wordn
#  doc1   tf_idf_value   ...      tf_idf_value
#  ...    tf_idf_value   ...      tf_idf_value
#  docn   tf_idf_value   ...      tf_idf_value
#

tf_idf_matrix = pd.DataFrame.from_dict(a, orient = 'index').fillna(0) # if word does not appear in doc we change NaN to 0
tf_idf_matrix = tf_idf_matrix.sort_index()
tf_idf_matrix.head()

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
0,0.052332,0.041372,0.079999,0.046407,0.021178,0.060818,0.029952,0.047041,0.002278,0.002334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001777,0.00182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.028372,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.022663,0.0,0.0,0.0,0.001625,0.001665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549,0.001588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Now we have to compare docs by computing cosine similarity between each vector (row) in dataframe
# For that we need to obtain 1. vector magnitude 2. dot product between two vectors

def vector_magnitude(v):
    return np.linalg.norm(v)

def dot_product(v1, v2):
    return np.dot(v1,v2)

# Creating cosine similarity table (should be 3193 x 3193)
def cosine_similarity(v1, v2):
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))
print(tf_idf_matrix.iloc[0])
cosine_similarity(tf_idf_matrix.iloc[0],tf_idf_matrix.iloc[0])

alkylphenols             0.052332
human                    0.041372
milk                     0.079999
relations                0.046407
dietary                  0.021178
habits                   0.060818
central                  0.029952
taiwan                   0.047041
pubmed                   0.002278
ncbi                     0.002334
abstract                 0.000056
aims                     0.031299
study                    0.006299
determine                0.019019
concentrations           0.034312
num                      0.017474
nonylphenol              0.050184
np                       0.110415
octylphenol              0.055208
op                       0.193874
samples                  0.019661
examine                  0.023895
related                  0.018125
factors                  0.014389
including                0.015695
mothers                  0.084329
demographics             0.045818
women                    0.015158
consumed                 0.041914
median        

0.9999999999999996

# Random Projections
**Hashing algorithm:**
<br>1.Choose a set of *$M$* random vectors ${r_1, r_2, ..., r_M}$ in the original high-dimensional vectors space (vector length $|V|$)
<br>2.For each document TF-IDF vector d do:
 - Compute the inner (dot) product of doc and each random vector $r:θ(r, d) = \sum_{i}^{|𝑉|}𝑟_𝑖∗𝑑_𝑖$
 - Hash each inner product: $h(d, r_k) = 1$ if $θ(r, d) > t$ (treshold), else 0

3.Compute a new vector of hashes:
 - $d’ = [h(d, r_1), h(d, r_2), ..., h(d, r_M)]$
 - The number of selected random vectors, *$M$*, is the dimensionality of hashed vectors

In [11]:
#a function for  creating a set of M random vectors with the dimension dim
def get_random_vectors(dim,m):
    return np.random.random_sample((m, dim))

In [12]:
#test the get_random_vectors
vocab_size = len(tf_idf_matrix.columns)
np.random.seed(0)
m = 100
random_vectors = get_random_vectors(vocab_size, m)
print('dimension of the set of random vectors: ', random_vectors.shape)
random_vectors[1]

dimension of the set of random vectors:  (100, 26951)


array([0.06284314, 0.51746825, 0.9356375 , ..., 0.81412037, 0.24848416,
       0.09751716])

In [13]:
# d - document vector
# rnd_vec - set of random vectors
# t - threshold
def norm(vec):
    return vec/vector_magnitude(vec)
    
    
def compute_hash(docs, rnd_vec, t):
    hashed_doc_vectors = []
    #for each document in document collection
    for doc in docs:
        hashed_dot_product = []
        inner_product = doc.dot(rnd_vec.transpose())
        for i in inner_product:  
            if i>t:
                hashed_dot_product.append(1)
            else:
                hashed_dot_product.append(0)
        hashed_doc_vectors.append(hashed_dot_product)
    return np.array(hashed_doc_vectors)

In [14]:
#test compute_hash
doc_vectors = tf_idf_matrix.values
inn = doc_vectors[0].dot(random_vectors.transpose())
h = []
for i in inn:
    if i>1.2:
        h.append(1)
    else:
        h.append(0)
print(inn)
print(h)

[1.52935202 1.40158704 1.37560483 1.40460678 1.25290627 1.56556019
 1.19668373 1.50428146 1.32872635 1.70703534 1.1779663  1.3402548
 1.43852473 1.39830539 1.26146833 1.47158566 1.29046752 1.50346167
 1.21262387 1.46803553 1.48094456 1.30494526 1.23609255 1.63103709
 1.2354678  1.32023814 1.40655926 1.28845768 1.54915213 1.28072208
 1.60531398 1.34482263 1.4346691  1.45906407 1.39824523 1.51955422
 1.28433195 1.26968334 1.31652239 1.35567704 1.32194379 1.39671996
 1.45932261 1.23691701 1.18106196 1.33183443 1.47699914 1.26385761
 1.57970796 1.23144514 1.33786753 1.37532657 1.28572843 1.2927297
 1.36897493 1.51295204 1.70902037 1.16520591 1.23202941 1.52018073
 1.33785418 1.56110283 1.48080842 1.39851598 1.38365494 1.44223707
 1.33886896 1.32950293 1.41556142 1.56642664 1.47891912 1.3567732
 1.43461308 1.44973598 1.40005237 1.44120087 1.37411145 1.2967823
 1.29051957 1.34881912 1.47508042 1.68026367 1.46225666 1.30655028
 1.22878692 1.54757521 1.45004802 1.31474515 1.56693045 1.26252524

In [15]:
random_projections = compute_hash(doc_vectors, random_vectors, 1.2)
random_projections[0]

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Use random projections

In [16]:
#read queries 
queries = pd.read_csv('nfcorpus/dev.titles.queries', sep='\t', names=['ID', 'TEXT'])
queries.head()

Unnamed: 0,ID,TEXT
0,PLAIN-1,why deep fried foods may cause cancer
1,PLAIN-1007,ddt
2,PLAIN-101,how to treat multiple sclerosis with diet
3,PLAIN-1017,detoxification
4,PLAIN-1027,dietary guidelines


In [21]:
#create vocab
vocab = tf_idf_matrix.columns
vocab

Index(['alkylphenols', 'human', 'milk', 'relations', 'dietary', 'habits',
       'central', 'taiwan', 'pubmed', 'ncbi',
       ...
       'tuscany', 'studies-depression', 'suicides', 'eurosave',
       'self-inflicted', 'eurostat', 'upward', 'suicide-recording', 'scarcity',
       'trim-and-fill'],
      dtype='object', length=26951)

In [74]:
q_vect = np.zeros(vocab.shape)
q_vect.shape

(26951,)

In [75]:
%%time
#create query vectors
query_vectors = {}
for index, row in queries.iterrows():    
    q_vect = np.zeros(vocab.shape)
    t_q = tokenize(q)
    q = row['TEXT']
    i=0
    for t in vocab:
        if t in t_q:
            q_vect[i] = 1
        i+=1
    query_vectors[index] = [row['ID'],q_vect]

Wall time: 2.11 s


In [78]:
print(sum(query_vectors[0][1]))

1.0


In [52]:
s = "may"
if s in vocab:
    print("the word exists in the vocab")
else:
    print("the word does not exist in the vocab")
    
#some of the words that exist in the query don't exist in our vocabulary

the word does not exist in the vocab


In [54]:
%%time
#calculate cosine similarity for q[0] and other documents
for doc in doc_vectors:
    print(cosine_similarity(doc, query_vectors[0][1]))

ValueError: shapes (26951,) and (188657,) not aligned: 26951 (dim 0) != 188657 (dim 0)