In [267]:
import numpy as np
import pandas as pd
import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import collections
import math
import copy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


## Topic 4: Efficient Vector Space Retrieval

In [442]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus.head()

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...


In [448]:
# create token list out of document
def tokenize(string):
    return string.split()

# apply term frequencies for each a single string (document)
def tf(string): 
    # create bag of words from the string
    bow = tokenize(string)
    
    tf_dict = {}
    for word in bow:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
            
    for word in tf_dict:
        tf_dict[word] = 1 + math.log(tf_dict[word])
    
    return tf_dict


In [450]:
# We then call our function on every doc and store all these tf dictionaries. 
tf_dict = {}
for index, row in corpus.iterrows():
    doc_dict = tf(row['TEXT'])
    tf_dict[index] = doc_dict

# test if tfDict was created correctly
tf_dict[0]["alkylphenols"]
# alkylphenols for doc 0 : 1

1.0

In [451]:
# total number of documents in corpus
no_of_docs = len(corpus.index)
print(no_of_docs)

3193


In [452]:
# term - key, number of docs term occured in
def count_occurances():
    count_dict = {}
    for key in tf_dict:
        for key in tf_dict[key]:
            if key in count_dict:
                count_dict[key] += 1
            else:
                count_dict[key] = 1
    return count_dict

# test if count_occurances works
count_oc = count_occurances()
count_oc["alkylphenols"] # checked with Elina, good

# number of alkylphenols occurence in entire corpus = 7

7

In [453]:
# having total number of documents and number of occurances of each word in entire corpus we can calculate 
# idf for each term as log(total # of documents / # of documents with term in it)

# idf is calculated per each term, thus we create dictionary with term as a key and idf as a value
def idf():
    
    idf_dict = {}
    for key in count_oc:
        idf_dict[key] = math.log(no_of_docs/count_oc[key])
    return idf_dict

idf = idf()

# test if idf function works
idf["alkylphenols"]

# alkylphenols idf = 6.122806043659469

6.122806043659469

In [454]:
# cosntructing the final tf-idf dictionary; tf-idf is calculated as tf-idf(t, d) = tf(t, d) * idf(t)
# so for each key in tf dict we have to miltiply it with corresponsinf idf value

def tf_idf():
    d = copy.deepcopy(tf_dict)
    for doc, value in d.items():
        for word, value in d[doc].items():
            d[doc][word] = value * idf[word]
    return d

# test if tf_idf works
a = tf_idf()
print('Result from def:')
print(a[0]["alkylphenols"])

# excpected result for (term, doc) --> (alkylphenols, 0) =  0.008547008547008548 * 6.122806043659469 = 0.05
print('Manual result:')
idf["alkylphenols"] * tf_dict[0]["alkylphenols"]

# it works :)

Result from def:
6.122806043659469
Manual result:


6.122806043659469

In [455]:
# First we have to build TF-IDF matrix based on obtain dictionary. 
# Rows will correspond to docs in the corpus, while columns will represent unique words

#              word1       ...          wordn
#  doc1   tf_idf_value   ...      tf_idf_value
#  ...    tf_idf_value   ...      tf_idf_value
#  docn   tf_idf_value   ...      tf_idf_value
#

tf_idf_matrix = pd.DataFrame.from_dict(a, orient = 'index').fillna(0) # if word does not appear in doc we change NaN to
tf_idf_matrix = tf_idf_matrix.sort_index()
tf_idf_matrix.head()

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
0,6.122806,3.386148,6.547579,5.429659,2.097678,6.023975,3.504368,5.503767,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,2.600018,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.600018,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [457]:
# Now we have to compare docs by computing cosine similarity between each vector (row) in dataframe
# For that we need to obtain 1. vector magintude 2. dot product between two vectors

def vector_magnitude(v):
    return np.linalg.norm(v)

def dot_product(v1, v2):
    return np.dot(v1,v2)

# Creating cosine similarity table (should be 3193 x 3193)
def cosine_similarity(v1, v2):
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))
print(tf_idf_matrix.iloc[0])
cosine_similarity(tf_idf_matrix.iloc[0],tf_idf_matrix.iloc[0])

alkylphenols         6.122806
human                3.386148
milk                 6.547579
relations            5.429659
dietary              2.097678
                       ...   
eurostat             0.000000
upward               0.000000
suicide-recording    0.000000
scarcity             0.000000
trim-and-fill        0.000000
Name: 0, Length: 26951, dtype: float64


0.9999999999999997

## Preclustering suggested in the lecture

In [456]:
#Set number of clusters at initialisation time
sqrt_n = round(math.sqrt(no_of_docs))

#we randomly select sqrt(N) documents from the corpus, which we call leaders
leaders = tf_idf_matrix.sample(sqrt_n)
leaders = leaders.sort_index()
leaders.head()

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [458]:
# For every other document in the collection
# 1. Compute the similarities (cosine of the angle between TF-IDF vectors) with all leaders
# 2. Add the document to the cluster of the most similar leader

cluster_list = []

for i in range(sqrt_n):
    cluster_list.append([])

for i in range(no_of_docs):
    cosines = []
    for j in leaders.index:
        cosines.append(cosine_similarity(tf_idf_matrix.loc[i],leaders.loc[j]))
    m = max(cosines)
    index_of_max = [l for l, b in enumerate(cosines) if b == m]
    cluster_list[index_of_max[0]].append(i) #if there are two equal max values of cosine similarity use the smaller index by default
        

In [438]:
cluster_list[:1]

[[6,
  22,
  27,
  155,
  158,
  199,
  229,
  395,
  601,
  604,
  895,
  1020,
  1058,
  1060,
  1071,
  1197,
  1352,
  1374,
  1377,
  1489,
  1521,
  1539,
  1569,
  1603,
  1604,
  1605,
  1606,
  1607,
  1608,
  1609,
  1610,
  1611,
  1613,
  1614,
  1795,
  1838,
  1998,
  2306,
  2599,
  2652,
  2836,
  2960]]

In [459]:
# check of total docs (every doc should be included in exactly one cluster)
total = 0
for i in range(len(cluster_list)):
    total = total + len(cluster_list[i])

if total == no_of_docs:
    print('all docs are distributed to the clusters')

all docs are distributed to the clusters


In [525]:
#construct function, which uses query q(should be already in the vector form) as input, required similarity of the doc to be retrieved - threshold, and
#necessary number of documents to be retrieved - K (5 most similar docs in the cluster by default)

def ir_preclustering(q, threshold = 0, K = 5): 
    sim_to_leaders = [] #array of cosine similarities of q to leaders
    retrieved_docs = [] #array of the most similar docs to be returned by the function
    
    for i in range(len(leaders.index)):
        sim_to_leaders.append(cosine_similarity(q,leaders.iloc[i]))
    m = max(sim_to_leaders)
    index_of_max = [l for l, b in enumerate(sim_to_leaders) if b == m] #odinal number of most similar leader => use this cluster
    
    sim_to_docs = [] #array of cosine similarities of q to all docs in the chosen cluster
    for doc in cluster_list[index_of_max[0]]:
        sim_to_docs.append(cosine_similarity(q,tf_idf_matrix.iloc[doc]))
        
    ins = np.argsort(sim_to_docs) #returns the indices that would sort an array of similarities to docs in ascending order
    ins = ins[::-1] #but we need descending (most similar in the beginning of the list)
    
    if threshold == 0: #proceed only with K
        for k in range(K):
            retrieved_docs.append(cluster_list[index_of_max[0]][ins[k]])

        
    else:
        if sim_to_docs[ins[0]] < threshold:
            print('no documents satisfy necessary level of threshold similarity')
            return None
        
        for sim in sim_to_docs:
            if sim >= threshold:
                retrieved_docs.append(cluster_list[index_of_max[0]][sim_to_docs.index(sim)])
            if len(retrieved_docs) < K:
                print('number of documents that satisfy threshold similarity is less than required \(less than K\)')
            
        
    return corpus.iloc[retrieved_docs]

# repeat this procedure using FAISS instead of cosine similarity

In [293]:
#FAISS works only with type float32
print(type(tf_idf_matrix.loc[0][0]))
tf_idf_matrix = tf_idf_matrix.astype('float32')
print(type(tf_idf_matrix.loc[0][0]))

numpy.float32

In [307]:
#we are going to use the same leaders
leaders = leaders.astype('float32')
index = faiss.IndexFlatL2(len(leaders.columns))
index.add(np.ascontiguousarray(leaders.values))

#find the nearest leader
def ir_preclustering_faiss(q, K = 5):
    D, I = index.search(q, 1) #returning distance and index of the nearest leader
    
    index2 = faiss.IndexFlatL2(len(leaders.columns)) #train index of the cluster with nearest leader
    index2.add(np.ascontiguousarray(cluster_list[I]))
    
    if len(cluster_list[I]) < K:
        print('asked number of documents to be retrieved is larger than the number of documents in the cluster; \nall documents in the cluster are retrieved')
        return tf_idf_matrix.iloc[cluster_list[I]]
    
    else:
        DD, II = index.search(q, K) #returning distances and indexes of the nearest documents (sorted by distance)
        return tf_idf_matrix.iloc[II]


## K-means clustering

In [242]:
#Set number of clusters at initialisation time
sqrt_n = round(math.sqrt(no_of_docs))

#Run the clustering algorithm
estimator = KMeans(n_clusters = sqrt_n)
model = estimator.fit(tf_idf_matrix)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=57, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [244]:
#Generate cluster predictions and store in y_hat
y_hat = estimator.predict(tf_idf_matrix) #predicting to which cluster the query belongs
y_hat #array of belongings of docs to cluster

array([33, 33, 13, ..., 16,  9,  9], dtype=int32)

In [248]:
cluster_list_kmeans = []
for i in range(sqrt_n):
    cluster_list_kmeans.append([])

for i in range(no_of_docs):
    for j in range(sqrt_n):
        if y_hat[i] == j:
            cluster_list_kmeans[j].append(i)


In [253]:
cluster_list_kmeans[0:2] #in one of the runs of kmeans not very balanced clusters: only one doc in cluster 0 and 1

[[2559], [164]]

In [254]:
#Since we have 57 clusters, we are going to compare the query vector with 57 vectors of cluster centroids
#All of cluster centroids are stored in the attribute cluster_centers
centers = np.array(model.cluster_centers_)
centers

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.71050543e-20,  4.22399631e-03,  8.67361738e-19, ...,
        -3.38813179e-21, -1.69406589e-21,  0.00000000e+00]])

In [495]:
#construct function, which uses query q(should be already in the vector form) as input, required similarity of the doc to be retrieved - threshold, and
#necessary number of documents to be retrieved - K (5 most similar docs in the cluster by default)

def ir_preclustering_kmeans(q, threshold = 0, K = 5): 
    sim_to_centers = [] #array of cosine similarities of q to leaders
    retrieved_docs = [] #array of the most similar docs to be returned by the function
    
    for i in leaders.index:
        sim_to_centers.append(cosine_similarity(q,leaders.iloc[i]))
    m = max(sim_to_centers)
    index_of_max = [l for l, b in enumerate(sim_to_centers) if b == m] #odinal number of most similar leader => use this cluster
    index_of_max = index_of_max[0]
    
    sim_to_docs = [] #array of cosine similarities of q to all docs in the chosen cluster
    for doc in cluster_list[index_of_max]:
        sim_to_docs.append(cosine_similarity(q,tf_idf_matrix.iloc[doc]))
        
    ins = np.argsort(sim_to_docs) #returns the indices that would sort an array of similarities to docs in accending order
    
    if threshold == 0: #proceed only with K
        for k in range(K):
            retrieved_docs.append(cluster_list_kmeans[m][-k-1])

        df_retrieved_docs = tf_idf_matrix.iloc[retrieved_docs] #construct the dataframe of retrieved docs to be returned by the function
    
    else:
        if sim_to_docs[ins[0]] < threshold:
            print('no documents satisfy necessary level of threshold similarity')
            return None
        
        for sim in sim_to_docs:
            if sim >= threshold:
                retrieved_docs.append(cluster_list_kmeans[m][sim_to_docs.index(sim)])
            if len(retrieved_docs) < K:
                print('number of documents that satisfy threshold similarity is less than required \(less than K\)')
            df_retrieved_docs = tf_idf_matrix.iloc[retrieved_docs]
        
    return df_retrieved_docs

## Vectorize query 

In [308]:
queries_relevance = pd.read_csv('nfcorpus/dev.2-1-0.qrel', sep='\t', names=['QUERY_ID', '0', 'DOC_ID', 'RELEVANCE_LEVEL'])
queries_relevance.head(10)

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-1,0,MED-2421,2
1,PLAIN-1,0,MED-2422,2
2,PLAIN-1,0,MED-2416,2
3,PLAIN-1,0,MED-2423,2
4,PLAIN-1,0,MED-2417,2
5,PLAIN-1,0,MED-2418,2
6,PLAIN-1,0,MED-4451,2
7,PLAIN-1,0,MED-2420,2
8,PLAIN-1,0,MED-2414,1
9,PLAIN-1,0,MED-4070,1


In [309]:
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])
queries_text.head(10)

Unnamed: 0,ID,TEXT
0,PLAIN-1,why deep fried foods may cause cancer in the l...
1,PLAIN-1007,"ddt - - persistent organic pollutants , indust..."
2,PLAIN-101,how to treat multiple sclerosis with diet mult...
3,PLAIN-1017,"detoxification - - cancer , raw food , heart h..."
4,PLAIN-1027,"dietary guidelines - - heart disease , cardiov..."
5,PLAIN-1038,"dogs - - meat , animal products , cats , heart..."
6,PLAIN-1049,"dr. david spence - - heart health , heart dise..."
7,PLAIN-1065,"dr. walter kempner - - mortality , heart disea..."
8,PLAIN-1077,"dulse - - thyroid health , hijiki , sushi , io..."
9,PLAIN-1087,"easter island - - mortality , muscle strength ..."


In [329]:
tf_idf_queries = tf_idf_matrix[0:0]
tf_idf_queries.head()

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill


In [330]:
for i in range(3): #range(len(queries_text)):
    tf_idf_queries = tf_idf_queries.append(pd.Series(0, index=tf_idf_queries.columns), ignore_index=True)
    tokenized_query = queries_text['TEXT'][0].split()
    for token in tokenized_query:
        for col in tf_idf_queries.columns:
            if token == col:
                tf_idf_queries[col][i] = tf_idf_queries[col][i] + 1

In [331]:
tf_idf_queries.head()

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
0,0.0,1.0,0.0,0.0,11.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,11.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,11.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [431]:
def vectorize_query(string):
    tokenized_query = string.split()
    df_query = tf_idf_matrix[0:0] #dataframe of tf-idf weights of a query
    df_query = df_query.append(pd.Series(0, index=df_query.columns), ignore_index=True)
    for token in tokenized_query:
        for col in df_query.columns:
            if token == col:
                df_query[col][0] = df_query[col][0] + 1 #raw term frequency
    
    df_query = df_query.replace(0, np.nan)
    
    df_query = np.log(df_query) + 1 #log term freq(as in the slides)
    
    df_query = df_query.fillna(0)
    
    for col in df_query.columns:
        df_query[col][0] = df_query[col][0] * idf[col]
        
    return df_query
    

## Information retrieval

In [520]:
def retrieve_with_preclustering(string_query, k = 5):
    print("")
    vector_q = vectorize_query(string_query)
    return ir_preclustering(vector_q.iloc[0], K = k)

In [526]:
retrieve_with_preclustering(queries_text['TEXT'][0], k=10)




Unnamed: 0,ID,TEXT
1142,MED-2423,dietary patterns breast cancer risk women pubm...
1138,MED-2418,consumption deep-fried foods risk prostate can...
956,MED-2195,influence deep frying vegetable oils acrylamid...
1794,MED-3498,dietary acrylamide exposure french population ...
3004,MED-5088,mitigation strategies reduce acrylamide format...
1136,MED-2416,chronic intake potato chips humans increases p...
3011,MED-5095,bioequivalence docosahexaenoic acid algal oils...
3001,MED-5085,factors dominating adhesion nacl potato chips ...
1139,MED-2420,acrylamide foods review science future conside...
954,MED-2191,effects baking boiling nutritional antioxidant...


## Find all relevant documents for the query

In [513]:
queries_relevance.loc[queries_relevance['QUERY_ID'].isin(['PLAIN-1']) & queries_relevance['RELEVANCE_LEVEL'].isin([1, 2, 3])]

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-1,0,MED-2421,2
1,PLAIN-1,0,MED-2422,2
2,PLAIN-1,0,MED-2416,2
3,PLAIN-1,0,MED-2423,2
4,PLAIN-1,0,MED-2417,2
5,PLAIN-1,0,MED-2418,2
6,PLAIN-1,0,MED-4451,2
7,PLAIN-1,0,MED-2420,2
8,PLAIN-1,0,MED-2414,1
9,PLAIN-1,0,MED-4070,1


## Evaluate performance by:
 MAP
 P@k
 R-precision 
 nDCG

Accuracy? Precision? Recall?