In [97]:
import numpy as np
import pandas as pd
import faiss                   # make faiss available
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import collections
import math
import copy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

import itertools
import more_itertools as mit
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/MacBook/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Topic 4: Efficient Vector Space Retrieval

In [2]:
corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])
corpus.head()

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...


In [89]:
# function for corpus preprocessing: stemming

def preprocess_corpus(data):

    ps = PorterStemmer()

    def stemSentence(sentence,ps):
        token_words = word_tokenize(sentence)
        stem_sentence = []
        for word in token_words:
            stem_sentence.append(ps.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)

    data['TEXT'] = data.apply(lambda x: stemSentence(x['TEXT'],ps), axis=1)

    return data

In [98]:
corpus = preprocess_corpus(corpus)
corpus.head()

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenol human milk relat dietari habit cen...
1,MED-329,phosphat vascular toxin pubm ncbi abstract ele...
2,MED-330,dietari phosphoru acut impair endotheli functi...
3,MED-332,public health impact dietari phosphoru excess ...
4,MED-334,differ total vitro digest phosphoru content pl...


In [99]:
# apply term frequencies for each a single string (document)
def tf(qstring):
    
    
    # create token list out of document
    def tokenize(qstring):
        return qstring.split()

    # create bag of words from the string
    bow = tokenize(qstring)
    
    tf_dict = {}
    for word in bow:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
            
    for word in tf_dict:
        tf_dict[word] = 1 + math.log(tf_dict[word])
    
    return tf_dict


In [102]:
# We then call our function on every doc and store all these tf dictionaries. 
tf_dict = {}
for index, row in corpus.iterrows():
    doc_dict = tf(row['TEXT'])
    tf_dict[index] = doc_dict

# test if tfDict was created correctly
tf_dict[0]["alkylphenol"]
# alkylphenols for doc 0 : 1

1.0

In [5]:
# total number of documents in corpus
no_of_docs = len(corpus.index)
print(no_of_docs)

3193


In [103]:
# term - key, number of docs term occured in
def count_occurances():
    count_dict = {}
    for key in tf_dict:
        for key in tf_dict[key]:
            if key in count_dict:
                count_dict[key] += 1
            else:
                count_dict[key] = 1
    return count_dict

# test if count_occurances works
count_oc = count_occurances()
count_oc["alkylphenol"] 

# number of alkylphenols occurence in entire corpus = 7

7

In [104]:
# having total number of documents and number of occurances of each word in entire corpus we can calculate 
# idf for each term as log(total # of documents / # of documents with term in it)

# idf is calculated per each term, thus we create dictionary with term as a key and idf as a value
def idf():
    
    idf_dict = {}
    for key in count_oc:
        idf_dict[key] = math.log(no_of_docs/count_oc[key])
    return idf_dict

idf = idf()

# test if idf function works
idf["alkylphenol"]

# alkylphenols idf = 6.122806043659469

6.122806043659469

In [105]:
# cosntructing the final tf-idf dictionary; tf-idf is calculated as tf-idf(t, d) = tf(t, d) * idf(t)
# so for each key in tf dict we have to miltiply it with corresponsinf idf value

def tf_idf():
    d = copy.deepcopy(tf_dict)
    for doc, value in d.items():
        for word, value in d[doc].items():
            d[doc][word] = value * idf[word]
    return d

# test if tf_idf works
a = tf_idf()
print('Result from def:')
print(a[0]["alkylphenol"])

# excpected result for (term, doc) --> (alkylphenols, 0) =  0.008547008547008548 * 6.122806043659469 = 0.05
print('Manual result:')
idf["alkylphenol"] * tf_dict[0]["alkylphenol"]

# it works :)

Result from def:
6.122806043659469
Manual result:


6.122806043659469

In [106]:
# First we have to build TF-IDF matrix based on obtain dictionary. 
# Rows will correspond to docs in the corpus, while columns will represent unique words

#              word1       ...          wordn
#  doc1   tf_idf_value   ...      tf_idf_value
#  ...    tf_idf_value   ...      tf_idf_value
#  docn   tf_idf_value   ...      tf_idf_value
#

tf_idf_matrix = pd.DataFrame.from_dict(a, orient = 'index').fillna(0) # if word does not appear in doc we change NaN to
tf_idf_matrix = tf_idf_matrix.sort_index()
tf_idf_matrix.head()

Unnamed: 0,alkylphenol,human,milk,relat,dietari,habit,central,taiwan,pubm,ncbi,...,six-year,inchianti,tuscani,studies-depress,eurosav,self-inflict,eurostat,suicide-record,scarciti,trim-and-fil
0,6.122806,2.886416,6.547579,2.90854,2.095849,5.898499,3.473596,5.503767,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,2.59775,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.59775,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266507,0.27307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
# Now we have to compare docs by computing cosine similarity between each vector (row) in dataframe
# For that we need to obtain 1. vector magintude 2. dot product between two vectors

def vector_magnitude(v):
    return np.linalg.norm(v)

def dot_product(v1, v2):
    return np.dot(v1,v2)

# Creating cosine similarity table (should be 3193 x 3193)
def cosine_similarity(v1, v2):
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))
print(tf_idf_matrix.iloc[0])
cosine_similarity(tf_idf_matrix.iloc[0],tf_idf_matrix.iloc[0])

alkylphenol       6.122806
human             2.886416
milk              6.547579
relat             2.908540
dietari           2.095849
                    ...   
self-inflict      0.000000
eurostat          0.000000
suicide-record    0.000000
scarciti          0.000000
trim-and-fil      0.000000
Name: 0, Length: 19930, dtype: float64


1.0

## Preclustering suggested in the lecture

In [245]:
def allocate_docs_to_clusters(rs, cosine = False, Faiss = False):
    
    def set_leaders(random_state = rs):
        #Set number of clusters at initialisation time
        sqrt_n = round(math.sqrt(no_of_docs))
        #we randomly select sqrt(N) documents from the corpus, which we call leaders
        leaders = tf_idf_matrix.sample(sqrt_n, random_state = rs)
        leaders = leaders.sort_index()
        return leaders

    leaders = set_leaders(random_state = rs)
    
    # For every other document in the collection
    # 1. Compute the similarities (cosine of the angle between TF-IDF vectors) with all leaders
    # 2. Add the document to the cluster of the most similar leader
    if (cosine and Faiss) or ((not cosine) and (not Faiss)):
        print('both true')
        return [], []
    
    
    elif cosine == True:
        cluster_list = []

        for i in range(sqrt_n):
            cluster_list.append([])

        for i in range(no_of_docs):
            cosines = []
            for j in leaders.index:
                cosines.append(cosine_similarity(tf_idf_matrix.loc[i],leaders.loc[j]))
            m = max(cosines)
            index_of_max = [l for l, b in enumerate(cosines) if b == m]
            cluster_list[index_of_max[0]].append(i) #if there are two equal max values of cosine similarity use the smaller index by default
        return leaders, cluster_list
    
    elif Faiss == True:
        
        leaders = leaders.astype('float32')
        index = faiss.IndexFlatL2(len(leaders.columns))
        index.add(np.ascontiguousarray(leaders.values))
        
        cluster_list = []

        for i in range(sqrt_n):
            cluster_list.append([])

        for i in range(no_of_docs):
            doc = np.array([tf_idf_matrix.loc[i]])
            D, I = index.search(doc, 1)
            cluster_list[I[0][0]].append(i)
        
        return leaders, cluster_list
    
    else:
        print('exeption')
        return [],[]


In [249]:
leaders, cluster_list = allocate_docs_to_clusters(11, cosine = False, Faiss = True)
print(leaders)
print(cluster_list)

      alkylphenol     human      milk     relat   dietari     habit   central  \
276           0.0  1.375393  0.000000  0.000000  1.237842  0.000000  0.000000   
400           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
518           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
523           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
544           0.0  0.000000  0.000000  0.000000  3.230072  0.000000  0.000000   
659           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
713           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
746           0.0  0.000000  0.000000  1.717830  0.000000  0.000000  0.000000   
755           0.0  0.000000  0.000000  0.000000  3.455757  0.000000  0.000000   
870           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
930           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
983           0.0  0.000000 

In [250]:
leaders, cluster_list = allocate_docs_to_clusters(11, cosine = True, Faiss = False)
print(leaders)
print(cluster_list)

      alkylphenol     human      milk     relat   dietari     habit   central  \
276           0.0  1.375393  0.000000  0.000000  1.237842  0.000000  0.000000   
400           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
518           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
523           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
544           0.0  0.000000  0.000000  0.000000  3.230072  0.000000  0.000000   
659           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
713           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
746           0.0  0.000000  0.000000  1.717830  0.000000  0.000000  0.000000   
755           0.0  0.000000  0.000000  0.000000  3.455757  0.000000  0.000000   
870           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
930           0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
983           0.0  0.000000 

In [14]:
#construct function, which uses query q(should be already in the vector form) as input, required similarity of the doc to be retrieved - threshold, and
#necessary number of documents to be retrieved - K (5 most similar docs in the cluster by default)

def ir_preclustering(q, threshold = 0, K = 5): 
    
    sim_to_leaders = [] #array of cosine similarities of q to leaders
    retrieved_docs = [] #array of the most similar docs to be returned by the function
    
    for i in range(len(leaders.index)):
        sim_to_leaders.append(cosine_similarity(q,leaders.iloc[i]))
        m = max(sim_to_leaders)
        index_of_max = [l for l, b in enumerate(sim_to_leaders) if b == m] #odinal number of most similar leader => use this cluster
    
    sim_to_docs = [] #array of cosine similarities of q to all docs in the chosen cluster
    for doc in cluster_list[index_of_max[0]]:
        sim_to_docs.append(cosine_similarity(q,tf_idf_matrix.iloc[doc]))
        
    ins = np.argsort(sim_to_docs) #returns the indices that would sort an array of similarities to docs in ascending order
    ins = ins[::-1] #but we need descending (most similar in the beginning of the list)
    
    if threshold == 0: #proceed only with K
        if len(ins)>=K:
            for k in range(K):
                retrieved_docs.append(cluster_list[index_of_max[0]][ins[k]])
        else:
            K=len(ins)
            for k in range(K):
                retrieved_docs.append(cluster_list[index_of_max[0]][ins[k]])

        
    else:
        if sim_to_docs[ins[0]] < threshold:
            print('no documents satisfy necessary level of threshold similarity')
            return None
        
        for sim in sim_to_docs:
            if sim >= threshold:
                retrieved_docs.append(cluster_list[index_of_max[0]][sim_to_docs.index(sim)])
            if len(retrieved_docs) < K:
                print('number of documents that satisfy threshold similarity is less than required \(less than K\)')
    
    return corpus.iloc[retrieved_docs]

# repeat this procedure using FAISS instead of cosine similarity

In [148]:
#FAISS works only with type float32
tf_idf_matrix = tf_idf_matrix.astype('float32')

In [394]:
def set_indeces_for_faiss():
    index = faiss.IndexFlatL2(len(leaders.columns))
    index.add(np.ascontiguousarray(leaders.values))
    
    indices = []
    for i in range(len(leaders)):
        index2 = faiss.IndexFlatL2(len(leaders.columns))
        index2.add(np.ascontiguousarray(tf_idf_matrix.loc[cluster_list[i]]))
        indices.append(index2)
        
    return index, indices

In [401]:
#we are going to use the same leaders
leaders = leaders.astype('float32')
#find the nearest leader
def ir_preclustering_faiss(q, K = 5, threshold = 0):
    
    

    q = np.array([q])
    D, I = index.search(q, 1) #returning distance and index of the nearest leader
    
    index2 = indices[I[0][0]]
    
    if threshold == 0: #proceed only with K
        
        if len(cluster_list[I[0][0]]) < K:
            print('asked number of documents to be retrieved is larger than the number of documents in the cluster; \nall documents in the cluster are retrieved')
            return corpus.iloc[cluster_list[I[0][0]]]   
        else:
            DD, II = index2.search(q, K) #returning distances and indexes of the nearest documents in the cluster (sorted by distance)
            return corpus.iloc[II[0]]
            
        
    else:
        DD, II = index2.search(q, K)
        DD = [1 - x for x in DD] #now DD are not distances, but similarities
        
        if DD[0] < threshold:
            return None
        
        for sim in DD:
            if sim < threshold:
                DD.pop(sim)
        
        if len(DD) < K:
                print('number of documents that satisfy threshold similarity is less than required \(less than K\)')
       
        return corpus.iloc[II[0]]


In [354]:
ir_preclustering_faiss(tf_idf_queries.loc[0])

Unnamed: 0,ID,TEXT
1029,MED-2279,role cyclooxygenas gastric mucos protect pubm ...
2772,MED-4800,detect character clostridium difficil retail c...
2042,MED-3789,cholin intak risk lethal prostat cancer incid ...
1030,MED-2280,cyclooxygenas inhibitori antioxid cyanidin gly...
1028,MED-2278,anti-inflammatori anti-oxid effect cherri freu...


In [189]:
true_relevant_docs(queries_text.loc[0].TEXT)

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-1,0,MED-2421,2
1,PLAIN-1,0,MED-2422,2
2,PLAIN-1,0,MED-2416,2
3,PLAIN-1,0,MED-2423,2
4,PLAIN-1,0,MED-2417,2
5,PLAIN-1,0,MED-2418,2
6,PLAIN-1,0,MED-4451,2
7,PLAIN-1,0,MED-2420,2
8,PLAIN-1,0,MED-2414,1
9,PLAIN-1,0,MED-4070,1


## K-means clustering

In [342]:
#Set number of clusters at initialisation time
sqrt_n = round(math.sqrt(no_of_docs))

#Run the clustering algorithm
estimator = KMeans(n_clusters = sqrt_n, random_state = 1100)
model = estimator.fit(tf_idf_matrix)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=57, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1100, tol=0.0001, verbose=0)

In [343]:
#Generate cluster predictions and store in y_hat
y_hat = estimator.predict(tf_idf_matrix) #predicting to which cluster the query belongs
y_hat #array of belongings of docs to cluster

array([45, 45, 56, ...,  3,  3, 45], dtype=int32)

In [344]:
cluster_list_kmeans = []
for i in range(sqrt_n):
    cluster_list_kmeans.append([])

for i in range(no_of_docs):
    for j in range(sqrt_n):
        if y_hat[i] == j:
            cluster_list_kmeans[j].append(i)


In [345]:
cluster_list_kmeans[0:2] #in one of the runs of kmeans not very balanced clusters: only one doc in cluster 0 and 1

[[2728], [3112]]

In [346]:
#Since we have 57 clusters, we are going to compare the query vector with 57 vectors of cluster centroids
#All of cluster centroids are stored in the attribute cluster_centers
centers = np.array(model.cluster_centers_)
centers

array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [-5.5879354e-09,  1.9648463e-01,  3.1178948e-01, ...,
        -2.3283064e-10, -2.3283064e-10, -2.3283064e-10],
       ...,
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00, -5.9604645e-08,  7.2584724e-01, ...,
        -2.3283064e-10, -2.3283064e-10, -2.3283064e-10]], dtype=float32)

In [416]:
#construct function, which uses query q(should be already in the vector form) as input, required similarity of the doc to be retrieved - threshold, and
#necessary number of documents to be retrieved - K (5 most similar docs in the cluster by default)

def ir_preclustering_kmeans(q, threshold = 0, K = 5):
#     q = [q]
    sim_to_centers = [] #array of cosine similarities of q to centers
    retrieved_docs = [] #array of the most similar docs to be returned by the function
    
    for i in range(len(centers)):
        sim_to_centers.append(cosine_similarity(q,centers[i]))
    m = max(sim_to_centers)
    index_of_max = [l for l, b in enumerate(sim_to_centers) if b == m] #odinal number of most similar leader => use this cluster
    
    sim_to_docs = [] #array of cosine similarities of q to all docs in the chosen cluster
    for doc in cluster_list_kmeans[index_of_max[0]]:
        sim_to_docs.append(cosine_similarity(q,tf_idf_matrix.iloc[doc]))
        
    ins = np.argsort(sim_to_docs) #returns the indices that would sort an array of similarities to docs in ascending order
    ins = ins[::-1] #but we need descending (most similar in the beginning of the list)
    
    if threshold == 0: #proceed only with K
        if len(ins)>=K:
            for k in range(K):
                retrieved_docs.append(cluster_list_kmeans[index_of_max[0]][ins[k]])
        else:
            K=len(ins)
            for k in range(K):
                retrieved_docs.append(cluster_list_kmeans[index_of_max[0]][ins[k]])

        
    else:
        if sim_to_docs[ins[0]] < threshold:
            print('no documents satisfy necessary level of threshold similarity')
            return None
        
        for sim in sim_to_docs:
            if sim >= threshold:
                retrieved_docs.append(cluster_list_kmeans[index_of_max[0]][sim_to_docs.index(sim)])
            if len(retrieved_docs) < K:
                print('number of documents that satisfy threshold similarity is less than required \(less than K\)')
    
    return corpus.iloc[retrieved_docs]

In [415]:
ir_preclustering_kmeans(tf_idf_queries.iloc[0])

Unnamed: 0,ID,TEXT
1142,MED-2423,dietari pattern breast cancer risk women pubm ...
3026,MED-5111,dietari factor breast cancer risk case control...
2229,MED-4070,intak fri meat risk cancer follow-up studi fin...
1143,MED-2424,preschool diet adult risk breast cancer pubm n...
1138,MED-2418,consumpt deep-fri food risk prostat cancera b ...


In [417]:
%%time
ir_preclustering_kmeans(tf_idf_queries.iloc[0])

CPU times: user 1.19 s, sys: 14.8 ms, total: 1.2 s
Wall time: 651 ms


Unnamed: 0,ID,TEXT
1142,MED-2423,dietari pattern breast cancer risk women pubm ...
3026,MED-5111,dietari factor breast cancer risk case control...
2229,MED-4070,intak fri meat risk cancer follow-up studi fin...
1143,MED-2424,preschool diet adult risk breast cancer pubm n...
1138,MED-2418,consumpt deep-fri food risk prostat cancera b ...


## Vectorize query 

In [111]:
queries_relevance = pd.read_csv('nfcorpus/dev.2-1-0.qrel', sep='\t', names=['QUERY_ID', '0', 'DOC_ID', 'RELEVANCE_LEVEL'])
queries_relevance.head(10)

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-1,0,MED-2421,2
1,PLAIN-1,0,MED-2422,2
2,PLAIN-1,0,MED-2416,2
3,PLAIN-1,0,MED-2423,2
4,PLAIN-1,0,MED-2417,2
5,PLAIN-1,0,MED-2418,2
6,PLAIN-1,0,MED-4451,2
7,PLAIN-1,0,MED-2420,2
8,PLAIN-1,0,MED-2414,1
9,PLAIN-1,0,MED-4070,1


In [112]:
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])
queries_text.head(10)

Unnamed: 0,ID,TEXT
0,PLAIN-1,why deep fried foods may cause cancer in the l...
1,PLAIN-1007,"ddt - - persistent organic pollutants , indust..."
2,PLAIN-101,how to treat multiple sclerosis with diet mult...
3,PLAIN-1017,"detoxification - - cancer , raw food , heart h..."
4,PLAIN-1027,"dietary guidelines - - heart disease , cardiov..."
5,PLAIN-1038,"dogs - - meat , animal products , cats , heart..."
6,PLAIN-1049,"dr. david spence - - heart health , heart dise..."
7,PLAIN-1065,"dr. walter kempner - - mortality , heart disea..."
8,PLAIN-1077,"dulse - - thyroid health , hijiki , sushi , io..."
9,PLAIN-1087,"easter island - - mortality , muscle strength ..."


In [114]:
## preprocessing of the text of the queries

def preprocess_queries(corpus, queries):

    def remove_punctuations(text): # remove punctuation
        for punctuation in string.punctuation:
            text = text.replace(punctuation, '')
        return text

    def remove_numbers(text): # remove numbers
        return re.sub('[0-9]+', '', text)

    def lower_case(text): # lower case
        text = text.lower()
        return text

    def tokenize(text): #tokenize
        return word_tokenize(text)

    stop = set(stopwords.words('english'))
    def stop_words(tokens): # stop words
        filtered_words = []
        for word in tokens:
            if word not in stop:
                filtered_words.append(word)
        return filtered_words

    ps = PorterStemmer()
    def stemming(tokens, ps): # stemming
        return [ps.stem(w) for w in tokens]

    def corpus_vocab(corpus):
        vocab = []
        corpus_tokens = corpus.apply(lambda x: word_tokenize(x['TEXT']), axis=1)
        for i, j in corpus_tokens.iteritems():
            for token in j:
                if token not in vocab:
                    vocab.append(token)
        return vocab

    v = corpus_vocab(corpus)
    def filter_query(tokens):
        t = []
        for token in tokens:
            if token in v:
                t.append(token)
        return t

    # apply functions
    queries['TEXT'] = queries.apply(lambda x: remove_punctuations(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: remove_numbers(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: lower_case(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: tokenize(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: stop_words(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: stemming(x['TEXT'],ps), axis=1)
    queries['TEXT'] = queries.apply(lambda x: filter_query(x['TEXT']), axis=1)

    return queries

In [115]:
queries_text = preprocess_queries(corpus, queries_text)
queries_text.head()

Unnamed: 0,ID,TEXT
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s..."
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,..."
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle..."
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he..."
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu..."


In [118]:
queries_text['TEXT'][0]

['deep',
 'fri',
 'food',
 'may',
 'caus',
 'cancer',
 'latest',
 'studi',
 'dietari',
 'pattern',
 'breast',
 'cancer',
 'risk',
 'women',
 'healthier',
 'eat',
 'associ',
 'elimin',
 'odd',
 'breast',
 'cancer',
 'healthi',
 'eat',
 'associ',
 'time',
 'odd',
 'includ',
 'unhealthi',
 'eat',
 'pattern',
 'consumpt',
 'food',
 'previous',
 'link',
 'breast',
 'cancer',
 'pancreat',
 'cancer',
 'lung',
 'cancer',
 'oral',
 'throat',
 'cancer',
 'esophag',
 'cancer',
 'cancer',
 'deep',
 'fri',
 'food',
 'southern',
 'bell',
 'deep',
 'fri',
 'food',
 'tradit',
 'southern',
 'diet',
 'character',
 'high',
 'intak',
 'cook',
 'green',
 'bean',
 'legum',
 'cabbag',
 'sweet',
 'potato',
 'may',
 'reduc',
 'risk',
 'invas',
 'breast',
 'cancer',
 'significantli',
 'consumpt',
 'food',
 'risk',
 'prostat',
 'cancer',
 'research',
 'fred',
 'hutchinson',
 'cancer',
 'research',
 'center',
 'univers',
 'washington',
 'found',
 'eat',
 'french',
 'fri',
 'fri',
 'chicken',
 'fri',
 'fish',
 'do

In [116]:
tf_idf_queries = tf_idf_matrix[0:0]
tf_idf_queries.head()

Unnamed: 0,alkylphenol,human,milk,relat,dietari,habit,central,taiwan,pubm,ncbi,...,six-year,inchianti,tuscani,studies-depress,eurosav,self-inflict,eurostat,suicide-record,scarciti,trim-and-fil


In [119]:
for i in range(len(queries_text)):
    tf_idf_queries = tf_idf_queries.append(pd.Series(0, index=tf_idf_queries.columns), ignore_index=True)
    for token in queries_text['TEXT'][i]:
        for col in tf_idf_queries.columns:
            if token == col:
                tf_idf_queries[col][i] = tf_idf_queries[col][i] + 1

In [120]:
tf_idf_queries.head()
tf_idf_queries = tf_idf_queries.astype('float32')
tf_idf_queries.head()

Unnamed: 0,alkylphenol,human,milk,relat,dietari,habit,central,taiwan,pubm,ncbi,...,six-year,inchianti,tuscani,studies-depress,eurosav,self-inflict,eurostat,suicide-record,scarciti,trim-and-fil
0,0.0,1.0,0.0,0.0,11.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,5.0,2.0,4.0,6.0,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
def vectorize_query(textstring):
    if type(textstring) == str:
        tokenized_query = textstring.split()
    else:
        tokenized_query = textstring
        
    df_query = tf_idf_matrix[0:0] #dataframe of tf-idf weights of a query
    df_query = df_query.append(pd.Series(0, index=df_query.columns), ignore_index=True)
    for token in tokenized_query:
        for col in df_query.columns:
            if token == col:
                df_query[col][0] = df_query[col][0] + 1 #raw term frequency
    
    df_query = df_query.replace(0, np.nan)
    
    df_query = np.log(df_query) + 1 #log term freq(as in the slides)
    
    df_query = df_query.fillna(0)
    
    for col in df_query.columns:
        df_query[col][0] = df_query[col][0] * idf[col]
        
    return df_query
    

## Information retrieval

In [122]:
def retrieve_with_preclustering(string_query, k = 5, IDs_of_retrieved_docs = False):
    vector_q = vectorize_query(string_query)
    return ir_preclustering(vector_q.iloc[0], K = k)


In [123]:
retrieve_with_preclustering(queries_text['TEXT'][0], k=10)

Unnamed: 0,ID,TEXT
1142,MED-2423,dietari pattern breast cancer risk women pubm ...
303,MED-1363,toward healthier mediterranean diet pubm ncbi ...
169,MED-1196,dietari pattern depress symptom middl age abst...
755,MED-1955,matern dietari pattern preterm deliveri result...
966,MED-2209,relationship process method glycem indic ten s...
1200,MED-2511,okinawan diet health implic low-calori nutrien...
1419,MED-2852,prospect studi dietari pattern meat intak risk...
2746,MED-4765,dietari predictor num year waist circumfer pub...
959,MED-2202,sweet potato review past present futur role hu...
1140,MED-2421,birth weight head circumfer prenat exposur acr...


## Find all relevant documents for the query

In [124]:
def true_relevant_docs(string_query):
    query_row = (queries_text.loc[queries_text['TEXT'].isin([string_query])])
    query_id = query_row.iloc[0]["ID"]
    relevance_lvl = [1, 2]
    return queries_relevance.loc[queries_relevance['QUERY_ID'].isin([query_id]) & queries_relevance['RELEVANCE_LEVEL'].isin(relevance_lvl)]

In [154]:
true_relevant_docs(queries_text['TEXT'][0])

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-1,0,MED-2421,2
1,PLAIN-1,0,MED-2422,2
2,PLAIN-1,0,MED-2416,2
3,PLAIN-1,0,MED-2423,2
4,PLAIN-1,0,MED-2417,2
5,PLAIN-1,0,MED-2418,2
6,PLAIN-1,0,MED-4451,2
7,PLAIN-1,0,MED-2420,2
8,PLAIN-1,0,MED-2414,1
9,PLAIN-1,0,MED-4070,1


## Evaluate performance by:
 Precision
 MAP 
 nDCG

In [125]:
def apk(actual, predicted):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), len(predicted))

def mapk(actual, predicted, k=5):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [126]:
def dcg(element_list):
    """
    Discounted Cumulative Gain (DCG)
    Parameters:
        element_list - a list of ranks Ex: [5,4,2,2,1]
    Returns:
        score
    """
    score = 0.0
    for order, rank in enumerate(element_list):
        score += float(rank)/math.log((order+2))
    return score


def ndcg(reference, hypothesis):
    """
    Normalized Discounted Cumulative Gain (nDCG)
    Normalized version of DCG:
        nDCG = DCG(hypothesis)/DCG(reference)
    Parameters:
        reference   - a gold standard (perfect) ordering Ex: [5,4,3,2,1]
        hypothesis  - a proposed ordering Ex: [5,2,2,3,1]
    Returns:
        ndcg_score  - normalized score
    """
    if dcg(reference) == 0:
        return 0 
    else:
        return dcg(hypothesis)/dcg(reference)

In [367]:
def evaluate_retrieve_with_preclustering(position):
    
    ## takes ordinal number of a query as an input 
    ## returns the triple (Precision, Average Precision, Normalized Discounted Cumulative Gain)
    
    retrieved_df = ir_preclustering(tf_idf_queries.iloc[position])
    ids_retrieved = []
    for i in range(len(retrieved_df)):
        ids_retrieved.append(retrieved_df.iloc[i].ID)
    ids_retrieved.sort()
    
    relevant = true_relevant_docs(queries_text['TEXT'][position])
    ids_true_relevant = []
    for i in range(len(relevant)):
        ids_true_relevant.append(relevant.iloc[i].DOC_ID)
    ids_true_relevant.sort()
    
    #count true positives and false positives
    tp = 0
    fp = 0
    for i in ids_retrieved:
        for j in ids_true_relevant:
            if i == j:
                tp += 1 
                break
            else:
                if i < j:
                    fp += 1 
                    break
                else:
                    continue
    if (tp == 0) & (fp == 0):
        precision = 0
    else:
        precision = tp/(tp+fp)
    #cannot calculate recall, since we predefined the number of retrieved documents => apriori algorithm cannot retrieve all documents
    
    #then calculate Average precision across retrieved documents
    ap = apk(ids_true_relevant, ids_retrieved)
    
    #since we have graded relevance annotations, we can also calculate Normalized Discounted Cumulative Gain
    list_of_ranks_of_retrieved_docs = []
    for i in ids_retrieved:
        if i in ids_true_relevant:
            list_of_ranks_of_retrieved_docs.append(relevant.loc[relevant['DOC_ID'].isin([i])].RELEVANCE_LEVEL.iloc[0])
        else:
            list_of_ranks_of_retrieved_docs.append(0)

                                               
    list_of_ranks_of_relevant_docs = []
    for i in ids_true_relevant:
        list_of_ranks_of_relevant_docs.append(relevant.loc[relevant['DOC_ID'].isin([i])].RELEVANCE_LEVEL.iloc[0])
    list_of_ranks_of_relevant_docs.sort(reverse = True)
    
    k=len(list_of_ranks_of_retrieved_docs)
    list_of_ranks_of_relevant_docs = list_of_ranks_of_relevant_docs[:k]
        
    return precision, ap, ndcg(list_of_ranks_of_relevant_docs, list_of_ranks_of_retrieved_docs)       
                
    

In [364]:
def evaluate_preclustering():
    evaluation = queries_text.copy()
    evaluation.insert(2, "Precision", 0)
    evaluation.insert(3, "Average Precision", 0)
    evaluation.insert(4, "nDCG", 0)
    
    for i in range(len(evaluation)):

        p, a, n = evaluate_retrieve_with_preclustering(i,)
        evaluation.loc[i, 'Precision'] = p
        evaluation.loc[i, 'Average Precision'] = a
        evaluation.loc[i, 'nDCG'] = n
    
    print('Average precision across all queries = ' + str(evaluation['Precision'].mean()))
    print('Mean Average Precision = ' + str(evaluation['Average Precision'].mean()))
    print('Average nDCG = ' + str(evaluation['nDCG'].mean()))
    
    return evaluation

In [253]:
leaders, cluster_list = allocate_docs_to_clusters(11, cosine = True, Faiss = False)
evaluate_with_leaders_state_11 = evaluate_preclustering()

Average precision across all queries = 0.18907692307692314
Mean Average Precision = 0.11784957264957267
Average nDCG = 0.15597203856880254


In [254]:
evaluate_with_leaders_state_11.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.2,0.04,0.131205
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.4,0.13,0.277273
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.0,0.0,0.0
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.4,0.2,0.360055
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.0,0.0,0.0
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.2,0.066667,0.181542
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.75,0.55,0.699215
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.0,0.0,0.0


In [255]:
evaluate_with_leaders_state_11.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.189077,0.11785,0.155972
std,0.286094,0.232718,0.25224
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.25,0.1,0.213986
max,1.0,1.0,1.0


In [256]:
leaders, cluster_list = allocate_docs_to_clusters(110, cosine = True, Faiss = False)
evaluate_with_leaders_state_110 = evaluate_preclustering()

Average precision across all queries = 0.18994871794871807
Mean Average Precision = 0.11945641025641023
Average nDCG = 0.15664189919470173


In [257]:
evaluate_with_leaders_state_110.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.4,0.333333,0.50874
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.8,0.543333,0.66084
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.0,0.0,0.0
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.2,0.05,0.146068
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.8,0.543333,0.66084
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.0,0.0,0.0
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.2,0.05,0.146068


In [258]:
evaluate_with_leaders_state_110.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.189949,0.119456,0.156642
std,0.288398,0.238078,0.250659
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.25,0.1,0.21837
max,1.0,1.0,1.0


In [369]:
leaders, cluster_list = allocate_docs_to_clusters(1100, cosine = True, Faiss = False)

In [370]:
%%time
evaluate_with_leaders_state_1100 = evaluate_preclustering()

Average precision across all queries = 0.22158974358974376
Mean Average Precision = 0.1494820512820514
Average nDCG = 0.187614632924556
CPU times: user 1min 54s, sys: 1.57 s, total: 1min 56s
Wall time: 1min 6s


In [260]:
evaluate_with_leaders_state_1100.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.8,0.543333,0.41521
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.0,0.0,0.0
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.0,0.0,0.0
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.2,0.04,0.131205
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.0,0.0,0.0
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.8,0.76,0.853932
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.2,0.066667,0.16958


In [261]:
evaluate_with_leaders_state_1100.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.222359,0.145738,0.187262
std,0.311055,0.264541,0.278859
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.4,0.166667,0.298776
max,1.0,1.0,1.0


## Evaluate FAISS

In [132]:
def retrieve_with_preclustering_faiss(string_query, k = 5, IDs_of_retrieved_docs = False):
    vector_q = vectorize_query(string_query)
    return ir_preclustering_faiss(vector_q.iloc[0].astype('float32'), K = k)

In [377]:
def evaluate_retrieve_with_preclustering_faiss(position):
    ## returns the triple (Precision, Average Precision, Normalized Discounted Cumulative Gain)
    
    retrieved_df = ir_preclustering_faiss(tf_idf_queries.iloc[position].astype('float32'))
    ids_retrieved = []
    
    for i in range(len(retrieved_df)):
        ids_retrieved.append(retrieved_df.iloc[i]['ID'])
    ids_retrieved.sort()
    
    relevant = true_relevant_docs(queries_text['TEXT'][position])
    ids_true_relevant = []
    for i in range(len(relevant)):
        ids_true_relevant.append(relevant.iloc[i].DOC_ID)
    ids_true_relevant.sort()
    
    #count true positives and false positives
    tp = 0
    fp = 0
    for i in ids_retrieved:
        for j in ids_true_relevant:
            if i == j:
                tp += 1 
                break
            else:
                if i < j:
                    fp += 1 
                    break
                else:
                    continue
    if (tp == 0) & (fp == 0):
        precision = 0
    else:
        precision = tp/(tp+fp)
    #cannot calculate recall, since we predefined the number of retrieved documents => apriori algorithm cannot retrieve all documents
    
    #then calculate Average precision across retrieved documents
    ap = apk(ids_true_relevant, ids_retrieved)
    
    #since we have graded relevance annotations, we can also calculate Normalized Discounted Cumulative Gain
    list_of_ranks_of_retrieved_docs = []
    for i in ids_retrieved:
        if i in ids_true_relevant:
            list_of_ranks_of_retrieved_docs.append(relevant.loc[relevant['DOC_ID'].isin([i])].RELEVANCE_LEVEL.iloc[0])
        else:
            list_of_ranks_of_retrieved_docs.append(0)

                                               
    list_of_ranks_of_relevant_docs = []
    for i in ids_true_relevant:
        list_of_ranks_of_relevant_docs.append(relevant.loc[relevant['DOC_ID'].isin([i])].RELEVANCE_LEVEL.iloc[0])
    list_of_ranks_of_relevant_docs.sort(reverse = True)
    
    k=len(list_of_ranks_of_retrieved_docs)
    list_of_ranks_of_relevant_docs = list_of_ranks_of_relevant_docs[:k]
        
    return precision, ap, ndcg(list_of_ranks_of_relevant_docs, list_of_ranks_of_retrieved_docs) 

In [382]:
def evaluate_preclustering_faiss():
    evaluation = queries_text.copy()
    evaluation.insert(2, "Precision", 0)
    evaluation.insert(3, "Average Precision", 0)
    evaluation.insert(4, "nDCG", 0)
    
    for i in range(len(evaluation)):

        p, a, n = evaluate_retrieve_with_preclustering_faiss(i)
        evaluation.loc[i, 'Precision'] = p
        evaluation.loc[i, 'Average Precision'] = a
        evaluation.loc[i, 'nDCG'] = n
    
    print('Average precision across all queries = ' + str(evaluation['Precision'].mean()))
    print('Mean Average Precision = ' + str(evaluation['Average Precision'].mean()))
    print('Average nDCG = ' + str(evaluation['nDCG'].mean()))
    
    return evaluation

In [262]:
leaders, cluster_list = allocate_docs_to_clusters(11, cosine = False, Faiss = True)
index, indices = set_indeces_for_faiss()
evaluate_with_leaders_state_11_faiss = evaluate_preclustering_faiss()

asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number 

In [263]:
evaluate_with_leaders_state_11_faiss.head()

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.0,0.0,0.0
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.0,0.0,0.0
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.0,0.0,0.0
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.0,0.0,0.0


In [264]:
evaluate_with_leaders_state_11_faiss.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.024308,0.018351,0.020855
std,0.124139,0.11591,0.111971
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [265]:
leaders, cluster_list = allocate_docs_to_clusters(110, cosine = False, Faiss = True)
index, indices = set_indeces_for_faiss()
evaluate_with_leaders_state_110_faiss = evaluate_preclustering_faiss()

asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number 

In [266]:
evaluate_with_leaders_state_110_faiss.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.0,0.0,0.0
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.0,0.0,0.0
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.0,0.0,0.0
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.0,0.0,0.0
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.0,0.0,0.0
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.0,0.0,0.0
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.0,0.0,0.0


In [267]:
evaluate_with_leaders_state_110_faiss.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.024205,0.019015,0.019873
std,0.132568,0.118751,0.099579
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,0.88268


In [403]:
leaders, cluster_list = allocate_docs_to_clusters(1100, cosine = False, Faiss = True)
index, indices = set_indeces_for_faiss()

In [404]:
%%time
evaluate_with_leaders_state_1100_faiss = evaluate_preclustering_faiss()

asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number of documents to be retrieved is larger than the number of documents in the cluster; 
all documents in the cluster are retrieved
asked number 

In [269]:
evaluate_with_leaders_state_1100_faiss.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.0,0.0,0.0
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.2,0.05,0.146068
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.0,0.0,0.0
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.0,0.0,0.0
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.0,0.0,0.0
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.0,0.0,0.0
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.0,0.0,0.0


In [270]:
evaluate_with_leaders_state_1100_faiss.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.033692,0.024544,0.029255
std,0.149522,0.130307,0.129226
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


## Evaluate K-Means

In [281]:
def retrieve_with_preclustering_kmeans(string_query, k = 5, IDs_of_retrieved_docs = False):
    vector_q = vectorize_query(string_query)
    return ir_preclustering_kmeans(vector_q.iloc[0].astype('float32'), K = k)

In [407]:
def evaluate_retrieve_with_preclustering_kmeans(position):
    ## returns the triple (Precision, Average Precision, Normalized Discounted Cumulative Gain)
    
    retrieved_df = ir_preclustering_kmeans(tf_idf_queries.iloc[position].astype('float32'))
    ids_retrieved = []
    for i in range(len(retrieved_df)):
        ids_retrieved.append(retrieved_df.iloc[i].ID)
    ids_retrieved.sort()
    
    relevant = true_relevant_docs(queries_text['TEXT'][position])
    ids_true_relevant = []
    for i in range(len(relevant)):
        ids_true_relevant.append(relevant.iloc[i].DOC_ID)
    ids_true_relevant.sort()
    
    #count true positives and false positives
    tp = 0
    fp = 0
    for i in ids_retrieved:
        for j in ids_true_relevant:
            if i == j:
                tp += 1 
                break
            else:
                if i < j:
                    fp += 1 
                    break
                else:
                    continue
    if (tp == 0) & (fp == 0):
        precision = 0
    else:
        precision = tp/(tp+fp)
    #cannot calculate recall, since we predefined the number of retrieved documents => apriori algorithm cannot retrieve all documents
    
    #then calculate Average precision across retrieved documents
    ap = apk(ids_true_relevant, ids_retrieved)
    
    #since we have graded relevance annotations, we can also calculate Normalized Discounted Cumulative Gain
    list_of_ranks_of_retrieved_docs = []
    for i in ids_retrieved:
        if i in ids_true_relevant:
            list_of_ranks_of_retrieved_docs.append(relevant.loc[relevant['DOC_ID'].isin([i])].RELEVANCE_LEVEL.iloc[0])
        else:
            list_of_ranks_of_retrieved_docs.append(0)

                                               
    list_of_ranks_of_relevant_docs = []
    for i in ids_true_relevant:
        list_of_ranks_of_relevant_docs.append(relevant.loc[relevant['DOC_ID'].isin([i])].RELEVANCE_LEVEL.iloc[0])
    list_of_ranks_of_relevant_docs.sort(reverse = True)
    
    k=len(list_of_ranks_of_retrieved_docs)
    list_of_ranks_of_relevant_docs = list_of_ranks_of_relevant_docs[:k]
        
    return precision, ap, ndcg(list_of_ranks_of_relevant_docs, list_of_ranks_of_retrieved_docs) 

In [409]:
def evaluate_preclustering_kmeans():
    evaluation = queries_text.copy()
    evaluation.insert(2, "Precision", 0)
    evaluation.insert(3, "Average Precision", 0)
    evaluation.insert(4, "nDCG", 0)
    
    for i in range(len(evaluation)):

        p, a, n = evaluate_retrieve_with_preclustering_kmeans(i)
        evaluation.loc[i, 'Precision'] = p
        evaluation.loc[i, 'Average Precision'] = a
        evaluation.loc[i, 'nDCG'] = n
    
    print('Average precision across all queries = ' + str(evaluation['Precision'].mean()))
    print('Mean Average Precision = ' + str(evaluation['Average Precision'].mean()))
    print('Average nDCG = ' + str(evaluation['nDCG'].mean()))
    
    return evaluation

In [330]:
evaluate_kmeans_random_state_11 = evaluate_preclustering_kmeans()

Average precision across all queries = 0.3063076923076928
Mean Average Precision = 0.2033153846153847
Average nDCG = 0.2634461343458222


In [331]:
evaluate_kmeans_random_state_11.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.6,0.286667,0.308217
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.0,0.0,0.0
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.2,0.05,0.146068
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.0,0.0,0.0
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",1.0,1.0,1.0
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.25,0.05,0.146068
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.2,0.04,0.131205


In [332]:
evaluate_kmeans_random_state_11.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.306308,0.203315,0.263446
std,0.328387,0.294969,0.303882
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.2,0.05,0.168954
75%,0.5,0.286667,0.395498
max,1.0,1.0,1.0


In [339]:
evaluate_kmeans_random_state_110 = evaluate_preclustering_kmeans()

Average precision across all queries = 0.2912820512820515
Mean Average Precision = 0.18891282051282074
Average nDCG = 0.24501749716945692


In [340]:
evaluate_kmeans_random_state_110.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.4,0.13,0.277273
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.4,0.166667,0.315648
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.2,0.05,0.146068
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.2,0.05,0.146068
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.25,0.05,0.146068
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.0,0.0,0.0
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.0,0.0,0.0


In [341]:
evaluate_kmeans_random_state_110.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.291282,0.188913,0.245017
std,0.32131,0.279955,0.291543
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.2,0.05,0.146068
75%,0.5,0.25,0.383566
max,1.0,1.0,1.0


In [418]:
%%time
evaluate_kmeans_random_state_1100 = evaluate_preclustering_kmeans()

Average precision across all queries = 0.3293846153846156
Mean Average Precision = 0.23286581196581202
Average nDCG = 0.28418444388847286
CPU times: user 10min 7s, sys: 6.93 s, total: 10min 14s
Wall time: 5min 39s


In [349]:
evaluate_kmeans_random_state_1100.head(10)

Unnamed: 0,ID,TEXT,Precision,Average Precision,nDCG
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s...",0.6,0.286667,0.308217
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,...",0.5,0.2,0.360055
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle...",0.4,0.166667,0.242614
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he...",0.0,0.0,0.0
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu...",0.0,0.0,0.0
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,...",0.25,0.05,0.146068
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles...",0.0,0.0,0.0
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea...",0.0,0.0,0.0
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v...",0.6,0.6,0.722727
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb...",0.2,0.05,0.146068


In [350]:
evaluate_kmeans_random_state_1100.describe()

Unnamed: 0,Precision,Average Precision,nDCG
count,325.0,325.0,325.0
mean,0.314,0.217416,0.268998
std,0.344673,0.30816,0.313711
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.2,0.05,0.168128
75%,0.6,0.333333,0.466232
max,1.0,1.0,1.0
