In [32]:
# importy

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import scipy.sparse
import pandas as pd
import csv

In [2]:
# load files

documents = []
for d in range(1400):
    f = open("./d/"+str(d+1)+".txt")
    documents.append(f.read())

def get_queries():
    queries = []
    for q in range(225):
        f = open("./q/"+str(q+1)+".txt")
        queries.append(f.read())
    return queries
    
def get_relevant(query):
    docs = []
    f = open("./r/"+str(query)+".txt")
    for line in f.readlines():
        docs.append(int(line))
    return docs

In [3]:
# euclidean and cosine distances

def euclid(data, length):
    sim = np.array(euclidean_distances(data[length], data[0:length])[0])
    return sim.argsort()+1

def cosin(data, length):
    sim = np.array(cosine_similarity(data[length], data[0:length])[0])
    return sim.argsort()[::-1]+1

In [4]:
# all following weightings

def bin_weight(query):
    data = documents.copy()
    data.append(query)
    
    bin_vectorizer = CountVectorizer(binary = True)
    bin_matrix = bin_vectorizer.fit_transform(data)
    
    return euclid(bin_matrix, len(data)-1), cosin(bin_matrix, len(data)-1)

def tf_weight(query):
    data = documents.copy()
    data.append(query)
    
    tf_vectorizer = CountVectorizer()
    tf_matrix = tf_vectorizer.fit_transform(data)
    
    sum = tf_matrix.sum(1)
    new_tf_matrix = tf_matrix.multiply(1 / sum)
    new_tf_matrix = scipy.sparse.csc_matrix(new_tf_matrix)
    
    return euclid(new_tf_matrix, len(data)-1), cosin(new_tf_matrix, len(data)-1)

def tfidf_weight(query):
    data = documents.copy()
    data.append(query)
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    
    return euclid(tfidf_matrix, len(data)-1), cosin(tfidf_matrix, len(data)-1)


In [5]:
# all possible evaluations

def precision(my_pick, right):
    count = 0
    for pick in my_pick:
        if pick in right:
            count += 1
    return count / len(my_pick)

def recall(my_pick, right):
    count = 0
    for pick in my_pick:
        if pick in right:
            count += 1
    return count / len(right)

def fmeas(my_pick, right):
    p = precision(my_pick, right)
    r = recall(my_pick, right)
    if (p == 0 and r == 0):
        return 0
    return 2 * ((p*r)/(p+r))

In [41]:
queries = get_queries()
results = []

# you can limit to top N relevant documents for each query
N = 20

head = ['Euclidean binary precision',
'Euclidean binary recall',
'Euclidean binary F-measure',
'Euclidean Term Frequency precision',
'Euclidean Term Frequency recall',
'Euclidean Term Frequency F-measure',
'Euclidean TF-IDF precision',
'Euclidean TF-IDF recall',
'Euclidean TF-IDF F-measure',
'Cosine binary precision',
'Cosine binary recall',
'Cosine binary F-measure',
'Cosine Term Frequency precision',
'Cosine Term Frequency recall',
'Cosine Term Frequency F-measure',
'Cosine TF-IDF precision',
'Cosine TF-IDF recall',
'Cosine TF-IDF F-measure', '\n']

with open('output.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(head)
    csvFile.close()   

for i, q in enumerate(queries, 1):
    ebin_res, cbin_res = bin_weight(q)
    etf_res ,ctf_res = tf_weight(q)
    etfidf_res ,ctfidf_res = tfidf_weight(q)
    
    relevant = get_relevant(i)
    
    result = [precision(ebin_res[:N], relevant),
    recall(ebin_res[:N], relevant),
    fmeas(ebin_res[:N], relevant),
    precision(etf_res[:N], relevant),
    recall(etf_res[:N], relevant),
    fmeas(etf_res[:N], relevant),
    precision(etfidf_res[:N], relevant),
    recall(etfidf_res[:N], relevant),
    fmeas(etfidf_res[:N], relevant),
    precision(cbin_res[:N], relevant),
    recall(cbin_res[:N], relevant),
    fmeas(cbin_res[:N], relevant),
    precision(ctf_res[:N], relevant),
    recall(ctf_res[:N], relevant),
    fmeas(ctf_res[:N], relevant),
    precision(ctfidf_res[:N], relevant),
    recall(ctfidf_res[:N], relevant),
    fmeas(ctfidf_res[:N], relevant), '\n']
    
    with open('output.csv', 'a') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(result)
        csvFile.close()
        
    results.append(result)
    
 



In [23]:
sum = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for res in results:
    for i in range(len(res)):
        sum[i] += res[i]

In [31]:
print('Average')
print('Euclidean binary precision')
print(sum[0] / len(queries))
print('Euclidean binary recall')
print(sum[1] / len(queries))
print('Euclidean binary F-measure')
print(sum[2] / len(queries))
print('Euclidean Term Frequency precision')
print(sum[3] / len(queries))
print('Euclidean Term Frequency recall')
print(sum[4] / len(queries))
print('Euclidean Term Frequency F-measure')
print(sum[5] / len(queries))
print('Euclidean TF-IDF precision')
print(sum[6] / len(queries))
print('Euclidean TF-IDF recall')
print(sum[7] / len(queries))
print('Euclidean TF-IDF F-measure')
print(sum[8] / len(queries))
print('Cosine binary precision')
print(sum[9] / len(queries))
print('Cosine binary recall')
print(sum[10] / len(queries))
print('Cosine binary F-measure')
print(sum[11] / len(queries))
print('Cosine Term Frequency precision')
print(sum[12] / len(queries))
print('Cosine Term Frequency recall')
print(sum[13] / len(queries))
print('Cosine Term Frequency F-measure')
print(sum[14] / len(queries))
print('Cosine TF-IDF precision')
print(sum[15] / len(queries))
print('Cosine TF-IDF recall')
print(sum[16] / len(queries))
print('Cosine TF-IDF F-measure')
print(sum[17] / len(queries))

Average
Euclidean binary precision
0.012888888888888882
Euclidean binary recall
0.03553423630435125
Euclidean binary F-measure
0.01781858858312141
Euclidean Term Frequency precision
0.07511111111111118
Euclidean Term Frequency recall
0.1996613270347871
Euclidean Term Frequency F-measure
0.1025703211020432
Euclidean TF-IDF precision
0.1535555555555556
Euclidean TF-IDF recall
0.42292367119323115
Euclidean TF-IDF F-measure
0.21211325919259827
Cosine binary precision
0.09955555555555567
Cosine binary recall
0.28692851358442995
Cosine binary F-measure
0.13944457562772075
Cosine Term Frequency precision
0.08911111111111118
Cosine Term Frequency recall
0.2446701014268041
Cosine Term Frequency F-measure
0.12233182853841017
Cosine TF-IDF precision
0.15977777777777788
Cosine TF-IDF recall
0.44331181646643375
Cosine TF-IDF F-measure
0.22104669332118243
