# Getting started

In [1]:
from whoosh.index import create_in
from whoosh.fields import *
import math

In [2]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

In [3]:
print(schema)

<Schema: ['content', 'path', 'title']>


Make sure you have the directory "indexdir" created beforehand in the folder where you start the notebook

In [4]:
index = create_in("indexdir", schema)
writer = index.writer()

In [5]:
writer.add_document(title=u"First document", path=u"/a",content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b",content=u"The second one is even more interesting!")
writer.commit()

In [6]:
from whoosh.qparser import QueryParser

the query we will use now is "first"

In [7]:
with index.searcher() as searcher:
    query = QueryParser("content", index.schema).parse("first")
    results = searcher.search(query)
    print(results[0], results[0].score)

<Hit {'path': '/a', 'title': 'First document'}> 1.047619047619048


# Custom ranking functions

In [8]:
from whoosh import scoring

In [9]:
w = scoring.TF_IDF()

In [10]:
with index.searcher(weighting =w) as searcher:
    query = QueryParser("content", index.schema).parse("first")
    results = searcher.search(query)
    print(results[0], results[0].score)

<Hit {'path': '/a', 'title': 'First document'}> 1.0


You can define a custom scoring function too. pos_score_fn computes a score for a given document using only one field. Here the score is based on the first occurence (position) of the query term.

In [11]:
def pos_score_fn(searcher, fieldname, text, matcher):
    poses = matcher.value_as("positions")
    return 1.0 / (poses[0] + 1)

pos_weighting = scoring.FunctionWeighting(pos_score_fn)

In [12]:
with index.searcher(weighting =pos_weighting) as searcher:
    query = QueryParser("content", index.schema).parse("first document")
    results = searcher.search(query)
    print(results[0], results[0].score)

<Hit {'path': '/a', 'title': 'First document'}> 0.45


# Indexing a collection and computing metrics

In [13]:
import csv

In [14]:
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
    csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
    for row in csv_reader:
        yield [unicode(cell, 'utf-8') for cell in row]

In [15]:
def read_file(file_path, delimiter='\t'):
    with open(file_path, 'r', encoding='utf8') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        doc_list = []
        for row in reader:
            doc_list.append((row[0],row[1], row[2].replace('\n',' ')))

    return doc_list

In [16]:
doc_list = read_file("collection.tsv")

In [17]:
print('#docs: ',len(doc_list))

#docs:  4154


Define our own tf-idf-function: (4.a.)

In [18]:
def tf_idf_fn(searcher, fieldname, text, matcher):
    tf = matcher.value_as("frequency")
    idf = searcher.idf(fieldname, text)
    return tf*idf

In [19]:
tf_idf_weighting = scoring.FunctionWeighting(tf_idf_fn)

Now define 'another ranking function that scores each document as the
sum of term frequency multiplied by term position' (4.a)

In [20]:
def tf_tp_fn(searcher, fieldname, text, matcher):
    tf = matcher.value_as("frequency")
    tp = matcher.value_as("positions")[0]
    return tf*tp

In [21]:
tf_tp_weighting = scoring.FunctionWeighting(tf_tp_fn)

In [22]:
schema = Schema(id=ID(stored=True), content=TEXT)
index = create_in("cw_index", schema)
writer = index.writer()

In [23]:
for doc in doc_list:
    writer.add_document(id=doc[0],content=doc[2])
writer.commit()

# Read QRels

In [24]:
def read_qrels(file_path, delimiter=' '):
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ')
        qrels = {}
        for row in reader:
            qrels[(row[0].replace("_", " "),row[1])] = int(row[2])

    return qrels

In [25]:
qrels_hash_map = read_qrels("q5.web.qrels.txt")

In [26]:
def precision(doc_list, query, qrels, k=10):
    f = lambda x: qrels[(x,query)] if (x,query) in qrels else 0
    vals = list(map(lambda q: 1 if q>0 else 0, map(f, doc_list[:k])))
    return sum(vals)/k

In [27]:
def avg_relevance(doc_list, query, qrels):
    f = lambda x: qrels[(x,query)] if (x,query) in qrels else 0
    vals = list(map(f, doc_list))
    print(vals)
    return sum(vals)/k

In [28]:
def precision_at_10(doc_list, query, qrels):
    return precision(doc_list, query, qrels, k=10)

In [29]:
# calculate total number of relevant docs for a given query
def total_relevant(query, qrels):
    return sum(1 if key[0]==query and value>0 else 0 for key,value in qrels_hash_map.items())

In [30]:
def recall(doc_list, query, qrels):
    f = lambda x: qrels[(x,query)] if (x,query) in qrels else 0
    vals = list(map(lambda q: 1 if q>0 else 0, map(f, doc_list)))
    return sum(vals)/total_relevant(query, qrels)

In [31]:
def ndcg_at_10(doc_list, query, qrels):
    total_eval = 0
    for i in range(10):
        gain = qrels[(doc_list[i], query)] if (doc_list[i], query) in qrels else 0
        if i>0:
            total_eval += gain * 1.0/math.log2(i+1)
        else:
            total_eval += gain
            
    return total_eval

In [32]:
def map_at_10(doc_list, query, qrels):
    total_eval = 0
    relevant_count = 0
    for i in range(10):
        if (qrels[(doc_list[i], query)] if (doc_list[i], query) in qrels else 0) > 0:
            relevant_count += 1
            total_eval += precision(doc_list, query, qrels, k=i+1)
    
    if relevant_count > 0:
        return total_eval/relevant_count
    else:
        return 0.0

In [33]:
def apply_evaluation(doc_list, query, qrels):
    print ("precision@10: ", precision_at_10(doc_list, query, qrels))
    print ("recall: ", recall(doc_list, query, qrels))
    print ("NDCG@10: ", ndcg_at_10(doc_list, query, qrels))
    print ("MAP@10: ", map_at_10(doc_list, query, qrels))

In [41]:
print("tf-idf weighting:")
result_list = []
with index.searcher(weighting = tf_idf_weighting) as searcher:
    query = QueryParser("content", index.schema).parse(query_strings[3])
    results = searcher.search(query, limit=None)
    print("Results found:", len(results))
    for result in results:
        result_list.append(result['id'])
        


tf-idf weighting:
Results found: 535


In [35]:
query_strings = ["obama family tree", "french lick resort and casino","getting organized","toilet","mitchell college"]

In [40]:
for query_str in query_strings:
    print("\n\n#######################################")
    print("query: ", query_str)
    print("")
    
    print("tf-idf weighting:")
    result_list = []
    with index.searcher(weighting = tf_idf_weighting) as searcher:
        query = QueryParser("content", index.schema).parse(query_str)
        results = searcher.search(query, limit=None)
        print("Results found:", len(results))
        for result in results:
            result_list.append(result['id'])
        
        apply_evaluation(result_list, query_str, qrels_hash_map)
        
    print("")
    print("tf-tp weighting:")
    result_list = []
    with index.searcher(weighting =tf_tp_weighting) as searcher:
        query = QueryParser("content", index.schema).parse(query_str)
        results = searcher.search(query, limit=None)
        print("Results found:", len(results))
        for result in results:
            result_list.append(result['id'])
            
        apply_evaluation(result_list, query_str, qrels_hash_map)



#######################################
query:  obama family tree

tf-idf weighting:
Results found: 85
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0

tf-tp weighting:
Results found: 85
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0


#######################################
query:  french lick resort and casino

tf-idf weighting:
Results found: 83
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0

tf-tp weighting:
Results found: 83
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0


#######################################
query:  getting organized

tf-idf weighting:
Results found: 468
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0

tf-tp weighting:
Results found: 468
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0


#######################################
query:  toilet

tf-idf weighting:
Results found: 535
precision@10:  0.0
recall:  0.0
NDCG@10:  0.0
MAP@10:  0.0

tf-tp weighting:
Results found: 535
precision@10:  0.0