In [1]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install sentence_transformers
!pip install chromadb

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308203 sha256=14d523be761ee674e818a229a586d13ed8493a903116663fe5c8cfa8ac17cbac
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5
Collecting wget
  Downloading wg

In [2]:
import wget
wget.download("https://github.com/MIE451-2024/course-datasets/raw/main/product_search.zip", "product_search.zip")

'product_search.zip'

In [3]:
!unzip product_search.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: product_search/documents/172338  
  inflating: product_search/documents/1032505  
  inflating: product_search/documents/1498799  
  inflating: product_search/documents/1177621  
  inflating: product_search/documents/514580  
  inflating: product_search/documents/772851  
  inflating: product_search/documents/1654026  
  inflating: product_search/documents/1105665  
  inflating: product_search/documents/939032  
  inflating: product_search/documents/1021744  
  inflating: product_search/documents/1133909  
  inflating: product_search/documents/1142390  
  inflating: product_search/documents/188428  
  inflating: product_search/documents/27993  
  inflating: product_search/documents/865667  
  inflating: product_search/documents/787967  
  inflating: product_search/documents/267299  
  inflating: product_search/documents/1584876  
  inflating: product_search/documents/1261105  
  inflating: product_search/docum

In [4]:
# imports
from whoosh import index, writing
from whoosh.searching import Results
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
from whoosh import qparser, query
from whoosh import scoring
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import torch
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
import numpy as np
import chromadb
import json
from sentence_transformers import SentenceTransformer
import nltk
from nltk.stem import *
nltk.download("wordnet")

# Put all your imports here

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system

    DO NOT ADD OR MODIFY THIS CLASS
    """

    def __init__(self, data_dir):
        # DON'T change the following names,topic_file, qrels_file, document_dir, file_list, text_embedding
        self.topic_file = os.path.join(data_dir, "product.topics")
        self.qrels_file = os.path.join(data_dir, "product.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]
        # this is the text embedding of the documents
        self.text_embedding = json.load(open(os.path.join(data_dir, "corpus_embeddings.json"), "r"))

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))

    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topicResults = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                if isinstance(topicResults, dict):
                    # format the result for chroma search
                    for (docnum, result) in enumerate(topicResults['ids'][0]):
                        score = topicResults['distances'][0][docnum]
                        score = self.post_process_score(score)
                        print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result), docnum, score))
                else:
                    # format the result for whoosh search
                    for (docnum, result) in enumerate(topicResults):
                        score = topicResults.score(docnum)
                        score = self.post_process_score(score)
                        print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topicResults = self.perform_search(topic_phrase)
                if isinstance(topicResults, dict):
                    # format the result for chroma search
                    for (docnum, result) in enumerate(topicResults['ids'][0]):
                        score = -topicResults['distances'][0][docnum]
                        outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id,
                        os.path.basename(result), docnum, score))
                else:
                    # format the result for whoosh search
                    for (docnum, result) in enumerate(topicResults):
                        score = topicResults.score(docnum)
                        outputTRECFile.write(
                            "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        self.print_trec_eval_result(results)

In [6]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

# Dont change this! Use it as-is in your code if you rerank your results using a non-Whoosh scoring function
# (please refer to the optional section of lab material)
class NeuralResults():
  '''
  This class is used to rerank documents returned by whoosh in an interface that
  imitates whoosh.searching.Results (the datatype of topicResults in pyTrecEval)
  '''
  def __init__(self, booleansearchdocs,scores,rankings, file_list):
    self.results=[]
    if rankings.shape:
      for idx in rankings:
        self.results.append({'file_path':file_list[booleansearchdocs[idx]],'score':scores[idx] })

  def score(self,docnum):
    return self.results[docnum]['score']

  def __iter__(self):
    return self.results.__iter__()

## Question 5

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

**6. If you are doing neural IR you should precompute your corpus embeddings and save them in the corpus_embeddings.json file. If you do this, please keep the code used to generate the embeddings somewhere in this notebook**

In [7]:
class IRQ5(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have either type
            whoosh.index.FileIndex or chromadb.collection.Collection
            (if you are using the chroma db, please name your collection as `ir_assignment_q5_[UTORid]`)
        """
        indexDir = tempfile.mkdtemp()

        mySchema = Schema(file_path = ID(stored=True),
                          file_content = TEXT(analyzer = StemmingAnalyzer() | LowercaseFilter() | StopFilter()))

        # DON't change the name of 'index_sys'
        self.index_sys = index.create_in(indexDir, mySchema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys for whoosh
            or add embeddings to self.index_sys for chroma db
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        filesToIndex = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        try:
            # write each file to index
            for docNum, filePath in enumerate(filesToIndex):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have
        1) type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        2) type sentence_transformers.SentenceTransformer and chromadb.collection.Collection respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema, group=qparser.OrGroup)
        self.searcher = self.index_sys.searcher(weighting=scoring.BM25F())

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults:
            whoosh.searching.Results (whoosh)
            OR NeuralResults (whoosh with embedding rerank)
            OR dict (chroma db)

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topic_phrase = topic_phrase.lower()
        sampleQuery = self.query_parser.parse(topic_phrase)
        topicResults = self.searcher.search(sampleQuery, limit=None)
        return topicResults

In [8]:
q5 = IRQ5("product_search")
q5.add_files()
q5.py_trec_eval()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
already indexed: 5000
already indexed: 6000
done indexing.
num_q                    100246  1.0000
num_ret                  100246  658.0000
num_rel                  100246  102.0000
num_rel_ret              100246  102.0000
map                      100246  0.9751
gm_map                   100246  -0.0253
Rprec                    100246  0.9118
bpref                    100246  0.9767
recip_rank               100246  1.0000
iprec_at_recall_0.00     100246  1.0000
iprec_at_recall_0.10     100246  1.0000
iprec_at_recall_0.20     100246  1.0000
iprec_at_recall_0.30     100246  1.0000
iprec_at_recall_0.40     100246  1.0000
iprec_at_recall_0.50     100246  0.9870
iprec_at_recall_0.60     100246  0.9870
iprec_at_recall_0.70     100246  0.9870
iprec_at_recall_0.80     100246  0.9651
iprec_at_recall_0.90     100246  0.9388
iprec_at_recall_1.00     100246  0.8031
P_5                      100246  1.0000
P_10  

### Please answer the following questions here
(a) A clear list of all final modifications made.  
- Scoring Function:
  - Implemented the BM25F scoring function for improving retrieval effectiveness in information retrieval systems
- Query Parsing:
  - Used qparser.OrGroup in the query parser to expand the search to include more relevant documents by allowing the use of an OR group for query terms.

(b)  Why each modification was made – how did it help?  
- BM25 Scoring function:
  - This didn't change the average performance in this case. However, it could be helpful for other datasets since BM25 is a scoring function that allows different weights to be assigned to different fields of the document (like title and content). It’s highly effective for handling short and long documents and generally provides better precision and recall.
- Query Parsing:
  - This significantly increased the average MAP performance.
  - This is because the qparser.OrGroup modification allows the retrieval system to return documents that match any of the query terms (as opposed to all query terms, which would be the case with an AND group). This increases recall, especially for queries where not all terms are crucial. By including more relevant documents, this also helps to increase the MAP score.

(c)  The  final  MAP  performance  that  these  modifications  attained.
- 0.7733

### Q5 Validation

In [9]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

In [10]:
q5 = IRQ5("product_search")
assert(isinstance(q5.index_sys, FileIndex) or isinstance(q5.index_sys, chromadb.collection.Collection)), "Index Type"
assert(isinstance(q5.query_parser, QueryParser) or isinstance(q5.query_parser, SentenceTransformer)), "Query Parser Type"
assert(isinstance(q5.searcher, Searcher) or isinstance(q5.searcher, chromadb.collection.Collection)), "Searcher Type"
print("Q5 Types Validated")

Q5 Types Validated
