# Assignment 2: IR

## Preparations
* Put all your imports, and path constants in the next cells

In [1]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install chromadb
!pip install sentence-transformers

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308212 sha256=3bfef98b877724e37c50c26b9700dca21d758bc7c12f9f6e19b9d395237e795b
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5
Collecting wget
  Downloading wg

In [2]:
import wget
wget.download("https://github.com/MIE451-2024/course-datasets/raw/main/product_search.zip", "product_search.zip")

'product_search.zip'

In [3]:
!unzip product_search.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: product_search/documents/172338  
  inflating: product_search/documents/1032505  
  inflating: product_search/documents/1498799  
  inflating: product_search/documents/1177621  
  inflating: product_search/documents/514580  
  inflating: product_search/documents/772851  
  inflating: product_search/documents/1654026  
  inflating: product_search/documents/1105665  
  inflating: product_search/documents/939032  
  inflating: product_search/documents/1021744  
  inflating: product_search/documents/1133909  
  inflating: product_search/documents/1142390  
  inflating: product_search/documents/188428  
  inflating: product_search/documents/27993  
  inflating: product_search/documents/865667  
  inflating: product_search/documents/787967  
  inflating: product_search/documents/267299  
  inflating: product_search/documents/1584876  
  inflating: product_search/documents/1261105  
  inflating: product_search/docum

In [4]:
# imports DO NOT MODIFY
from whoosh import index, writing
from whoosh.searching import Results
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
from whoosh import qparser, query
from whoosh import scoring
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
import numpy as np
import chromadb
import json
from sentence_transformers import SentenceTransformer
import nltk
from nltk.stem import *
nltk.download("wordnet")

# Put all your imports here

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system

    DO NOT ADD OR MODIFY THIS CLASS
    """

    def __init__(self, data_dir):
        # DON'T change the following names,topic_file, qrels_file, document_dir, file_list, text_embedding
        self.topic_file = os.path.join(data_dir, "product.topics")
        self.qrels_file = os.path.join(data_dir, "product.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]
        # this is the text embedding of the documents for Question 4 Only
        self.text_embedding = json.load(open(os.path.join(data_dir, "corpus_embeddings.json"), "r"))

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))

    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topicResults = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                if isinstance(topicResults, dict):
                    # format the result for chroma search
                    for (docnum, result) in enumerate(topicResults['ids'][0]):
                        score = topicResults['distances'][0][docnum]
                        score = self.post_process_score(score)
                        print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result), docnum, score))
                else:
                    # format the result for whoosh search
                    for (docnum, result) in enumerate(topicResults):
                        score = topicResults.score(docnum)
                        score = self.post_process_score(score)
                        print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topicResults = self.perform_search(topic_phrase)
                if isinstance(topicResults, dict):
                    # format the result for chroma search
                    for (docnum, result) in enumerate(topicResults['ids'][0]):
                        score = -topicResults['distances'][0][docnum]
                        outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id,
                        os.path.basename(result), docnum, score))
                else:
                    # format the result for whoosh search
                    for (docnum, result) in enumerate(topicResults):
                        score = topicResults.score(docnum)
                        outputTRECFile.write(
                            "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        self.print_trec_eval_result(results)


In [6]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

## Question 1


### Q1 (a): Provide answer to Q1 (a) here [markdown cell]

P_10

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]

Because typical users that are searching for products only look at the top results, having high precision at top is cruicial.

## Question 2

### Q2 (a): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [10]:
class IRQ2(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        indexDir = tempfile.mkdtemp()

        mySchema = Schema(file_path = ID(stored=True),
                          file_content = TEXT(analyzer = RegexTokenizer()))

        # DON't change the name of 'index_sys'
        self.index_sys = index.create_in(indexDir, mySchema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        filesToIndex = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        try:
            # write each file to index
            for docNum, filePath in enumerate(filesToIndex):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
        # DON't change the names of 'query_parser' and 'searcher'

        # QueryParser(which field to search, using which schema)
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        sampleQuery = self.query_parser.parse(topic_phrase)
        topicResults = self.searcher.search(sampleQuery, limit=None)
        return topicResults

In [11]:
q2 = IRQ2("product_search")

In [12]:
q2.add_files()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
already indexed: 5000
already indexed: 6000
done indexing.


In [13]:
q2.py_trec_eval()

num_q                    100246  1.0000
num_ret                  100246  2.0000
num_rel                  100246  102.0000
num_rel_ret              100246  2.0000
map                      100246  0.0196
gm_map                   100246  -3.9318
Rprec                    100246  0.0196
bpref                    100246  0.0196
recip_rank               100246  1.0000
iprec_at_recall_0.00     100246  1.0000
iprec_at_recall_0.10     100246  0.0000
iprec_at_recall_0.20     100246  0.0000
iprec_at_recall_0.30     100246  0.0000
iprec_at_recall_0.40     100246  0.0000
iprec_at_recall_0.50     100246  0.0000
iprec_at_recall_0.60     100246  0.0000
iprec_at_recall_0.70     100246  0.0000
iprec_at_recall_0.80     100246  0.0000
iprec_at_recall_0.90     100246  0.0000
iprec_at_recall_1.00     100246  0.0000
P_5                      100246  0.4000
P_10                     100246  0.2000
P_15                     100246  0.1333
P_20                     100246  0.1000
P_30                     100246  0.06

In [14]:
q2.print_rel_name('100246')

---------------------------Topic_id and Topic_phrase----------------------------------
100246 SPLOTY Tire Inflator Air Compressor Portable 12V
---------------------------Return documents----------------------------------
100246 Q0 1555783 0 54.335138 test
100246 Q0 1489405 1 53.903616 test
---------------------------Relevant documents----------------------------------
100246 0 10037 1
100246 0 1030471 1
100246 0 103372 1
100246 0 1044 1
100246 0 1044436 1
100246 0 106086 1
100246 0 1079438 1
100246 0 1086239 1
100246 0 1093412 1
100246 0 10966 1
100246 0 1105375 1
100246 0 1107134 1
100246 0 1137791 1
100246 0 1140524 1
100246 0 1148601 1
100246 0 116655 1
100246 0 1176585 1
100246 0 117671 1
100246 0 1187066 1
100246 0 1191987 1
100246 0 120310 1
100246 0 1214125 1
100246 0 1214399 1
100246 0 121627 1
100246 0 1216577 1
100246 0 1267443 1
100246 0 128561 1
100246 0 1294856 1
100246 0 1306587 1
100246 0 1307993 1
100246 0 1349887 1
100246 0 1384746 1
100246 0 138718 1
100246 0 1418253 

In [15]:
INDEX_Q2 = q2.index_sys
QP_Q2 = q2.query_parser
SEARCHER_Q2 = q2.searcher

### Q2 (b): Provide answer to Q2 (b) here [markdown cell]

The baseline Whoosh system evaluated a total of 10 queries/topics. The average P_10 score across these topics is 0.28.

### Q2 (c): Provide answer to Q2(c) here [markdown cell]

The system performed very well on topic ID 200013, achieving a P_10 score of 1.0. However, it did very badly with topic ID 100436, with a P_10 score of 0.0.


## Question 3

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]

In [16]:
q2.print_rel_name('100436')

---------------------------Topic_id and Topic_phrase----------------------------------
100436 Microsoft Surface Laptop 2
---------------------------Return documents----------------------------------
100436 Q0 1454895 0 32.313010 test
100436 Q0 1018392 1 31.286254 test
100436 Q0 1531654 2 30.105474 test
100436 Q0 222971 3 29.902118 test
100436 Q0 1211358 4 29.846285 test
100436 Q0 1611328 5 29.846285 test
100436 Q0 848659 6 29.814829 test
100436 Q0 838597 7 29.814829 test
100436 Q0 446132 8 29.791898 test
100436 Q0 1581323 9 29.791898 test
100436 Q0 1505502 10 29.630750 test
100436 Q0 55856 11 29.289351 test
100436 Q0 619911 12 29.258819 test
100436 Q0 1180732 13 29.025775 test
100436 Q0 154576 14 28.993166 test
100436 Q0 790458 15 28.942390 test
100436 Q0 1519025 16 28.905020 test
100436 Q0 546888 17 28.900627 test
100436 Q0 517899 18 28.528829 test
100436 Q0 518791 19 28.243789 test
100436 Q0 850038 20 28.225207 test
100436 Q0 541719 21 28.164424 test
100436 Q0 1427560 22 27.734011 te

Since P_10 is 0 for query 100436 (Microsoft Surface Laptop 2), all the top 10 documents returned by the system were irrelevant.
- Therefore, a false positive example is document 1454895, which was ranked first but is not listed in the relevant documents.
- A false negative example is document 10260, which was not retrieved within the top 10


### Q3 (b): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [17]:
class IRQ3(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        indexDir = tempfile.mkdtemp()

        mySchema = Schema(file_path = ID(stored=True),
                          file_content = TEXT(analyzer = RegexTokenizer()|LowercaseFilter(), stored=True)) # the exact code to implement a lowercase filter was from chatgpt
                                                                                                           # stored=True to store the field's original content

        # DON't change the name of 'index_sys'
        self.index_sys = index.create_in(indexDir, mySchema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)
        filesToIndex = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        try:
            # write each file to index
            for docNum, filePath in enumerate(filesToIndex):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
        # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topic_phrase = topic_phrase.lower()
        sampleQuery = self.query_parser.parse(topic_phrase)
        topicResults = self.searcher.search(sampleQuery, limit=None)
        return topicResults

In [18]:
q3 = IRQ3("product_search")

In [19]:
q3.add_files()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
already indexed: 5000
already indexed: 6000
done indexing.


In [20]:
q3.py_trec_eval()

num_q                    100246  1.0000
num_ret                  100246  2.0000
num_rel                  100246  102.0000
num_rel_ret              100246  2.0000
map                      100246  0.0196
gm_map                   100246  -3.9318
Rprec                    100246  0.0196
bpref                    100246  0.0196
recip_rank               100246  1.0000
iprec_at_recall_0.00     100246  1.0000
iprec_at_recall_0.10     100246  0.0000
iprec_at_recall_0.20     100246  0.0000
iprec_at_recall_0.30     100246  0.0000
iprec_at_recall_0.40     100246  0.0000
iprec_at_recall_0.50     100246  0.0000
iprec_at_recall_0.60     100246  0.0000
iprec_at_recall_0.70     100246  0.0000
iprec_at_recall_0.80     100246  0.0000
iprec_at_recall_0.90     100246  0.0000
iprec_at_recall_1.00     100246  0.0000
P_5                      100246  0.4000
P_10                     100246  0.2000
P_15                     100246  0.1333
P_20                     100246  0.1000
P_30                     100246  0.06

In [21]:
q3.print_rel_name('100246')

---------------------------Topic_id and Topic_phrase----------------------------------
100246 SPLOTY Tire Inflator Air Compressor Portable 12V
---------------------------Return documents----------------------------------
100246 Q0 1489405 0 57.418609 test
100246 Q0 1555783 1 56.632846 test
---------------------------Relevant documents----------------------------------
100246 0 10037 1
100246 0 1030471 1
100246 0 103372 1
100246 0 1044 1
100246 0 1044436 1
100246 0 106086 1
100246 0 1079438 1
100246 0 1086239 1
100246 0 1093412 1
100246 0 10966 1
100246 0 1105375 1
100246 0 1107134 1
100246 0 1137791 1
100246 0 1140524 1
100246 0 1148601 1
100246 0 116655 1
100246 0 1176585 1
100246 0 117671 1
100246 0 1187066 1
100246 0 1191987 1
100246 0 120310 1
100246 0 1214125 1
100246 0 1214399 1
100246 0 121627 1
100246 0 1216577 1
100246 0 1267443 1
100246 0 128561 1
100246 0 1294856 1
100246 0 1306587 1
100246 0 1307993 1
100246 0 1349887 1
100246 0 1384746 1
100246 0 138718 1
100246 0 1418253 

In [22]:
INDEX_Q3 = q3.index_sys
QP_Q3 = q3.query_parser
SEARCHER_Q3 = q3.searcher

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]

Modifications:

- I added a LowercaseFilter to the text processing pipeline for both indexing and querying. This ensures that all content is converted to lowercase, allowing for consistent matching regardless of the original case of the text in documents or queries.

Improvements:

- The modification of using LowercaseFilter improved the consistency in matching queries with the document content. This change potentially increased the P_10 score for queries where case sensitivity might have previously caused mismatches.
- Specifically, for queries like "Microsoft Surface Laptop 2," where case differences could have led to missed matches, this modification likely improved recall and precision, particularly at the top-ranked results (e.g., P_10).


### Q3 (d): Provide answer to Q3 (d) here [markdown cell]

Yes

### Q3 (e): Provide answer to Q3 (e) here [markdown cell]

Yes

### Q3 (f): Provide answer to Q3 (f) here [markdown cell]

The idea of applying a lowercase filter was good because it standardizes the text and queries, reducing the chances of mismatches caused by case differences. This is especially beneficial in cases where users might enter queries in different cases or where document content varies in case usage. However, while this approach generally improves recall and consistency, it could potentially decrease precision in scenarios where case distinctions are important. Overall, I believe the benefits outweigh the downsides.

## Question 4

### Q4a) Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir
      self.text_embedding
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [23]:
class IRQ4(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type
            chromadb.collection.Collection with name `ir_assignment_[utorid]`
        """
        chroma_client = chromadb.Client()

        # Modify collection_name as `ir_assignment_[utorid]`
        name = 'ir_assignment_kimdabbi'

        ## DON't Modify Following Line
        if name in [collection.name for collection in
                               chroma_client.list_collections()]:
            chroma_client.delete_collection(name)

        # Your Code Here, DON't change the name of 'index_sys'

        self.index_sys = chroma_client.create_collection(name=name)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add embedding of each document from self.text_embedding,
              and content of each document, to self.index_sys
        """
        fileList = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                doc_id = os.path.basename(filePath)
                self.index_sys.add(ids = doc_id,
                            documents = fileContent,
                            embeddings = self.text_embedding[doc_id])
                if (docNum+1) % 1000 == 0:
                    print("already indexed:", docNum+1)
        print("done indexing.")

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser,
              which should have type sentence_transformers.SentenceTransformer with TAS-B model


        """
        # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
        # DON't modify the following line
        self.searcher = self.index_sys

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: dict

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        query_embedding = self.query_parser.encode(topic_phrase).tolist()
        topicResults = self.searcher.query(query_embeddings=query_embedding)
        return topicResults

In [24]:
q4= IRQ4("product_search")
q4.add_files()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
already indexed: 5000
already indexed: 6000
done indexing.


In [25]:
q4.py_trec_eval()

num_q                    100246  1.0000
num_ret                  100246  10.0000
num_rel                  100246  102.0000
num_rel_ret              100246  7.0000
map                      100246  0.0555
gm_map                   100246  -2.8918
Rprec                    100246  0.0686
bpref                    100246  0.0678
recip_rank               100246  1.0000
iprec_at_recall_0.00     100246  1.0000
iprec_at_recall_0.10     100246  0.0000
iprec_at_recall_0.20     100246  0.0000
iprec_at_recall_0.30     100246  0.0000
iprec_at_recall_0.40     100246  0.0000
iprec_at_recall_0.50     100246  0.0000
iprec_at_recall_0.60     100246  0.0000
iprec_at_recall_0.70     100246  0.0000
iprec_at_recall_0.80     100246  0.0000
iprec_at_recall_0.90     100246  0.0000
iprec_at_recall_1.00     100246  0.0000
P_5                      100246  0.6000
P_10                     100246  0.7000
P_15                     100246  0.4667
P_20                     100246  0.3500
P_30                     100246  0.2

In [26]:
q4.print_rel_name('100246')

---------------------------Topic_id and Topic_phrase----------------------------------
100246 SPLOTY Tire Inflator Air Compressor Portable 12V
---------------------------Return documents----------------------------------
100246 Q0 1489405 0 27.039255 test
100246 Q0 1555783 1 28.156231 test
100246 Q0 963936 2 29.083698 test
100246 Q0 1640998 3 30.070866 test
100246 Q0 1138844 4 31.560390 test
100246 Q0 746471 5 31.703403 test
100246 Q0 766020 6 32.279701 test
100246 Q0 444309 7 32.787178 test
100246 Q0 969940 8 33.005508 test
100246 Q0 272645 9 33.311699 test
---------------------------Relevant documents----------------------------------
100246 0 10037 1
100246 0 1030471 1
100246 0 103372 1
100246 0 1044 1
100246 0 1044436 1
100246 0 106086 1
100246 0 1079438 1
100246 0 1086239 1
100246 0 1093412 1
100246 0 10966 1
100246 0 1105375 1
100246 0 1107134 1
100246 0 1137791 1
100246 0 1140524 1
100246 0 1148601 1
100246 0 116655 1
100246 0 1176585 1
100246 0 117671 1
100246 0 1187066 1
10024

In [27]:
INDEX_Q4 = q4.index_sys
QP_Q4 = q4.query_parser
SEARCHER_Q4 = q4.searcher

### Q4 (b): Provide answer to Q4 (b) here [markdown cell]


Yes

### Q4 (c): Provide answer to Q4 (c) here [markdown cell]


Yes

### Q4 (d): Provide answer to Q4 (d) here [markdown cell]

The system’s P_10 metric improved alot, indicating better alignment between user queries and relevant documents. This was because the integration of the TAS-B model and Chroma DB indexing allowed for better semantic matching between queries and documents, leading to improved performance in terms of precision metrics like P_10.


## Validation

In [None]:
# Run the following cells to make sure your code returns the correct value types

In [28]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [29]:
q2 = IRQ2("product_search")
assert(isinstance(q2.index_sys, FileIndex)), "Index Type"
assert(isinstance(q2.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q2.searcher, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [30]:
q3 = IRQ3("product_search")
assert(isinstance(q3.index_sys, FileIndex)), "Index Type"
assert(isinstance(q3.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q3.searcher, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation

In [31]:
q4 = IRQ4("product_search")
assert(isinstance(q4.index_sys, chromadb.Collection)), "Collection Type"
assert(isinstance(q4.query_parser, SentenceTransformer)), " SentenceTransformer Type"
assert(isinstance(q4.searcher, chromadb.Collection)), "Collection Type"
print("Q4 Types Validated")

Q4 Types Validated
