In [13]:
!pip install python-terrier

import random
from pathlib import Path
import pyterrier as pt
if not pt.started():
    pt.init()



In [14]:
# Example of loading a dataset from PyTerrier
dataset = pt.get_dataset('irds:msmarco-passage/trec-dl-2020')

In [15]:
topics = dataset.get_topics()
topics.head()

Unnamed: 0,qid,query
0,1030303,who is aziz hashim
1,1037496,who is rep scalise
2,1043135,who killed nicholas ii of russia
3,1045109,who owns barnhart crane
4,1049519,who said no one can make you feel inferior


In [16]:
qrels = dataset.get_qrels()
qrels.head()

Unnamed: 0,qid,docno,label,iteration
0,23849,1020327,2,0
1,23849,1034183,3,0
2,23849,1120730,0,0
3,23849,1139571,1,0
4,23849,1143724,0,0


In [17]:
corpus_iter = dataset.get_corpus_iter()

# Convert to an iterator
corpus_iterator = iter(corpus_iter)

first_doc = next(corpus_iterator)
print(first_doc)

msmarco-passage/trec-dl-2020 documents:   0%|          | 0/8841823 [00:00<?, ?it/s]

{'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.', 'docno': '0'}


In [18]:
import pandas as pd
import spacy
from spellchecker import SpellChecker
import pyterrier as pt

class AdvancedQueryRewriting(pt.Transformer):
    def __init__(self):
        # Load English tokenizer, tagger, parser, NER, and word vectors
        self.nlp = spacy.load("en_core_web_sm")
        self.spellchecker = SpellChecker()
        super().__init__()

    def _correct_spelling(self, query: str) -> str:
        corrected_query = []
        for word in query.split():
            # Ensure correction or original word is added, preventing None
            corrected_word = self.spellchecker.correction(word) or word
            corrected_query.append(corrected_word)
        return " ".join(corrected_query)

    def _extract_keywords(self, query: str) -> str:
        doc = self.nlp(query)
        keywords = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
        # Ensure keywords list is not empty; return original query if it is
        return " ".join(keywords) if keywords else query

    def _expand_synonyms(self, query: str) -> str:
        doc = self.nlp(query)
        expanded_query = []
        for token in doc:
            # Ensure lemma or original word is added, preventing None
            expanded_word = token.lemma_ or token.text
            expanded_query.append(expanded_word)
        return " ".join(expanded_query)

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_new = df.copy()
        df_new["original_query"] = df_new["query"].astype(str)
        # Apply transformations with assurance that each step returns a string
        df_new["query"] = df_new["original_query"].map(self._correct_spelling)
        df_new["query"] = df_new["query"].map(self._extract_keywords)
        # df_new["query"] = df_new["query"].map(self._expand_synonyms)
        return df_new


In [19]:
# Instantiate the query rewriting transformer
query_rewriter = AdvancedQueryRewriting()

In [20]:
# Transform the topics with the query rewriter
rewritten_topics = query_rewriter.transform(topics)

In [21]:
# Compare the original and rewritten queries
for i in range(10):
    # where i is random number
    i = random.randint(0, len(topics) - 1)
    print(f"Original: {topics['query'][i]}")
    print(f"Rewritten: {rewritten_topics['query'][i]}\n")

Original: who said no one can make you feel inferior
Rewritten: said feel inferior

Original: how much weight on usps letter
Rewritten: weight uses letter

Original: estar meaning
Rewritten: star meaning

Original: what amino produces carnitine
Rewritten: amino produces carnotite

Original: what is chronometer who invented it
Rewritten: chronometer invented

Original: why did the ancient egyptians call their land kemet or black land
Rewritten: ancient egyptians land black land

Original: what causes muscles to tear
Rewritten: causes muscles tear

Original: why do some places on my scalp feel sore
Rewritten: places scalp feel sore

Original: how long do you stay contagious with the flu
Rewritten: long stay contagious flu

Original: what is onboarding for credit unions
Rewritten: boarding credit unions


In [22]:
indexer = pt.IterDictIndexer(str(Path("index").absolute()))
index_ref = indexer.index(corpus_iter)

msmarco-passage/trec-dl-2020 documents:   6%|▌         | 497955/8841823 [00:31<05:54, 23518.73it/s]



msmarco-passage/trec-dl-2020 documents: 100%|██████████| 8841823/8841823 [05:49<00:00, 25290.70it/s]
Exception in thread Thread-6 (_write_fifos):
Traceback (most recent call last):
  File "/Users/jasperbruin/anaconda3/envs/IR/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/Users/jasperbruin/anaconda3/envs/IR/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/jasperbruin/anaconda3/envs/IR/lib/python3.10/site-packages/pyterrier/index.py", line 1093, in _write_fifos
    json.dump(doc, fifo)
  File "/Users/jasperbruin/anaconda3/envs/IR/lib/python3.10/json/__init__.py", line 180, in dump
    fp.write(chunk)
BrokenPipeError: [Errno 32] Broken pipe
msmarco-passage/trec-dl-2020 documents:   0%|          | 69/8841823 [18:12<38885:27:08, 15.83s/it]


17:17:22.517 [ForkJoinPool-3-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 5 empty documents


In [23]:
del index_ref

index = pt.IndexFactory.of(str(Path("index")))
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

JavaException: JVM exception occurred: Could not load an index for ref index, even though IndexLoader org.terrier.structures.IndexOnDisk$DiskIndexLoader could support that type of index. It may be your ref had a wrong location; Terrier logs may have more information. java.lang.IllegalArgumentException

In [None]:
from pyterrier.measures import RR, nDCG, MAP
from pathlib import Path

results_dir = Path("results")
results_dir.mkdir(exist_ok=True)

pt.Experiment(
    [tf_idf, bm25],
    dataset.get_topics(),
    dataset.get_qrels(),
    names=["TF-IDF", "BM25"],
    eval_metrics=[RR @ 10, nDCG @ 20, MAP, nDCG @ 10],
    save_dir=str(results_dir),
)

In [None]:
pt.Experiment(
    [tf_idf, bm25],
    rewritten_topics,  
    dataset.get_qrels(),
    names=["TF-IDF", "BM25"],
    eval_metrics=[RR @ 10, nDCG @ 20, MAP, nDCG @ 10],
    save_dir=str(results_dir),
)