# Step 1: Import Libraries

In [1]:
# Install necessary libraries
!pip3 install tira ir-datasets python-terrier nltk

[0m

In [2]:
# Import necessary libraries
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client

import nltk

import pandas as pd
import pyterrier as pt
from pyterrier.measures import *
import math

In [3]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
# Initialize PyTerrier
if not pt.started():
    pt.init()

In [5]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', dataset)

# Step 2: Data Loading

In [6]:
# Display some example data
#topics = dataset.get_topics('text')
#qrels = dataset.get_qrels()


In [7]:
#print(topics.head())
#print(qrels.head())

In [8]:
# Step 3: Data Preprocessing
# Here we would include steps to clean and preprocess the text, such as removing stop words, stemming, etc.
# For this example, we'll assume the data is already preprocessed.

# Step 4: Information Retrieval Models

In [9]:
# BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [10]:
# TF-IDF
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")

# Step 5: Evaluation

In [12]:
# # Evaluate BM25
bm25_results = bm25.transform(dataset.get_topics('text'))
# bm25_metrics = pt.Experiment(
#     [bm25],
#     dataset.get_topics(), 
#     dataset.get_qrels(), 
#     eval_metrics=["map", "ndcg", "recip_rank"]
# )

In [13]:
# # Evaluate TF-IDF
tfidf_results = tfidf.transform(dataset.get_topics('text'))
# tfidf_metrics = pt.Experiment(
#     [tfidf],
#     topics,
#     qrels,
#     eval_metrics=["map", "ndcg", "recip_rank"]
# )

# Step 6: Conclusion

In [14]:
run = bm25_results
persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')

run = tfidf_results
persist_and_normalize_run(run, system_name='tfidf-baseline', default_output='../runs')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".
The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".


In [None]:
# # Summarize findings
# print("BM25 Metrics:", bm25_metrics)
# print("TF-IDF Metrics:", tfidf_metrics)

# Step 7 : Benchmarks

### bm25

In [None]:
# # Local test of BM25
# bm25_experiment = pt.Experiment(
#     [bm25],
#     topics,
#     qrels,
#     eval_metrics=['P_1000', 'map', 'recip_rank'],
#     names=['BM25'],
#     baseline=0
# )

In [None]:
# # Display BM25 results
# print("BM25 Experiment Results")
# print(bm25_experiment)

### TF-IDF

In [None]:
# # Local test of TF-IDF
# tfidf_experiment = pt.Experiment(
#     [tfidf],
#     topics,
#     qrels,
#     eval_metrics=['P_1000', 'map', 'recip_rank'],
#     names=['TF-IDF'],
#     baseline=0
# )

In [None]:
# # Display TF-IDF results
# print("TF-IDF Experiment Results")
# print(tfidf_experiment)