# IR Lab SoSe 2024: Combined Retrieval System

This jupyter notebook serves as an improved retrieval system combining components from both provided notebooks.
We will use a corpus of scientific papers (title + abstracts) from the fields of information retrieval and natural language processing (the [IR Anthology](https://ir.webis.de/anthology/) and the [ACL Anthology](https://aclanthology.org/)). This notebook serves as a retrieval system, i.e., it gets a set of information needs (topics) and a corpus as input and produces a run file as output. Please do evaluations in a new dedicated notebook.

### Step 1: Import Libraries

In [None]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
import logging

logging.basicConfig(level=logging.INFO)

# Download NLTK data
nltk.download('punkt')

# Initialize PyTerrier and TIRA client
ensure_pyterrier_is_loaded()
tira = Client()

logging.info("Libraries imported successfully.")

### Step 2: Load the Dataset and the Index

In [None]:
try:
    # The dataset: the union of the IR Anthology and the ACL Anthology
    pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
    logging.info("Dataset loaded successfully.")

    # A (pre-built) PyTerrier index loaded from TIRA
    index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
    logging.info("Index loaded successfully.")
except Exception as e:
    logging.error(f"An error occurred while loading the dataset or index: {str(e)}")
    raise

### Step 3: Define the Retrieval Pipeline

In [None]:
# Base retrieval model with BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Query expansion with Bo1
bo1_expansion = pt.rewrite.Bo1QueryExpansion(index, fb_docs=10, fb_terms=20)
bm25_bo1 = bm25 >> bo1_expansion >> bm25

# Additional reranking models
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
dirichletLM = pt.BatchRetrieve(index, wmodel="DirichletLM")

# Combined retrieval pipeline
combined_pipeline = bm25_bo1 + 2 * tf_idf + 2 * dirichletLM

logging.info("Retrieval pipeline defined successfully.")

### Step 4: Create the Run

In [None]:
print('First, we have a short look at the first three topics:')
topics = pt_dataset.get_topics('text')
print(topics.head(3))

# Simple tokenization function
def tokenize_query(query):
    return ' '.join(word_tokenize(query.lower()))

print('\nTokenizing the queries...')
tokenized_topics = topics.copy()
tokenized_topics['query'] = tokenized_topics['query'].apply(tokenize_query)
print(tokenized_topics.head(3))

print('\nNow we do the retrieval...')
run = combined_pipeline.transform(tokenized_topics)

print('\nDone. Here are the first 10 entries of the run')
print(run.head(10))

# Define possible output directories
output_dirs = [
    os.environ.get('outputDir', '/output'),  # TIRA-specific directory
    '../runs',  # Local directory outside the sandbox
    '.'  # Current directory as fallback
]

# Try to write to each directory until successful
for output_dir in output_dirs:
    try:
        os.makedirs(output_dir, exist_ok=True)
        run_file_path = os.path.join(output_dir, 'run.txt')
        run.to_csv(run_file_path, sep='\t', index=False, header=False)
        logging.info(f"Results saved to {run_file_path}")
        break  # Exit the loop if writing was successful
    except OSError as e:
        logging.warning(f"Could not save to {output_dir}: {str(e)}")
else:
    logging.error("Failed to save results to any output directory")
    raise RuntimeError("No writable output directory found")

# Persist and normalize the run if possible
try:
    persist_and_normalize_run(run, system_name='combined-bm25-bo1-tfidf-dirichlet', default_output='../runs')
except Exception as e:
    logging.warning(f"Could not persist and normalize run: {str(e)}")