In [2]:
# IR Lab SoSe 2024: Baseline Retrieval System

# You only need to execute this cell if you are using Google Colab.
!pip3 install tira ir-datasets python-terrier scikit-learn pandas

# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os

# Create a REST client
client = Client(base_url='http://localhost:12345')

# Ensure PyTerrier is loaded
ensure_pyterrier_is_loaded()
if not pt.started():
    pt.init()

# Load the dataset
dataset = pt.get_dataset('irds:antique/train')

# Check if the index already exists, if not create it
index_dir = './index'
if not os.path.exists(index_dir):
    os.makedirs(index_dir)
    pt.index.IterDictIndexer(index_dir).index(dataset.get_corpus_iter())
index = pt.IndexFactory.of(index_dir)

# Initialize BM25 model
bm25 = pt.BatchRetrieve(index, wmodel='BM25')

# Perform Query Expansion using Bo1 (Rocchio)
qe = pt.rewrite.Bo1QueryExpansion(index)

# Combine BM25 with Query Expansion
pipeline = bm25 >> qe >> bm25

# Perform retrieval
topics = dataset.get_topics('text')
expanded_run = pipeline(topics)

# Segment the expanded queries for better retrieval (this is a mock-up of segmentation process)
def segment_query(query):
    return query.split()

expanded_run['query'] = expanded_run['query'].apply(segment_query)

# TF-IDF Vectorization
def tfidf_vectorize(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Example usage with a mock-up corpus
documents = [
    "this is a sample",
    "this is another example example"
]
tfidf_df = tfidf_vectorize(documents)
print("tf-idf")
print(tfidf_df)

# Print results
print('Now we do the retrieval with query expansion and segmentation...')
print('Done. Here are the first 10 entries of the expanded and segmented run')
print(expanded_run.head(10))

### Step 5: Persist the run file for subsequent evaluations
# The output of a prototypical retrieval system is a run file. This run file can later (optimally in a different notebook) be statistically evaluated.

persist_and_normalize_run(expanded_run, system_name='bm25-qe-segmented', default_output='../runs')

# The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
# Done. Run file is stored under "../runs/run.txt".


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
tf-idf
    another   example        is    sample      this
0  0.000000  0.000000  0.501549  0.704909  0.501549
1  0.407824  0.815648  0.290170  0.000000  0.290170
Now we do the retrieval with query expansion and segmentation...
Done. Here are the first 10 entries of the expanded and segmented run
       qid  docid  docno  rank      score  \
0  1000063    451    452     0  16.881727   
1  1000063   8981   8982     1  13.971172   
2  1000063   3127   3128     2  12.668903   
3  1000063   9843   9844     3  10.877479   
4  1000063   7234   7235     4  10.554410   
5  1000063   7817   7818     5   8.203563   
6  1000063    117    118     6   5.894464   
7  1000063   9669   9670     7   5.889050   
8  1000063  10124  10125     8  