In [1]:
!pip3 install tira ir-datasets python-terrier

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
import pyterrier as pt
import os
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()

[0m

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset('irds:' + dataset)

pt_index_path = './terrier-index'


if not os.path.exists(pt_index_path + "/data.properties"):
  # create the index, using the IterDictIndexer indexer 
  indexer = pt.index.IterDictIndexer(pt_index_path, blocks=True)

  # we give the dataset get_corpus_iter() directly to the indexer
  # while specifying the fields to index and the metadata to record
  index_ref = indexer.index(pt_dataset.get_corpus_iter(), 
                            meta=('docno',))

else:
  # if you already have the index, use it.
  index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
index = pt.IndexFactory.of(index_ref)

In [3]:
best_b = 0.1
best_k_1 = 1.9

bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls= {"bm25.b" : best_b, "bm25.k_1": best_k_1})
sdm = pt.rewrite.SDM()

In [4]:
# Load the expansions
dataset_tira = ir_datasets.load(dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(dataset), format='trecxml')
# llm expansions with gpt
# gpt_cot = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-cot', dataset, prefix='llm_expansion_')
gpt_sq_fs = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs', dataset, prefix='llm_expansion_')
gpt_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-gpt3.5-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with llama
llama_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-cot', dataset, prefix='llm_expansion_')
# llama_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-fs', dataset, prefix='llm_expansion_')
llama_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-llama-sq-zs', dataset, prefix='llm_expansion_')

# llm expansions with flan-ul2
# flan_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-cot', dataset, prefix='llm_expansion_')
flan_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-fs', dataset, prefix='llm_expansion_')
# flan_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-zs', dataset, prefix='llm_expansion_')

In [5]:
topics = pt_dataset.get_topics(variant='title')

tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()

def pt_tokenize(text):
    return ' '.join(tokeniser.getTokens(text))

def expand_query(topic):
  ret = ' '.join([topic['query'], topic['query'], topic['query'],  topic['query'],  topic['query'], topic['llm_expansion_query']])

  # apply the tokenization
  return pt_tokenize(ret)

# we wrap this into an pyterrier transformer
# Documentation: https://pyterrier.readthedocs.io/en/latest/apply.html
pt_expand_query = pt.apply.query(expand_query)


pipeline_gpt_sq_fs = (gpt_sq_fs >> pt_expand_query) >> bm25
pipeline_gpt_sq_zs = (gpt_sq_zs >> pt_expand_query) >> bm25

pipeline_llama_cot = (llama_cot >> pt_expand_query) >> bm25
pipeline_llama_sq_zs = (llama_sq_zs >> pt_expand_query) >> bm25

pipeline_flan_sq_fs = (flan_sq_fs >> pt_expand_query) >> bm25

In [6]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])
    
topics = pt_dataset.get_topics(variant='title')

SEED=42

!pip3 install scikit-learn

from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(topics, test_size=15, random_state=SEED)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=5, random_state=SEED)

ltr_feats1 = bm25 >> pt.text.get_text(pt_dataset, ["text", "doc_id"]) >> (
    pt.BatchRetrieve(index, wmodel="Js_KLs")
    ** # KL Expansion
    pt.BatchRetrieve(index, wmodel="BM25", controls={"qe":"on", "qemodel" : "KL"})
    ** # ChatGPT Similiar Queries Few Shot
    pipeline_gpt_sq_fs
    ** # ChatGPT Similiar Queries Zero shot
    pipeline_gpt_sq_zs
    ** # Llama-2 Chain of Thought
    pipeline_llama_cot
    ** # Llama-2 Similiar Queries Zero shot
    pipeline_llama_sq_zs
    ** # Flan-UL2 Similiar Queries Few Shot
    pipeline_flan_sq_fs
)

# for reference, lets record the feature names here too
fnames=["Js_KLs", "KL", "gpt_sq_fs", "gpt_sq_zs", "llama_cot", "llama_sq_zs", "flan_sq_fs"]

[0m

In [7]:
from sklearn.ensemble import RandomForestRegressor
!pip3 install joblib
import joblib

model_path = "/app/baseline-retrieval-system/model9.joblib"

if os.path.exists(model_path):
    # Load the model
    model = joblib.load(model_path)
    print("Model loaded from file.")
    trained_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(model)
else:
    raise Exception("Model not found")

[0mModel loaded from file.


In [8]:
run = trained_pipe(pt_dataset.get_topics('text'))
persist_and_normalize_run(run, system_name="RandomForestRegressor", default_output='../runs')

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    0.9s finished


The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".
