In [1]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import os

In [2]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

pt_index_path = './terrier-index'

if not os.path.exists(pt_index_path + "/data.properties"):
  # create the index, using the IterDictIndexer indexer 
  indexer = pt.index.IterDictIndexer(pt_index_path, blocks=True)

  # we give the dataset get_corpus_iter() directly to the indexer
  # while specifying the fields to index and the metadata to record
  index_ref = indexer.index(pt_dataset.get_corpus_iter(), 
                            meta=('docno',))

else:
  # if you already have the index, use it.
  index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
index = pt.IndexFactory.of(index_ref)

In [4]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

configuration={"bm25.b": 0.1, "bm25.k_1": 2.5}
bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration)
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)

In [11]:
topics = pt_dataset.get_topics(variant='title')

SEED=42

!pip3 install scikit-learn

from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(topics, test_size=15, random_state=SEED)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=5, random_state=SEED)

ltr_feats1 = bm25 >> pt.text.get_text(pt_dataset, ["text", "doc_id"]) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence
    (sdm >> bm25)
    ** # score of text (not originally indexed)
    (pt.text.scorer(body_attr="text", wmodel="TF_IDF", background_index=index)) 
    ** # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
    ** 
    pt.BatchRetrieve(index, wmodel="Js_KLs")
)

# for reference, lets record the feature names here too
fnames=["BM25", "SDM", 'text', "CoordinateMatch"]

[0m

In [12]:
from sklearn.ensemble import RandomForestRegressor
!pip3 install joblib
import joblib

model_path = "./model.joblib"

if os.path.exists(model_path):
    # Load the model
    model = joblib.load(model_path)
    print("Model loaded from file.")
    rf_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(model)
else:
    
    model = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)
    rf_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(model)
    rf_pipe.fit(train_topics, pt_dataset.get_qrels())
    joblib.dump(model, model_path)
    print("Model trained and saved to file.")

print(model.feature_importances_)



[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.4s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   27.5s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:   54.0s finished


Model trained and saved to file.
[0.2235218  0.22497675 0.27938835 0.02204656 0.25006655]


In [13]:
def save_results(system, name):
    run = system(pt_dataset.get_topics('text'))
    persist_and_normalize_run(run, system_name=name, default_output='../runs')
    os.rename('../runs/run.txt', "../runs/"+name+".txt")

# run = rf_pipe(pt_dataset.get_topics('text'))
# persist_and_normalize_run(run, system_name="rf_pipe", default_output='../runs')
save_results(rf_pipe, 'trained with js_kls')



[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.8s finished


The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".


In [8]:
# def run_bm25_grid_search_run(index, output_dir, queries):
#     """
#         defaults: http://terrier.org/docs/current/javadoc/org/terrier/matching/models/BM25.html
#         k_1 = 1.2d, k_3 = 8d, b = 0.75d
#         We do not tune parameter k_3, as this parameter only impacts queries with reduntant terms.
#     """
#     for b in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
#         for k_1 in [0.5, 1.0, 1.5, 2.0, 2.5]:
#             system = f'bm25-b={b}-k_1={k_1}'
#             configuration = {"bm25.b" : b, "bm25.k_1": k_1}
#             run_output_dir = output_dir + '/' + system
#             !rm -Rf {run_output_dir}
#             !mkdir -p {run_output_dir}
#             print(f'Run {system}')
#             BM25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)
#             run = BM25(queries)
#             persist_and_normalize_run(run, system, run_output_dir)

# run_bm25_grid_search_run(index, 'new-grid-search', pt_dataset.get_topics('text'))

In [9]:
# !pip3 install trectools

# from glob import glob
# from trectools import TrecRun, TrecQrel, TrecEval
# import pandas as pd
# from tira.rest_api_client import Client

# tira = Client()

# def evaluate_run(run_dir, qrels):
#     run = TrecRun(run_dir + '/run.txt')
#     trec_eval = TrecEval(run, qrels)

#     return {
#         'run': run.get_runid(),
#         'nDCG@10': trec_eval.get_ndcg(depth=10),
#         'nDCG@10 (unjudgedRemoved)': trec_eval.get_ndcg(depth=10, removeUnjudged=True),
#         'MAP': trec_eval.get_map(depth=10),
#         'MRR': trec_eval.get_reciprocal_rank(),
#         'P@10': trec_eval.get_precision(depth=10)
#     }

# def load_qrels(dataset):
#     return TrecQrel(tira.download_dataset('ir-lab-sose-2024', dataset, truth_dataset=True) + '/qrels.txt')

# training_qrels = load_qrels('ir-acl-anthology-20240504-training')

# df = []
# for r in glob('grid-search/training/bm25*'):
#     df += [evaluate_run(r, training_qrels)]
# df = pd.DataFrame(df)
# df.sort_values('nDCG@10', ascending=False)