In [2]:
import json
import os
from collections import defaultdict
import numpy as np
import pandas as pd
import gzip
import joblib

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
ensure_pyterrier_is_loaded()
tira = Client()
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
topics = pt_dataset.get_topics('text')

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
if not os.path.isfile('data/docs_with_all_info.json'):
    documents = tira.get_run_output('ir-lab-sose-2024/augsburg-information-retrieval/enriched-documents', 'ir-lab-sose-2024/ir-acl-anthology-20240504-training')
    os.makedirs('data', exist_ok=True)
    with gzip.open(f'{documents}/enriched-documents.json.gz', 'rt') as compressed_documents, open('data/docs_with_all_info.json', 'w') as documents:
        documents.write(compressed_documents.read())

with open('data/docs_with_all_info.json', 'r') as file:
    documents = json.load(file)

In [6]:
df_docs = pd.DataFrame(documents)
df_docs = df_docs.transpose()

df_docs['docno'] = df_docs.index

df_docs['text'] = df_docs['title'] + ' ' + df_docs['abstract']
df_docs.drop(columns=['pagerank', 'title', 'abstract', 'discounted_pagerank', 'pub_date'], inplace=True)

df_docs.reset_index(drop=True, inplace=True)

df_docs = df_docs.to_dict(orient='records')

In [7]:
#Create the index using PyTerrier
indexer = pt.IterDictIndexer(
    "../index/index_retrieved_docs",
    overwrite=True,
    fields=["text"],
    meta=["docno"]
)

# Index the documents
index = indexer.index(df_docs)

#Retrieve documents using BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [8]:
bm25_run = bm25(pt_dataset.get_topics('text'))

In [9]:
print(len(bm25_run))

66391


In [10]:
bm25_dict = defaultdict(dict)

for i, row in bm25_run.iterrows():
    bm25_dict[row['qid']][row['docno']] = {'bm25_score': row['score'], 
                                           'bm25_rank': row['rank'], 
                                           'pagerank_score': documents[row['docno']]['pagerank'],
                                           'discounted_pagerank_score': documents[row['docno']]['discounted_pagerank']}

In [11]:
bm25_imputation = np.mean([scores['bm25_rank'] for doc_dict in bm25_dict.values() for scores in doc_dict.values()])
pagerank_imputation = np.mean([documents[docno]['pagerank'] for docno in documents])
discounted_pagerank_imputation = np.mean([documents[docno]['discounted_pagerank'] for docno in documents])

print("BM25 Imputation: ", bm25_imputation)
print("Pagerank Imputation: ", pagerank_imputation)
print("Discounted Pagerank Imputation: ", discounted_pagerank_imputation)

BM25 Imputation:  497.66620475666883
Pagerank Imputation:  2.0335444620543548e-06
Discounted Pagerank Imputation:  8.441582033060812e-07


In [12]:
logistic_regression_model = joblib.load('linear_regression_model.joblib')
logistic_regression_model.coef_

array([[-1.68452733e-03,  2.78602961e-07]])

In [25]:
data_dict = defaultdict(dict)

for qid, doc_dict in bm25_dict.items():
    for docno, label in doc_dict.items():
        data_dict[qid][docno] = {}
        try:
            data_dict[qid][docno]['bm25_score'] = bm25_dict[qid][docno]['bm25_score']
        except:
            data_dict[qid][docno]['bm25_score'] =  bm25_imputation

        try:
            data_dict[qid][docno]['pagerank_score'] = bm25_dict[qid][docno]['pagerank_score']
        except:
            data_dict[qid][docno]['pagerank_score'] = pagerank_imputation

        try:
            data_dict[qid][docno]['discounted_pagerank_score'] = bm25_dict[qid][docno]['discounted_pagerank_score']
        except:
            data_dict[qid][docno]['discounted_pagerank_score'] = discounted_pagerank_imputation


In [27]:
#Flatten the data
flattened_data = []
for qid, docs in data_dict.items():
    for docno, values in docs.items():
        flattened_data.append({
            'qid': qid,
            'docno': docno,
            'bm25': values['bm25_score'],
            'pagerank': values['pagerank_score'],
            'discounted_pagerank': values['discounted_pagerank_score']
        })

df = pd.DataFrame(flattened_data)

In [28]:
df

Unnamed: 0,qid,docno,bm25,pagerank,discounted_pagerank
0,1,2004.jasis_journal-ir0volumeA55A10.2,14.091164,0.000004,2.281242e-06
1,1,2004.cikm_conference-2004.47,13.753853,0.000002,2.930823e-07
2,1,2009.sigirconf_conference-2009.113,13.082214,0.000003,5.852006e-07
3,1,1982.sigirjournals_journal-ir0volumeA16A3.3,12.968918,0.000002,1.011560e-06
4,1,X93-1008,12.910271,0.000003,1.252759e-07
...,...,...,...,...,...
66386,18,2015.sigirconf_conference-2015.191,6.033528,0.000002,6.098719e-07
66387,18,2002.sigirjournals_journal-ir0volumeA36A2.7,6.029317,0.000002,1.684238e-07
66388,18,P85-1017,6.029111,0.000004,6.062971e-08
66389,18,2016.ismir_conference-2016.53,6.029111,0.000002,9.866248e-07


In [29]:
#Prepare training data
X = df[['bm25', 'discounted_pagerank']]

In [30]:
# Predict relevance scores
df['predicted_relevance'] = logistic_regression_model.predict_proba(X)[:, 1]

In [31]:
# Select top 10 documents for each query
top_10_docs = df.sort_values(by=['qid', 'predicted_relevance'], ascending=[True, False]).groupby('qid').head(10)

In [32]:
# Save the resulting document orders in a JSON file
result = {}
for qid, group in top_10_docs.groupby('qid'):
    result[qid] = group[['docno', 'predicted_relevance']].to_dict(orient='records')

with open('top_10_docs.json', 'w') as f:
    json.dump(result, f, indent=4)

# Print learned weights
print("Learned weights for BM25 and PageRank:", logistic_regression_model.coef_)

Learned weights for BM25 and PageRank: [[-1.68452733e-03  2.78602961e-07]]


In [33]:
run = df[['qid', 'docno', 'predicted_relevance', 'bm25']]

run.rename(columns={'predicted_relevance': 'score'}, inplace=True)

run.sort_values(by=['qid', 'score'], ascending=[True, False], inplace=True)

run['rank'] = run.groupby('qid').cumcount()

print(run.head(5))

    qid                         docno     score      bm25  rank
999   1  2013.trec_conference-2013.52  0.575540  8.710547     0
998   1  2005.clef_workshop-2005w.103  0.575540  8.712382     1
997   1  2001.cikm_conference-2001.72  0.575539  8.714200     2
996   1                      P98-2205  0.575538  8.715427     3
995   1                      P12-1029  0.575538  8.717390     4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run.rename(columns={'predicted_relevance': 'score'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run.sort_values(by=['qid', 'score'], ascending=[True, False], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run['rank'] = run.groupby('qid').cumcount()


In [36]:
persist_and_normalize_run(run, system_name='BM25_and_discounted_pagerank', default_output='.')

The run file is normalized outside the TIRA sandbox, I will store it at ".".
Done. run file is stored under "./run.txt".
