In [1]:
import json
import os
from collections import defaultdict
import numpy as np
import pandas as pd

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
ensure_pyterrier_is_loaded()
tira = Client()
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
topics = pt_dataset.get_topics('text')

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
qrels_df = pt_dataset.get_qrels()

# qrels_dict: {qid: {docno: label}}
qrels_dict = defaultdict(dict)

for i, row in qrels_df.iterrows():
    qrels_dict[row['qid']][row['docno']] = row['label']

print(len(qrels_df))

2623


In [5]:
if os.getcwd().endswith('pagerank'):
    os.chdir('..')

In [6]:
with open('data/docs_with_all_info.json', 'r') as file:
    documents = json.load(file)

In [7]:
df_docs = pd.DataFrame(documents)
df_docs = df_docs.transpose()

df_docs['docno'] = df_docs.index

df_docs['text'] = df_docs['title'] + ' ' + df_docs['abstract']
df_docs.drop(columns=['pagerank', 'title', 'abstract', 'discounted_pagerank', 'pub_date'], inplace=True)

df_docs.reset_index(drop=True, inplace=True)

df_docs = df_docs.to_dict(orient='records')

In [8]:
print(os.getcwd())

#Create the index using PyTerrier
indexer = pt.IterDictIndexer(
    "../index/index_retrieved_docs",
    overwrite=True,
    fields=["text"],
    meta=["docno"]
)

# Index the documents
index = indexer.index(df_docs)

#Retrieve documents using BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

/workspaces/SE-shared-task-v2/ir-lab-sose-2024-augsburg-information-retrieval


In [9]:
if os.getcwd().endswith('pagerank'):
    os.chdir('..')

print(os.getcwd())

/workspaces/SE-shared-task-v2/ir-lab-sose-2024-augsburg-information-retrieval


In [10]:
bm25_run = bm25(pt_dataset.get_topics('text'))

In [11]:
print(len(bm25_run))

66391


In [12]:
bm25_dict = defaultdict(dict)

for i, row in bm25_run.iterrows():
    bm25_dict[row['qid']][row['docno']] = {'bm25_score': row['score'], 
                                           'bm25_rank': row['rank'], 
                                           'pagerank_score': documents[row['docno']]['pagerank'],
                                           'discounted_pagerank_score': documents[row['docno']]['discounted_pagerank']}

In [13]:
bm25_imputation = np.mean([scores['bm25_rank'] for doc_dict in bm25_dict.values() for scores in doc_dict.values()])
pagerank_imputation = np.mean([documents[docno]['pagerank'] for docno in documents])
discounted_pagerank_imputation = np.mean([documents[docno]['discounted_pagerank'] for docno in documents])

print("BM25 Imputation: ", bm25_imputation)
print("Pagerank Imputation: ", pagerank_imputation)
print("Discounted Pagerank Imputation: ", discounted_pagerank_imputation)

BM25 Imputation:  497.66620475666883
Pagerank Imputation:  2.0335444620543548e-06
Discounted Pagerank Imputation:  8.441582033060812e-07


In [14]:
data_dict = defaultdict(dict)

for qid, doc_dict in qrels_dict.items():
    for docno, label in doc_dict.items():
        data_dict[qid][docno] = {}
        try:
            data_dict[qid][docno]['bm25_score'] = bm25_dict[qid][docno]['bm25_score']
        except:
            data_dict[qid][docno]['bm25_score'] =  bm25_imputation

        try:
            data_dict[qid][docno]['pagerank_score'] = bm25_dict[qid][docno]['pagerank_score']
        except:
            data_dict[qid][docno]['pagerank_score'] = pagerank_imputation

        try:
            data_dict[qid][docno]['discounted_pagerank_score'] = bm25_dict[qid][docno]['discounted_pagerank_score']
        except:
            data_dict[qid][docno]['discounted_pagerank_score'] = discounted_pagerank_imputation
        
        try:
            data_dict[qid][docno]['label'] = qrels_dict[qid][docno]
        except:
            data_dict[qid][docno]['label'] = 0

In [15]:
#Flatten the data
flattened_data = []
for qid, docs in data_dict.items():
    for docno, values in docs.items():
        flattened_data.append({
            'qid': qid,
            'docno': docno,
            'bm25': values['bm25_score'],
            'pagerank': values['pagerank_score'],
            'discounted_pagerank': values['discounted_pagerank_score'],
            'label': values['label']
        })

df = pd.DataFrame(flattened_data)

In [16]:
#Prepare training data
X = df[['bm25', 'discounted_pagerank']]
y = df['label']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

model = LogisticRegression()
model.fit(X, y)

In [18]:
# Predict relevance scores
df['predicted_relevance'] = model.predict_proba(X)[:, 1]

In [19]:
# Select top 10 documents for each query
top_10_docs = df.sort_values(by=['qid', 'predicted_relevance'], ascending=[True, False]).groupby('qid').head(10)

In [20]:
# Save the resulting document orders in a JSON file
result = {}
for qid, group in top_10_docs.groupby('qid'):
    result[qid] = group[['docno', 'predicted_relevance']].to_dict(orient='records')

with open('top_10_docs.json', 'w') as f:
    json.dump(result, f, indent=4)

# Print learned weights
print("Learned weights for BM25 and PageRank:", model.coef_)

Learned weights for BM25 and PageRank: [[-1.68452733e-03  2.78602961e-07]]


In [58]:
run = df[['qid', 'docno', 'predicted_relevance', 'bm25']]

run.rename(columns={'predicted_relevance': 'score'}, inplace=True)

run.sort_values(by=['qid', 'score'], ascending=[True, False], inplace=True)

run['rank'] = run.groupby('qid').cumcount()

print(run.head(5))

   qid                               docno     score      bm25  rank
1    1  2019.tois_journal-ir0volumeA37A1.2  0.575487  8.840352     0
3    1   2015.ipm_journal-ir0volumeA51A5.7  0.575381  9.098508     1
39   1     1998.sigirconf_conference-98.22  0.575288  9.323570     2
17   1        2008.cikm_conference-2008.59  0.575184  9.576583     3
40   1       2009.cikm_conference-2009.277  0.575121  9.730872     4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run.rename(columns={'predicted_relevance': 'score'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run.sort_values(by=['qid', 'score'], ascending=[True, False], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run['rank'] = run.groupby('qid').cumcount()


In [59]:
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

Evaluation Metrics:
{'map': 0.547369892273752, 'ndcg': 0.7323267767648299, 'ndcg_cut.10': 0.45129355824419753, 'recip_rank': 0.5550540146128383, 'recall_100': 0.9852941176470589}


In [60]:
persist_and_normalize_run(run, system_name='BM25_and_discounted_pagerank', default_output='runs/bm25_and_discounted_pagerank_run.txt')

The run file is normalized outside the TIRA sandbox, I will store it at "runs/bm25_and_discounted_pagerank_run.txt".
Done. run file is stored under "runs/bm25_and_discounted_pagerank_run.txt".


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run['qid'] = run['qid'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run['system'] = system_name


In [61]:
################################################
# Optionally use rerank

from ReRank.ReRank import ReRank

# Create a df with the text documents
documents = pt_dataset.get_corpus_iter() 
# Extract docno and text into a DataFrame
doc_list = []
for doc in documents:
    doc_list.append(doc)
documents = pd.DataFrame(doc_list)

queries = pt_dataset.get_topics('text')
run['qid'] = run['qid'].astype(str)


run = run.merge(documents, on='docno', how='left') # Merge the run and the documents
run = run.merge(queries, on='qid', how='left') # Merge the run and the queries
print('Done. Here are the first 10 entries of the run')
run.head(3)

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:01<00:00, 67870.56it/s]


Done. Here are the first 10 entries of the run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  run['qid'] = run['qid'].astype(str)


Unnamed: 0,qid,docno,score,bm25,rank,system,text,query
0,1,2019.tois_journal-ir0volumeA37A1.2,0.575487,8.840352,0,BM25_and_discounted_pagerank,Learning to Adaptively Rank Document Retrieval...,retrieval system improving effectiveness
1,1,2015.ipm_journal-ir0volumeA51A5.7,0.575381,9.098508,1,BM25_and_discounted_pagerank,Statistical comparisons of non-deterministic I...,retrieval system improving effectiveness
2,1,1998.sigirconf_conference-98.22,0.575288,9.32357,2,BM25_and_discounted_pagerank,"Aspect Windows, 3-D Visualizations, and Indire...",retrieval system improving effectiveness


In [62]:


# Create ReRank object
ReRank_object = ReRank()

# Rerank the documents
run = ReRank_object.rerank_documents(df=run, top_n=5)



In [63]:
persist_and_normalize_run(run, system_name='BM25_and_discounted_pagerank_reranked', default_output='runs/bm25_and_discounted_pagerank_reranked_run.txt')

The run file is normalized outside the TIRA sandbox, I will store it at "runs/bm25_and_discounted_pagerank_reranked_run.txt".
Done. run file is stored under "runs/bm25_and_discounted_pagerank_reranked_run.txt".
