In [56]:
import json
import os
from collections import defaultdict
import numpy as np
import pandas as pd

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [58]:
ensure_pyterrier_is_loaded()
tira = Client()
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
topics = pt_dataset.get_topics('text')

In [59]:
qrels_df = pt_dataset.get_qrels()

# qrels_dict: {qid: {docno: label}}
qrels_dict = defaultdict(dict)

for i, row in qrels_df.iterrows():
    qrels_dict[row['qid']][row['docno']] = row['label']

print(len(qrels_df))

2623


In [60]:
if os.getcwd().endswith('pagerank'):
    os.chdir('..')

In [61]:
with open('data/docs_with_all_info.json', 'r') as file:
    documents = json.load(file)

In [62]:
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")

bm25_run = bm25(pt_dataset.get_topics('text'))
print(bm25_run.head())

  qid   docid                               docno  rank      score  \
0   1   94858        2004.cikm_conference-2004.47     0  15.681777   
1   1  125137   1989.ipm_journal-ir0volumeA25A4.2     1  15.047380   
2   1  125817  2005.ipm_journal-ir0volumeA41A5.11     2  14.144223   
3   1    5868                            W05-0704     3  14.025748   
4   1   84876       2016.ntcir_conference-2016.90     4  13.947994   

                                      query  
0  retrieval system improving effectiveness  
1  retrieval system improving effectiveness  
2  retrieval system improving effectiveness  
3  retrieval system improving effectiveness  
4  retrieval system improving effectiveness  


In [63]:
print(len(bm25_run))

66283


In [64]:
bm25_dict = defaultdict(dict)

for i, row in bm25_run.iterrows():
    bm25_dict[row['qid']][row['docno']] = {'bm25_score': row['score'], 
                                           'bm25_rank': row['rank'], 
                                           'pagerank_score': documents[row['docno']]['pagerank']}

In [65]:
bm25_imputation = np.mean([scores['bm25_rank'] for doc_dict in bm25_dict.values() for scores in doc_dict.values()])
pagerank_imputation = np.mean([documents[docno]['pagerank'] for docno in documents])

print("BM25 Imputation: ", bm25_imputation)
print("Pagerank Imputation: ", pagerank_imputation)

BM25 Imputation:  497.95668572635515
Pagerank Imputation:  2.0335464700349214e-06


In [66]:
data_dict = defaultdict(dict)

for qid, doc_dict in qrels_dict.items():
    for docno, label in doc_dict.items():
        data_dict[qid][docno] = {}
        try:
            data_dict[qid][docno]['bm25_score'] = bm25_dict[qid][docno]['bm25_score']
        except:
            data_dict[qid][docno]['bm25_score'] =  bm25_imputation

        try:
            data_dict[qid][docno]['pagerank_score'] = bm25_dict[qid][docno]['pagerank_score']
        except:
            data_dict[qid][docno]['pagerank_score'] = pagerank_imputation
        
        try:
            data_dict[qid][docno]['label'] = qrels_dict[qid][docno]
        except:
            data_dict[qid][docno]['label'] = 0
        

In [67]:
#Flatten the data
flattened_data = []
for qid, docs in data_dict.items():
    for docno, values in docs.items():
        flattened_data.append({
            'qid': qid,
            'docno': docno,
            'bm25': values['bm25_score'],
            'pagerank': values['pagerank_score'],
            'label': values['label']
        })

df = pd.DataFrame(flattened_data)

print(flattened_data)

[{'qid': '1', 'docno': '2005.ipm_journal-ir0volumeA41A1.7', 'bm25': 11.444355719402566, 'pagerank': 3.123153015824058e-06, 'label': 1}, {'qid': '1', 'docno': '2019.tois_journal-ir0volumeA37A1.2', 'bm25': 10.496959906589126, 'pagerank': 1.6112608969823924e-06, 'label': 1}, {'qid': '1', 'docno': '2008.sigirconf_conference-2008.127', 'bm25': 12.034894938308014, 'pagerank': 1.571803899539206e-06, 'label': 1}, {'qid': '1', 'docno': '2015.ipm_journal-ir0volumeA51A5.7', 'bm25': 10.365889225118261, 'pagerank': 1.8235531534885565e-06, 'label': 0}, {'qid': '1', 'docno': '2008.tois_journal-ir0volumeA27A1.1', 'bm25': 9.563732905465484, 'pagerank': 1.6185283952887325e-05, 'label': 0}, {'qid': '1', 'docno': '1999.ntcir_workshop-1999.31', 'bm25': 12.338898222636612, 'pagerank': 1.956256695139485e-06, 'label': 1}, {'qid': '1', 'docno': '2001.sigirconf_workshop-2001w1.0', 'bm25': 497.95668572635515, 'pagerank': 2.0335464700349214e-06, 'label': 0}, {'qid': '1', 'docno': '2018.wsdm_conference-2018.47', '

In [68]:
#Prepare training data
X = df[['bm25', 'pagerank']]
y = df['label']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Train logistic regression model
model = LogisticRegression()
model.fit(X, y)

In [70]:
# Predict relevance scores
df['predicted_relevance'] = model.predict_proba(X)[:, 1]

In [71]:
# Select top 10 documents for each query
top_10_docs = df.sort_values(by=['qid', 'predicted_relevance'], ascending=[True, False]).groupby('qid').head(10)

In [72]:
# Dump the resulting document orders into a JSON file
result = {}
for qid, group in top_10_docs.groupby('qid'):
    result[qid] = group[['docno', 'predicted_relevance']].to_dict(orient='records')

with open('top_10_docs.json', 'w') as f:
    json.dump(result, f, indent=4)

# Display the results
for qid, group in top_10_docs.groupby('qid'):
    print(f"Top 10 documents for query {qid}:")
    print(group[['docno', 'bm25', 'pagerank', 'predicted_relevance']])
    print("\n")

# Print learned weights
print("Learned weights for BM25 and PageRank:", model.coef_)

Top 10 documents for query 1:
                                           docno       bm25  pagerank  \
34                   2014.cikm_conference-2014.9   9.110971  0.000002   
39               1998.sigirconf_conference-98.22   9.452847  0.000003   
4             2008.tois_journal-ir0volumeA27A1.1   9.563733  0.000016   
29              2012.wwwconf_conference-2012c.84   9.630197  0.000002   
3              2015.ipm_journal-ir0volumeA51A5.7  10.365889  0.000002   
40                 2009.cikm_conference-2009.277  10.452920  0.000002   
1             2019.tois_journal-ir0volumeA37A1.2  10.496960  0.000002   
35  2018.sigirjournals_journal-ir0volumeA52A2.23  10.527326  0.000002   
12                  2008.cikm_conference-2008.76  10.763251  0.000002   
38             2003.sigirconf_conference-2003.49  10.779634  0.000004   

    predicted_relevance  
34             0.562589  
39             0.562465  
4              0.562424  
29             0.562400  
3              0.562132  
40        

In [73]:
# # Predict probabilities for each document
# probabilities = model.predict_proba(X_test)

# # Get the index of the document with the highest probability for each query
# best_document_indices = probabilities.argmax(axis=1)

# # Retrieve the best documents
# best_documents = [documents[i] for i in best_document_indices]

# # Print or return the best documents
# for query, doc in zip(queries, best_documents):
#     print(f"Query: {query}\nBest Document: {doc}\n")