In [4]:
import sys

import json
import pyterrier as pt

from dotenv import dotenv_values
import sqlalchemy

import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
import sys

if not pt.started():
    pt.init()

db_vals = dotenv_values("/workspaces/CORD19_Plus/.env")

from cord19_plus.data_model.model import Document
from cord19_plus.data_model.model import Table


  if not pt.started():


In [5]:
#check available docs
avail_ids = []

with open('/workspaces/CORD19_Plus/data/index.jsonl', 'r') as file:
    for line in file:

        json_obj = json.loads(line.strip())
        avail_ids.append(json_obj['pdf_path'].split('.')[0])


In [6]:
dataset = pt.get_dataset('irds:cord19/fulltext/trec-covid')

In [7]:
engine = create_engine(f"postgresql+psycopg2://{db_vals['USER']}:{db_vals['PASSWORD']}@{db_vals['ADDRESS']}:{db_vals['PORT']}/{db_vals['DB']}", echo=False)
session = Session(engine)

In [8]:
result = session.query(Table)
#result_dict = [{"docno" : str(e.id), "ir_id" : e.ir_id, "content" : e.content, "content_json" : e.content_json} for e in result]

result_dict = [{"docno" : str(e.ir_tab_id), "ir_id" : e.ir_id, "header": e.header, "content" : e.content, "caption" : e.caption, "references" : e.references} for e in result]

In [None]:
import json
def map_content_json_to_string(result_dict):
    for i in range(len(result_dict)):
        if isinstance(result_dict[i]['content_json'], dict):
            result_dict[i]['content_json'] = dict_to_string(result_dict[i]["content_json"])
        
def dict_to_string(d, indent=0):
    """
    Recursively converts a nested dictionary into a string representation.
    
    :param d: Dictionary to convert to a string.
    :param indent: Current level of indentation for nested dictionaries.
    :return: String representation of the dictionary.
    """
    result = ""
    indent_str = "  " * indent
    for key, value in d.items():
        if isinstance(value, dict):
            result += f"{indent_str}{key}:\n" + dict_to_string(value, indent + 1)
        else:
            result += f"{indent_str}{key}: {value}\n"
    return result

def load_table_qrels(path):
    """
    Loads a table qrels file into a pandas DataFrame.

    :param path: Path to the table qrels file.
    :return: DataFrame containing the table qrels data.
    """
    entries = []
    with open(path, 'r') as f:
        lines = f.readlines()
        for entry in lines:
            entries.append(entry.rstrip().split(" "))
    
    qrels = pd.DataFrame(entries, columns=["qid","qo", "docno", "label"])
    qrels['label'] = pd.to_numeric(qrels['label'], downcast='integer')
    return qrels

In [None]:
map_content_json_to_string(result_dict)

In [12]:
index_folder = "/workspaces/CORD19_Plus/retrieval_api/app/indices"
field_dict = {0 : 'docno', 1 : 'ir_id', 2 : 'content', 3 : 'header', 4 : 'caption', 5 : 'references'}
field_configs = [[0,1,2], [0,1,3], [0,1,2,3], [0,1,2,3,4]]

field_configs = [[1,2,3,4,5]]

index_paths = []

for config in field_configs:
    fields = [field_dict[c] for c in config]
    current_path = "_".join(fields)
    full_path = f"{index_folder}/{current_path}"
    index_paths.append(full_path)
    indexer = pt.IterDictIndexer(f"{full_path}", meta={'docno': 1024, 'ir_id': 1024}, overwrite=True)
    index_ref = indexer.index(result_dict, fields=fields)


In [13]:
index_paths = sorted(index_paths)
indices  = [pt.IndexFactory.of(path) for path in index_paths]

for index in indices:
    print(index.getCollectionStatistics().toString())

Number of documents: 137
Number of terms: 5757
Number of postings: 18047
Number of fields: 5
Number of tokens: 39341
Field names: [ir_id, content, header, caption, references]
Positions:   false


In [14]:
engines = {i : pt.BatchRetrieve(indices[i], wmodel="BM25") for i in range(len(indices))}

  engines = {i : pt.BatchRetrieve(indices[i], wmodel="BM25") for i in range(len(indices))}


In [29]:
qrels = dataset.get_qrels()

In [31]:
qrels.dtypes

qid          object
docno        object
label         int64
iteration    object
dtype: object

In [32]:
qrels_path = "/workspaces/CORD19_Plus/data/clean/table_qrels.json"
qrels = pd.read_json(qrels_path)

In [34]:
qrels['qid'] = qrels['qid'].astype(str)

In [46]:
qrels['docno'] = qrels['docno'].apply(lambda row: row.replace(".json", ""))

In [47]:
qrels

Unnamed: 0,qid,docno,label
0,1,006k39tj_13_0,0
1,1,006k39tj_14_0,0
2,1,00m2g55u_4_1,0
3,1,006k39tj_12_0,0
4,1,011k6mm0_14_0,0
...,...,...,...
195,10,01es0zv4_3_0,0
196,10,011k6mm0_13_1,0
197,10,01d8cqn4_4_0,2
198,10,01eyo422_2_0,1


In [48]:
topics = dataset.get_topics('title')
#topics = topics[topics['qid'] == '1']
#qrels = dataset.get_qrels()

print(f"We currently have {len(set(avail_ids))} docs")
print(f"There are {len(qrels['docno'].unique())} unique documents with relance labels")
print(f"We have {len(set(qrels['docno'].unique()).intersection(set(avail_ids)))} unique documents which also have official relevance labels")

#qrels = qrels[qrels['qid'].isin(topics['qid'])]

#table_qrels = load_table_qrels("/workspaces/CORD19_Plus/data/tables.qrels")
table_qrels = qrels

We currently have 63318 docs
There are 20 unique documents with relance labels
We have 0 unique documents which also have official relevance labels


In [61]:
topics = dataset.get_topics('description')

In [62]:
pt.Experiment(
    [engines[key] for key in sorted(engines.keys())],
    topics[:10],
    table_qrels,
    eval_metrics=['P_10', 'P_20', 'P_50','P_100', 'map', 'ndcg_cut_10','ndcg_cut_50','recall_100', 'mrt'],
    names = [path.split("/")[-1].replace("docno_ir_id_", "") for path in index_paths]
)

Unnamed: 0,name,P_10,P_20,P_50,P_100,map,ndcg_cut_10,ndcg_cut_50,recall_100,mrt
0,ir_id_content_header_caption_references,0.08,0.065,0.046,0.039,0.113002,0.119331,0.217633,0.7175,5.281291


In [76]:
topics_t = dataset.get_topics('title')
topics_d = dataset.get_topics('description')
topics_n = dataset.get_topics('narrative')

topics_all = dataset.get_topics('title')

In [74]:
topics_d.loc[0]['query']

'what is the origin of covid 19'

In [83]:
topics_all['query'] = topics_all.apply(lambda row: f"{topics_t.loc[int(row['qid'])-1]} {topics_d.loc[int(row['qid'])-1]} {topics_n.loc[int(row['qid'])-1]}", axis=1)

In [78]:
topic

'1'