In [None]:
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv


import pyterrier as pt
from tqdm import tqdm
import numpy as np
tqdm.pandas()

from retrieval_utils import faiss_search_pipe, build_faiss_index, apply_cutoff

if not pt.started():
    pt.init()

load_dotenv("/workspace/.env")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

dataset = pt.get_dataset('irds:cord19/fulltext/trec-covid')

In [None]:
topics_t = dataset.get_topics('title')
topics_d = dataset.get_topics('description')
topics_n = dataset.get_topics('narrative')

topics_all = dataset.get_topics('title')

topics_all['query'] = topics_all.apply(lambda row: f"{topics_t.loc[int(row['qid'])-1]['query']}. {topics_d.loc[int(row['qid'])-1]['query']}. {topics_n.loc[int(row['qid'])-1]['query']}", axis=1)

In [None]:
query_embs = pd.read_json("/workspaces/CORD19_Plus/retrieval/query_embeddings.json")
queries = query_embs.emb.tolist()
queries = np.array(queries)
queries = queries.astype("float32")

In [None]:
df = pd.read_json("/workspaces/CORD19_Plus/retrieval/table_embeddings.json")
df = df.dropna().reset_index(drop=True)

In [None]:
index = build_faiss_index(df)

In [None]:
res = faiss_search_pipe(index, queries, df)

In [None]:
res = apply_cutoff(res)

In [None]:
qrels_path = "/workspaces/CORD19_Plus/data/clean/table_qrels.json"
qrels = pd.read_json(qrels_path)

qrels['qid'] = qrels['qid'].astype(str)
qrels['docno'] = qrels['docno'].apply(lambda row: row.replace(".json", ""))
table_qrels = qrels

In [None]:
pt.Experiment(
    [res],
    topics_all,
    qrels,
    eval_metrics=['P_10', 'P_20', 'map', 'ndcg_cut_10','ndcg_cut_50','recall_100', 'mrt']
)

In [None]:
pt.io.write_results(res, f'{"/workspaces/CORD19_Plus/retrieval/rankings"}/text-embedding-3-small.trec', format='trec', run_name="text-embedding-3-small")