In [1]:
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
)

In [2]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"][:5]  # for the test purpose, take one instance
print(data[0].keys())

dict_keys(['body', 'documents', 'ideal_answer', 'concepts', 'type', 'id', 'snippets'])


In [3]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10
ABSTRACTS_FOR_ANSWER = 3

In [13]:
N = 0
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    N += 1
    retrieved = retrieve_abstracts.transform([question])
    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    # reranked.to_csv(f"temp/reranked/bm25_{question.id}.csv", index=False)

    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked["question"] = [question.body] * len(reranked)
    reranked["questionno"] = [question.id] * len(reranked)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

100%|██████████| 5/5 [00:50<00:00, 10.07s/it]


### OpenAI API for snippets


In [12]:
questions = []

IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(x["question"], x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked.to_csv(f"temp/snippets/openai/{file}", index=False)

  0%|          | 0/5 [00:00<?, ?it/s]


KeyError: 'question'

In [None]:
import test question, match with with results

In [6]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )


# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)