In [1]:
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
)

In [22]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"]
for d in data:
    if d["type"] == "yesno":
        print(d)
        break

{'body': 'Is the protein Papilin secreted?', 'documents': ['http://www.ncbi.nlm.nih.gov/pubmed/3320045', 'http://www.ncbi.nlm.nih.gov/pubmed/7515725', 'http://www.ncbi.nlm.nih.gov/pubmed/20805556', 'http://www.ncbi.nlm.nih.gov/pubmed/19297413', 'http://www.ncbi.nlm.nih.gov/pubmed/19724244', 'http://www.ncbi.nlm.nih.gov/pubmed/15094122', 'http://www.ncbi.nlm.nih.gov/pubmed/12666201', 'http://www.ncbi.nlm.nih.gov/pubmed/21784067', 'http://www.ncbi.nlm.nih.gov/pubmed/11076767', 'http://www.ncbi.nlm.nih.gov/pubmed/15094110'], 'ideal_answer': ['Yes,  papilin is a secreted protein'], 'exact_answer': 'yes', 'type': 'yesno', 'id': '54e25eaaae9738404b000017', 'snippets': [{'offsetInBeginSection': 1085, 'offsetInEndSection': 1307, 'text': 'Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin. ', 'beginSection': 'abstract', 'docume

In [32]:
for d in data[2:3]:
    print(d["type"])
    print(d["id"])

yesno
54e25eaaae9738404b000017


In [4]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"][:5]  # for the test purpose, take one instance
print(data[0].keys())

dict_keys(['body', 'documents', 'ideal_answer', 'concepts', 'type', 'id', 'snippets'])


In [3]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10

In [6]:
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    retrieved = retrieve_abstracts.transform([question])
    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    # reranked.to_csv(f"temp/reranked/bm25_{question.id}.csv", index=False)

    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked["question"] = [question.body] * len(reranked)
    reranked["questionno"] = [question.id] * len(reranked)
    reranked["questiontype"] = [question.type] * len(reranked)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

100%|██████████| 5/5 [00:54<00:00, 10.90s/it]


### OpenAI API for snippets


In [7]:
questions = []

IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(x["question"], x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked.to_csv(f"temp/snippets/openai/{file}", index=False)

100%|██████████| 5/5 [01:38<00:00, 19.73s/it]


### OpenAI API for answers


In [2]:
IN_DIR = "temp/snippets/openai/"
ABSTRACTS_FOR_ANSWER = 3

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["answer_abstracts_exact"] = [
        response_exact_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(reranked)
    reranked["answer_abstracts_ideal"] = [
        response_ideal_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(reranked)
    reranked["answer_snippets_exact"] = [
        response_exact_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            "".join(
                flat_list(reranked["abstract_snippets"].tolist())
                + flat_list(reranked["title_snippets"])
            ),
        )
    ] * len(reranked)
    reranked["answer_snippets_ideal"] = [
        response_ideal_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            "".join(
                flat_list(reranked["abstract_snippets"].tolist())
                + flat_list(reranked["title_snippets"])
            ),
        )
    ] * len(reranked)
    reranked.to_csv(f"temp/answers/openai/{file}", index=False)

100%|██████████| 5/5 [00:29<00:00,  5.89s/it]


### Prepare submission


In [8]:
IN_DIR1 = "temp/answers/openai/"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    rearnked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)

    # if reranked["questiontype"].tolist()[0] == "list":
    #     break

100%|██████████| 5/5 [00:00<00:00, 312.97it/s]


In [13]:
print(rearnked_merged.columns.tolist())

['docno', 'score', 'rank', 'title', 'text', 'url', 'bm25', 'cos_sim', 'question', 'questionno', 'questiontype', 'snippets', 'title_snippets', 'abstract_snippets', 'offset_title', 'offset_abstract', 'answer_abstracts_exact', 'answer_abstracts_ideal', 'answer_snippets_exact', 'answer_snippets_ideal', 'docno', 'score', 'rank', 'title', 'text', 'url', 'bm25', 'cos_sim', 'question', 'questionno', 'questiontype', 'snippets_blablador_gpt', 'title_snippets_gpt', 'abstract_snippets_gpt', 'offset_title_gpt', 'offset_abstract_gpt', 'docno', 'score', 'rank', 'title', 'text', 'url', 'bm25', 'cos_sim', 'question', 'questionno', 'questiontype', 'snippets_blablador_mistral', 'title_snippets_mistral', 'abstract_snippets_mistral', 'offset_title_mistral', 'offset_abstract_mistral']


In [6]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )


# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)