In [1]:
import ast
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
    remove_stopwords_and_punctuation,
)

In [10]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"][:10]  # for the test purpose, take one instance
print(data[0].keys())
print(len(data))

dict_keys(['body', 'documents', 'ideal_answer', 'concepts', 'type', 'id', 'snippets'])
10


In [11]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10

In [12]:
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    retrieved = retrieve_abstracts.transform([question])
    if retrieved.empty:
        question.body = remove_stopwords_and_punctuation(question.body)
        print(f"trying new query {question.body}")
        etrieved = retrieve_abstracts.transform([question])

    while retrieved.empty:
        question.body = " ".join(question.body.split()[:-1])
        print(f"trying new query {question.body}")
        retrieved = retrieve_abstracts.transform([question])

    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    # reranked.to_csv(f"temp/reranked/bm25_{question.id}.csv", index=False)

    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked["question"] = [question.body] * len(reranked)
    reranked["questionno"] = [question.id] * len(reranked)
    reranked["questiontype"] = [question.type] * len(reranked)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

 50%|█████     | 5/10 [00:48<00:49,  9.93s/it]

trying new query metformin interfere thyroxine absorption
trying new query metformin interfere thyroxine


 90%|█████████ | 9/10 [01:31<00:10, 10.34s/it]

trying new query List human genes encoding dishevelled proteins
trying new query List human genes encoding dishevelled
trying new query List human genes encoding


100%|██████████| 10/10 [01:49<00:00, 11.00s/it]


### OpenAI API for snippets (SKIP)


In [7]:
questions = []

IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(x["question"], x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked.to_csv(f"temp/snippets/openai/{file}", index=False)

100%|██████████| 5/5 [01:38<00:00, 19.73s/it]


### OpenAI API for answers


In [2]:
ABSTRACTS_FOR_ANSWER = 3

COLUMNS_FOR_ANSWER = {
    "openai": ["abstract_snippets", "title_snippets"],
    "gpt": ["abstract_snippets_gpt", "title_snippets_gpt"],
    "mistral": [
        ["abstract_snippets_mistral", "title_snippets_mistral"],
    ],
}

choice = "mistral"

IN_DIR = f"temp/snippets/{choice}/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["answer_abstracts_exact"] = [
        response_exact_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(reranked)
    reranked["answer_abstracts_ideal"] = [
        response_ideal_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(reranked)
    reranked["answer_snippets_exact"] = [
        response_exact_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            "".join(
                flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]])
            ),
        )
    ] * len(reranked)
    reranked["answer_snippets_ideal"] = [
        response_ideal_answer(
            reranked["question"].tolist()[0],
            reranked["questiontype"].tolist()[0],
            "".join(
                flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]])
            ),
        )
    ] * len(reranked)
    reranked.to_csv(f"temp/answers/openai/{choice}/{file}", index=False)

 20%|██        | 2/10 [00:11<00:43,  5.40s/it]

Trying another way


100%|██████████| 10/10 [01:11<00:00,  7.17s/it]


### Prepare submission


In [3]:
def return_exact_answer(row, ABSTRACTS_OR_SNIPPETS):
    if row.questiontype == "yesno":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    elif row.questiontype == "summary":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    else:
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"]


def return_snippets(reranked, choice):
    output = []
    for _, row in reranked.iterrows():
        is_success = False
        if row[f"abstract_snippets_{choice}"]:
            for i, snippet in enumerate(row[f"abstract_snippets_{choice}"]):
                try:
                    d = {
                        "document": row["url"],
                        "text": snippet,
                        "offsetInBeginSection": row[f"offset_abstract_{choice}"][i][0],
                        "offsetInEndSection": row[f"offset_abstract_{choice}"][i][1],
                        "beginSection": "abstract",
                        "endSection": "abstract",
                    }
                    output.append(d)
                    is_success = True
                except:
                    pass
        if not is_success:
            if row[f"title_snippets_{choice}"]:
                for i, snippet in enumerate(row[f"title_snippets_{choice}"]):
                    try:
                        d = {
                            "document": row["url"],
                            "text": snippet,
                            "offsetInBeginSection": row[f"offset_title_{choice}"][i][0],
                            "offsetInEndSection": row[f"offset_title_{choice}"][i][1],
                            "beginSection": "title",
                            "endSection": "title",
                        }
                        output.append(d)
                    except:
                        pass
    return output

### Answers based on snippets


In [4]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "snippets"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)
    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": return_exact_answer(
                reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
            ),
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice),
        }
    )

    json_out = {"questions": json_list}

100%|██████████| 10/10 [00:00<00:00, 177.12it/s]


In [5]:
with open("temp/submission/system_1.json", "w") as f:
    json.dump(json_out, f, indent=4)

### Answers based on abstracts


In [6]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "abstracts"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)
    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": return_exact_answer(
                reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
            ),
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice),
        }
    )

    json_out = {"questions": json_list}

100%|██████████| 10/10 [00:00<00:00, 174.59it/s]


In [7]:
with open("temp/submission/system_2.json", "w") as f:
    json.dump(json_out, f, indent=4)

In [6]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )


# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)