In [1]:
import ast
import copy
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
    remove_stopwords_and_punctuation,
)

In [None]:
# Hi Alexander,
# thanks for contacting us. Indeed, there was a mistake in the filename, not the contents of the file, which is now fixed.
# New file (name) BioASQ-task12bPhaseA-testset1

TEST_FILE = "temp/test_set/BioASQ-task11bPhaseA-testset1"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"]
print(data[0].keys())
print(len(data))

In [None]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10

In [None]:
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    retrieved = retrieve_abstracts.transform([question])
    if retrieved.empty:
        question_new = copy.deepcopy(question)
        question_new.body = remove_stopwords_and_punctuation(question_new.body)
        print(f"trying new query {question_new.body}")
        print(f"original question: {question.body}")
        etrieved = retrieve_abstracts.transform([question_new])

    while retrieved.empty:
        question_new.body = " ".join(question_new.body.split()[:-1])
        print(f"trying new query {question_new.body}")
        print(f"original question: {question.body}")
        retrieved = retrieve_abstracts.transform([question_new])

    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    # reranked.to_csv(f"temp/reranked/bm25_{question.id}.csv", index=False)

    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked["question"] = [question.body] * len(reranked)
    reranked["questionno"] = [question.id] * len(reranked)
    reranked["questiontype"] = [question.type] * len(reranked)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

### OpenAI API for snippets (SKIP)


In [None]:
questions = []

IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(x["question"], x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked.to_csv(f"temp/snippets/openai/{file}", index=False)

### OpenAI API for answers


In [None]:
choice = "pyterrier"

### PyTerrier snippets


In [None]:
IN_DIR = f"temp/batch_2/snippets/{choice}/"
# OUT_DIR = f"temp/batch_2/answers/openai/{choice}/"

processed_files = os.listdir("temp/batch_2/answers/openai/pyterrier_snippets/")

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_snippets_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(reranked["text"].tolist()),
            )
        ] * len(reranked)
        reranked["answer_snippets_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(reranked["text"].tolist()),
            )
        ] * len(reranked)
        reranked.to_csv(
            f"temp/batch_2/answers/openai/pyterrier_snippets/{file}", index=False
        )

### PyTerrier abstracts


In [None]:
IN_DIR = f"temp/batch_2/reranked/{choice}/"
processed_files = os.listdir("temp/batch_2/answers/openai/pyterrier_docs/")

ABSTRACTS_FOR_ANSWER = 3

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_abstracts_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["abstract"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_abstracts_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["abstract"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked.to_csv(
            f"temp/batch_2/answers/openai/pyterrier_docs/{file}", index=False
        )

### Mistral


In [None]:
ABSTRACTS_FOR_ANSWER = 3

COLUMNS_FOR_ANSWER = {
    "openai": ["abstract_snippets", "title_snippets"],
    "gpt": ["abstract_snippets_gpt", "title_snippets_gpt"],
    "mistral": [
        ["abstract_snippets_mistral", "title_snippets_mistral"],
    ],
}

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_abstracts_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["text"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_abstracts_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["text"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_snippets_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                    + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]].tolist())
                ),
            )
        ] * len(reranked)
        reranked["answer_snippets_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                    + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]].tolist())
                ),
            )
        ] * len(reranked)
        reranked.to_csv(f"temp/answers/openai/{choice}/{file}", index=False)

### Prepare submission


#### for PyTerrier


##### Answers based on snippets


In [None]:
def return_exact_answer(row, ABSTRACTS_OR_SNIPPETS):
    if row.questiontype == "yesno":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    elif row.questiontype == "summary":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"][0]
    else:
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"]


def return_snippets(reranked):
    output = []
    for _, row in reranked.iterrows():
        d = {
            "document": row["url"],
            "text": row["text"],
            "offsetInBeginSection": row["snippet_offset_in_begin_section"],
            "offsetInEndSection": row["snippet_offset_in_end_section"],
            "beginSection": row["snippet_begin_section"],
            "endSection": row["snippet_end_section"],
        }
        output.append(d)

    return output


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:5]

In [None]:
IN_DIR1 = "temp/batch_2/answers/openai/pyterrier_snippets/"
IN_DIR2 = "temp/batch_2/reranked/pyterrier/"

ABSTRACTS_OR_SNIPPETS = "snippets"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked1["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
    json_list.append(
        {
            "type": reranked1["questiontype"].tolist()[0],
            "body": reranked1["question"].tolist()[0],
            "id": reranked1["questionno"].tolist()[0],
            "ideal_answer": reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"].tolist()[
                0
            ][0],
            "exact_answer": exact_answer,
            # "documents": reranked2["url"].tolist(),
            # "snippets": return_snippets(reranked1),
        }
    )

    json_out = {"questions": json_list}

with open("temp/batch_2/submission/mibi_rag_snippet.json", "w") as f:
    json.dump(json_out, f, indent=4)

#### Answers based on abstracts


In [None]:
IN_DIR1 = "temp/batch_2/answers/openai/pyterrier_docs/"

ABSTRACTS_OR_SNIPPETS = "abstracts"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked1["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
    json_list.append(
        {
            "type": reranked1["questiontype"].tolist()[0],
            "body": reranked1["question"].tolist()[0],
            "id": reranked1["questionno"].tolist()[0],
            "ideal_answer": reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"].tolist()[
                0
            ][0],
            "exact_answer": exact_answer,
            # "documents": reranked1["url"].tolist(),
            # "snippets": return_snippets(reranked1),
        }
    )

    json_out = {"questions": json_list}

with open("temp/batch_2/submission/mibi_rag_abstract.json", "w") as f:
    json.dump(json_out, f, indent=4)

In [None]:
def return_exact_answer(row, ABSTRACTS_OR_SNIPPETS):
    if row.questiontype == "yesno":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    elif row.questiontype == "summary":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    else:
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"]


def return_snippets(reranked, choice):
    output = []
    for _, row in reranked.iterrows():
        is_success = False
        if row[f"abstract_snippets_{choice}"]:
            for i, snippet in enumerate(row[f"abstract_snippets_{choice}"]):
                try:
                    d = {
                        "document": row["url"],
                        "text": snippet,
                        "offsetInBeginSection": row[f"offset_abstract_{choice}"][i][0],
                        "offsetInEndSection": row[f"offset_abstract_{choice}"][i][1],
                        "beginSection": "abstract",
                        "endSection": "abstract",
                    }
                    output.append(d)
                    is_success = True
                except:
                    pass
        if not is_success:
            if row[f"title_snippets_{choice}"]:
                for i, snippet in enumerate(row[f"title_snippets_{choice}"]):
                    try:
                        d = {
                            "document": row["url"],
                            "text": snippet,
                            "offsetInBeginSection": row[f"offset_title_{choice}"][i][0],
                            "offsetInEndSection": row[f"offset_title_{choice}"][i][1],
                            "beginSection": "title",
                            "endSection": "title",
                        }
                        output.append(d)
                    except:
                        pass
    return output


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:5]

### Answers based on snippets


In [None]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "snippets"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    # reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = pd.concat([reranked1, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked_merged["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(
            reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
        )
    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": exact_answer,
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice)[:10],  # not good
        }
    )

    json_out = {"questions": json_list}

In [None]:
with open("temp/submission/system_1.json", "w") as f:
    json.dump(json_out, f, indent=4)

### Answers based on abstracts


In [None]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "abstracts"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    # reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = pd.concat([reranked1, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked_merged["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(
            reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
        )

    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": exact_answer,
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice)[:10],  # not good
        }
    )

    json_out = {"questions": json_list}

In [None]:
with open("temp/submission/system_2.json", "w") as f:
    json.dump(json_out, f, indent=4)

# Phase B


In [3]:
TEST_FILE = "temp/batch_4/test_set/BioASQ-task12bPhaseB-testset4"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"]
print(data[0].keys())
print(len(data))

dict_keys(['documents', 'snippets', 'id', 'type', 'body'])
85


In [4]:
# data[1]

In [5]:
def get_snippets_from_df(row):
    return [x["text"] for x in row.snippets]


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:100]

### Answers based on snippets


In [7]:
for d in tqdm(data):
    df = pd.json_normalize(d)
    df["snippets_extracted"] = df.apply(get_snippets_from_df, axis=1)
    df.rename(
        columns={"body": "question", "id": "questionno", "type": "questiontype"},
        inplace=True,
    )

    df["answer_snippets_exact"] = [
        response_exact_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(df.snippets_extracted.tolist()[0]),
        )
    ] * len(df)
    df["answer_snippets_ideal"] = [
        response_ideal_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(df.snippets_extracted.tolist()[0]),
        )
    ] * len(df)

    df.to_csv(
        f"temp/batch_4/phase_b/answers/openai/snippets/{df.questionno.values[0]}.csv",
        index=False,
    )

100%|██████████| 85/85 [09:37<00:00,  6.79s/it]


### Prepare submission


In [8]:
json_list = []

IN_DIR = "temp/batch_4/phase_b/answers/openai/snippets/"

for file in os.listdir(IN_DIR):
    df = pd.read_csv(os.path.join(IN_DIR, file))
    df["answer_snippets_exact"] = df["answer_snippets_exact"].apply(ast.literal_eval)
    df["answer_snippets_ideal"] = df["answer_snippets_ideal"].apply(ast.literal_eval)

    if df["questiontype"].values[0] == "summary":
        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_snippets_ideal"].values[0][0],
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )
    else:
        if df["questiontype"].values[0] in ("factoid", "list"):
            exact_answer = fix_list(df["answer_snippets_exact"].values[0])
        else:
            exact_answer = df["answer_snippets_exact"].values[0][0]

        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_snippets_ideal"].values[0][0],
                "exact_answer": exact_answer,
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )

    json_out = {"questions": json_list}

In [9]:
with open("temp/batch_4/phase_b/submission/mibi_rag_snippet.json", "w") as f:
    json.dump(json_out, f, indent=4, ensure_ascii=False)

In [None]:
# for d in json_list:
#     try:
#         print(d["exact_answer"])
#     except:
#         print("NO ANSWER")
#     print(d["ideal_answer"])
#     print()

### Answers based on abstracts


In [None]:
RERANK_CROSS = 10
RERANK_BI = 3

In [None]:
from get_pubmed_documents import get_title_abstract

ABSTRACTS_FOR_ANSWER = 3

for d in tqdm(data):
    df = pd.json_normalize(d)
    df["snippets_extracted"] = df.apply(get_snippets_from_df, axis=1)
    df.rename(
        columns={"body": "question", "id": "questionno", "type": "questiontype"},
        inplace=True,
    )

    titles_abstracts = [get_title_abstract(url) for url in df["documents"].tolist()[0]]
    titles = list(zip(*titles_abstracts))[0]
    abstracts = list(zip(*titles_abstracts))[1]
    reranked = pd.DataFrame({"title": titles, "abstract": abstracts})
    question = Question(
        id=df["questionno"].values[0],
        type=df["questiontype"].values[0],
        body=df["question"].values[0],
    )

    reranked = pd.DataFrame({"title": titles, "text": abstracts})
    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    df["answer_abstracts_exact"] = [
        response_exact_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(df)
    df["answer_abstracts_ideal"] = [
        response_ideal_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(df)

    df.to_csv(
        f"temp/batch_4/phase_b/answers/openai/abstracts/{df.questionno.values[0]}.csv",
        index=False,
    )

### Prepare submission


In [None]:
json_list = []

IN_DIR = "temp/batch_4/phase_b/answers/openai/abstracts/"

for file in os.listdir(IN_DIR):
    df = pd.read_csv(os.path.join(IN_DIR, file))
    df["answer_abstracts_exact"] = df["answer_abstracts_exact"].apply(ast.literal_eval)
    df["answer_abstracts_ideal"] = df["answer_abstracts_ideal"].apply(ast.literal_eval)

    if df["questiontype"].values[0] == "summary":
        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_abstracts_ideal"].values[0][0],
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )
    else:
        if df["questiontype"].values[0] in ("factoid", "list"):
            exact_answer = fix_list(df["answer_abstracts_exact"].values[0])
        else:
            exact_answer = df["answer_abstracts_exact"].values[0][0]

        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_abstracts_ideal"].values[0][0],
                "exact_answer": exact_answer,
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )

    json_out = {"questions": json_list}

In [None]:
with open("temp/batch_4/phase_b/submission/mibi_rag_abstract.json", "w") as f:
    json.dump(json_out, f, indent=4, ensure_ascii=False)

In [None]:
# for d in json_list:
#     print(d["body"])
#     try:
#         print(d["exact_answer"])
#     except:
#         print("NO ANSWER")
#     print(d["ideal_answer"])
#     print()

In [None]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )


# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)