In [1]:
import ast
import copy
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
    remove_stopwords_and_punctuation,
)

In [6]:
# Hi Alexander,
# thanks for contacting us. Indeed, there was a mistake in the filename, not the contents of the file, which is now fixed.
# New file (name) BioASQ - task12bPhaseA - testset1

TEST_FILE = "temp/test_set/BioASQ-task11bPhaseA-testset1"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"]
print(data[0].keys())
print(len(data))

dict_keys(['id', 'type', 'body'])
85


In [3]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10

In [7]:
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    retrieved = retrieve_abstracts.transform([question])
    if retrieved.empty:
        question_new = copy.deepcopy(question)
        question_new.body = remove_stopwords_and_punctuation(question_new.body)
        print(f"trying new query {question_new.body}")
        print(f"original question: {question.body}")
        etrieved = retrieve_abstracts.transform([question_new])

    while retrieved.empty:
        question_new.body = " ".join(question_new.body.split()[:-1])
        print(f"trying new query {question_new.body}")
        print(f"original question: {question.body}")
        retrieved = retrieve_abstracts.transform([question_new])

    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    # reranked.to_csv(f"temp/reranked/bm25_{question.id}.csv", index=False)

    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked["question"] = [question.body] * len(reranked)
    reranked["questionno"] = [question.id] * len(reranked)
    reranked["questiontype"] = [question.type] * len(reranked)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

  8%|▊         | 7/85 [00:58<10:26,  8.04s/it]

trying new query MDMA ecstasy successfully used treat PTSD disorder
original question: Has MDMA(ecstasy) been successfully used to treat PTSD disorder?
trying new query MDMA ecstasy successfully used treat PTSD
original question: Has MDMA(ecstasy) been successfully used to treat PTSD disorder?
trying new query MDMA ecstasy successfully used treat
original question: Has MDMA(ecstasy) been successfully used to treat PTSD disorder?


 15%|█▌        | 13/85 [02:03<11:28,  9.56s/it]

trying new query measles immunisation best public health approach reduce incidence measles worldwide
original question: Is measles immunisation the best public health approach to reduce incidence of measles worldwide?
trying new query measles immunisation best public health approach reduce incidence measles
original question: Is measles immunisation the best public health approach to reduce incidence of measles worldwide?


 19%|█▉        | 16/85 [02:29<09:47,  8.51s/it]

trying new query proportion alteration NTRK genes attributable colorectal cancer
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?
trying new query proportion alteration NTRK genes attributable colorectal
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?
trying new query proportion alteration NTRK genes attributable
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?
trying new query proportion alteration NTRK genes
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?


 20%|██        | 17/85 [02:45<12:25, 10.97s/it]

trying new query Please list Janus Kinase inhibitors used treat Inflammatory Bowel Disease IBD
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat Inflammatory Bowel Disease
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat Inflammatory Bowel
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat Inflammatory
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used
origina

 21%|██        | 18/85 [03:22<20:46, 18.61s/it]

trying new query babies young children risk severe malaria endemic areas
original question: Are only babies and young children at risk of severe malaria in endemic areas?
trying new query babies young children risk severe malaria endemic
original question: Are only babies and young children at risk of severe malaria in endemic areas?


 22%|██▏       | 19/85 [03:38<19:36, 17.82s/it]

trying new query types glucosteroids used management Duchenne muscular dystrophy
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used management Duchenne muscular
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used management Duchenne
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used management
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?


 25%|██▍       | 21/85 [04:05<16:10, 15.17s/it]

trying new query oncogene somatic mutations associated situ carcinoma evolution colonic polyp adenomas
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma evolution colonic polyp
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma evolution colonic
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma evolution
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma
original question: Which oncogene somatic mutations are a

 29%|██▉       | 25/85 [04:55<11:59, 12.00s/it]

trying new query medication tested PEMMELA trial
original question: What medication were tested in the PEMMELA trial?
trying new query medication tested PEMMELA
original question: What medication were tested in the PEMMELA trial?
trying new query medication tested
original question: What medication were tested in the PEMMELA trial?


 31%|███       | 26/85 [05:16<14:10, 14.42s/it]

trying new query pathophysiological mechanism microbiota produce malignant lesions colonic mucosa
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce malignant lesions colonic
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce malignant lesions
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce malignant
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce
original question: What is the pathophysiological mechanism by which the

 32%|███▏      | 27/85 [05:39<16:28, 17.03s/it]

trying new query carotenodermia caused excess lycopene diet
original question: is carotenodermia caused by an excess of lycopene in the diet?
trying new query carotenodermia caused excess lycopene
original question: is carotenodermia caused by an excess of lycopene in the diet?
trying new query carotenodermia caused excess
original question: is carotenodermia caused by an excess of lycopene in the diet?


 33%|███▎      | 28/85 [05:54<15:37, 16.46s/it]

trying new query two main active ingredients standard PrEP Pre-Exposure Prophylaxis
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?
trying new query two main active ingredients standard PrEP Pre-Exposure
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?
trying new query two main active ingredients standard PrEP
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?
trying new query two main active ingredients standard
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?


 54%|█████▍    | 46/85 [08:50<06:30, 10.02s/it]

trying new query sectoral Heterochromia associated Crohn 's disease
original question: Is sectoral Heterochromia associated with Crohn's disease?
trying new query sectoral Heterochromia associated Crohn 's
original question: Is sectoral Heterochromia associated with Crohn's disease?
trying new query sectoral Heterochromia associated Crohn
original question: Is sectoral Heterochromia associated with Crohn's disease?
trying new query sectoral Heterochromia associated
original question: Is sectoral Heterochromia associated with Crohn's disease?


 55%|█████▌    | 47/85 [09:07<07:43, 12.19s/it]

trying new query List effective treatment methods Madelung disease
original question: List the most effective treatment methods for Madelung disease.
trying new query List effective treatment methods Madelung
original question: List the most effective treatment methods for Madelung disease.
trying new query List effective treatment methods
original question: List the most effective treatment methods for Madelung disease.


 56%|█████▋    | 48/85 [09:25<08:38, 14.01s/it]

trying new query common side effects progesterone-only pill POP
original question: What are common side effects of the progesterone-only pill (POP)?
trying new query common side effects progesterone-only pill
original question: What are common side effects of the progesterone-only pill (POP)?


 59%|█████▉    | 50/85 [09:43<06:34, 11.27s/it]

trying new query Explain difference eosinophilic esophagitis reflux-induced esophagitis
original question: Explain the difference between eosinophilic esophagitis and reflux-induced esophagitis.
trying new query Explain difference eosinophilic esophagitis reflux-induced
original question: Explain the difference between eosinophilic esophagitis and reflux-induced esophagitis.
trying new query Explain difference eosinophilic esophagitis
original question: Explain the difference between eosinophilic esophagitis and reflux-induced esophagitis.


 74%|███████▍  | 63/85 [11:53<03:06,  8.47s/it]

trying new query find individual suspect overdosed
original question: What should you do if you find an individual that you suspect has overdosed?
trying new query find individual suspect
original question: What should you do if you find an individual that you suspect has overdosed?


 76%|███████▋  | 65/85 [12:18<03:19, 10.00s/it]

trying new query Please summarize MuSK Antibody positive Myasthenia Gravis
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK Antibody positive Myasthenia
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK Antibody positive
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK Antibody
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.


 78%|███████▊  | 66/85 [12:36<03:59, 12.59s/it]

trying new query advisable bring fever supposedly meant assist fight disease
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly meant assist fight
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly meant assist
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly meant
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever
original question: Is it advisable t

 91%|█████████ | 77/85 [14:28<01:14,  9.36s/it]

trying new query Please list congenital fibrinogen disorders
original question: Please list the congenital fibrinogen disorders.
trying new query Please list congenital fibrinogen
original question: Please list the congenital fibrinogen disorders.
trying new query Please list congenital
original question: Please list the congenital fibrinogen disorders.


 93%|█████████▎| 79/85 [14:52<01:02, 10.44s/it]

trying new query administration route zavegepant
original question: What is the administration route of zavegepant?
trying new query administration route
original question: What is the administration route of zavegepant?


 94%|█████████▍| 80/85 [15:07<00:59, 11.88s/it]

trying new query Brunner 's gland hamartoma BGH often asymptomatic usually diagnosed
original question: Brunner's gland hamartoma (BGH) is often asymptomatic and so how is it usually diagnosed?
trying new query Brunner 's gland hamartoma BGH often asymptomatic usually
original question: Brunner's gland hamartoma (BGH) is often asymptomatic and so how is it usually diagnosed?


 95%|█████████▌| 81/85 [15:18<00:46, 11.73s/it]

trying new query peritoneal dialysis best option infants kidney failure
original question: Is peritoneal dialysis the best option for infants with kidney failure?
trying new query peritoneal dialysis best option infants kidney
original question: Is peritoneal dialysis the best option for infants with kidney failure?
trying new query peritoneal dialysis best option infants
original question: Is peritoneal dialysis the best option for infants with kidney failure?


100%|██████████| 85/85 [16:01<00:00, 11.31s/it]


### OpenAI API for snippets (SKIP)


In [7]:
questions = []

IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(x["question"], x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked.to_csv(f"temp/snippets/openai/{file}", index=False)

100%|██████████| 5/5 [01:38<00:00, 19.73s/it]


### OpenAI API for answers


In [2]:
choice = "mistral"

IN_DIR = f"temp/snippets/{choice}/"
OUT_DIR = f"temp/answers/openai/{choice}/"

processed_files = os.listdir(OUT_DIR)

In [15]:
ABSTRACTS_FOR_ANSWER = 3

COLUMNS_FOR_ANSWER = {
    "openai": ["abstract_snippets", "title_snippets"],
    "gpt": ["abstract_snippets_gpt", "title_snippets_gpt"],
    "mistral": [
        ["abstract_snippets_mistral", "title_snippets_mistral"],
    ],
}

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_abstracts_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["text"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_abstracts_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["text"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_snippets_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                    + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]].tolist())
                ),
            )
        ] * len(reranked)
        reranked["answer_snippets_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                    + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]].tolist())
                ),
            )
        ] * len(reranked)
        reranked.to_csv(f"temp/answers/openai/{choice}/{file}", index=False)

 95%|█████████▌| 81/85 [00:29<00:02,  1.93it/s]

Trying another way


100%|██████████| 85/85 [01:03<00:00,  1.33it/s]


### Prepare submission


In [16]:
def return_exact_answer(row, ABSTRACTS_OR_SNIPPETS):
    if row.questiontype == "yesno":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    elif row.questiontype == "summary":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    else:
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"]


def return_snippets(reranked, choice):
    output = []
    for _, row in reranked.iterrows():
        is_success = False
        if row[f"abstract_snippets_{choice}"]:
            for i, snippet in enumerate(row[f"abstract_snippets_{choice}"]):
                try:
                    d = {
                        "document": row["url"],
                        "text": snippet,
                        "offsetInBeginSection": row[f"offset_abstract_{choice}"][i][0],
                        "offsetInEndSection": row[f"offset_abstract_{choice}"][i][1],
                        "beginSection": "abstract",
                        "endSection": "abstract",
                    }
                    output.append(d)
                    is_success = True
                except:
                    pass
        if not is_success:
            if row[f"title_snippets_{choice}"]:
                for i, snippet in enumerate(row[f"title_snippets_{choice}"]):
                    try:
                        d = {
                            "document": row["url"],
                            "text": snippet,
                            "offsetInBeginSection": row[f"offset_title_{choice}"][i][0],
                            "offsetInEndSection": row[f"offset_title_{choice}"][i][1],
                            "beginSection": "title",
                            "endSection": "title",
                        }
                        output.append(d)
                    except:
                        pass
    return output


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:5]

### Answers based on snippets


In [21]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "snippets"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    # reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = pd.concat([reranked1, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked_merged["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(
            reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
        )
    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": exact_answer,
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice)[:10],  # not good
        }
    )

    json_out = {"questions": json_list}

100%|██████████| 85/85 [00:00<00:00, 243.02it/s]


In [23]:
with open("temp/submission/system_1.json", "w") as f:
    json.dump(json_out, f, indent=4)

### Answers based on abstracts


In [24]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "abstracts"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    # reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = pd.concat([reranked1, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked_merged["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(
            reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
        )

    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": exact_answer,
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice)[:10],  # not good
        }
    )

    json_out = {"questions": json_list}

100%|██████████| 85/85 [00:00<00:00, 214.13it/s]


In [25]:
with open("temp/submission/system_2.json", "w") as f:
    json.dump(json_out, f, indent=4)

# Phase B


In [9]:
TEST_FILE = "../../data/bioasq-data/training12b_new.json"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"][:10]  # for the test purpose, take one instance
print(data[0].keys())
print(len(data))

dict_keys(['body', 'documents', 'ideal_answer', 'concepts', 'type', 'id', 'snippets'])
10


In [24]:
def get_snippets_from_df(row):
    return [x["text"] for x in row.snippets]

### Answers based on snippets


In [13]:
for d in data:
    df = pd.json_normalize(d)
    break

In [25]:
df["snippets_extracted"] = df.apply(get_snippets_from_df, axis=1)

In [35]:
df.head(1)

Unnamed: 0,body,documents,ideal_answer,concepts,type,id,snippets,snippets_extracted
0,Is Hirschsprung disease a mendelian or a multi...,"[http://www.ncbi.nlm.nih.gov/pubmed/15858239, ...","[Coding sequence mutations in RET, GDNF, EDNRB...",[http://www.disease-ontology.org/api/metadata/...,summary,55031181e9bde69634000014,"[{'offsetInBeginSection': 131, 'offsetInEndSec...",[Hirschsprung disease (HSCR) is a multifactori...


In [34]:
# " ".join(df.snippets_extracted.tolist()[0])

In [6]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )


# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)