In [1]:
import ast
import copy
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
    remove_stopwords_and_punctuation,
)

In [6]:
# Hi Alexander,
# thanks for contacting us. Indeed, there was a mistake in the filename, not the contents of the file, which is now fixed.
# New file (name) BioASQ-task12bPhaseA-testset1

TEST_FILE = "temp/test_set/BioASQ-task11bPhaseA-testset1"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"]
print(data[0].keys())
print(len(data))

dict_keys(['id', 'type', 'body'])
85


In [3]:
RETRIEVE_TOP_K = 200
RERANK_BM25 = 50
RERANK_CROSS = 25
RERANK_BI = 10

In [7]:
questions = [
    Question(
        id=q["id"],
        type=q["type"],
        body=q["body"],
    )
    for q in data
]
retrieve_abstracts = PubMedApiRetrieve(verbose=True, num_results=RETRIEVE_TOP_K)

for question in tqdm(questions):
    retrieved = retrieve_abstracts.transform([question])
    if retrieved.empty:
        question_new = copy.deepcopy(question)
        question_new.body = remove_stopwords_and_punctuation(question_new.body)
        print(f"trying new query {question_new.body}")
        print(f"original question: {question.body}")
        etrieved = retrieve_abstracts.transform([question_new])

    while retrieved.empty:
        question_new.body = " ".join(question_new.body.split()[:-1])
        print(f"trying new query {question_new.body}")
        print(f"original question: {question.body}")
        retrieved = retrieve_abstracts.transform([question_new])

    retrieved["bm25"] = retrieve_bm25(question, retrieved)
    reranked = retrieved.sort_values("bm25", ascending=False).head(RERANK_BM25)
    # reranked.to_csv(f"temp/reranked/bm25_{question.id}.csv", index=False)

    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    reranked["question"] = [question.body] * len(reranked)
    reranked["questionno"] = [question.id] * len(reranked)
    reranked["questiontype"] = [question.type] * len(reranked)

    reranked.to_csv(f"temp/reranked/{question.id}.csv", index=False)
    # retrieved.to_csv(f"temp/reranked/retrieved_{question.id}.csv", index=False)

  8%|▊         | 7/85 [00:58<10:26,  8.04s/it]

trying new query MDMA ecstasy successfully used treat PTSD disorder
original question: Has MDMA(ecstasy) been successfully used to treat PTSD disorder?
trying new query MDMA ecstasy successfully used treat PTSD
original question: Has MDMA(ecstasy) been successfully used to treat PTSD disorder?
trying new query MDMA ecstasy successfully used treat
original question: Has MDMA(ecstasy) been successfully used to treat PTSD disorder?


 15%|█▌        | 13/85 [02:03<11:28,  9.56s/it]

trying new query measles immunisation best public health approach reduce incidence measles worldwide
original question: Is measles immunisation the best public health approach to reduce incidence of measles worldwide?
trying new query measles immunisation best public health approach reduce incidence measles
original question: Is measles immunisation the best public health approach to reduce incidence of measles worldwide?


 19%|█▉        | 16/85 [02:29<09:47,  8.51s/it]

trying new query proportion alteration NTRK genes attributable colorectal cancer
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?
trying new query proportion alteration NTRK genes attributable colorectal
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?
trying new query proportion alteration NTRK genes attributable
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?
trying new query proportion alteration NTRK genes
original question: What proportion of alteration in NTRK genes are attributable to colorectal cancer?


 20%|██        | 17/85 [02:45<12:25, 10.97s/it]

trying new query Please list Janus Kinase inhibitors used treat Inflammatory Bowel Disease IBD
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat Inflammatory Bowel Disease
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat Inflammatory Bowel
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat Inflammatory
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used treat
original question: Please list the Janus Kinase inhibitors used to treat Inflammatory Bowel Disease(IBD)
trying new query Please list Janus Kinase inhibitors used
origina

 21%|██        | 18/85 [03:22<20:46, 18.61s/it]

trying new query babies young children risk severe malaria endemic areas
original question: Are only babies and young children at risk of severe malaria in endemic areas?
trying new query babies young children risk severe malaria endemic
original question: Are only babies and young children at risk of severe malaria in endemic areas?


 22%|██▏       | 19/85 [03:38<19:36, 17.82s/it]

trying new query types glucosteroids used management Duchenne muscular dystrophy
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used management Duchenne muscular
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used management Duchenne
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used management
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?
trying new query types glucosteroids used
original question: What types of glucosteroids are used for the management of Duchenne muscular dystrophy?


 25%|██▍       | 21/85 [04:05<16:10, 15.17s/it]

trying new query oncogene somatic mutations associated situ carcinoma evolution colonic polyp adenomas
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma evolution colonic polyp
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma evolution colonic
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma evolution
original question: Which oncogene somatic mutations are associated to in situ carcinoma evolution from colonic polyp adenomas?
trying new query oncogene somatic mutations associated situ carcinoma
original question: Which oncogene somatic mutations are a

 29%|██▉       | 25/85 [04:55<11:59, 12.00s/it]

trying new query medication tested PEMMELA trial
original question: What medication were tested in the PEMMELA trial?
trying new query medication tested PEMMELA
original question: What medication were tested in the PEMMELA trial?
trying new query medication tested
original question: What medication were tested in the PEMMELA trial?


 31%|███       | 26/85 [05:16<14:10, 14.42s/it]

trying new query pathophysiological mechanism microbiota produce malignant lesions colonic mucosa
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce malignant lesions colonic
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce malignant lesions
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce malignant
original question: What is the pathophysiological mechanism by which the microbiota produce malignant lesions in colonic mucosa?
trying new query pathophysiological mechanism microbiota produce
original question: What is the pathophysiological mechanism by which the

 32%|███▏      | 27/85 [05:39<16:28, 17.03s/it]

trying new query carotenodermia caused excess lycopene diet
original question: is carotenodermia caused by an excess of lycopene in the diet?
trying new query carotenodermia caused excess lycopene
original question: is carotenodermia caused by an excess of lycopene in the diet?
trying new query carotenodermia caused excess
original question: is carotenodermia caused by an excess of lycopene in the diet?


 33%|███▎      | 28/85 [05:54<15:37, 16.46s/it]

trying new query two main active ingredients standard PrEP Pre-Exposure Prophylaxis
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?
trying new query two main active ingredients standard PrEP Pre-Exposure
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?
trying new query two main active ingredients standard PrEP
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?
trying new query two main active ingredients standard
original question: Which are the two main active ingredients of the standard PrEP (Pre-Exposure Prophylaxis)?


 54%|█████▍    | 46/85 [08:50<06:30, 10.02s/it]

trying new query sectoral Heterochromia associated Crohn 's disease
original question: Is sectoral Heterochromia associated with Crohn's disease?
trying new query sectoral Heterochromia associated Crohn 's
original question: Is sectoral Heterochromia associated with Crohn's disease?
trying new query sectoral Heterochromia associated Crohn
original question: Is sectoral Heterochromia associated with Crohn's disease?
trying new query sectoral Heterochromia associated
original question: Is sectoral Heterochromia associated with Crohn's disease?


 55%|█████▌    | 47/85 [09:07<07:43, 12.19s/it]

trying new query List effective treatment methods Madelung disease
original question: List the most effective treatment methods for Madelung disease.
trying new query List effective treatment methods Madelung
original question: List the most effective treatment methods for Madelung disease.
trying new query List effective treatment methods
original question: List the most effective treatment methods for Madelung disease.


 56%|█████▋    | 48/85 [09:25<08:38, 14.01s/it]

trying new query common side effects progesterone-only pill POP
original question: What are common side effects of the progesterone-only pill (POP)?
trying new query common side effects progesterone-only pill
original question: What are common side effects of the progesterone-only pill (POP)?


 59%|█████▉    | 50/85 [09:43<06:34, 11.27s/it]

trying new query Explain difference eosinophilic esophagitis reflux-induced esophagitis
original question: Explain the difference between eosinophilic esophagitis and reflux-induced esophagitis.
trying new query Explain difference eosinophilic esophagitis reflux-induced
original question: Explain the difference between eosinophilic esophagitis and reflux-induced esophagitis.
trying new query Explain difference eosinophilic esophagitis
original question: Explain the difference between eosinophilic esophagitis and reflux-induced esophagitis.


 74%|███████▍  | 63/85 [11:53<03:06,  8.47s/it]

trying new query find individual suspect overdosed
original question: What should you do if you find an individual that you suspect has overdosed?
trying new query find individual suspect
original question: What should you do if you find an individual that you suspect has overdosed?


 76%|███████▋  | 65/85 [12:18<03:19, 10.00s/it]

trying new query Please summarize MuSK Antibody positive Myasthenia Gravis
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK Antibody positive Myasthenia
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK Antibody positive
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK Antibody
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.
trying new query Please summarize MuSK
original question: Please summarize MuSK Antibody positive Myasthenia Gravis.


 78%|███████▊  | 66/85 [12:36<03:59, 12.59s/it]

trying new query advisable bring fever supposedly meant assist fight disease
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly meant assist fight
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly meant assist
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly meant
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever supposedly
original question: Is it advisable to bring the fever down when it is supposedly meant to assist in the fight of disease?
trying new query advisable bring fever
original question: Is it advisable t

 91%|█████████ | 77/85 [14:28<01:14,  9.36s/it]

trying new query Please list congenital fibrinogen disorders
original question: Please list the congenital fibrinogen disorders.
trying new query Please list congenital fibrinogen
original question: Please list the congenital fibrinogen disorders.
trying new query Please list congenital
original question: Please list the congenital fibrinogen disorders.


 93%|█████████▎| 79/85 [14:52<01:02, 10.44s/it]

trying new query administration route zavegepant
original question: What is the administration route of zavegepant?
trying new query administration route
original question: What is the administration route of zavegepant?


 94%|█████████▍| 80/85 [15:07<00:59, 11.88s/it]

trying new query Brunner 's gland hamartoma BGH often asymptomatic usually diagnosed
original question: Brunner's gland hamartoma (BGH) is often asymptomatic and so how is it usually diagnosed?
trying new query Brunner 's gland hamartoma BGH often asymptomatic usually
original question: Brunner's gland hamartoma (BGH) is often asymptomatic and so how is it usually diagnosed?


 95%|█████████▌| 81/85 [15:18<00:46, 11.73s/it]

trying new query peritoneal dialysis best option infants kidney failure
original question: Is peritoneal dialysis the best option for infants with kidney failure?
trying new query peritoneal dialysis best option infants kidney
original question: Is peritoneal dialysis the best option for infants with kidney failure?
trying new query peritoneal dialysis best option infants
original question: Is peritoneal dialysis the best option for infants with kidney failure?


100%|██████████| 85/85 [16:01<00:00, 11.31s/it]


### OpenAI API for snippets (SKIP)


In [7]:
questions = []

IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets"] = reranked.apply(
        lambda x: get_snippets(x["question"], x["title"], x["text"]), axis=1
    )
    reranked["title_snippets"] = [x[0] for x in reranked["snippets"]]
    reranked["abstract_snippets"] = [x[1] for x in reranked["snippets"]]
    reranked["offset_title"] = reranked.apply(
        lambda x: get_offset(x["title_snippets"], x["text"]), axis=1
    )
    reranked["offset_abstract"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets"], x["text"]), axis=1
    )
    reranked.to_csv(f"temp/snippets/openai/{file}", index=False)

100%|██████████| 5/5 [01:38<00:00, 19.73s/it]


### OpenAI API for answers


In [2]:
choice = "pyterrier"

### PyTerrier snippets


In [3]:
IN_DIR = f"temp/batch_2/snippets/{choice}/"
# OUT_DIR = f"temp/batch_2/answers/openai/{choice}/"

processed_files = os.listdir("temp/batch_2/answers/openai/pyterrier_snippets/")

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_snippets_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(reranked["text"].tolist()),
            )
        ] * len(reranked)
        reranked["answer_snippets_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(reranked["text"].tolist()),
            )
        ] * len(reranked)
        reranked.to_csv(
            f"temp/batch_2/answers/openai/pyterrier_snippets/{file}", index=False
        )

 16%|█▋        | 14/85 [00:44<03:26,  2.91s/it]

Trying another way


 28%|██▊       | 24/85 [01:14<02:49,  2.79s/it]

Trying another way


 29%|██▉       | 25/85 [01:18<03:20,  3.34s/it]

Trying another way


 32%|███▏      | 27/85 [01:24<02:46,  2.87s/it]

Trying another way


 35%|███▌      | 30/85 [01:31<02:28,  2.69s/it]

Trying another way


 44%|████▎     | 37/85 [01:55<02:10,  2.73s/it]

Trying another way


 87%|████████▋ | 74/85 [03:52<00:40,  3.69s/it]

Trying another way


100%|██████████| 85/85 [04:35<00:00,  3.24s/it]


### PyTerrier abstracts


In [4]:
IN_DIR = f"temp/batch_2/reranked/{choice}/"
processed_files = os.listdir("temp/batch_2/answers/openai/pyterrier_docs/")

ABSTRACTS_FOR_ANSWER = 3

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_abstracts_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["abstract"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_abstracts_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["abstract"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked.to_csv(
            f"temp/batch_2/answers/openai/pyterrier_docs/{file}", index=False
        )

  9%|▉         | 8/85 [00:30<05:28,  4.26s/it]

Trying another way


 21%|██        | 18/85 [01:06<03:48,  3.40s/it]

Trying another way


 28%|██▊       | 24/85 [01:25<02:52,  2.83s/it]

Trying another way


 65%|██████▍   | 55/85 [03:04<01:20,  2.69s/it]

Trying another way


 88%|████████▊ | 75/85 [04:04<00:28,  2.85s/it]

Trying another way


 89%|████████▉ | 76/85 [04:09<00:32,  3.57s/it]

Trying another way


100%|██████████| 85/85 [04:37<00:00,  3.27s/it]


### Mistral


In [15]:
ABSTRACTS_FOR_ANSWER = 3

COLUMNS_FOR_ANSWER = {
    "openai": ["abstract_snippets", "title_snippets"],
    "gpt": ["abstract_snippets_gpt", "title_snippets_gpt"],
    "mistral": [
        ["abstract_snippets_mistral", "title_snippets_mistral"],
    ],
}

for file in tqdm(os.listdir(IN_DIR)):
    if file not in processed_files:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_abstracts_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["text"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_abstracts_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                " ".join(
                    [a for a in reranked["text"].tolist() if str(a) != "nan"][
                        :ABSTRACTS_FOR_ANSWER
                    ]
                ),
            )
        ] * len(reranked)
        reranked["answer_snippets_exact"] = [
            response_exact_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                    + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]].tolist())
                ),
            )
        ] * len(reranked)
        reranked["answer_snippets_ideal"] = [
            response_ideal_answer(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][0]].tolist())
                    + flat_list(reranked[COLUMNS_FOR_ANSWER[choice][0][1]].tolist())
                ),
            )
        ] * len(reranked)
        reranked.to_csv(f"temp/answers/openai/{choice}/{file}", index=False)

 95%|█████████▌| 81/85 [00:29<00:02,  1.93it/s]

Trying another way


100%|██████████| 85/85 [01:03<00:00,  1.33it/s]


### Prepare submission


#### for PyTerrier


##### Answers based on snippets


In [40]:
def return_exact_answer(row, ABSTRACTS_OR_SNIPPETS):
    if row.questiontype == "yesno":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    elif row.questiontype == "summary":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"][0]
    else:
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"]


def return_snippets(reranked):
    output = []
    for _, row in reranked.iterrows():
        d = {
            "document": row["url"],
            "text": row["text"],
            "offsetInBeginSection": row["snippet_offset_in_begin_section"],
            "offsetInEndSection": row["snippet_offset_in_end_section"],
            "beginSection": row["snippet_begin_section"],
            "endSection": row["snippet_end_section"],
        }
        output.append(d)

    return output


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:5]

In [42]:
IN_DIR1 = "temp/batch_2/answers/openai/pyterrier_snippets/"
IN_DIR2 = "temp/batch_2/reranked/pyterrier/"

ABSTRACTS_OR_SNIPPETS = "snippets"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked1["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
    json_list.append(
        {
            "type": reranked1["questiontype"].tolist()[0],
            "body": reranked1["question"].tolist()[0],
            "id": reranked1["questionno"].tolist()[0],
            "ideal_answer": reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"].tolist()[
                0
            ][0],
            "exact_answer": exact_answer,
            # "documents": reranked2["url"].tolist(),
            # "snippets": return_snippets(reranked1),
        }
    )

    json_out = {"questions": json_list}

with open("temp/batch_2/submission/mibi_rag_snippet.json", "w") as f:
    json.dump(json_out, f, indent=4)

100%|██████████| 85/85 [00:00<00:00, 489.74it/s]


#### Answers based on abstracts


In [41]:
IN_DIR1 = "temp/batch_2/answers/openai/pyterrier_docs/"

ABSTRACTS_OR_SNIPPETS = "abstracts"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked1[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked1["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(reranked1.iloc[0], ABSTRACTS_OR_SNIPPETS)
    json_list.append(
        {
            "type": reranked1["questiontype"].tolist()[0],
            "body": reranked1["question"].tolist()[0],
            "id": reranked1["questionno"].tolist()[0],
            "ideal_answer": reranked1[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"].tolist()[
                0
            ][0],
            "exact_answer": exact_answer,
            # "documents": reranked1["url"].tolist(),
            # "snippets": return_snippets(reranked1),
        }
    )

    json_out = {"questions": json_list}

with open("temp/batch_2/submission/mibi_rag_abstract.json", "w") as f:
    json.dump(json_out, f, indent=4)

100%|██████████| 85/85 [00:00<00:00, 542.30it/s]


In [16]:
def return_exact_answer(row, ABSTRACTS_OR_SNIPPETS):
    if row.questiontype == "yesno":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    elif row.questiontype == "summary":
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"][0]
    else:
        return row[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"]


def return_snippets(reranked, choice):
    output = []
    for _, row in reranked.iterrows():
        is_success = False
        if row[f"abstract_snippets_{choice}"]:
            for i, snippet in enumerate(row[f"abstract_snippets_{choice}"]):
                try:
                    d = {
                        "document": row["url"],
                        "text": snippet,
                        "offsetInBeginSection": row[f"offset_abstract_{choice}"][i][0],
                        "offsetInEndSection": row[f"offset_abstract_{choice}"][i][1],
                        "beginSection": "abstract",
                        "endSection": "abstract",
                    }
                    output.append(d)
                    is_success = True
                except:
                    pass
        if not is_success:
            if row[f"title_snippets_{choice}"]:
                for i, snippet in enumerate(row[f"title_snippets_{choice}"]):
                    try:
                        d = {
                            "document": row["url"],
                            "text": snippet,
                            "offsetInBeginSection": row[f"offset_title_{choice}"][i][0],
                            "offsetInEndSection": row[f"offset_title_{choice}"][i][1],
                            "beginSection": "title",
                            "endSection": "title",
                        }
                        output.append(d)
                    except:
                        pass
    return output


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:5]

### Answers based on snippets


In [21]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "snippets"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    # reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = pd.concat([reranked1, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked_merged["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(
            reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
        )
    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": exact_answer,
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice)[:10],  # not good
        }
    )

    json_out = {"questions": json_list}

100%|██████████| 85/85 [00:00<00:00, 243.02it/s]


In [23]:
with open("temp/submission/system_1.json", "w") as f:
    json.dump(json_out, f, indent=4)

### Answers based on abstracts


In [24]:
IN_DIR1 = f"temp/answers/openai/{choice}"
IN_DIR2 = "temp/snippets/gpt"
IN_DIR3 = "temp/snippets/mistral"

ABSTRACTS_OR_SNIPPETS = "abstracts"

json_list = []

for file in tqdm(os.listdir(IN_DIR1)):
    reranked1 = pd.read_csv(os.path.join(IN_DIR1, file))
    # reranked2 = pd.read_csv(os.path.join(IN_DIR2, file))
    reranked3 = pd.read_csv(os.path.join(IN_DIR3, file))
    # reranked_merged = pd.concat([reranked1, reranked2, reranked3], axis=1)
    reranked_merged = pd.concat([reranked1, reranked3], axis=1)
    reranked_merged = reranked_merged.loc[:, ~reranked_merged.columns.duplicated()]
    reranked_merged[f"title_snippets_{choice}"] = reranked_merged[
        f"title_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"abstract_snippets_{choice}"] = reranked_merged[
        f"abstract_snippets_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_title_{choice}"] = reranked_merged[
        f"offset_title_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"offset_abstract_{choice}"] = reranked_merged[
        f"offset_abstract_{choice}"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
    ].apply(ast.literal_eval)
    reranked_merged[f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"] = reranked_merged[
        f"answer_{ABSTRACTS_OR_SNIPPETS}_exact"
    ].apply(ast.literal_eval)

    if reranked_merged["questiontype"].tolist()[0] in ("factoid", "list"):
        exact_answer = fix_list(
            return_exact_answer(reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS)
        )
    else:
        exact_answer = return_exact_answer(
            reranked_merged.iloc[0], ABSTRACTS_OR_SNIPPETS
        )

    json_list.append(
        {
            "type": reranked_merged["questiontype"].tolist()[0],
            "body": reranked_merged["question"].tolist()[0],
            "id": reranked_merged["questionno"].tolist()[0],
            "ideal_answer": reranked_merged[
                f"answer_{ABSTRACTS_OR_SNIPPETS}_ideal"
            ].tolist()[0][0],
            "exact_answer": exact_answer,
            "documents": reranked_merged["url"].tolist(),
            "snippets": return_snippets(reranked_merged, choice)[:10],  # not good
        }
    )

    json_out = {"questions": json_list}

100%|██████████| 85/85 [00:00<00:00, 214.13it/s]


In [25]:
with open("temp/submission/system_2.json", "w") as f:
    json.dump(json_out, f, indent=4)

# Phase B


In [2]:
TEST_FILE = "temp/test_set/BioASQ-task12bPhaseB-testset1"

with open(TEST_FILE, "r") as f:
    data = json.load(f)

data = data["questions"]
print(data[0].keys())
print(len(data))

dict_keys(['documents', 'snippets', 'id', 'type', 'body'])
85


In [3]:
def get_snippets_from_df(row):
    return [x["text"] for x in row.snippets]


def fix_list(l):
    l_out = []
    for item in l:
        if type(item) == list:
            if len(item) == 1 and [item] not in l_out:
                l_out.append(item)
            elif len(item) > 1:
                for subitem in item:
                    if [subitem] not in l_out:
                        l_out.append([subitem])
        if type(item) == str and [item] not in l_out:
            l_out.append([item])

    return l_out[:5]

### Answers based on snippets


In [4]:
for d in tqdm(data):
    df = pd.json_normalize(d)
    df["snippets_extracted"] = df.apply(get_snippets_from_df, axis=1)
    df.rename(
        columns={"body": "question", "id": "questionno", "type": "questiontype"},
        inplace=True,
    )

    df["answer_snippets_exact"] = [
        response_exact_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(df.snippets_extracted.tolist()[0]),
        )
    ] * len(df)
    df["answer_snippets_ideal"] = [
        response_ideal_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(df.snippets_extracted.tolist()[0]),
        )
    ] * len(df)

    df.to_csv(
        f"temp/phase_b/answers/openai/snippets/{df.questionno.values[0]}.csv",
        index=False,
    )

  4%|▎         | 3/85 [00:08<04:01,  2.94s/it]

Trying another way


 25%|██▍       | 21/85 [01:06<02:26,  2.29s/it]

Trying another way


 32%|███▏      | 27/85 [01:27<03:11,  3.31s/it]

Trying another way


 74%|███████▍  | 63/85 [03:15<00:58,  2.68s/it]Incomplete output detected, should increase max_tokens


Trying another way (exact answer)


 82%|████████▏ | 70/85 [04:03<00:55,  3.68s/it]

Trying another way


100%|██████████| 85/85 [04:47<00:00,  3.39s/it]


### Prepare submission


In [24]:
json_list = []

IN_DIR = "temp/phase_b/answers/openai/snippets/"

for file in os.listdir(IN_DIR):
    df = pd.read_csv(os.path.join(IN_DIR, file))
    df["answer_snippets_exact"] = df["answer_snippets_exact"].apply(ast.literal_eval)
    df["answer_snippets_ideal"] = df["answer_snippets_ideal"].apply(ast.literal_eval)

    if df["questiontype"].values[0] == "summary":
        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_snippets_ideal"].values[0][0],
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )
    else:
        if df["questiontype"].values[0] in ("factoid", "list"):
            exact_answer = fix_list(df["answer_snippets_exact"].values[0])
        else:
            exact_answer = df["answer_snippets_exact"].values[0][0]

        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_snippets_ideal"].values[0][0],
                "exact_answer": exact_answer,
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )

    json_out = {"questions": json_list}

In [25]:
with open("temp/phase_b/submission/system_1.json", "w") as f:
    json.dump(json_out, f, indent=4)

In [7]:
# for d in json_list:
#     try:
#         print(d["exact_answer"])
#     except:
#         print("NO ANSWER")
#     print(d["ideal_answer"])
#     print()

### Answers based on abstracts


In [9]:
RERANK_CROSS = 25
RERANK_BI = 10

In [10]:
from get_pubmed_documents import get_title_abstract

ABSTRACTS_FOR_ANSWER = 3

for d in tqdm(data):
    df = pd.json_normalize(d)
    df["snippets_extracted"] = df.apply(get_snippets_from_df, axis=1)
    df.rename(
        columns={"body": "question", "id": "questionno", "type": "questiontype"},
        inplace=True,
    )

    titles_abstracts = [get_title_abstract(url) for url in df["documents"].tolist()[0]]
    titles = list(zip(*titles_abstracts))[0]
    abstracts = list(zip(*titles_abstracts))[1]
    reranked = pd.DataFrame({"title": titles, "abstract": abstracts})
    question = Question(
        id=df["questionno"].values[0],
        type=df["questiontype"].values[0],
        body=df["question"].values[0],
    )

    reranked = pd.DataFrame({"title": titles, "text": abstracts})
    reranked["cos_sim"] = rerank_crossencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_CROSS)
    reranked = reranked.drop("cos_sim", axis=1)

    reranked["cos_sim"] = rerank_biencoder(question, reranked)
    reranked = reranked.sort_values("cos_sim", ascending=False).head(RERANK_BI)

    df["answer_abstracts_exact"] = [
        response_exact_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(df)
    df["answer_abstracts_ideal"] = [
        response_ideal_answer(
            df["question"].values[0],
            df["questiontype"].values[0],
            " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
        )
    ] * len(df)

    df.to_csv(
        f"temp/phase_b/answers/openai/abstracts/{df.questionno.values[0]}.csv",
        index=False,
    )

  4%|▎         | 3/85 [00:35<15:35, 11.40s/it]

Trying another way


 14%|█▍        | 12/85 [02:41<17:33, 14.43s/it]

Trying another way


 32%|███▏      | 27/85 [05:56<12:55, 13.37s/it]

Trying another way


 38%|███▊      | 32/85 [07:07<12:39, 14.33s/it]

Trying another way


 82%|████████▏ | 70/85 [15:42<02:55, 11.71s/it]

Trying another way


 92%|█████████▏| 78/85 [17:39<01:53, 16.17s/it]

Trying another way


 98%|█████████▊| 83/85 [18:56<00:31, 15.51s/it]

Trying another way


100%|██████████| 85/85 [19:28<00:00, 13.75s/it]


### Prepare submission


In [26]:
json_list = []

IN_DIR = "temp/phase_b/answers/openai/abstracts/"

for file in os.listdir(IN_DIR):
    df = pd.read_csv(os.path.join(IN_DIR, file))
    df["answer_abstracts_exact"] = df["answer_abstracts_exact"].apply(ast.literal_eval)
    df["answer_abstracts_ideal"] = df["answer_abstracts_ideal"].apply(ast.literal_eval)

    if df["questiontype"].values[0] == "summary":
        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_abstracts_ideal"].values[0][0],
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )
    else:
        if df["questiontype"].values[0] in ("factoid", "list"):
            exact_answer = fix_list(df["answer_abstracts_exact"].values[0])
        else:
            exact_answer = df["answer_abstracts_exact"].values[0][0]

        json_list.append(
            {
                "type": df["questiontype"].values[0],
                "body": df["question"].values[0],
                "id": df["questionno"].values[0],
                "ideal_answer": df["answer_abstracts_ideal"].values[0][0],
                "exact_answer": exact_answer,
                "documents": df["documents"].apply(ast.literal_eval).tolist()[0],
                "snippets": df["snippets"].apply(ast.literal_eval).tolist()[0],
            }
        )

    json_out = {"questions": json_list}

In [27]:
with open("temp/phase_b/submission/system_2.json", "w") as f:
    json.dump(json_out, f, indent=4)

In [14]:
# for d in json_list:
#     print(d["body"])
#     try:
#         print(d["exact_answer"])
#     except:
#         print("NO ANSWER")
#     print(d["ideal_answer"])
#     print()

In [6]:
# reranked["snippets_blablador_gpt"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="gpt-3.5-turbo"
#     ),
#     axis=1,
# )
# reranked["snippets_blablador_mistral"] = reranked.apply(
#     lambda x: get_snippets_blablador(
#         question.body, x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
#     ),
#     axis=1,
# )
# reranked["title_snippets_gpt"] = [x[0] for x in reranked["snippets_blablador_gpt"]]
# reranked["abstract_snippets_gpt"] = [
#     x[1] for x in reranked["snippets_blablador_gpt"]
# ]
# reranked["title_snippets_mistral"] = [
#     x[0] for x in reranked["snippets_blablador_mistral"]
# ]
# reranked["abstract_snippets_mistral"] = [
#     x[1] for x in reranked["snippets_blablador_mistral"]
# ]

# reranked["offset_title_gpt"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_gpt"], x["text"]), axis=1
# )
# reranked["offset_abstract_gpt"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_gpt"], x["text"]), axis=1
# )

# reranked["offset_title_mistral"] = reranked.apply(
#     lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
# )
# reranked["offset_abstract_mistral"] = reranked.apply(
#     lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
# )


# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)