In [1]:
import ast
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
    response_exact_answer_mistral,
    response_ideal_answer_mistral,
)

### Blablador Mistral API for snippets


In [3]:
IN_DIR = "temp/batch_2/reranked/pyterrier"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets_blablador_mistral"] = reranked.apply(
        lambda x: get_snippets_blablador(
            x["question"], x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
        ),
        axis=1,
    )

    reranked["title_snippets_mistral"] = [
        x[0] for x in reranked["snippets_blablador_mistral"]
    ]
    reranked["abstract_snippets_mistral"] = [
        x[1] for x in reranked["snippets_blablador_mistral"]
    ]

    reranked["offset_title_mistral"] = reranked.apply(
        lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
    )
    reranked["offset_abstract_mistral"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
    )

    reranked.to_csv(f"temp/batch_2/snippets/mistral/{file}", index=False)

  0%|          | 0/5 [03:04<?, ?it/s]


InternalServerError: <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>502 Proxy Error</title>
</head><body>
<h1>Proxy Error</h1>
<p>The proxy server received an invalid
response from an upstream server.<br />
The proxy server could not handle the request<p>Reason: <strong>Error reading from remote server</strong></p></p>
</body></html>

### PyTerrier


In [2]:
IN_DIR = "temp/batch_2/reranked/pyterrier"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets_blablador_mistral"] = reranked.apply(
        lambda x: get_snippets_blablador(
            x["question"], x["title"], x["abstract"], model="Mistral-7B-Instruct-v0.2"
        ),
        axis=1,
    )

    reranked["title_snippets_mistral"] = [
        x[0] for x in reranked["snippets_blablador_mistral"]
    ]
    reranked["abstract_snippets_mistral"] = [
        x[1] for x in reranked["snippets_blablador_mistral"]
    ]

    reranked["offset_title_mistral"] = reranked.apply(
        lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
    )
    reranked["offset_abstract_mistral"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
    )

    reranked.to_csv(f"temp/batch_2/snippets/mistral/{file}", index=False)

  0%|          | 0/5 [03:03<?, ?it/s]


InternalServerError: <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>502 Proxy Error</title>
</head><body>
<h1>Proxy Error</h1>
<p>The proxy server received an invalid
response from an upstream server.<br />
The proxy server could not handle the request<p>Reason: <strong>Error reading from remote server</strong></p></p>
</body></html>

### Blablador Mistral API for answers (SKIP)


In [2]:
IN_DIR = "temp/snippets/mistral/"

for file in tqdm(os.listdir(IN_DIR)):
    if "54e25eaaae9738404b000017" in file:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_snippets_exact_mistral"] = [
            response_exact_answer_mistral(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    reranked["abstract_snippets_mistral"].tolist()
                    + reranked["title_snippets_mistral"]
                ),
                model="Mistral-7B-Instruct-v0.2",
            )
        ] * len(reranked)

        reranked["answer_snippets_ideal_mistral"] = [
            response_ideal_answer_mistral(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked["abstract_snippets_mistral"].tolist())
                    + flat_list(reranked["title_snippets_mistral"])
                ),
                model="Mistral-7B-Instruct-v0.2",
            )
        ] * len(reranked)

#

100%|██████████| 5/5 [00:01<00:00,  2.97it/s]


In [4]:
# reranked

In [6]:
# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)