In [1]:
import ast
import json
import os
import pandas as pd
from tqdm import tqdm
from mibi.modules import DocumentsModule, Question
from test_utils import (
    get_snippets,
    PubMedApiRetrieve,
    rerank_biencoder,
    rerank_crossencoder,
    get_snippets,
    get_offset,
    response_exact_answer,
    response_ideal_answer,
    flat_list,
    get_snippets_blablador,
    retrieve_bm25,
    response_exact_answer_mistral,
    response_ideal_answer_mistral,
)

### Blablador Mistral API for snippets


In [20]:
IN_DIR = "temp/reranked/"

for file in tqdm(os.listdir(IN_DIR)):
    reranked = pd.read_csv(os.path.join(IN_DIR, file))
    reranked["snippets_blablador_mistral"] = reranked.apply(
        lambda x: get_snippets_blablador(
            x["question"], x["title"], x["text"], model="Mistral-7B-Instruct-v0.2"
        ),
        axis=1,
    )

    reranked["title_snippets_mistral"] = [
        x[0] for x in reranked["snippets_blablador_mistral"]
    ]
    reranked["abstract_snippets_mistral"] = [
        x[1] for x in reranked["snippets_blablador_mistral"]
    ]

    reranked["offset_title_mistral"] = reranked.apply(
        lambda x: get_offset(x["title_snippets_mistral"], x["text"]), axis=1
    )
    reranked["offset_abstract_mistral"] = reranked.apply(
        lambda x: get_offset(x["abstract_snippets_mistral"], x["text"]), axis=1
    )

    reranked.to_csv(f"temp/snippets/mistral/{file}", index=False)

100%|██████████| 5/5 [05:23<00:00, 64.65s/it]


In [2]:
IN_DIR = "temp/snippets/mistral/"

for file in tqdm(os.listdir(IN_DIR)):
    if "54e25eaaae9738404b000017" in file:
        reranked = pd.read_csv(os.path.join(IN_DIR, file))

        reranked["answer_snippets_exact_mistral"] = [
            response_exact_answer_mistral(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    reranked["abstract_snippets_mistral"].tolist()
                    + reranked["title_snippets_mistral"]
                ),
                model="Mistral-7B-Instruct-v0.2",
            )
        ] * len(reranked)

        reranked["answer_snippets_ideal_mistral"] = [
            response_ideal_answer_mistral(
                reranked["question"].tolist()[0],
                reranked["questiontype"].tolist()[0],
                "".join(
                    flat_list(reranked["abstract_snippets_mistral"].tolist())
                    + flat_list(reranked["title_snippets_mistral"])
                ),
                model="Mistral-7B-Instruct-v0.2",
            )
        ] * len(reranked)

#

100%|██████████| 5/5 [00:01<00:00,  2.97it/s]


In [3]:
reranked

Unnamed: 0,docno,score,rank,title,text,url,bm25,cos_sim,question,questionno,questiontype,snippets_blablador_mistral,title_snippets_mistral,abstract_snippets_mistral,offset_title_mistral,offset_abstract_mistral,answer_snippets_exact_mistral,answer_snippets_ideal_mistral
0,15094122,0,15,"Papilin, a novel component of basement membran...","Papilins are homologous, secreted extracellula...",https://pubmed.ncbi.nlm.nih.gov/15094122/,0.0,0.628675,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,"(['Papilin, a novel component of basement memb...","['Papilin, a novel component of basement membr...","['They occur widely, from nematodes to man,"" ""...",[()],[()],Yes.,"Yes, papilin is a secreted extracellular matr..."
1,11076767,0,18,Papilin in development; a pericellular protein...,Papilin is an extracellular matrix glycoprotei...,https://pubmed.ncbi.nlm.nih.gov/11076767/,0.0,0.587269,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,(['Papilin in development; a pericellular prot...,['Papilin in development; a pericellular prote...,['Determination of its cDNA sequence led to th...,[()],"[(273, 375), (376, 556), (685, 784), (785, 901...",Yes.,"Yes, papilin is a secreted extracellular matr..."
2,12666201,0,17,Alternative splicing of papilin and the divers...,Papilins are extracellular matrix proteins tha...,https://pubmed.ncbi.nlm.nih.gov/12666201/,0.0,0.515122,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,(['Alternative splicing of papilin and the div...,['Alternative splicing of papilin and the dive...,['Papilins are extracellular matrix proteins t...,[()],"[(0, 109), (286, 441), (442, 507), (508, 768)]",Yes.,"Yes, papilin is a secreted extracellular matr..."
3,38284126,0,1,Secreted ADAMTS-like proteins as regulators of...,The extracellular matrix (ECM) determines func...,https://pubmed.ncbi.nlm.nih.gov/38284126/,0.0,0.411736,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,"([], ['The secreted a disintegrin and metallop...",[],['The secreted a disintegrin and metalloprotei...,[],"[(), (584, 671)]",Yes.,"Yes, papilin is a secreted extracellular matr..."
4,32585132,0,5,Comprehensive Endogenous Tagging of Basement M...,Basement membranes (BMs) are supramolecular ma...,https://pubmed.ncbi.nlm.nih.gov/32585132/,0.0,0.403068,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,"([], ['Through photobleaching studies, we show...",[],"['Through photobleaching studies, we show that...",[],"[(477, 626)]",Yes.,"Yes, papilin is a secreted extracellular matr..."
5,20805556,0,13,Genetics of extracellular matrix remodeling du...,The organs of animal embryos are typically cov...,https://pubmed.ncbi.nlm.nih.gov/20805556/,0.0,0.397381,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,(['The organs of animal embryos are typically ...,['The organs of animal embryos are typically c...,['We previously described the twisting of the ...,"[(), (), (), (), (725, 843)]","[(), (), (), (725, 843), (1116, 1415)]",Yes.,"Yes, papilin is a secreted extracellular matr..."
6,15094110,0,16,The thrombospondin type 1 repeat superfamily.,The TSR superfamily is a diverse family of ext...,https://pubmed.ncbi.nlm.nih.gov/15094110/,0.0,0.364708,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,(['The thrombospondin type 1 repeat superfamil...,['The thrombospondin type 1 repeat superfamily.'],['Many of which have functions related to regu...,[()],[()],Yes.,"Yes, papilin is a secreted extracellular matr..."
7,34240827,0,3,Primary angle closure glaucoma is characterize...,To characterize the proteome of the iris in pr...,https://pubmed.ncbi.nlm.nih.gov/34240827/,0.0,0.360284,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,(['Primary angle closure glaucoma is character...,['Primary angle closure glaucoma is characteri...,['The main outcome was proteins with a log2 fo...,[()],"[(), ()]",Yes.,"Yes, papilin is a secreted extracellular matr..."
8,19297413,0,14,C. elegans mig-6 encodes papilin isoforms that...,The gonad arms of C. elegans hermaphrodites ac...,https://pubmed.ncbi.nlm.nih.gov/19297413/,0.0,0.333149,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,(['C. elegans mig-6 encodes papilin isoforms t...,['C. elegans mig-6 encodes papilin isoforms th...,['The gonad arms of C. elegans hermaphrodites ...,[()],"[(), (), (394, 539), (540, 713), (714, 835), (...",Yes.,"Yes, papilin is a secreted extracellular matr..."
9,7515725,0,19,"Differentiation, extracellular matrix synthesi...","Two contrasting substrates, Drosophila laminin...",https://pubmed.ncbi.nlm.nih.gov/7515725/,0.0,0.330635,Is the protein Papilin secreted?,54e25eaaae9738404b000017,yesno,"([], ['Collagen IV, laminin, glutactin, papili...",[],"['Collagen IV, laminin, glutactin, papilin, an...",[],"[(732, 881)]",Yes.,"Yes, papilin is a secreted extracellular matr..."


In [None]:
reranked

In [6]:
# reranked["answer_abstracts_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_abstracts_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(reranked["text"].tolist()[:ABSTRACTS_FOR_ANSWER]),
#     )
# ] * len(reranked)
# reranked["answer_snippets_exact"] = [
#     response_exact_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)
# reranked["answer_snippets_ideal"] = [
#     response_ideal_answer(
#         question.body,
#         question.type,
#         " ".join(
#             flat_list(reranked["abstract_snippets"].tolist())
#             + flat_list(reranked["title_snippets"])
#         ),
#     )
# ] * len(reranked)