# mediRAG Pipeline

In [1]:
import os
import torch
from prompts import *
import evaluate
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough

## Load Model and Tokenizer

In [2]:
model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="models")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    cache_dir="models"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Load Data

In [3]:
dataset = load_dataset("bigbio/pubmed_qa", cache_dir="data")

dataset

DatasetDict({
    train: Dataset({
        features: ['QUESTION', 'CONTEXTS', 'LABELS', 'MESHES', 'YEAR', 'reasoning_required_pred', 'reasoning_free_pred', 'final_decision', 'LONG_ANSWER'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['QUESTION', 'CONTEXTS', 'LABELS', 'MESHES', 'YEAR', 'reasoning_required_pred', 'reasoning_free_pred', 'final_decision', 'LONG_ANSWER'],
        num_rows: 11269
    })
})

In [4]:
page_content_column = "CONTEXTS"

def preprocess(dataset):
    for split in dataset.keys():
        for contexts in dataset[split][page_content_column]:
            for sentence in contexts:
                yield Document(page_content=sentence)

data = list(preprocess(dataset))  # 655055

data[0]

Document(page_content='In previous work we (Fisher et al., 2011) examined the emergence of neurobehavioral disinhibition (ND) in adolescents with prenatal substance exposure. We computed ND factor scores at three age points (8/9, 11 and 13/14 years) and found that both prenatal substance exposure and early adversity predicted ND. The purpose of the current study was to determine the association between these ND scores and initiation of substance use between ages 8 and 16 in this cohort as early initiation of substance use has been related to later substance use disorders. Our hypothesis was that prenatal cocaine exposure predisposes the child to ND, which, in turn, is associated with initiation of substance use by age 16.')

## Setting up FAISS

In [5]:
embedding_model = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device':'cuda'}
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,   
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs, 
    cache_folder="models"
)

if os.path.exists("faiss_index_pubmed"):
    db = FAISS.load_local("faiss_index_pubmed", embeddings)
else:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
    docs = text_splitter.split_documents(data)  # 676307

    db = FAISS.from_documents(docs, embeddings)
    db.save_local("faiss_index_pubmed")

In [6]:
question = dataset['train']["QUESTION"][0]
context = dataset['train']["CONTEXTS"][0]

retrieved_docs = db.similarity_search(question)  # db.similarity_search_with_score(question)

print(f"Question:\n{question}")
print(f"\nContext:\n{context}")
print(f"\nRetrieved document:\n{retrieved_docs[0].page_content}")

Question:
Does neurobehavioral disinhibition predict initiation of substance use in children with prenatal cocaine exposure?

Context:
['In previous work we (Fisher et al., 2011) examined the emergence of neurobehavioral disinhibition (ND) in adolescents with prenatal substance exposure. We computed ND factor scores at three age points (8/9, 11 and 13/14 years) and found that both prenatal substance exposure and early adversity predicted ND. The purpose of the current study was to determine the association between these ND scores and initiation of substance use between ages 8 and 16 in this cohort as early initiation of substance use has been related to later substance use disorders. Our hypothesis was that prenatal cocaine exposure predisposes the child to ND, which, in turn, is associated with initiation of substance use by age 16.', "We studied 386 cocaine exposed and 517 unexposed children followed since birth in a longitudinal study. Five dichotomous variables were computed based 

## Initializing Pipeline


In [7]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    max_new_tokens=300,
    do_sample=False,
)

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE_QA_EXPLAINER,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

## Running Queries

In [8]:
question = dataset['train'][0]["QUESTION"]
context = dataset['train'][0]["CONTEXTS"]
long_answer = dataset['train'][0]["LONG_ANSWER"]
final_decision = dataset['train'][0]["final_decision"]

In [9]:
# QA without retrieval 
input_ids = tokenizer.encode(question, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        do_sample=False,
        return_dict_in_generate=True,
        max_new_tokens=300,
    )

output = tokenizer.decode(output.sequences[0][len(input_ids[0]):])

output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'\n\nAbstract:\n\nBackground: Neurobehavioral disinhibition (NBD) is a neurodevelopmental disorder characterized by impulsivity, hyperactivity, and poor inhibitory control. It has been linked to prenatal cocaine exposure (PCE) and has been proposed as a risk factor for substance use initiation. However, there is limited research on whether NBD predicts substance use initiation in children with PCE.\n\nMethods: This study examined whether NBD predicted substance use initiation in a sample of children with PCE. Participants were recruited from a longitudinal study of children born to cocaine-using mothers. NBD was assessed using the Behavior Rating Inventory of Executive Function (BRIEF) at age 5 and substance use was assessed using self-report questionnaires at age 12. Logistic regression analyses were conducted to examine the relationship between NBD and substance use initiation.\n\nResults: The sample included 100 children with PCE and 100 children without PCE. NBD was significantly h

In [10]:
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 3}
)

# retriever = db.as_retriever(search_type="similarity_score_threshold", 
#                                  search_kwargs={"score_threshold": .5, 
#                                                 "k": top_k})

rag_chain = ({"context": retriever, "question": RunnablePassthrough()} | llm_chain)

# QA with retrieval
qa_retrieval_result = rag_chain.invoke(question)

qa_retrieval_result

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'context': [Document(page_content='In previous work we (Fisher et al., 2011) examined the emergence of neurobehavioral disinhibition (ND) in adolescents with prenatal substance exposure. We computed ND factor scores at three age points (8/9, 11 and 13/14 years) and found that both prenatal substance exposure and early adversity predicted ND. The purpose of the current study was to determine the association between these ND scores and initiation of substance use between ages 8 and 16 in this cohort as early initiation of substance use has been related to later substance use disorders. Our hypothesis was that prenatal cocaine exposure predisposes the child to ND, which, in turn, is associated with initiation of substance use by age 16.'),
  Document(page_content='The authors derived an index of neurobehavioral disinhibition from measures of affect, behavior, and cognition. The neurobehavioral disinhibition score was used to discriminate youth at high and low average risk for substance u

# Evaluation

In [11]:
bleu = evaluate.load("bleu", cache_dir="evaluation_metrics")

bleu_score = bleu.compute(predictions=[output], references=[long_answer])
print(f"BLEU Score: {bleu_score}")

bleu_score = bleu.compute(predictions=[qa_retrieval_result["text"]], references=[long_answer])
print(f"BLEU Score: {bleu_score}")

BLEU Score: {'bleu': 0.0150853715603116, 'precisions': [0.05741626794258373, 0.019230769230769232, 0.00966183574879227, 0.0048543689320388345], 'brevity_penalty': 1.0, 'length_ratio': 9.08695652173913, 'translation_length': 209, 'reference_length': 23}
BLEU Score: {'bleu': 0.01746600805465613, 'precisions': [0.07853403141361257, 0.021052631578947368, 0.010582010582010581, 0.005319148936170213], 'brevity_penalty': 1.0, 'length_ratio': 8.304347826086957, 'translation_length': 191, 'reference_length': 23}


In [12]:
bertscore = evaluate.load("bertscore", cache_dir="evaluation_metrics")

bert_score = bertscore.compute(predictions=[output], references=[long_answer], lang="en")
print(f"BERTScore: {bert_score}")

bert_score = bertscore.compute(predictions=[qa_retrieval_result["text"]], references=[long_answer], lang="en")
print(f"BERTScore: {bert_score}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: {'precision': [0.7908798456192017], 'recall': [0.853207528591156], 'f1': [0.8208622932434082], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.35.1)'}
BERTScore: {'precision': [0.8336621522903442], 'recall': [0.8714827299118042], 'f1': [0.8521530032157898], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.35.1)'}
