<a href="https://colab.research.google.com/github/vvarss/Medical_Chatbot/blob/main/Medical_Chatbot_using_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load the google drive


In [10]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install langchain sentence-transformers chromadb llama-cpp-python langchain_community pypdf



In [3]:
import langchain
print(langchain.__version__)


1.1.0


In [11]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings       #for generating sentence embedings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import LlamaCpp
#from langchain.chains import RetrievalQA




In [14]:
import pkgutil
import inspect

def find_class_in_package(package_name, class_name):
    found_paths = []
    try:
        package = __import__(package_name)
        # Iterate through submodules of the package
        for importer, modname, ispkg in pkgutil.walk_packages(package.__path__, package.__name__ + '.'):
            try:
                # Try to import the module and check for the class
                module = __import__(modname, fromlist=[class_name])
                if hasattr(module, class_name):
                    obj = getattr(module, class_name)
                    if inspect.isclass(obj):
                        found_paths.append(modname)
            except (ImportError, AttributeError):
                # Continue if module cannot be imported or class not found in it
                continue
    except ImportError:
        print(f"Package '{package_name}' not found.")
        pass
    return found_paths

# Search in langchain
langchain_paths = find_class_in_package('langchain', 'RetrievalQA')
print(f"RetrievalQA found in 'langchain' at: {langchain_paths}")

# Search in langchain_community
langchain_community_paths = find_class_in_package('langchain_community', 'RetrievalQA')
print(f"RetrievalQA found in 'langchain_community' at: {langchain_community_paths}")

RetrievalQA found in 'langchain' at: []




RetrievalQA found in 'langchain_community' at: []


In [12]:
loader = PyPDFDirectoryLoader("/content/drive/MyDrive/Data")      # No need for entire path
docs = loader.load()

In [15]:
len(docs)

95

In [16]:
docs[6]  #content in page 6

Document(metadata={'producer': 'Acrobat Distiller 6.0.1 for Macintosh', 'creator': 'QuarkXPress(tm) 6.5', 'creationdate': '2006-02-16T11:30:29-05:00', 'subject': 'Heart disease', 'author': 'NHLBI', 'keywords': 'heart disease, prevention, risk factors, chd, coronary artery disease, corornary heart disease, cad', 'moddate': '2006-02-23T09:58:15-05:00', 'title': 'Your Guide to A Healthy Heart', 'source': '/content/drive/MyDrive/Data/healthyheart.pdf', 'total_pages': 95, 'page': 6, 'page_label': '7'}, page_content='2\nThese facts may seem frightening, but they need not be. The good\nnews is that you have a lot of power to protect and improve your\nheart health. This guidebook will help you find out your own risk\nof heart disease and take steps to prevent it.\n“But,” you may still be thinking, “I take pretty good care of myself.\nI’m unlikely to get heart disease.” Yet a recent national survey shows\nthat only 3 percent of U.S. adults practice all of the “Big Four”\nhabits that help to pre

## Chunking


In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,chunk_overlap=50)  # each chunk into 300 characters
chunks = text_splitter.split_documents(docs)

In [18]:
len(chunks)

585

In [19]:
chunks[16]

Document(metadata={'producer': 'Acrobat Distiller 6.0.1 for Macintosh', 'creator': 'QuarkXPress(tm) 6.5', 'creationdate': '2006-02-16T11:30:29-05:00', 'subject': 'Heart disease', 'author': 'NHLBI', 'keywords': 'heart disease, prevention, risk factors, chd, coronary artery disease, corornary heart disease, cad', 'moddate': '2006-02-23T09:58:15-05:00', 'title': 'Your Guide to A Healthy Heart', 'source': '/content/drive/MyDrive/Data/healthyheart.pdf', 'total_pages': 95, 'page': 3, 'page_label': '4'}, page_content='Stress . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 36\nAlcohol . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 36')

In [20]:
chunks[17]

Document(metadata={'producer': 'Acrobat Distiller 6.0.1 for Macintosh', 'creator': 'QuarkXPress(tm) 6.5', 'creationdate': '2006-02-16T11:30:29-05:00', 'subject': 'Heart disease', 'author': 'NHLBI', 'keywords': 'heart disease, prevention, risk factors, chd, coronary artery disease, corornary heart disease, cad', 'moddate': '2006-02-23T09:58:15-05:00', 'title': 'Your Guide to A Healthy Heart', 'source': '/content/drive/MyDrive/Data/healthyheart.pdf', 'total_pages': 95, 'page': 3, 'page_label': '4'}, page_content='Sleep Apnea . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 39\nMenopausal Hormone Therapy . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 39')

## Embedding Creations

In [21]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="YOUR_API_KEY"

In [22]:
embeddings = SentenceTransformerEmbeddings(model_name="NeuML/pubmedbert-base-embeddings")

  embeddings = SentenceTransformerEmbeddings(model_name="NeuML/pubmedbert-base-embeddings")


## Vector store creation

In [24]:
vectorstore = Chroma.from_documents(chunks,embeddings)

In [25]:
query = "Who is at risk of heart disease?"
search_results = vectorstore.similarity_search(query)
search_results

[Document(metadata={'title': 'Your Guide to A Healthy Heart', 'page_label': '9', 'producer': 'Acrobat Distiller 6.0.1 for Macintosh', 'moddate': '2006-02-23T09:58:15-05:00', 'total_pages': 95, 'creator': 'QuarkXPress(tm) 6.5', 'keywords': 'heart disease, prevention, risk factors, chd, coronary artery disease, corornary heart disease, cad', 'author': 'NHLBI', 'creationdate': '2006-02-16T11:30:29-05:00', 'source': '/content/drive/MyDrive/Data/healthyheart.pdf', 'subject': 'Heart disease', 'page': 8}, page_content='4\nWho Is at Risk?\nRisk factors are conditions or habits that make a person more likely\nto develop a disease. They can also increase the chances that an\nexisting disease will get worse. Important risk factors for heart dis-\nease that you can do something about are cigarette smoking, high'),
 Document(metadata={'page_label': '9', 'subject': 'Heart disease', 'creator': 'QuarkXPress(tm) 6.5', 'total_pages': 95, 'page': 8, 'producer': 'Acrobat Distiller 6.0.1 for Macintosh', 'm

In [26]:
retriever = vectorstore.as_retriever(search_kwags={'k':5})
                   # Can use a retriever instead of search as well. in form of dictionary, retrieves top 4 documents.

In [27]:
retriever.invoke(query)

[Document(metadata={'author': 'NHLBI', 'keywords': 'heart disease, prevention, risk factors, chd, coronary artery disease, corornary heart disease, cad', 'subject': 'Heart disease', 'page_label': '9', 'title': 'Your Guide to A Healthy Heart', 'moddate': '2006-02-23T09:58:15-05:00', 'source': '/content/drive/MyDrive/Data/healthyheart.pdf', 'creationdate': '2006-02-16T11:30:29-05:00', 'total_pages': 95, 'creator': 'QuarkXPress(tm) 6.5', 'page': 8, 'producer': 'Acrobat Distiller 6.0.1 for Macintosh'}, page_content='4\nWho Is at Risk?\nRisk factors are conditions or habits that make a person more likely\nto develop a disease. They can also increase the chances that an\nexisting disease will get worse. Important risk factors for heart dis-\nease that you can do something about are cigarette smoking, high'),
 Document(metadata={'page_label': '9', 'keywords': 'heart disease, prevention, risk factors, chd, coronary artery disease, corornary heart disease, cad', 'creationdate': '2006-02-16T11:3

## LLM Model Loading

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("BioMistral/BioMistral-7B")
model = AutoModel.from_pretrained("BioMistral/BioMistral-7B")


In [3]:
from huggingface_hub import hf_hub_download

model_file = hf_hub_download(
    repo_id="BioMistral/BioMistral-7B-GGUF",
    filename="ggml-model-Q4_K_M.gguf"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ggml-model-Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

In [4]:
from huggingface_hub import list_repo_files

repo_id = "BioMistral/BioMistral-7B-GGUF"
files = list_repo_files(repo_id=repo_id)
print(f"Files in {repo_id}:")
for file in files:
    print(file)

Files in BioMistral/BioMistral-7B-GGUF:
.gitattributes
README.md
config.json
ggml-model-Q2_K.gguf
ggml-model-Q3_K_L.gguf
ggml-model-Q3_K_M.gguf
ggml-model-Q3_K_S.gguf
ggml-model-Q4_0.gguf
ggml-model-Q4_K_M.gguf
ggml-model-Q4_K_S.gguf
ggml-model-Q5_K_M.gguf
ggml-model-Q5_K_S.gguf
ggml-model-Q6_K.gguf
ggml-model-Q8_0.gguf


In [31]:
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(model_path =model_file,temperature=0.2,n_ctx=4096,max_tokens=512,top_p=1)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--BioMistral--BioMistral-7B-GGUF/snapshots/de8c2dfcead24fd23ccb33f6ca5ff015e9ecdb4b/ggml-model-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension

## Using LLM,Retirver and query to generate final response

In [6]:
template = """
<|context|>
You are a medical assistant, that follows the instructions and generate the accurate response based on the query and the context provided.
Please be truthful and give direct answers.
</s>
<|user|>
{query}
</s>
<|assistant|>
"""


In [7]:

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


In [8]:
prompt = ChatPromptTemplate.from_template(template)

In [33]:


from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_template("""
Use the context below to answer the question:

{context}

Question: {question}
""")

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [34]:
response = rag_chain.invoke(query)

llama_perf_context_print:        load time =  475204.32 ms
llama_perf_context_print: prompt eval time =  475203.94 ms /  1115 tokens (  426.19 ms per token,     2.35 tokens per second)
llama_perf_context_print:        eval time =   13889.44 ms /    18 runs   (  771.64 ms per token,     1.30 tokens per second)
llama_perf_context_print:       total time =  489118.48 ms /  1133 tokens
llama_perf_context_print:    graphs reused =        120


In [35]:
response

'Answer: Anyone who has one or more risk factors is at risk of heart disease.'

In [None]:
import sys
while True:
  user_input = input(f"Input query: ")
  if user_input == "exit":
    sys.exit()
  if user_input == "":
    continue
  result = rag_chain.invoke(user_input)
  print("Answer:",result)


Llama.generate: 21 prefix-match hit, remaining 1136 prompt tokens to eval
llama_perf_context_print:        load time =  475204.32 ms
llama_perf_context_print: prompt eval time =  488496.14 ms /  1136 tokens (  430.01 ms per token,     2.33 tokens per second)
llama_perf_context_print:        eval time =   31448.06 ms /    40 runs   (  786.20 ms per token,     1.27 tokens per second)
llama_perf_context_print:       total time =  520002.75 ms /  1176 tokens
llama_perf_context_print:    graphs reused =        143


Answer: Answer: people who smoke, have diabetes or high risk score of more than 20 percent, two or more risk factors, menopause, middle age, family history of early heart disease


In [None]:
!pip install nbstripout
!nbstripout your_notebook.ipynb
