<a href="https://colab.research.google.com/github/vaibhav34777/medical-rag-chatbot/blob/main/demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qq requests feedparser chromadb pypdf torch langchain_community langchain langchain_core langchain_google_genai sentence-transformers

In [None]:
import os
import requests
import feedparser
from urllib.parse import quote
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.load import dumps, loads
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import warnings
import logging
import re

### Loading Data and Storing it in VectorStore

In [None]:
warnings.filterwarnings("ignore")
logging.getLogger("pypdf").setLevel(logging.ERROR)

MEDICAL_QUERY = 'cat:q-bio.TO OR cat:q-bio.CB OR cat:q-bio.NC OR cat:q-bio.BM OR cat:physics.med-ph'
MAX_RESULTS = 10
DATA_DIR = "data"

def fetch_arxiv_papers(query, max_results=10):
    encoded_query = quote(query)
    url = f"http://export.arxiv.org/api/query?search_query={encoded_query}&sortBy=submittedDate&sortOrder=descending&max_results={max_results}"
    feed = feedparser.parse(url)
    papers = []
    for entry in feed.entries:
        pdf_link = next((l.href for l in entry.links if l.type == "application/pdf"), None)
        papers.append({"title": entry.title, "pdf_url": pdf_link})
    return papers

def download_pdf(url, filename):
    response = requests.get(url, timeout=15)
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)

def load_and_chunk_pdfs(data_dir):
    all_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
    for filename in os.listdir(data_dir):
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(data_dir, filename))
            pages = loader.load()
            for page in pages:
                chunks = text_splitter.split_text(page.page_content)
                for chunk in chunks:
                    all_chunks.append(Document(page_content=chunk, metadata={"source": filename}))
    return all_chunks

if __name__ == "__main__":
    os.makedirs(DATA_DIR, exist_ok=True)

    papers = fetch_arxiv_papers(MEDICAL_QUERY, MAX_RESULTS)

    for i, paper in enumerate(papers, start=1):
        title_clean = "".join(c for c in paper['title'] if c.isalnum() or c in (" ", "_")).replace(" ", "_")[:50]
        filename = os.path.join(DATA_DIR, f"{i:02d}_{title_clean}.pdf")
        if paper['pdf_url']:
            download_pdf(paper['pdf_url'], filename)

    chunks = load_and_chunk_pdfs(DATA_DIR)

    if chunks:
        embedding_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_fn, persist_directory="./chroma_db")

### Loading the retriever and llm

In [None]:
def load_llm():
  api_key = "AIzaSyBmk_5xiADBgdBQuNYavK_HPKruT1xBuTQ"
  return ChatGoogleGenerativeAI(
      model="gemini-2.5-flash-preview-05-20",
      google_api_key=api_key,
      temperature=0.3
  )
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
llm = load_llm()

### Generating Mutiple Queries from a single user query

In [None]:
def generate_multi_queries(question):
    template = """You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant medical research documents.
    By generating multiple perspectives on the user question, your goal is to help
    overcome limitations of distance-based similarity search.
    Provide these alternative questions separated by newlines. Original question: {question}"""

    prompt = ChatPromptTemplate.from_template(template)
    generate_queries = (
        prompt
        | llm
        | StrOutputParser()
        | (lambda x: x.split("\n"))
    )
    return generate_queries.invoke({"question": question})

### Multi Query Retrieval

In [None]:
def get_unique_union(documents):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

def retrieve_documents(question):
    queries = generate_multi_queries(question)
    all_docs = []
    for query in queries:
        if query.strip():
            docs = retriever.get_relevant_documents(query.strip())
            all_docs.append(docs)
    return get_unique_union(all_docs)

### Updating Conversation Summary Memory

In [None]:
chat_histories = {}
def update_conversation_summary(session_id, new_question, new_answer):
    if session_id not in chat_histories:
        chat_histories[session_id] = {"messages": [], "summary": ""}

    session_data = chat_histories[session_id]

    if session_data["summary"]:
        summary_prompt = f"""Given the existing conversation summary and the new exchange,
        generate a new summary of the conversation. Maintain as much relevant information as possible.

        Existing summary:
        {session_data["summary"]}

        New exchange:
        User: {new_question}
        Assistant: {new_answer}

        Generate the updated summary:"""
    else:
        summary_prompt = f"""Summarize this conversation exchange:

        User: {new_question}
        Assistant: {new_answer}

        Generate a concise summary:"""

    new_summary = llm.invoke(summary_prompt)
    session_data["summary"] = new_summary.content


### Generating the answer with all the context and user query

In [None]:
def generate_answer(question, docs, session_id):
    if session_id not in chat_histories:
        chat_histories[session_id] = {"messages": [], "summary": ""}

    session_data = chat_histories[session_id]

    context = "\n\n".join([doc.page_content for doc in docs[:5]])

    template = """You are a medical research assistant. Answer the question based on the provided research context and conversation history.

Conversation Summary:
{summary}

Research Context:
{context}

Question: {question}

Provide a detailed, accurate answer based on the research papers. If the answer is not in the context, say so."""

    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser()

    answer = chain.invoke({
        "summary": session_data["summary"] if session_data["summary"] else "No previous conversation.",
        "context": context,
        "question": question
    })

    return answer, docs

### Final RAG Pipeline

In [None]:
def rag_pipeline(question, session_id="demo_user"):
    queries = generate_multi_queries(question)
    docs = retrieve_documents(question)
    answer, source_docs = generate_answer(question, docs, session_id)

    print(f"\nAnswer:\n{answer}")

    print("Source Documents:")

    for i, doc in enumerate(source_docs[:3], 1):
        print(f"\nSource {i}:")
        print(f"  File: {doc.metadata.get('source', 'Unknown')}")
        print(f"  Page: {doc.metadata.get('page', 'Unknown')}")
        print(f"  Content Preview: {doc.page_content[:300]}...")
        print("-" * 80)

    print("\n Updating conversation summary...")
    update_conversation_summary(session_id, question, answer)

    if session_id in chat_histories:
        print(f"\nConversation Summary:\n{chat_histories[session_id]['summary']}")

    return answer, source_docs


## Demo Question Answering

In [None]:
question3 = "What are the two primary strategies explored in this study to optimize Motor Imagery-based Brain-Computer Interface (MI-BCI) rehabilitation protocols for stroke patients, and how does the experimental evidence for each strategy demonstrate an improvement in BCI classification performance compared to conventional approaches?"
answer3, docs3 = rag_pipeline(question3, session_id="user456")


Answer:
The study explored two primary strategies to optimize Motor Imagery-based Brain-Computer Interface (MI-BCI) rehabilitation protocols for stroke patients:

1.  **Task Design:** The study proposed an "affected hand movement versus rest" motor imagery paradigm as an alternative to the conventional left-versus-right motor imagery. This approach aimed to simplify the task and address the weak contralateral activation often observed in stroke patients.
    *   **Experimental Evidence:** The experimental results demonstrated that the "MI vs. rest" approach generally showed better classification performance compared to the "left vs. right" approach. Specifically, the MI vs. rest condition exhibited superior classification performance. Paired sample t-tests confirmed that the differences were statistically significant (p < 0.05) for both L:Rest and R:Rest compared to L:R. EEGNet showed significant improvements in both healthy and stroke patient groups, while FBCSP was significant in al

### Generated Answer
The study explored two primary strategies to optimize Motor Imagery-based Brain-Computer Interface (MI-BCI) rehabilitation protocols for stroke patients:

1.  **Task Design:** The study proposed an "affected hand movement versus rest" motor imagery paradigm as an alternative to the conventional left-versus-right motor imagery. This approach aimed to simplify the task and address the weak contralateral activation often observed in stroke patients.
    *   **Experimental Evidence:** The experimental results demonstrated that the "MI vs. rest" approach generally showed better classification performance compared to the "left vs. right" approach. Specifically, the MI vs. rest condition exhibited superior classification performance. Paired sample t-tests confirmed that the differences were statistically significant (p < 0.05) for both L:Rest and R:Rest compared to L:R. EEGNet showed significant improvements in both healthy and stroke patient groups, while FBCSP was significant in all conditions except R:Rest in the healthy group. These findings indicate that this proposed task design enhances MI classification in stroke patients.

2.  **Training Protocol (Session Duration):** The study investigated the impact of session duration on BCI performance.
    *   **Experimental Evidence:** The findings revealed that shorter training sessions produced better BCI performance than longer sessions. Furthermore, the research suggested that using data from multiple short training sessions or implementing adaptive training protocols could further enhance MI classification performance compared to using the entire dataset at once or single, longer sessions.
Source Documents:

### Expected Answer
The study explored two primary strategies to optimize MI-BCI protocols for stroke rehabilitation: optimizing the task design and reducing the training duration.

Optimized Task Design: Affected-Hand MI versus Rest
The conventional task for upper-limb rehabilitation is the left versus right-hand motor imagery (L:R) paradigm, which often underperforms in stroke patients due to the loss of lateralization and the presence of compensatory ipsilateral activation.


The proposed alternative is the affected-hand movement versus rest (MI:Rest) paradigm.



Mechanism: This simpler approach requires the patient to perform only one type of motor imagery, aiming to improve BCI performance by focusing on a less complex discrimination task.


Empirical Evidence: The MI versus rest condition showed superior classification performance compared to the left versus right condition across all tested stroke patient groups and most classifiers. For example, using the FBCSP algorithm, the mean accuracy significantly improved for stroke patients with:




Left-Hand Paralysis (LHP): From 50.7% (L:R) to 64.3% (L:Rest).



Right-Hand Paralysis (RHP): From 53.1% (L:R) to 63.3% (R:Rest).


These differences were statistically significant (p<0.05).


Optimized Training Duration: Shorter Sessions
The second strategy involved investigating the impact of reducing the length of training sessions.


Mechanism: This strategy suggests that data variability in longer sessions may hinder MI performance, and that shorter, more focused sessions can lead to better results.


Empirical Evidence: The results showed that using smaller subsets of data (shorter sessions) generally produced higher accuracy than using the entire dataset at once.


This finding suggests that multiple short training sessions or adaptive training protocols that adjust the classification models with new samples may enhance MI classification performance during rehabilitation.

### Follow up Question

In [None]:
question3 = "what was the conventional paradigm that was replaced, and for which specific group of stroke patients (LHP or RHP) did the FBCSP classifier show the highest mean accuracy using the proposed MI versus rest paradigm?"
answer3, docs3 = rag_pipeline(question3, session_id="user456")


Answer:
The conventional paradigm that was replaced was the **left-versus-right motor imagery** approach.

For the FBCSP classifier using the proposed MI versus rest paradigm, the highest mean accuracy was shown by the **Stroke Subjects Left Hand Affected (LHP)** group, with a mean accuracy of 65.8% in the R:Rest condition.
Source Documents:

Source 1:
  File: 05_Optimizing_BCI_Rehabilitation_Protocols_for_Stroke.pdf
  Page: Unknown
  Content Preview: data improves MI performance compared to using the entire
dataset at once. This suggests that multiple short training
sessions or adaptive training protocols may enhance MI clas-
sification. Overall, these findings highlight the importance of
both task design and training protocol in optimizing BCI-...
--------------------------------------------------------------------------------

Source 2:
  File: 05_Optimizing_BCI_Rehabilitation_Protocols_for_Stroke.pdf
  Page: Unknown
  Content Preview: movement is right-hand MI, while for LHP patie

### Generated Answer
The conventional paradigm that was replaced was the **left-versus-right motor imagery** approach.

For the FBCSP classifier using the proposed MI versus rest paradigm, the highest mean accuracy was shown by the **Stroke Subjects Left Hand Affected (LHP)** group, with a mean accuracy of 65.8% in the R:Rest condition


### Expected Answer
The conventional paradigm that was replaced by the affected-hand MI versus rest approach was the left versus right-hand motor imagery (L:R) paradigm.




The FBCSP classifier showed the highest mean accuracy using the proposed MI versus rest paradigm for the Stroke Subjects Left Hand Affected (LHP) group.



The specific results are:


LHP (Left-Hand Paralysis): Mean FBCSP accuracy for L:Rest was 64.3%.




RHP (Right-Hand Paralysis): Mean FBCSP accuracy for R:Rest was 63.3%.