In [12]:
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from typing import Optional, List, Union
from pydantic import BaseModel, Field, field_validator
from typing import List, Optional
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import PydanticOutputParser
import json
import os
import re
import glob
import json
from json import JSONEncoder
import requests, urllib.parse
from langchain_core.tools import tool


In [13]:
# Pick via provider:model string → works across providers
llm = init_chat_model(model="gemini-2.5-flash",
                      model_provider="google_genai",
                      temperature=0)

In [5]:
#global_path  = "/Users/ilboukil/Library/CloudStorage/OneDrive-SIBSwissInstituteofBioinformatics/Trainings-cb-402/ML_summer_school_code/"
global_path  = "/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/topic-1_data-integration-and-llms/project/results/"

patient_id = "MM082"
# Path to your JSON file (e.g., patient PKG or classification output)
json_path = f"{global_path}/{patient_id}.json"

with open(json_path, "r") as f:
    patient_json = json.load(f)


In [7]:
class GNN_prediction_report(BaseModel):
    patient_ID: Optional[Union[str, int]]  # "555-1234", 5551234, or None
    disease_type: str = Field(description="e.g., Melanoma")
    recomended_drug_name: str = Field(description = "e.g., Pembrolizumab")

In [6]:
from json import JSONEncoder
class MyEncoder(JSONEncoder):
    def default(self, o):
        return o.__dict__


In [8]:
@tool
def get_openfda_label(ingredient):
    """
    Fetches drug purpose, indications, usages, adverse reactions, warning, and dosage and administration information from the FDA API.

    Args:
        drug_name (str): The name of the drug (e.g., "aspirin").

    Returns:
        dict: The  response drug purpose, indications, usages, adverse reactions, warning, and dosage and administration information
    """
    # Standardize ingredient name
    ingredient = urllib.parse.quote(ingredient)
    print(ingredient)
    base = "https://api.fda.gov/drug/label.json"
    q = f'openfda.substance_name:"{ingredient}"'
    r = requests.get(base, params={"search": q, "limit": 1})
    r.raise_for_status()
    res = r.json().get("results", [])
    if not res:
        q2 = f'openfda.brand_name:"{ingredient}"'
        r = requests.get(base, params={"search": q2, "limit": 1})
        r.raise_for_status()
        res = r.json().get("results", [])
        if not res:
            return None
    return res[0]

# item = get_openfda_label("ibuprofen")

# if item:
#     for key in ("purpose","indications_and_usage", "adverse_reactions", "warnings", "dosage_and_administration"):
#         if key in item:
#             print(f"\n=== {key} ===\n{item[key][0][:800]}")
# else:
#     print("Cannot find record on openFDA for that name.")

In [9]:
# bioassistant.py

import json
from typing import List, Dict
from collections import defaultdict

from gseapy import enrichr
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.tools import tool


# -------------------------------
# Tool: Enrichment
# -------------------------------
@tool
def enrichr_query(gene_list: List[str]):
    """Run enrichment analysis on a list of genes using gseapy (GO Biological Process)."""
    enr = enrichr(
        gene_list=gene_list,
        gene_sets='GO_Biological_Process_2021',
        organism='Human',
        outdir=None,
        cutoff=0.05
    )
    return enr.results  # DataFrame


# -------------------------------
# LLM setup
# -------------------------------
def get_llm_with_tools(model: str = "gemini-2.5-flash", provider: str = "google_genai"):
    """Initialize the chat model and bind the enrichment tool."""
    llm = init_chat_model(model=model, model_provider=provider, temperature=0.2)
    return llm.bind_tools([enrichr_query])


def get_prompt_chain(llm_with_tools):
    """Return a chain with system+human prompt bound to the LLM with tools."""
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful bioinformatics assistant. Use tools when needed."),
        ("human", "{question}")
    ])
    return prompt | llm_with_tools


# -------------------------------
# SHAP → Gene Sets → Enrichment → Summarization
# -------------------------------
def analyze_patient(patient_json: Dict, patient_id: str, chain):
    """
    Collect SHAP features per predicted class, run enrichment, and ask LLM to summarize.
    
    Parameters
    ----------
    patient_json : dict
        JSON object with structure { patient_id: { "Drugs": {...}} }
    patient_id : str
        Patient ID key in patient_json
    chain : LangChain runnable (prompt | llm_with_tools)
    
    Returns
    -------
    results_by_class : dict
        { predicted_class: { "positive": enrichment_df, "negative": enrichment_df } }
    """
    drug_keys = list(patient_json[patient_id]["Drugs"].keys())
    class_features = defaultdict(lambda: {"positive": [], "negative": []})

    # Step 1: Collect SHAP features by predicted class
    for drug in drug_keys:
        predicted_class = patient_json[patient_id]["Drugs"][drug]["Predicted_Class"]
        pos_features = patient_json[patient_id]["Drugs"][drug]['SHAP']['Top_Positive']
        neg_features = patient_json[patient_id]["Drugs"][drug]['SHAP']['Top_Negative']

        class_features[predicted_class]["positive"].extend(
            [item["Feature"].split("_", 1)[1] if "_" in item["Feature"] else item["Feature"]
             for item in pos_features]
        )
        class_features[predicted_class]["negative"].extend(
            [item["Feature"].split("_", 1)[1] if "_" in item["Feature"] else item["Feature"]
             for item in neg_features]
        )

    # Step 2: Run enrichment
    results_by_class = {}
    for cls, feats in class_features.items():
        results_by_class[cls] = {}
        results_by_class[cls]["positive"] = (
            enrichr_query({"gene_list": list(set(feats["positive"]))}) if feats["positive"] else None
        )
        results_by_class[cls]["negative"] = (
            enrichr_query({"gene_list": list(set(feats["negative"]))}) if feats["negative"] else None
        )

    # Step 3: Summarize with LLM
    summaries = {}
    for cls, res in results_by_class.items():
        question = f"Predicted class: {cls}\nSummarize functional biology or pathways of SHAP features.\n"

        if res["positive"] is not None and not res["positive"].empty:
            question += f"\nPositive SHAP features (supporting {cls}):\n{res['positive'].head(10).to_string(index=False)}\n"
        if res["negative"] is not None and not res["negative"].empty:
            question += f"\nNegative SHAP features (against {cls}):\n{res['negative'].head(10).to_string(index=False)}\n"

        ai_msg = chain.invoke({"question": question})
        summaries[cls] = ai_msg.content

    return results_by_class, summaries


# bioassistant.py (add at the bottom)

import pandas as pd
def save_patient_summary_html(patient_id: str,
                              results_by_class: dict,
                              summaries: dict,
                              out_path: str = None):
    """
    Save the enrichment results + LLM summaries into an HTML report.
    
    Parameters
    ----------
    patient_id : str
        Patient identifier
    results_by_class : dict
        Output from analyze_patient (enrichment results)
    summaries : dict
        Output from analyze_patient (LLM summaries)
    out_path : str
        File path for the HTML file (default = f"{patient_id}_summary.html")
    """
    if out_path is None:
        out_path = f"{patient_id}_summary.html"

    html_parts = [f"<h1>Patient {patient_id} – Pathway Analysis Report</h1>"]

    for cls, summary in summaries.items():
        html_parts.append(f"<h2>Predicted Class: {cls}</h2>")
        html_parts.append(f"<p><strong>LLM Summary:</strong><br>{summary}</p>")

        # Insert enrichment tables
        for direction in ["positive", "negative"]:
            df = results_by_class[cls].get(direction)
            if df is not None and not df.empty:
                html_parts.append(f"<h3>{direction.title()} SHAP Features Enrichment</h3>")
                html_parts.append(df.head(15).to_html(index=False, escape=False))
    
    html = "\n".join(html_parts)
    with open(out_path, "w") as f:
        f.write(html)
    print(f"✅ HTML report saved to {out_path}")

In [10]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a biomedical-AI assistant that interprets predictions from a AI-powered predictive model for clinicians.
You are given a JSON with some information regarding the patient and 2 drugs. In the JSON, you are given:
- Patient ID
- Disease type
- For each drug, you will be given:
    - The drug name
    - The predicted class. This class can be one of three options: 
        - No effect if the drug is predicted as having no effect on treating the patient disease
        - Positive response if the drug is predicted as having a positive effect on treating the patient disease
        - Adverse effects if the drug is predicted as having a negative effect on the patient disease
    - Each predicted class has an associated probability
    - Each predicted class has associated features, that are responsible for the prediction. To reflect the importance of these features on the prediction we have the SHAP values. We have the top positive SHAP values and the top negative SHAP values.

Taking into account this JSON and the information explained above, I want you as a smart biomedical-AI assistant to pick the best of the two drugs. I don't want you to talk about the accuracy of the predicition for more than one sentence.
Once you have  picked the best drug for my patient, I want you to write a small report on the chosen drug, please include both positive an negative points about the drug, make it as straigthforward as possible (a maximum of 10 bullet points in total), and targeted towards clinicians.
After that, please write a short paragraph about the features involved in the decision making process, and look in the litterature for information about the relationship between these features and the disease the patient has."""),
    ("human", "{JSON_input}")
])

In [11]:
chain = prompt | llm | StrOutputParser()
response = chain.invoke({"JSON_input": patient_json})

In [12]:
response

"For patient MM082, diagnosed with Multiple Myeloma, the AI-powered predictive model recommends the **Dexamethasone+Bortezomib+Pomalidomide (Proteasome Inhibitor regime)**. This drug combination is predicted to elicit a positive response with a probability of 88.6%, making it the most promising option compared to the other drugs which were predicted to have no effect.\n\n---\n\n### Clinical Report: Dexamethasone+Bortezomib+Pomalidomide (Proteasome Inhibitor regime) for Patient MM082\n\n*   **Drug Recommendation:** Dexamethasone+Bortezomib+Pomalidomide (Proteasome Inhibitor regime) is the recommended treatment for patient MM082.\n*   **Predicted Efficacy:** The model predicts a positive therapeutic response for Multiple Myeloma in this patient.\n*   **Confidence Level:** This positive effect is predicted with a high probability of 88.6%.\n*   **Key Positive Indicator: Prot_YY1:** Elevated levels of the transcription factor YY1 are a strong positive indicator, significantly contributing 

In [13]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a biomedical-AI assistant that interprets predictions from a AI-powered predictive model for clinicians.
You are given a JSON with some information regarding the patient and 2 drugs. In the JSON, you are given:
- Patient ID
- Disease type
- For each drug, you will be given:
    - The drug name
    - The predicted class. This class can be one of three options: 
        - No effect if the drug is predicted as having no effect on treating the patient disease
        - Positive response if the drug is predicted as having a positive effect on treating the patient disease
        - Adverse effects if the drug is predicted as having a negative effect on the patient disease
    - Each predicted class has an associated probability
    - Each predicted class has associated features, that are responsible for the prediction. To reflect the importance of these features on the prediction we have the SHAP values. We have the top positive SHAP values and the top negative SHAP values.

Taking into account this JSON and the information explained above, I want you as a smart biomedical-AI assistant to pick the best of the two drugs. I don't want you to talk about the accuracy of the predicition for more than one sentence.""" ), ("human", "{JSON_input}")
])

chain = prompt | llm | StrOutputParser()
response = chain.invoke({"JSON_input": patient_json})

In [14]:
parser = PydanticOutputParser(pydantic_object=GNN_prediction_report)
format_instructions = parser.get_format_instructions()
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract per schema:\n{format_instructions}"),
    ("human", "{text}"),
]).partial(format_instructions=format_instructions)

parsing_llm = prompt | llm | parser

# if `drug_text` is an AIMessage, use .content; otherwise pass the raw string
result = parsing_llm.invoke({"text": response})


In [15]:
from json import JSONEncoder
class MyEncoder(JSONEncoder):
    def default(self, o):
        return o.__dict__


In [17]:
result1 = MyEncoder().encode(result)

In [18]:
json.loads(result1)

{'patient_ID': 'MM082',
 'disease_type': 'Multiple Myeloma',
 'recomended_drug_name': 'Dexamethasone+Bortezomib+Pomalidomide_Proteasome Inhibitor regime'}

In [19]:
llm_with_tools = llm.bind_tools([get_openfda_label])

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful bioinformatics assistant. Use tools when needed."),
    ("human", "{question}")
])

chain = prompt | llm_with_tools 

user_question = (
    f"""Please tell me about each of recommended drugs: {json.loads(result1)['recomended_drug_name']} in the context of  {json.loads(result1)['disease_type']} """
)

ai_msg = chain.invoke({"question": user_question})

In [20]:
tool_calls = getattr(ai_msg, "tool_calls", [])
print("Tool calls:", tool_calls)
# Execute each tool call by name with provided arguments
tools_by_name = {enrichr_query.name: enrichr_query}

tool_results = []
for call in tool_calls:
    name = call["name"]
    args = call.get("args", {})
    if name in tools_by_name:
        result = tools_by_name[name].invoke(args)
        tool_resul

Tool calls: [{'name': 'get_openfda_label', 'args': {'ingredient': 'Dexamethasone'}, 'id': '0fd66b8b-7b67-4299-b7c0-cb713662bdcb', 'type': 'tool_call'}, {'name': 'get_openfda_label', 'args': {'ingredient': 'Bortezomib'}, 'id': '0ad19e2c-5e79-4e56-9995-438cbe76f4b5', 'type': 'tool_call'}, {'name': 'get_openfda_label', 'args': {'ingredient': 'Pomalidomide'}, 'id': 'c4268c65-f132-49ee-9dae-b06535d51d57', 'type': 'tool_call'}]


In [21]:
messages = []
messages.extend(prompt.format_messages(question=user_question))
messages.append(ai_msg)

# Attach tool outputs so the model can read and summarize them
for idx, tr in enumerate(tool_results):
    # Convert complex objects to strings or JSON for reliability
    payload_str = str(tr["output"])  # or json.dumps(...)
    messages.append(
        ToolMessage(content=payload_str, name=tr["tool"], tool_call_id=ai_msg.tool_calls[idx]["id"]) 
    )

final_answer = llm.invoke(messages)
print(final_answer.content)

This regimen, combining Dexamethasone, Bortezomib, and Pomalidomide, is a powerful triple-drug approach often used in the treatment of Multiple Myeloma, particularly in the relapsed or refractory setting. Each drug targets myeloma cells through distinct mechanisms, leading to a synergistic effect.

Here's a breakdown of each recommended drug:

### 1. Dexamethasone

*   **Class:** Corticosteroid
*   **Mechanism of Action:** Dexamethasone is a potent synthetic corticosteroid. In Multiple Myeloma, it acts in several ways:
    *   **Direct Cytotoxicity:** It can directly induce apoptosis (programmed cell death) in myeloma cells.
    *   **Immunosuppression/Anti-inflammatory:** It reduces inflammation and suppresses the immune system, which can be beneficial in managing certain symptoms or side effects.
    *   **Synergistic Effect:** It enhances the anti-myeloma activity of many other drugs, including proteasome inhibitors (like Bortezomib) and immunomodulatory drugs (like Pomalidomide).
*

Making sure of the added value of the tools

In [52]:

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful bioinformatics assistant.."),
    ("human", "{text}")
])

chain = prompt | llm | StrOutputParser()

chain.invoke({"text": user_question})


"The combination of Dexamethasone, Bortezomib, and Pomalidomide (often abbreviated as **DVP** or **PVd**) is a highly effective regimen used in the treatment of Multiple Myeloma (MM), particularly in the relapsed/refractory setting, but also sometimes in newly diagnosed patients, especially those with high-risk features.\n\nThis regimen combines three distinct classes of drugs, each targeting different pathways crucial for myeloma cell survival and proliferation, leading to synergistic anti-myeloma activity.\n\nLet's break down each drug:\n\n---\n\n### 1. Dexamethasone (Corticosteroid)\n\n*   **Drug Class:** Corticosteroid\n*   **Mechanism of Action (MoA) in Multiple Myeloma:**\n    *   **Direct Cytotoxicity:** Dexamethasone induces apoptosis (programmed cell death) in myeloma cells by binding to glucocorticoid receptors, which then translocate to the nucleus and alter gene expression. This leads to the upregulation of pro-apoptotic genes and downregulation of anti-apoptotic genes.\n  

## Functional Analysis of SHAP Features by Predicted Drug Response Class

## SHAP values interpretation

In [24]:
# notebook.ipynb
import json

# --- Setup LLM + chain ---
llm_with_tools = get_llm_with_tools()
chain = get_prompt_chain(llm_with_tools)

# --- Analyze patient ---
results, summaries = analyze_patient(patient_json, patient_id, chain)

# --- Show results ---
for cls, summary in summaries.items():
    print(f"\n### Drugs Predicted as {cls}")
    print(summary)

  enrichr_query({"gene_list": list(set(feats["positive"]))}) if feats["positive"] else None



### Drugs Predicted as no_effect
The positive SHAP features, which support a "no_effect" prediction, are primarily associated with the regulation of immune responses, specifically involving T cell proliferation and activation (genes like IL15, CD3E, ICAM3). Other notable functions include cell-cell adhesion mediated by integrins (CD3E, ADA), posttranscriptional regulation of gene expression (NUDT21, RBM4B), and protein-containing complex assembly (CUL1, SNX9, CD3E). There's also an indication of substantia nigra development (GLUD1, CNP) and cellular response to indole-3-methanol (CTNNA1).

Conversely, the negative SHAP features, which argue against a "no_effect" prediction, are strongly linked to various aspects of immune response, particularly neutrophil-mediated immunity, degranulation, and activation (genes such as CTSZ, HLA-B, HP, SRP14, SERPINB6). These features also highlight the positive regulation of T cell and leukocyte-mediated cytotoxicity and immunity (HLA-B, HLA-DRA), as 

## Patient Knowledge Graph Integration

In [6]:
patient_id = "MM082"
# Path to your JSON file (e.g., patient PKG or classification output)
KG_json_path = f"{global_path}/patient_graphs_json/{patient_id}_KG.json"

with open(KG_json_path, "r") as f:
    patient_KG_json = json.load(f)

In [15]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

patient_id = "MM082"
KG_json_path = f"{global_path}/patient_graphs_json/{patient_id}_KG.json"

with open(KG_json_path, "r") as f:
    patient_KG_json = json.load(f)

KG_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """You are a helpful biomedical-AI assistant that interprets patient knowledge graphs for clinicians.

TASK:
Given the PKG and the training links, for patient {patient_id}:

1. For each regimen (drug/treatment), list the training patients whose **Training_True_Class** matches {patient_id}’s **Predicted_Class** (“same effect”).
2. Summarize clinically relevant metadata for those matched patients:
   - Age
   - Sex
   - Treatment stage and prior lines
   - Isotype and light chain
   - Phenogroup
   - sFLC range
   - Key cytogenetics (del17p, t(11;14), t(4;14), 1q gain, hyperdiploidy, hypodiploidy)

FORMAT:
- Organize the output by regimen.
- Keep the summaries concise and geared toward treatment decisions.
"""
    ),
    (
        "user",
        """Here is the patient KG JSON:
{JSON_input}"""
    )
])

# Build chain
chain = KG_prompt | llm | StrOutputParser()

# Invoke chain, filling in the placeholders
KG_response = chain.invoke({
    "patient_id": patient_id,
    "JSON_input": json.dumps(patient_KG_json)
})


In [19]:
KG_response

"Here is the summary of training patients whose treatment outcomes match MM082's predicted class for each regimen:\n\n**Regimen: Lenalidomide_Corticosteroid + IMID** (MM082 Predicted Class: no_effect)\n- **Patient MM021**\n  - Age: 71, Sex: f\n  - Treatment Stage: untreated, Prior Lines: 0\n  - Isotype/Light Chain: IgG-Kappa\n  - Phenogroup: Group2\n  - sFLC range: None reported as present mg/L\n  - Key Cytogenetics: 1q gain: Present\n- **Patient MM023**\n  - Age: 69, Sex: m\n  - Treatment Stage: treatment control, Prior Lines: 1\n  - Isotype/Light Chain: LC-Kappa\n  - Phenogroup: Group1\n  - sFLC range: 712.5 mg/L\n  - Key Cytogenetics: Hyperdiploidy: Present\n- **Patient MM025**\n  - Age: 65, Sex: m\n  - Treatment Stage: treatment control, Prior Lines: 1\n  - Isotype/Light Chain: IgG-Kappa\n  - Phenogroup: Group3\n  - sFLC range: 53 mg/L\n  - Key Cytogenetics: None reported as present\n- **Patient MM026**\n  - Age: 59, Sex: m\n  - Treatment Stage: untreated, Prior Lines: 0\n  - Isoty

In [None]:
def print_regimen_summary(summary_text: str):
    """
    Takes the structured markdown-style text (from LLM) and prints 
    it with clear section separators for readability.
    """
    regimens = summary_text.split("**Regimen:")
    for block in regimens:
        block = block.strip()
        if not block:
            continue
        print("="*80)
        print("REGIMEN:", block.split("**")[0].strip())  
        print("="*80)
        # Print the rest
        lines = block.split("\n")
        for line in lines[1:]:
            print(line)
        print("\n")


In [None]:
summary_text = KG_response  
print_regimen_summary(summary_text)

REGIMEN: Here is the summary of training patients whose treatment outcomes match MM082's predicted class for each regimen:


REGIMEN: Lenalidomide_Corticosteroid + IMID
- **Patient MM021**
  - Age: 71, Sex: f
  - Treatment Stage: untreated, Prior Lines: 0
  - Isotype/Light Chain: IgG-Kappa
  - Phenogroup: Group2
  - sFLC range: None reported as present mg/L
  - Key Cytogenetics: 1q gain: Present
- **Patient MM023**
  - Age: 69, Sex: m
  - Treatment Stage: treatment control, Prior Lines: 1
  - Isotype/Light Chain: LC-Kappa
  - Phenogroup: Group1
  - sFLC range: 712.5 mg/L
  - Key Cytogenetics: Hyperdiploidy: Present
- **Patient MM025**
  - Age: 65, Sex: m
  - Treatment Stage: treatment control, Prior Lines: 1
  - Isotype/Light Chain: IgG-Kappa
  - Phenogroup: Group3
  - sFLC range: 53 mg/L
  - Key Cytogenetics: None reported as present
- **Patient MM026**
  - Age: 59, Sex: m
  - Treatment Stage: untreated, Prior Lines: 0
  - Isotype/Light Chain: IgG-Lambda
  - Phenogroup: Group1
  - sFL