In [1]:
# run this cell once
!pip install --quiet scikit-fuzzy deap xgboost shap openai pyarrow


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m920.8/920.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, glob, math, random, time
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
import xgboost as xgb

# fuzzy
import skfuzzy as fuzz

# GA
from deap import base, creator, tools, algorithms

# LLM (optional)
import os
OPENAI_KEY = os.environ.get("OPENAI_API_KEY")
if OPENAI_KEY:
    import openai
    openai.api_key = OPENAI_KEY

# reproducible
RSEED = 42
random.seed(RSEED)
np.random.seed(RSEED)


In [3]:
# This searches for the files you uploaded under /kaggle/input
INPUT_ROOT = '/kaggle/input'
files = {}
expected = [
 'admissions_subset.csv','chartevents_filtered.csv','diagnoses_withstayids.csv',
 'discharge_notes_clean.csv','icustay_subset.csv','labevents_filtered.csv',
 'patients_subset.csv','radiology_notes_clean.csv'
]
for root, dirs, filenames in os.walk(INPUT_ROOT):
    for f in filenames:
        if f in expected:
            files[f] = os.path.join(root, f)

print("Found files:")
for f in expected:
    print(f, "->", files.get(f, "MISSING"))


Found files:
admissions_subset.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/admissions_subset.csv
chartevents_filtered.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/chartevents_filtered.csv
diagnoses_withstayids.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/diagnoses_withstayids.csv
discharge_notes_clean.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/discharge_notes_clean.csv
icustay_subset.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/icustay_subset.csv
labevents_filtered.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/labevents_filtered.csv
patients_subset.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/patients_subset.csv
radiology_notes_clean.csv -> /kaggle/input/mimic-iv/MIMIC-Subset/radiology_notes_clean.csv


In [4]:
import pandas as pd
import os

DATA_DIR = "/kaggle/input/mimic-iv/MIMIC-Subset"

char = pd.read_csv(f"{DATA_DIR}/chartevents_filtered.csv")
lab = pd.read_csv(f"{DATA_DIR}/labevents_filtered.csv")
icu = pd.read_csv(f"{DATA_DIR}/icustay_subset.csv")
pat = pd.read_csv(f"{DATA_DIR}/patients_subset.csv")
adm = pd.read_csv(f"{DATA_DIR}/admissions_subset.csv")
diag = pd.read_csv(f"{DATA_DIR}/diagnoses_withstayids.csv")
dnote = pd.read_csv(f"{DATA_DIR}/discharge_notes_clean.csv")
rnote = pd.read_csv(f"{DATA_DIR}/radiology_notes_clean.csv")

print("Loaded all datasets!")


Loaded all datasets!


In [5]:
# Load small tables into memory (should be small)
icustay = pd.read_csv(files['icustay_subset.csv'])
patients = pd.read_csv(files['patients_subset.csv'])
admissions = pd.read_csv(files['admissions_subset.csv'])
diagnoses = pd.read_csv(files['diagnoses_withstayids.csv'])
discharge_notes = pd.read_csv(files['discharge_notes_clean.csv'])
radiology_notes = pd.read_csv(files['radiology_notes_clean.csv'])

# Quick checks
print("icustay:", icustay.shape)
print("patients:", patients.shape)
print("admissions:", admissions.shape)
print("diagnoses:", diagnoses.shape)
print("discharge notes:", discharge_notes.shape)
print("radiology notes:", radiology_notes.shape)


icustay: (10000, 3)
patients: (10000, 6)
admissions: (10000, 16)
diagnoses: (193485, 4)
discharge notes: (7831, 4)
radiology notes: (71534, 4)


In [6]:
# Make sure char is loaded from the previous cell

# Vital sign itemid → easy labels
vital_map = {
    211:'HR', 220045:'HR',
    51:'SBP', 220050:'SBP',
    8368:'DBP', 220051:'DBP',
    52:'MAP', 220052:'MAP',
    618:'RR', 220210:'RR',
    223761:'Temp',
    646:'SpO2', 220277:'SpO2'
}

# Apply mapping safely
char['label'] = char['itemid'].map(vital_map)

# Keep only mapped rows
char = char.dropna(subset=['label'])

# Keep important columns
char = char[['stay_id','charttime','label','valuenum']]
char['charttime'] = pd.to_datetime(char['charttime'])

# Aggregate VITALS per stay
vitals = (
    char.groupby(['stay_id','label'])['valuenum']
         .agg(['mean','max','min'])
         .reset_index()
)

# pivot so each stay_id gets HR_mean, RR_max, etc.
vitals = vitals.pivot(index='stay_id', columns='label')
vitals.columns = [f"{stat}_{vital}" for stat, vital in vitals.columns]
vitals.reset_index(inplace=True)

print("Vitals feature table ready!")
vitals.head()


Vitals feature table ready!


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,stay_id,mean_DBP,mean_HR,mean_MAP,mean_RR,mean_SBP,mean_SpO2,mean_Temp,max_DBP,max_HR,...,max_SBP,max_SpO2,max_Temp,min_DBP,min_HR,min_MAP,min_RR,min_SBP,min_SpO2,min_Temp
0,30000831,,106.229885,,25.313953,,94.534884,99.035294,,157.0,...,,99.0,101.2,,80.0,,17.0,,88.0,97.8
1,30002548,57.454545,75.241379,75.590909,16.448276,115.5,98.413793,98.255556,77.0,84.0,...,160.0,100.0,99.2,45.0,65.0,59.0,0.0,82.0,96.0,97.2
2,30003087,,91.53913,,17.054054,,97.154639,98.187097,,141.0,...,,100.0,98.9,,66.0,,11.0,,90.0,96.6
3,30003306,43.5,60.594937,61.25,16.341772,104.75,96.367089,98.261538,44.0,80.0,...,108.0,100.0,99.0,43.0,60.0,60.0,9.0,103.0,76.0,97.5
4,30005085,55.666667,83.733333,76.666667,15.822222,129.277778,94.932584,98.75,76.0,101.0,...,174.0,100.0,101.8,45.0,67.0,61.0,6.0,97.0,89.0,96.4


In [7]:
lab_map = {
    51300:'WBC',
    50813:'Lactate',
    50912:'Creatinine'
}

lab['label'] = lab['itemid'].map(lab_map)
lab = lab.dropna(subset=['label'])

lab['charttime'] = pd.to_datetime(lab['charttime'])

labs = lab.groupby(['stay_id','label'])['valuenum'].agg(['mean','max','min']).reset_index()
labs = labs.pivot(index='stay_id', columns='label')
labs.columns = [f"{a}_{b}" for a,b in labs.columns]
labs.reset_index(inplace=True)

labs.head()


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,stay_id,mean_Creatinine,mean_Lactate,mean_WBC,max_Creatinine,max_Lactate,max_WBC,min_Creatinine,min_Lactate,min_WBC
0,30000831,1.969048,2.08,,3.1,7.4,,1.5,1.0,
1,30002548,0.966667,1.25,,1.2,1.6,,0.8,0.8,
2,30003087,0.671429,1.4,,0.7,1.4,,0.6,1.4,
3,30003306,3.540625,1.583333,,5.2,2.2,,1.9,0.7,
4,30005085,0.828571,1.65,,1.0,1.8,,0.7,1.5,


In [8]:
df = icu.merge(pat, on='subject_id', how='left')
df = df.merge(adm, on='hadm_id', how='left')
df = df.merge(vitals, on='stay_id', how='left')
df = df.merge(labs, on='stay_id', how='left')

# dummy target label: ICU mortality (0/1)
if 'hospital_expire_flag' in df.columns:
    df['label'] = df['hospital_expire_flag']
else:
    df['label'] = 0  # will adjust later if needed

df = df.fillna(df.mean(numeric_only=True))

print(df.shape)
df.head()


(12158, 54)


Unnamed: 0,stay_id,hadm_id,subject_id_x,gender,anchor_age,anchor_year,anchor_year_group,dod,subject_id_y,admittime,...,mean_Creatinine,mean_Lactate,mean_WBC,max_Creatinine,max_Lactate,max_WBC,min_Creatinine,min_Lactate,min_WBC,label
0,30000831,22744101,15726459,M,78,2140,2020 - 2022,,15726459,2140-04-17 21:25:00,...,1.969048,2.08,8.430833,3.1,7.4,8.659167,1.5,1.0,8.2025,0
1,30002548,24622512,14311522,M,70,2111,2017 - 2019,,14311522,2111-08-15 20:24:00,...,0.966667,1.25,8.430833,1.2,1.6,8.659167,0.8,0.8,8.2025,0
2,30003087,20035892,10682002,M,55,2132,2017 - 2019,,10682002,2132-12-01 12:28:00,...,0.671429,1.4,8.430833,0.7,1.4,8.659167,0.6,1.4,8.2025,0
3,30003306,28956560,16235911,M,91,2186,2017 - 2019,2189-01-04,16235911,2188-06-04 21:57:00,...,3.540625,1.583333,8.430833,5.2,2.2,8.659167,1.9,0.7,8.2025,0
4,30005085,23247126,14289094,M,60,2136,2008 - 2010,,14289094,2136-01-24 14:32:00,...,0.828571,1.65,8.430833,1.0,1.8,8.659167,0.7,1.5,8.2025,0


In [9]:
# CELL 9 (CORRECTED) — Apply Fuzzy Logic

import numpy as np
import skfuzzy as fuzz

def fuzzy_score(row):
    score = 0

    # Heart rate high
    hr = row.get('mean_HR', np.nan)
    if not np.isnan(hr):
        hr_x = np.arange(30, 201, 1)
        hr_high = fuzz.trimf(hr_x, [110, 140, 200])
        score += fuzz.interp_membership(hr_x, hr_high, hr)

    # SBP low
    sbp = row.get('mean_SBP', np.nan) 
    if not np.isnan(sbp):
        sbp_x = np.arange(40, 221, 1)
        sbp_low = fuzz.trimf(sbp_x, [40, 70, 100])
        score += fuzz.interp_membership(sbp_x, sbp_low, sbp)

    # RR high
    rr = row.get('mean_RR', np.nan) 
    if not np.isnan(rr):
        rr_x = np.arange(5, 41, 1)
        rr_high = fuzz.trimf(rr_x, [20, 28, 40])
        score += fuzz.interp_membership(rr_x, rr_high, rr)

    # SpO2 low
    spo2 = row.get('mean_SpO2', np.nan) 
    if not np.isnan(spo2):
        spo2_x = np.arange(60, 101, 1)
        spo2_low = fuzz.trimf(spo2_x, [60, 88, 94])
        score += fuzz.interp_membership(spo2_x, spo2_low, spo2)

    return np.tanh(score)  # squash 0–1

# Now apply the CORRECTED function
df['fuzzy_score'] = df.apply(fuzzy_score, axis=1)

print("Fuzzy score calculation complete.")
df[['stay_id','fuzzy_score']].head()

Fuzzy score calculation complete.


Unnamed: 0,stay_id,fuzzy_score
0,30000831,0.581181
1,30002548,0.0
2,30003087,0.0
3,30003306,0.0
4,30005085,0.0


In [10]:
from deap import base, creator, tools, algorithms
from sklearn.metrics import roc_auc_score
import random

# Use ICU mortality as label (or proxy)
y = df['label'].values

def evaluate_individual(ind):
    hr_cut = ind[0]
    sbp_cut = ind[1]
    
    # recompute fuzzy score with new cutpoints
    scores = []
    for _, row in df.iterrows():
        hr = row.get('HR_mean', np.nan)
        sbp = row.get('SBP_mean', np.nan)
        
        score = 0
        if not np.isnan(hr):
            hr_x = np.arange(30,201,1)
            hr_high = fuzz.trimf(hr_x, [hr_cut, hr_cut+20, 200])
            score += fuzz.interp_membership(hr_x, hr_high, hr)
        
        if not np.isnan(sbp):
            sbp_x = np.arange(40,221,1)
            sbp_low = fuzz.trimf(sbp_x, [40, sbp_cut, sbp_cut+20])
            score += fuzz.interp_membership(sbp_x, sbp_low, sbp)
        
        scores.append(np.tanh(score))
    
    try:
        return (roc_auc_score(y, scores),)
    except:
        return (0.5,)

# GA setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_hr", random.uniform, 80, 140)
toolbox.register("attr_sbp", random.uniform, 50, 100)
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_hr, toolbox.attr_sbp), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate_individual)
toolbox.register("mate", tools.cxBlend, alpha=0.3)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=5, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# run GA
pop = toolbox.population(n=10)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                    ngen=6, halloffame=hof, verbose=True)

print("Best GA parameters:", hof[0])


gen	nevals
0  	10    
1  	4     
2  	9     
3  	5     
4  	10    
5  	5     
6  	9     
Best GA parameters: [118.36560790747302, 51.250537761133344]


In [11]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GEMINI_API_KEY")


In [12]:
import google.generativeai as genai
import json
import os
import pandas as pd

# --- Setup: Load API Key from Kaggle Secrets ---
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    GEMINI_KEY = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=GEMINI_KEY)
except (ImportError, KeyError):
    GEMINI_KEY = None
    print("Kaggle secrets not found or API key not set. Please set the GEMINI_API_KEY secret.")


# --- Dummy DataFrame for Testing ---
data = {'stay_id': [1], 'HR_mean': [85], 'SBP_mean': [110], 'RR_mean': [18], 'Temp_mean': [37.0], 'SpO2_mean': [98], 'Lactate_mean': [1.5], 'fuzzy_score': [0.3]}
df = pd.DataFrame(data)


def agentic_explanation(stay_id):
    """
    Generates a clinical explanation for a given stay_id using the Gemini API.
    """
    if not GEMINI_KEY:
        return "NO API KEY SET — Unable to call LLM."

    try:
        row = df[df['stay_id'] == stay_id].iloc[0]
    except IndexError:
        return f"Error: stay_id {stay_id} not found in the DataFrame."

    prompt = f"""
    You are a clinical AI agent.

    Analyze the following patient data and provide a concise summary. You must also tell me sepsis score.

    Vital signs and lab summary:
    - Heart Rate (mean): {row.get('HR_mean', 'N/A')} bpm
    - Systolic Blood Pressure (mean): {row.get('SBP_mean', 'N/A')} mmHg
    - Respiratory Rate (mean): {row.get('RR_mean', 'N/A')} breaths/min
    - Temperature (mean): {row.get('Temp_mean', 'N/A')} °C
    - Oxygen Saturation (SpO2 mean): {row.get('SpO2_mean', 'N/A')}%
    - Lactate (mean): {row.get('Lactate_mean', 'N/A')} mmol/L
    - Sepsis Risk Score (fuzzy_score): {row.get('fuzzy_score', 'N/A')}

    Based on this data, explain in simple language:
    1.  **Clinical Interpretation:** What does this summary of vitals and labs indicate?
    2.  **Risk Assessment:** Why is the patient's risk currently high, moderate, or low?
    3.  **Immediate Check:** What is one immediate clinical check a nurse or doctor should perform?
    4.  **Follow-up Action:** What is one logical follow-up action or test to consider?
    """

    try:
        # CORRECTED LINE: Use a more recent and widely available model
        model = genai.GenerativeModel('gemini-2.0-flash')

        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                max_output_tokens=350,
                temperature=0.7,
            )
        )
        return response.text

    except Exception as e:
        return f"An error occurred while calling the Gemini API: {e}"

# --- Test the function on one stay ---
if not df.empty:
    first_stay_id = df['stay_id'].iloc[0]
    explanation = agentic_explanation(first_stay_id)
    print(explanation)
else:
    print("The DataFrame is empty. Cannot run the test.")

Okay, here's an analysis of the patient data:

**Summary:**

The patient presents with relatively stable vital signs: a normal heart rate, adequate blood pressure, normal respiratory rate, normal temperature, and good oxygen saturation. The lactate level is slightly elevated, and the sepsis risk score (fuzzy_score) is 0.3.

**1. Clinical Interpretation:**

The patient's vital signs are generally within normal limits, suggesting they are currently stable. However, the slightly elevated lactate level suggests there may be some degree of tissue hypoperfusion or anaerobic metabolism occurring. This could be due to a variety of factors and warrants further investigation.

**2. Risk Assessment:**

The patient's sepsis risk score (fuzzy_score) is 0.3. This is considered a low risk, but it is not zero. The elevated lactate is the primary driver of this risk.

**3. Immediate Check:**

*   **Assess Fluid Status/Volume Status:** Check for signs of dehydration (e.g., skin turgor, mucous membrane m

In [13]:
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient

# Configure the API key first
try:
    user_secrets = UserSecretsClient()
    GEMINI_KEY = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=GEMINI_KEY)

    print("Available models that support 'generateContent':\n")
    # List models that support the 'generateContent' method
    for m in genai.list_models():
      if 'generateContent' in m.supported_generation_methods:
        print(m.name)

except (ImportError, KeyError):
    print("Could not configure API key. Please set the GEMINI_API_KEY secret.")
except Exception as e:
    print(f"An error occurred: {e}")

Available models that support 'generateContent':

models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/learnlm-2.0-flash-experimental
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
mo