In [1]:
pip install pandas numpy scikit-learn tensorflow keras matplotlib seaborn imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from imblearn.over_sampling import SMOTE

In [3]:
mimic_path = "C:/Users/SYYAD/Documents/MSAI/AI in Healthcare/NLP/"  # Change this to your path

In [4]:
labevents_file = os.path.join(mimic_path, "LABEVENTS.csv")

# Load only needed columns
df_labs = pd.read_csv(labevents_file, usecols=["SUBJECT_ID", "HADM_ID", "ITEMID", "VALUENUM"])

In [5]:
micro_file = os.path.join(mimic_path, "MICROBIOLOGYEVENTS.csv")

# Load data with pathogen information
df_micro = pd.read_csv(micro_file, usecols=["SUBJECT_ID", "HADM_ID", "ORG_NAME"])

In [6]:
prescriptions_file = os.path.join(mimic_path, "PRESCRIPTIONS.csv")

# Load only required columns
df_meds = pd.read_csv(prescriptions_file, usecols=["SUBJECT_ID", "HADM_ID", "DRUG"])

In [7]:
# Select relevant lab tests for infection risk
infection_markers = {
    50861: "wbc_count",  # White Blood Cell Count
    50912: "creatinine",  # Kidney Function
    50822: "crp",  # C-Reactive Protein (infection marker)
    50813: "lactate"  # Lactate (infection/sepsis indicator)
}

df_labs = df_labs[df_labs["ITEMID"].isin(infection_markers.keys())]
df_labs["ITEMID"] = df_labs["ITEMID"].map(infection_markers)

# Pivot table to structure the data
df_labs = df_labs.pivot_table(index=["SUBJECT_ID", "HADM_ID"], columns="ITEMID", values="VALUENUM").reset_index()

In [8]:
# Identify common hospital-acquired infections (e.g., MRSA, Klebsiella, Pseudomonas)
hai_pathogens = ["STAPH AUREUS COAG", "ESCHERICHIA COLI", "STAPHYLOCOCCUS, COAGULASE NEGATIVE", "KLEBSIELLA PNEUMONIAE", "PSEUDOMONAS AERUGINOSA"]

# Create infection flag
df_micro["hai_infection"] = df_micro["ORG_NAME"].apply(lambda x: 1 if any(pathogen in str(x) for pathogen in hai_pathogens) else 0)

# Aggregate to patient level (1 if any infection was found)
df_micro = df_micro.groupby(["SUBJECT_ID", "HADM_ID"])["hai_infection"].max().reset_index()

In [9]:
# Identify common **antibiotics used for hospital-acquired infections**
antibiotics = ["Vancomycin", "Ceftriaxone", "Piperacillin", "Meropenem", "Linezolid"]

# Create antibiotic treatment flag
df_meds["antibiotic_treatment"] = df_meds["DRUG"].apply(lambda x: 1 if any(abx in str(x) for abx in antibiotics) else 0)

# Aggregate to patient level (1 if any antibiotic was given)
df_meds = df_meds.groupby(["SUBJECT_ID", "HADM_ID"])["antibiotic_treatment"].max().reset_index()

In [10]:
# Merge lab results, microbiology data, and medication history
df = df_labs.merge(df_micro, on=["SUBJECT_ID", "HADM_ID"], how="left")
df = df.merge(df_meds, on=["SUBJECT_ID", "HADM_ID"], how="left")

# Fill missing values with median
df.fillna(df.median(), inplace=True)

# Drop ID columns
# df = df.drop(columns=["SUBJECT_ID", "HADM_ID"])

In [11]:
# Features & target variable
features = ["wbc_count", "creatinine", "crp", "lactate", "antibiotic_treatment"]
X = df[features]
y = df["hai_infection"]  # Target variable


In [12]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("After SMOTE:", pd.Series(y_resampled).value_counts())  # Check new balance

After SMOTE: hai_infection
1.0    38691
0.0    38691
Name: count, dtype: int64


In [13]:
# Normalize data
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [14]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

# Check if there are at least two classes in y_train before calculating AUC
if len(np.unique(y_train)) > 1:
	print("Random Forest AUC:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))
else:
	print("Random Forest AUC cannot be calculated because there is only one class in the target variable.")

print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.7794146152355108
Random Forest AUC: 0.8571119595089083
              precision    recall  f1-score   support

         0.0       0.79      0.76      0.78      7739
         1.0       0.77      0.80      0.78      7738

    accuracy                           0.78     15477
   macro avg       0.78      0.78      0.78     15477
weighted avg       0.78      0.78      0.78     15477



In [15]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]))

Logistic Regression Accuracy: 0.6795244556438587
Logistic Regression AUC: 0.7048819256413132


In [16]:
TIME_STEPS = 16  # Use past 5 records for each prediction

# Convert data into sequences
def create_sequences(df, time_steps):
    X_seq, y_seq = [], []
    
    for patient_id in df["SUBJECT_ID"].unique():
        patient_data = df[df["SUBJECT_ID"] == patient_id]
        features = patient_data.drop(columns=["hai_infection"])  # Drop target column
        
        for i in range(len(features) - time_steps):
            X_seq.append(features.iloc[i:i+time_steps].values)  # Use past `time_steps` records
            y_seq.append(patient_data["hai_infection"].iloc[i+time_steps])  # Predict infection at last step
            
    return np.array(X_seq), np.array(y_seq)

X, y = create_sequences(df, TIME_STEPS)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print shapes
print(f" X_train shape: {X_train.shape}")  # (samples, time_steps, features)
print(f" y_train shape: {y_train.shape}")  # (samples,)


 X_train shape: (88, 16, 7)
 y_train shape: (88,)


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Build LSTM model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32, return_sequences=False),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=32)


  super().__init__(**kwargs)


Epoch 1/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 357ms/step - accuracy: 0.6879 - loss: 0.6532 - val_accuracy: 0.7826 - val_loss: 0.5403
Epoch 2/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.7752 - loss: 0.5491 - val_accuracy: 0.7826 - val_loss: 0.5258
Epoch 3/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.7674 - loss: 0.5425 - val_accuracy: 0.7826 - val_loss: 0.5278
Epoch 4/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.7674 - loss: 0.5418 - val_accuracy: 0.7826 - val_loss: 0.5236
Epoch 5/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.7635 - loss: 0.5449 - val_accuracy: 0.7826 - val_loss: 0.5241
Epoch 6/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.7479 - loss: 0.5618 - val_accuracy: 0.7826 - val_loss: 0.5254
Epoch 7/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━