In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import re

In [None]:
info = pd.read_excel("data/info.xlsx")

In [None]:
info = info[pd.notnull(info['IN_OR_DTTM']) & 
            pd.notnull(info['OUT_OR_DTTM']) & 
            pd.notnull(info['AN_START_DATETIME']) & 
            pd.notnull(info['AN_STOP_DATETIME']) & 
            pd.notnull(info['PRIMARY_PROCEDURE_NM'])].reset_index(drop=True)

In [None]:
info = info[['MRN','LOG_ID','HOSP_ADMSN_TIME','HOSP_DISCH_TIME','SRG_PLN_TIME','SRG_CNL_TIME',
         'IN_OR_DTTM','OUT_OR_DTTM','AN_START_DATETIME','AN_STOP_DATETIME', 
         'ICU_ADMIN_FLAG', 'BIRTH_DATE', 'SEX', 'PRIMARY_ANES_TYPE_NM', 'ASA_RATING', 'ASA_RATING_C',
         'PRIMARY_PROCEDURE_NM']]

In [None]:
info['MRN'] = info['MRN'].astype("string")
info['LOG_ID'] = info['LOG_ID'].astype("string")
info['HOSP_ADMSN_TIME'] = pd.to_datetime(info['HOSP_ADMSN_TIME'], format="%m/%d/%y %H:%M") 
info['HOSP_DISCH_TIME'] = pd.to_datetime(info['HOSP_DISCH_TIME'], format="%m/%d/%y %H:%M") 
info['SRG_PLN_TIME'] = pd.to_datetime(info['SRG_PLN_TIME'], format="%m/%d/%y %H:%M") 
info['SRG_CNL_TIME'] = pd.to_datetime(info['SRG_CNL_TIME'], format="%m/%d/%y %H:%M") 
info['IN_OR_DTTM'] = pd.to_datetime(info['IN_OR_DTTM'], format="%m/%d/%y %H:%M")  
info['OUT_OR_DTTM'] = pd.to_datetime(info['OUT_OR_DTTM'], format="%m/%d/%y %H:%M") 
info['AN_START_DATETIME'] = pd.to_datetime(info['AN_START_DATETIME'], format="%m/%d/%y %H:%M")  
info['AN_STOP_DATETIME'] = pd.to_datetime(info['AN_STOP_DATETIME'], format="%m/%d/%y %H:%M")
info['ICU_ADMIN_FLAG'] = info["ICU_ADMIN_FLAG"].astype("string")
info["BIRTH_DATE"] = info["BIRTH_DATE"].astype("int64")
info = info.rename(columns={'BIRTH_DATE':'AGE'})
info["SEX"] = info["SEX"].astype("string")
info["PRIMARY_ANES_TYPE_NM"] = info["PRIMARY_ANES_TYPE_NM"].astype("string")
info["ASA_RATING"] = info["ASA_RATING"].astype("string")
info["ASA_RATING_C"] = info["ASA_RATING_C"].astype('Int64')
info["PRIMARY_PROCEDURE_NM"] = info["PRIMARY_PROCEDURE_NM"].astype("string")

In [None]:
info = info.sort_values(by=['MRN', 'HOSP_ADMSN_TIME', 'SRG_PLN_TIME', 'LOG_ID'])

In [None]:
# Surgery categories based on user-defined classification
surgeries = {
    "Bariatric": [
        "GASTRIC BYPASS", "SLEEVE GASTRECTOMY", "GASTRIC BANDING", "ROUX-EN-Y GASTROENTEROSTOMY"
    ],
    "Cardiothoracic": [
        "BYPASS GRAFT", "CABG", "VALVE REPLACEMENT", "THORACOTOMY", 
        "VATS", "THORACOSCOPIC", "LOBECTOMY", "PLEURODESIS", "CARDIOVERSION", "CATHETERIZATION",
        "DECORTICATION", "THORACENTESIS", "PERICARDIOCENTESIS", "AORTIC ANEURYSM REPAIR",
        "CARDIAC CATHETERIZATION", "ELECTROPHYSIOLOGY STUDY", "BYPASS", "ABLATION", "COX MAZE PROCEDURE",
        "PERICARDIAL WINDOW", "ECHOCARDIOGRAM"
    ],
    "ENT (Ear, Nose, and Throat)": [
        "TONSILLECTOMY", "SINUS", "SEPTOPLASTY", "RHINOPLASTY", "LARYNGOSCOPY", "FLOOR OF MOUTH OR TONGUE",
        "TYMPANOPLASTY", "STAPEDECTOMY", "PAROTIDECTOMY", "THYROIDECTOMY", "PARATHYROIDECTOMY",
        "GLOSSECTOMY", "NOSE", "VESTIBULE", "SOFT PALATE", "UVULA", "CANALOPLASTY", "EPISTAXIS",
        "CRYOABLATION", "TRANSORAL SURGERY"
    ],
    "General": [
        "APPENDECTOMY", "CHOLECYSTECTOMY", "HERNIORRHAPHY", "COLECTOMY", "GASTRECTOMY",
        "BOWEL RESECTION", "LAPAROTOMY", "LAPAROSCOPY", "WHIPPLE PROCEDURE", "PANCREATECTOMY",
        "SPLENECTOMY", "JEJUNOSTOMY", "GASTROSTOMY", "ABDOMINOPLASTY", "ILEOSTOMY", "RESECTION", "ANTERIOR", 
        "DRAINAGE", "ABSCESS", "CYST", "WHIPPLE", "EXCISION", "LESION", "NEPHROSTOMY", "ANESTHESIA", 
        "ANOPLASTY", "ANOSCOPY", "APPENDICOVESICOSTOMY", "CHOLECYSTOSTOMY", "ABDOMINAL WALL", "COLOSTOMY",
        "FASCIOTOMY", "TRACHEOSTOMY", "WOUND", "COCCYGECTOMY", "BURR HOLE", "COLOSTOMY", "CYSTOGASTROSTOMY",
        "HERNIA", "DECOMPRESSION, SPINE, LUMBAR", "DECOMPRESSION, SPINE, LUMBAR, POSTERIOR APPROACH, WITH POSTERIOR COLUMN FUSION",
        "DECOMPRESSION, ULNAR NERVE", "DECORTICATION, LUNG, TOTAL, USING VATS", "ESOPHAGECTOMY", "ESOPHAGOGASTRECTOMY",
        "EVISCERATION", "TUNNELED CENTRAL VENOUS DEVICE", "TUNNELED CUFFED HEMODIALYSIS CATHETER", "VAGUS NERVE STIMULATOR",
        "INTRAMEDULLARY RODDING", "BILIARY", "CATHETER", "DRAIN"
    ],
    "Gynecological": [
        "CURETTAGE", "HYSTERECTOMY", "SALPINGO-OOPHORECTOMY", "SALPINGECTOMY", "MYOMECTOMY", "D&C", "COLPORRHAPHY",
        "COLPOCLEISIS", "HYSTEROSCOPY", "OOPHORECTOMY", "SACROCOLPOPEXY", "ENDOMETRIAL ABLATION", "CESAREAN",
        "CERCLAGE", "COLPOSCOPY", "MIDURETHRAL SLING", "DILATION EVACUATION"
    ],
    "Neurosurgical": [
        "CRANIOTOMY", "CRANIECTOMY", "LAMINECTOMY", "SPINAL FUSION", "DISCECTOMY",
        "VENTRICULOPERITONEAL SHUNT", "BRAIN TUMOR RESECTION", "SUBDURAL HEMATOMA EVACUATION",
        "DEEP BRAIN STIMULATOR", "NEUROSTIMULATOR", "CERVICAL DECOMPRESSION"
    ],
    "Oncological": [
        "TUMOR RESECTION", "MASTECTOMY", "LYMPHADENECTOMY", "NEPHRECTOMY", "PANCREATECTOMY",
        "GLOSSECTOMY", "PAROTIDECTOMY", "FLAP", "ADRENALECTOMY", "BRACHYTHERAPY", "CHEMOTHERAPY",
        "CYSTOPROSTATECTOMY", "LYMPH NODE", "DECORTICATION, CYST, KIDNEY"
    ],
    "Ophthalmological": [
        "CATARACT EXTRACTION", "VITRECTOMY", "RETINAL DETACHMENT REPAIR", "GLAUCOMA SURGERY",
        "KERATOPLASTY", "ENUCLEATION", "CATARACT", "RETINAL DETACHMENT", "RETINAL"
    ],
    "Orthopedic": [
        "ORIF", "ARTHROPLASTY", "ARTHROSCOPY", "SPINAL FUSION", "LAMINECTOMY", "DISCECTOMY", "ARTHRODESIS", "ARTHROTOMY",
        "AMPUTATION", "DUPUYTREN'S CONTRACTURE", "CARPAL TUNNEL", "TENDON REPAIR", "FRACTURE REPAIR", "CLOSED REDUCTION",
        "ADJUSTMENT", "ARTHROCENTESIS", "EXTERNAL FIXATION DEVICE", "CAST OR SPLINT", "BASIC FOOT", "BASIC HAND",
        "BUNIONECTOMY", "BURSECTOMY", "CARPECTOMY", "FRACTURE", "PREVIOUS HIP SURGERY", "PREVIOUS HIP SURGERY TO TOTAL HIP REPLACEMENT",
        "DISARTICULATION", "INTRAMEDULLARY NAIL", "OMMAYA RESERVOIR", "PENILE PROSTHESIS"
    ],
    "Pediatric": [
        "CIRCUMCISION", "ORCHIOPEXY", "HERNIA REPAIR", "APPENDECTOMY", "PYELOPLASTY", "CYSTOURETHROGRAM"
    ],
    "Plastic and Reconstructive": [
        "BREAST RECONSTRUCTION", "MASTECTOMY", "ABDOMINOPLASTY", "RHINOPLASTY", "RHYTIDECTOMY", "GRAFT",
        "LIPOSUCTION", "SKIN GRAFTING", "SCAR REVISION", "FLAP RECONSTRUCTION", "BLEPHAROPLASTY", "ALLOGRAFT",
        "MAXILLOMANDIBULAR FIXATION", "BLEPHARORRHAPHY", "BRACHIOPLASTY", "BROW LIFT", "CAPSULECTOMY",
        "EAR AURICLE", "CONJUNCTIVOPLASTY", "LAGOPHTHALMOS", "CRANIOPLASTY", "FACELIFT"
    ],
    "Specialized": [
        "ELECTROCONVULSIVE THERAPY", "BRONCHOSCOPY", "ERCP", "COLONOSCOPY", "EGD", "ENDOSCOPIC"
        "LUMBAR PUNCTURE", "PERITONEAL DIALYSIS", "ASPIRATION", "BIOPSY", "CONE BIOPSY", "ENDOSCOPY",
        "CHOLANGIOGRAM"
    ],
    "Trauma": [
        "LAPAROTOMY", "FRACTURE REPAIR", "IRRIGATION AND DEBRIDEMENT", "DEBRIDEMENT", "IRRIGATION", "THORACOTOMY"
    ],
    "Urological": [
        "CYSTOSCOPY", "URETEROSCOPY", "RETROGRADE PYELOGRAM", "HOLMIUM LASER LITHOTRIPSY",
        "PERCUTANEOUS NEPHROLITHOTOMY", "TURBT", "CYSTECTOMY", "NEPHRECTOMY",
        "URETERAL STENT REPLACEMENT", "PROSTATECTOMY", "TURP", "CYSTOLITHOLAPAXY",
        "URETHROPLASTY", "NEPHROURETERECTOMY", "INDIANA POUCH", "URETHRAL SLING", "PROSTATE",
        "CYSTOLITHOTOMY", "CYSTOSTOMY", "CYSTOURETEROSCOPY", "CYSTOURETHROPLASTY"
    ],
    "Vascular": [
        "ENDARTERECTOMY", "ANGIOGRAM", "FISTULA", "FISTULOGRAM", "THROMBECTOMY", "EMBOLIZATION",
        "VARICOSE VEIN ABLATION", "AORTIC ANEURYSM REPAIR", "ARTERIOGRAM", "ANGIO", "ANASTOMOSIS", 
        "ANGIOPLASTY", "DECOMPRESSION, THORACIC OUTLET", "EMBOLECTOMY"
    ],
}

In [None]:
# Function to extract surgery name and categorize it
def extract_and_categorize_surgery(procedure):
    # Check for exact match in the surgery categories
    for category, keywords in surgeries.items():
        for keyword in keywords:
            if keyword.lower() == procedure.lower():
                return keyword, category  # Return best surgery name and category

    # Split using just comma
    words = re.split(r"[,]+", procedure, flags=re.IGNORECASE)
    # Remove empty entries
    filtered_words = [word.strip() for word in words if word.strip()]
    # Check for best match in the surgery categories
    for category, keywords in surgeries.items():
        for keyword in keywords:
            if keyword.lower() in (word.lower() for word in filtered_words):
                return keyword, category  # Return best surgery name and category

    stop_words = {"av", "with", "for", "and", "of", "or", "the", "using", "to", "by", "da", "vinci", "xi", "si", "ir", "gi", "insertion"}    
    # Split using comma and common connectors
    words = re.split(r"[,]+", procedure, flags=re.IGNORECASE)    
    # Remove empty entries and filter stop words
    filtered_words = [word.strip() for word in words if word.strip() and word.strip().lower() not in stop_words]
    # Check for best match in the surgery categories
    for category, keywords in surgeries.items():
        for keyword in keywords:
            if keyword.lower() in (word.lower() for word in filtered_words):
                return keyword, category  # Return best surgery name and category
    
    # Split using spaces and common connectors
    words = re.split(r"[,\s\-\\/]+| AND | OR | OF | WITH | FOR ", procedure, flags=re.IGNORECASE)    
    # Remove empty entries and filter stop words
    filtered_words = [word.strip() for word in words if word.strip() and word.strip().lower() not in stop_words]
    # Check for best match in the surgery categories
    for category, keywords in surgeries.items():
        for keyword in keywords:
            if keyword.lower() in (word.lower() for word in filtered_words):
                return keyword, category  # Return best surgery name and category
            
    # Fallback: return the first word and categorize as 'Other'
    return (filtered_words[0] if filtered_words else procedure), "Other"

In [None]:
# Apply extraction and categorization
info[["SURG_NAME", "SURG_CATE"]] = info["PRIMARY_PROCEDURE_NM"].apply(lambda x: pd.Series(extract_and_categorize_surgery(x)))

In [None]:
#  Feature engineering Calculate key time intervals using absolute values
info["Surgery_Duration"] = abs((info["OUT_OR_DTTM"] - info["IN_OR_DTTM"]).dt.total_seconds()) / 60
info["Anesthesia_Duration"] = abs((info["AN_STOP_DATETIME"] - info["AN_START_DATETIME"]).dt.total_seconds()) / 60
info["Delay_From_Planned"] = abs((info["IN_OR_DTTM"] - info["SRG_PLN_TIME"]).dt.total_seconds()) / 60
info["Anesthesia_Induction"] = abs((info["AN_START_DATETIME"] - info["IN_OR_DTTM"]).dt.total_seconds()) / 60
info["Recovery_Duration"] = abs((info["HOSP_DISCH_TIME"] - info["OUT_OR_DTTM"]).dt.total_seconds()) / 60
info["Hospital_Stay"] = abs((info["HOSP_DISCH_TIME"] - info["HOSP_ADMSN_TIME"]).dt.total_seconds()) / (60 * 60)

In [None]:
# Plot boxplot with rotated labels and clean layout
sns.boxplot(data=info, x='SEX', y='Surgery_Duration', width=0.5)
plt.title("Surgery Duration by Gender")
plt.xlabel("Gender")
plt.ylabel("Surgery Duration in minutes")

# Rotate x-axis labels for readability
plt.xticks(rotation=0, ha='center')  # 'ha' means horizontal alignment

# Adjust layout to prevent clipping
plt.tight_layout()
plt.show()

In [None]:
# Define a mapping: ASA number → label with description (each word on new line)
asa_label_map = {
    1: "ASA I\nHealthy",
    2: "ASA II\nMild\nSystemic\nDisease",
    3: "ASA III\nSevere\nSystemic\nDisease",
    4: "ASA IV\nIncapacitating\nDisease",
    5: "ASA V\nMoribund",
    6: "ASA VI\nBrain-Dead"
}

# Create a new column with formatted labels
info['ASA_LABEL'] = info['ASA_RATING_C'].map(asa_label_map)

# Create a categorical type for ordering
asa_order = [
    "ASA I\nHealthy",
    "ASA II\nMild\nSystemic\nDisease",
    "ASA III\nSevere\nSystemic\nDisease",
    "ASA IV\nIncapacitating\nDisease",
    "ASA V\nMoribund",
    "ASA VI\nBrain-Dead"
]
info['ASA_LABEL'] = pd.Categorical(info['ASA_LABEL'], categories=asa_order, ordered=True)

# Plot
sns.boxplot(data=info, x='ASA_LABEL', y='Surgery_Duration', width=0.5)
# plt.title("Surgery Duration by ASA Rating")
plt.xlabel("")
plt.ylabel("Surgery Duration (minutes)")

# Keep labels centered
plt.xticks(rotation=0, ha='center')

# Tidy layout
plt.tight_layout()
plt.show()

In [None]:
# Create age bins (e.g., 0–9, 10–19, ..., 90–99)
info['AGE_GROUP'] = pd.cut(info['AGE'], bins=range(10, 101, 10), right=False, labels=[f'{i}-{i+9}' for i in range(10, 100, 10)])

# Plot using the new age group column
sns.boxplot(data=info, x='AGE_GROUP', y='Surgery_Duration', width=0.5)
# plt.title("Surgery Duration by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Surgery Duration in minutes")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
sns.histplot(info['Delay_From_Planned'], bins=50, kde=True)
plt.title("Surgery Start Delays (Planned vs. Actual)")
plt.show()

In [None]:
kpi_df = info.groupby('SURG_CATE').agg({
    'Surgery_Duration': ['min', 'mean', 'median', 'max', 'std', 'count']
}).reset_index()

# Rename columns for clarity
kpi_df.columns = ['Procedure', 'Min', 'Avg', 'Median', 'Max', 'Std Dev', 'Case Count']

# Save to CSV
kpi_df.to_csv("data/surgery_kpi_summary.csv", index=False)

In [None]:
# Load data (adjust path if needed)
kpi_df = pd.read_csv("data/surgery_kpi_summary.csv")

# Set style and figure
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(12, 7))

# Bar setup
bar_width = 0.35
x = range(len(kpi_df))

# Plot average duration with error bars
bars_avg = ax.bar([i - bar_width/2 for i in x], 
                  kpi_df["Avg"], 
                  yerr=kpi_df["Std Dev"], 
                  capsize=5, 
                  width=bar_width, 
                  label="Avg Duration", 
                  color='skyblue')

# Plot median duration
bars_median = ax.bar([i + bar_width/2 for i in x], 
                     kpi_df["Median"], 
                     width=bar_width, 
                     label="Median Duration", 
                     color='steelblue')

# Axis labels and title
ax.set_xlabel("Procedure Type")
ax.set_ylabel("Duration (minutes)")
ax.set_title("Surgery KPI Summary by Procedure Type")
ax.set_xticks(x)
ax.set_xticklabels(kpi_df["Procedure"], rotation=45, ha='right')
ax.legend(loc="upper left")

# Add case count and std dev values above bars
for i, (bar1, bar2, count, std_dev) in enumerate(zip(bars_avg, bars_median, kpi_df["Case Count"], kpi_df["Std Dev"])):
    height = max(bar1.get_height(), bar2.get_height())
    ax.text(bar1.get_x() + bar_width / 2, height + 5, f'Cases\n{count}', ha='center', fontsize=9, color='darkgreen')
    ax.text(bar1.get_x() + bar_width / 2, height + 40, f'Std. Dev.\n±{std_dev:.1f}', ha='center', fontsize=8, color='red')

plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=info, x='SURG_CATE', y='Surgery_Duration', width=0.5, hue='SURG_CATE', palette='Blues', dodge=False)
# Label the plot
plt.xticks(rotation=90)
plt.xlabel('Surgery Categories', fontsize=14)
plt.ylabel('Time Spent (in minutes)', fontsize=12)
plt.title('Duration in Operating Room', fontsize=16)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Set up the subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 6), sharey=False)

# List of duration types
duration_types = ['Surgery_Duration', 'Recovery_Duration']
titles = ['Surgery Duration', 'Recovery Duration']

# Loop through each duration and plot
for ax, duration, title in zip(axes, duration_types, titles):
    sns.boxplot(data=info, x='SURG_CATE', y=duration, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Surgical Category')
    ax.set_ylabel('Duration (min)')
    ax.tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()



🔁 Operating Room Turnover Time

In [None]:
df_sorted = info.sort_values(by="IN_OR_DTTM")

import heapq

rooms = []  # Min-heap to track room availability: [(room_end_time, room_id)]
room_assignments = []  # Room ID for each surgery
room_count = 0

for index, row in df_sorted.iterrows():
    entry, exit = row["IN_OR_DTTM"], row["OUT_OR_DTTM"]
    
    # Reuse a room if possible
    assigned = False
    for i in range(len(rooms)):
        if rooms[i][0] <= entry:
            # Replace the tuple in the heap
            room_id = rooms[i][1]
            rooms[i] = (exit, room_id)
            assigned = True
            break
    
    if not assigned:
        room_count += 1
        room_id = room_count
        rooms.append((exit, room_id))
    
    heapq.heapify(rooms)
    room_assignments.append(room_id)

df_sorted["INFERRED_OR"] = room_assignments

import plotly.express as px

fig = px.timeline(df_sorted, x_start="IN_OR_DTTM", x_end="OUT_OR_DTTM", y="INFERRED_OR", color="SURG_CATE")
fig.update_yaxes(autorange="reversed")
fig.show()

In [None]:
df_or_tunover = df_sorted.sort_values(['INFERRED_OR', 'IN_OR_DTTM'])
df_or_tunover['prev_out_or_time'] = df_or_tunover['OUT_OR_DTTM'].shift(1)
df_or_tunover['turnover_time_min'] = (df_or_tunover['IN_OR_DTTM'] - df_or_tunover['prev_out_or_time']).dt.total_seconds() / 60

# Optional: Filter by OR if you have OR_ID or ROOM column

🔍 Outlier Detection & Root Cause

Statistical outlier detection:

In [None]:
# IQR-based outlier detection
Q1 = info['Surgery_Duration'].quantile(0.25)
Q3 = info['Surgery_Duration'].quantile(0.75)
IQR = Q3 - Q1

info['Surgery_Duration_Outlier'] = (info['Surgery_Duration'] < Q1 - 1.5 * IQR) | (info['Surgery_Duration'] > Q3 + 1.5 * IQR)

Root cause analysis:

In [None]:
info[info['Surgery_Duration_Outlier']].groupby('ASA_RATING').size()

In [None]:
info[info['Surgery_Duration_Outlier']].groupby('SURG_CATE').size()

🤖 Clustering & Predictive Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 🔄 Clustering
features = info[['Surgery_Duration', 'Anesthesia_Duration', 'Delay_From_Planned', 'AGE']].dropna()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

info.loc[features.index, 'cluster'] = clusters

# 📊 Visualize Clusters
plt.figure(figsize=(8,6))
sns.scatterplot(data=info, x='Surgery_Duration', y='Delay_From_Planned', hue='cluster', palette='viridis')
plt.title("Clustering: Surgery Duration vs Delay from Planned")
plt.xlabel("Surgery Duration (min)")
plt.ylabel("Delay from Planned (min)")
plt.show()

In [None]:
# Convert categorical variables to numerical (One-Hot Encoding)
info_dummy = pd.get_dummies(info, columns=["AGE_GROUP", 'SEX','ICU_ADMIN_FLAG', 'PRIMARY_ANES_TYPE_NM', 'ASA_RATING', 'SURG_CATE'], dtype='int')

In [None]:
# Identify historical delays (outliers)
target = "Recovery_Duration"

# IQR-based outlier detection
Q1 = info_dummy[target].quantile(0.25)
Q3 = info_dummy[target].quantile(0.75)
IQR = Q3 - Q1
info_dummy['recovery_outlier'] = (info_dummy[target] < Q1 - 1.5 * IQR) | (info_dummy[target] > Q3 + 1.5 * IQR)

# Prepare data for ML models
feature_cols = ['Delay_From_Planned', 'Anesthesia_Induction', 'Anesthesia_Duration', 'Surgery_Duration'] 
feature_cols += [col for col in info_dummy.columns if col.startswith(("AGE_GROUP_", "SEX_", "ICU_", "PRIMARY_ANES_", "ASA_RATING", "SURG_CATE"))]

In [None]:
feature_cols.pop(4)

In [None]:
# Define Features and Target
target_cls = "recovery_outlier"
X = info_dummy[feature_cols]
y = info_dummy[target_cls]

In [None]:
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_cls)
X_test = scaler.transform(X_test_cls)

In [None]:
# Train ML models
classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
classifier.fit(X_train, y_train_cls)

# Evaluate models
y_pred_cls = classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test_cls, y_pred_cls)
print(f'Accuracy: {accuracy * 100:.2f}%')

conf_matrix = confusion_matrix(y_test_cls, y_pred_cls)

In [None]:
# Check class distribution:
print(y.value_counts(normalize=True))

In [None]:
# Use Better Metrics for Imbalanced Classification: Consider:
from sklearn.metrics import classification_report

print(classification_report(y_test_cls, y_pred_cls))

In [None]:
import numpy as np

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.

    Arguments
    ---------
    cf:            confusion matrix to be passed in

    group_names:   List of strings that represent the labels row by row to be shown in each square.

    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'

    count:         If True, show the raw number in the confusion matrix. Default is True.

    normalize:     If True, show the proportions for each category. Default is True.

    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.

    xyticks:       If True, show x and y ticks. Default is True.

    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.

    sum_stats:     If True, display summary statistics below the figure. Default is True.

    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.

    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.

    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.4f} Precision={:0.4f} Recall={:0.4f} F1 Score={:0.4f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.4f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['No Delay', 'Delayed']
make_confusion_matrix(conf_matrix, group_names=labels, categories=categories, cmap='Blues', title='')

In [None]:
importances = classifier.feature_importances_
features_df = pd.DataFrame({'feature': feature_cols, 'importance': importances})
features_df = features_df.sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=features_df.importance[:20], y=features_df.feature[:20])
plt.title("")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

In [None]:
# Prepare data for ML models
info_kmeans = info_dummy
k_feature_cols = ['Delay_From_Planned', 'Anesthesia_Induction', 'Anesthesia_Duration', 'Surgery_Duration', 'Recovery_Duration', 'AGE', 'ASA_RATING_C'] 
k_feature_cols += [col for col in info_kmeans.columns if col.startswith(("SEX_", "SURG_CATE"))]
k_features = info_kmeans[k_feature_cols].dropna()

# 🔄 Clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(k_features)

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

info_kmeans.loc[k_features.index, 'cluster'] = clusters

In [None]:
# 📊 Visualize Clusters
x_axis = "AGE"
y_axis = "Surgery_Duration"
plt.figure(figsize=(8,6))
sns.scatterplot(data=info_kmeans, x=x_axis, y=y_axis, hue='cluster', palette='viridis')
plt.title("Clustering")
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.show()

In [None]:
# Total_OR_Occupancy Distribution Plot
plt.figure(figsize=(8, 5))
sns.histplot(info['Surgery_Duration'], bins=25, kde=True)
plt.title("Surgery Duration Histogram (bins=25")
plt.xlabel("Duration (Minutes)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Step 1: Outlier detection by SURG_CATE using IQR on 'Recovery_Duration'
def detect_outliers_by_group(df, group_col, target_col):
    df = df.copy()
    df['recovery_outlier'] = False

    for group in df[group_col].unique():
        group_mask = df[group_col] == group
        Q1 = df.loc[group_mask, target_col].quantile(0.25)
        Q3 = df.loc[group_mask, target_col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df.loc[group_mask, 'recovery_outlier'] = (
            (df.loc[group_mask, target_col] < lower_bound) |
            (df.loc[group_mask, target_col] > upper_bound)
        )

    return df

# Apply outlier detection
info_with_outliers = detect_outliers_by_group(info, group_col='SURG_CATE', target_col='Recovery_Duration')

# Step 2: Prepare features
feature_cols = ['Delay_From_Planned', 'Anesthesia_Induction', 'Anesthesia_Duration', 'Surgery_Duration']
feature_cols += [col for col in info_dummy.columns if col.startswith(
    ("AGE_GROUP_", "SEX_", "ICU_", "PRIMARY_ANES_", "ASA_RATING", "SURG_CATE_"))]

# Merge outlier info with encoded features
info_dummy['recovery_outlier'] = info_with_outliers['recovery_outlier']

# Define features and labels
X = info_dummy[feature_cols]
y = info_dummy['recovery_outlier']

# Train-test split
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_cls = X_train_cls.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)
X_train_cls = X_train_cls.apply(lambda col: col.astype(int) if col.dtype == 'boolean' else col)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_cls)
X_test = scaler.transform(X_test_cls)

# Train Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train_cls)

# Predict and evaluate
y_pred_cls = classifier.predict(X_test)
accuracy = accuracy_score(y_test_cls, y_pred_cls)
report = classification_report(y_test_cls, y_pred_cls)
conf_matrix = confusion_matrix(y_test_cls, y_pred_cls)

# Check class distribution:
print(y.value_counts(normalize=True))
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)


In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['No Delay', 'Delayed']
make_confusion_matrix(conf_matrix, group_names=labels, categories=categories, cmap='Blues', title='')

In [None]:
importances = classifier.feature_importances_
features_df = pd.DataFrame({'feature': feature_cols, 'importance': importances})
features_df = features_df.sort_values(by='importance', ascending=False)

plt.figure(figsize=(8, 5))
sns.barplot(x=features_df.importance[:20], y=features_df.feature[:20])
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()