# =========================================
# Prototype B – With Generative AI
# =========================================

# -----------------------------
### 1. Import libraries
# -----------------------------

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
import pickle, os


# -----------------------------
### 2. Load data
# -----------------------------

In [21]:

df = pd.read_csv("../data/2022_Q1_OR_Utilization.csv")

# Converting the date string to date time object
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%y")
for col in ['Wheels In','Start Time','End Time','Wheels Out']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

print(df.columns)

# Creating the columns procedure_minutes, setup_minutes, exit_minutes
df['procedure_minutes'] = (df['End Time'] - df['Start Time']).dt.total_seconds()/60
df['setup_minutes'] = (df['Start Time'] - df['Wheels In']).dt.total_seconds()/60
df['exit_minutes']  = (df['Wheels Out'] - df['End Time']).dt.total_seconds()/60

# Fiiltering the dataset based on  hourly basis
df = df[(df['procedure_minutes'] > 0) & (df['procedure_minutes'] < 360)]

# Find the shape of the dataset
print(df.shape)

print("Adding the extra columns")

# explore the columns
print(df.columns)


Index(['index', 'Encounter ID', 'Date', 'OR Suite', 'Service', 'CPT Code',
       'CPT Description', 'Booked Time (min)', 'OR Schedule', 'Wheels In',
       'Start Time', 'End Time', 'Wheels Out'],
      dtype='object')
(2172, 16)
Adding the extra columns
Index(['index', 'Encounter ID', 'Date', 'OR Suite', 'Service', 'CPT Code',
       'CPT Description', 'Booked Time (min)', 'OR Schedule', 'Wheels In',
       'Start Time', 'End Time', 'Wheels Out', 'procedure_minutes',
       'setup_minutes', 'exit_minutes'],
      dtype='object')


  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')


# -----------------------------
### 3. Generative AI-Assisted Feature Engineering
# -----------------------------

In [22]:
# Here the extra columns such as weekday , month and is_morning is added extra using GPT
df['weekday'] = df['Date'].dt.weekday
df['month'] = df['Date'].dt.month
df['is_morning'] = (df['Start Time'].dt.hour < 12).astype(int)

# -----------------------------
### 4. Generative AI-Assisted Data Augmentation
# -----------------------------
# Simulate "AI-generated" synthetic cases for rare long surgeries

### Inserting the systhetically generated data to the dataset

In [23]:
long_cases = df[df['procedure_minutes'] > df['procedure_minutes'].quantile(0.85)]
rng = np.random.default_rng(42)
synthetic = []

for _ in range(min(300, len(long_cases)*2)):
    r = long_cases.sample(1, replace=True).iloc[0].copy()
    r['Booked Time (min)'] += rng.integers(-10, 11)
    r['procedure_minutes'] += rng.normal(0, 5)
    r['setup_minutes'] += rng.normal(0, 2)
    r['exit_minutes'] += rng.normal(0, 2)
    synthetic.append(r)

df = pd.concat([df, pd.DataFrame(synthetic)], ignore_index=True)

print("Dataset after GenAI augmentation:", df.shape)


Dataset after GenAI augmentation: (2472, 19)


# -----------------------------
### 5. Define features & target
# -----------------------------

In [24]:

features = [
    'Booked Time (min)', 'Service', 'OR Suite',
    'weekday', 'month', 'is_morning',
    'setup_minutes', 'exit_minutes'
]
X = df[features]
y = df['procedure_minutes']

# -----------------------------
### 6. Train-test split
# -----------------------------

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
### 7. Preprocess & Train Model
# -----------------------------

In [26]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Service']),
    ('num', 'passthrough', ['Booked Time (min)', 'OR Suite', 'weekday', 'month', 'is_morning', 'setup_minutes', 'exit_minutes'])
])

model = Pipeline([
    ('prep', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=400, random_state=42))
])

model.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,400
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:

# -----------------------------
# 8. Evaluate
# -----------------------------
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)
r2 = r2_score(y_test, pred)

print("\n=== Prototype B Results (With GenAI) ===")
print(f"Mean Absolute Error (MAE): {mae:.2f} minutes")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")
print(f"R² Score: {r2:.4f}")


=== Prototype B Results (With GenAI) ===
Mean Absolute Error (MAE): 0.71 minutes
Mean Absolute Percentage Error (MAPE): 0.0075
R² Score: 0.9911
