# =========================================
# Prototype A – Without Generative AI
# =========================================

# -----------------------------
### 1. Import libraries
# -----------------------------

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# -----------------------------
### 2. Load and explore dataset
# -----------------------------

In [4]:
df = pd.read_csv("../data/2022_Q1_OR_Utilization.csv")
print("Shape of dataset:", df.shape)
print(df.head())
# Loading the columns
print(df.columns)


Shape of dataset: (2172, 13)
   index  Encounter ID      Date  OR Suite      Service  CPT Code  \
0      0         10001  01/03/22         1     Podiatry     28110   
1      1         10002  01/03/22         1     Podiatry     28055   
2      2         10003  01/03/22         1     Podiatry     28297   
3      3         10004  01/03/22         1     Podiatry     28296   
4      4         10005  01/03/22         2  Orthopedics     27445   

                             CPT Description  Booked Time (min)  \
0   Partial ostectomy, fifth metatarsal head                 90   
1  Neurectomy, intrinsic musculature of foot                 60   
2                       Lapidus bunionectomy                150   
3         Bunionectomy with distal osteotomy                120   
4        Arthroplasty, knee, hinge prothesis                120   

         OR Schedule          Wheels In         Start Time           End Time  \
0  01/03/22 07:00 AM  01/03/22 07:05 AM  01/03/22 07:32 AM  01/03/22 09:

# -----------------------------
### 3. Convert time columns to datetime
# -----------------------------

In [None]:
# Converting the text date string to datetime object
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%y")
for col in ['Wheels In','Start Time','End Time','Wheels Out']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# -----------------------------
### 4. Create duration features
# -----------------------------

In [10]:
# Adding extra columns procedure_minutes, setup_minutes and exit_minutes
df['procedure_minutes'] = (df['End Time'] - df['Start Time']).dt.total_seconds()/60
df['setup_minutes'] = (df['Start Time'] - df['Wheels In']).dt.total_seconds()/60
df['exit_minutes']  = (df['Wheels Out'] - df['End Time']).dt.total_seconds()/60

# Filtering out unrealistic duration
df = df[(df['procedure_minutes'] > 0) & (df['procedure_minutes'] < 360)]
# Print the shape of the cleaned dataset
print("Dataset after cleaning", df.shape)
# Explore the columns again before chososing the feature and the target
print(df.columns)

Dataset after cleaning (2172, 16)
Index(['index', 'Encounter ID', 'Date', 'OR Suite', 'Service', 'CPT Code',
       'CPT Description', 'Booked Time (min)', 'OR Schedule', 'Wheels In',
       'Start Time', 'End Time', 'Wheels Out', 'procedure_minutes',
       'setup_minutes', 'exit_minutes'],
      dtype='object')


# -----------------------------
### 5. Define features and targets
# -----------------------------

In [12]:
# Here the features are 'Booked Time (min)', 'Service', 'OR Suite' and the target are procedure_minutes
features = ['Booked Time (min)', 'Service', 'OR Suite']
X = df[features]
y = df['procedure_minutes']

# -----------------------------
### 6. Split data
# -----------------------------

In [16]:
# Splitting the dataset into trainig and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# -----------------------------
# 7. Preprocessing
# -----------------------------


In [None]:
# There are several specialization such as orhtopedics and pediatrics and these specialization cannot be understood by the model so we are encoding it to 0s and 1s.
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Service']),
    ('num', 'passthrough', ['Booked Time (min)', 'OR Suite'])
])