In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta

In [None]:
transitions = pd.read_excel("data/transitions.xlsx")

In [None]:
start_event = "Operating Room Enter"
finish_event = "Operating Room Exit"

# Extract relevant events
start_events = transitions[transitions["concept:name"] == start_event].rename(columns={"time:timestamp": "start_time"})
finish_events = transitions[transitions["concept:name"] == finish_event].rename(columns={"time:timestamp": "finish_time"})

# Merge enter and exit events based on case and log
duration_df = pd.merge(start_events, finish_events, on=["case:concept:name", "log"], how="inner")

# Calculate duration in hour
duration_df["duration"] = (duration_df["finish_time"] - duration_df["start_time"]).dt.total_seconds() / 3600

# Select required columns
result = duration_df[["case:concept:name", "log", "start_time", "finish_time", "duration"]]

In [None]:
plt.figure(figsize=(6, 6))
sns.boxplot(y='duration', data=result)

# Label the plot
plt.xlabel('All Data', fontsize=14)
plt.ylabel('Time Spent', fontsize=14)
plt.title('Duration Between ' + start_event + " - " + finish_event + " in Hour", fontsize=16)

# Show the plot
plt.tight_layout()
plt.show()

***Distribution of Events over the Hours***



In [None]:
# Function to calculate the hourly distribution for each patient
def calculate_hourly_distribution_multi_day(df):
    hourly_distributions = []

    # Loop through each patient record
    for index, row in df.iterrows():
        in_time = row['start_time']
        out_time = row['finish_time']

        # Initialize an array to hold the time fractions for each hour (0-23) for this patient, for each day
        hourly_distribution = {}

        current_time = in_time

        while current_time < out_time:
            # Determine the hour and day of the current time
            hour = current_time.hour
            day = current_time.date()

            # Find the end of the current hour interval (next full hour)
            next_hour_time = (current_time + timedelta(hours=1)).replace(minute=0, second=0, microsecond=0)

            # Calculate the time spent in the current hour interval
            time_in_this_hour = min(out_time, next_hour_time) - current_time
            time_in_this_hour_minutes = time_in_this_hour.total_seconds() / 60.0  # Convert to minutes

            # Initialize if the day is not yet in the hourly_distribution dict
            if day not in hourly_distribution:
                hourly_distribution[day] = np.zeros(24)

            # Add the fraction of the hour spent to the corresponding day and hour
            hourly_distribution[day][hour] += time_in_this_hour_minutes / 60.0  # Convert to hour fraction

            # Move to the next hour
            current_time = next_hour_time

        # Append the patient's hourly distribution to the list
        hourly_distributions.append(hourly_distribution)

    return hourly_distributions

# Apply the function to calculate hourly distributions
hourly_distributions = calculate_hourly_distribution_multi_day(result)

# Convert the result into a DataFrame for further analysis
# We will create a list of records, one for each day/hour of occupancy
records = []

for i, distribution in enumerate(hourly_distributions):
    for day, hours in distribution.items():
        for hour, time_spent in enumerate(hours):
            if time_spent > 0:
                records.append({
                    'case:concept:name': result.iloc[i]['case:concept:name'],
                    'Date': day,
                    'Hour': hour,
                    'Time_Spent': time_spent
                })

# Create a DataFrame from the records
op_df_long = pd.DataFrame(records)

plt.figure(figsize=(12, 6))
sns.boxplot(x='Hour', y='Time_Spent', hue='Hour', data=op_df_long, palette='Blues', dodge=False, legend=False)

# Label the plot
plt.xlabel('Hour of the Day', fontsize=14)
plt.ylabel('Time Spent (in hours)', fontsize=14)
plt.title('Distribution of Time Spent Between ' + start_event + ' and ' + finish_event + ' by Hour (Across Multiple Days)', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Function to create a new DataFrame with hourly distribution
def create_hourly_df(df, hourly_distributions):
    # List to store rows for the new DataFrame
    records = []

    # Loop through each patient's hourly distribution
    for i, distribution in enumerate(hourly_distributions):
        in_time = df.iloc[i]['start_time']
        out_time = df.iloc[i]['finish_time']

        # For each day in the hourly distribution
        for day, hours in distribution.items():
            # Create a dictionary to store the row data
            row = {
                'case:concept:name': df.iloc[i]['case:concept:name'],
                'Date': day,
                'start_time': in_time,
                'finish_time': out_time
            }

            # Add time spent for each hour (Hour_00 to Hour_23)
            for hour in range(24):
                row[f'Hour_{hour:02}'] = hours[hour]

            # Append the row to the records list
            records.append(row)

    # Create a DataFrame from the records
    hourly_df = pd.DataFrame(records)

    return hourly_df

# Create the new DataFrame using the hourly distribution data
hourly_df = create_hourly_df(result, hourly_distributions)

In [None]:
# Melt the DataFrame to long format so that each hour becomes a row
hour_columns = [f'Hour_{h:02}' for h in range(24)]
hourly_df_long = pd.melt(hourly_df,
                         id_vars=['case:concept:name', 'Date', 'start_time', 'finish_time'],
                         value_vars=hour_columns,
                         var_name='Hour',
                         value_name='Time_Spent')

labels=[f'{h:02}:00' for h in hours]

# Convert 'Hour' to a more readable integer format (e.g., 'Hour_00' to 0)
hourly_df_long['Hour'] = hourly_df_long['Hour'].str.replace('Hour_', '').astype(int)
hourly_df_long['Hour'] = hourly_df_long['Hour'].apply(lambda h: f'{h:02}:00')

# Drop rows where Time_Spent is 0 (no time spent in that hour)
#hourly_df_long = hourly_df_long[hourly_df_long['Time_Spent'] > 0]

# Plot the boxplot for each hour
plt.figure(figsize=(14, 7))
sns.boxplot(x='Hour', y='Time_Spent', hue='Hour', data=hourly_df_long, palette='Blues', dodge=False, legend=False)

# Label the plot
plt.xlabel('Hour of the Day', fontsize=14)
plt.ylabel('Time Spent (in hours)', fontsize=14)
plt.title('Distribution of Time Spent Between ' + start_event + ' and ' + finish_event + ' by Hour (Across Multiple Days)', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px

# Ensure that the hourly_df_long is already created from the previous code (reshaping to long format)
# If not, use the same melting step here:

hour_columns = [f'Hour_{h:02}' for h in range(24)]
hourly_df_long = pd.melt(hourly_df,
                         id_vars=['case:concept:name', 'Date', 'start_time', 'finish_time'],
                         value_vars=hour_columns,
                         var_name='Hour',
                         value_name='Time_Spent')

# Convert 'Hour' to a more readable integer format (e.g., 'Hour_00' to 0)
hourly_df_long['Hour'] = hourly_df_long['Hour'].str.replace('Hour_', '').astype(int)

# Create an interactive boxplot using Plotly
fig = px.box(hourly_df_long, x='Hour', y='Time_Spent',
             title='Distribution of Time Spent Between ' + start_event + ' and ' + finish_event + ' by Hour',
             labels={'Hour': 'Hour of the Day', 'Time_Spent': 'Time Spent (in hours)'},
             template='plotly_dark')

# Show the interactive plot
fig.show()

In [None]:
# Summing the hourly distribution across all patients
hourly_totals = hourly_df[[f'Hour_{h:02}' for h in range(24)]].mean()

# Normalize the density values to map them to a color gradient
norm = plt.Normalize(vmin=hourly_totals.min(), vmax=hourly_totals.max())

# Create a colormap (e.g., 'viridis' for density mapping)
cmap = plt.cm.viridis

# Apply the colormap to each bar according to the density (hourly total)
colors = cmap(norm(hourly_totals))

# Plotting the total operating room occupancy by hour with density-based color mapping
fig, ax = plt.subplots(figsize=(10, 6))

# X-axis: hours (0-23), Y-axis: total time spent in each hour (sum of fractions)
hours = np.arange(24)
bars = ax.bar(hours, hourly_totals, color=colors)

# Add colorbar to show the density scale
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(hourly_totals)
cbar = fig.colorbar(sm, ax=ax)  # Explicitly associate the colorbar with the axes
cbar.set_label('Density (Mean Time in Hours)', fontsize=12)

# Label the plot
ax.set_xlabel('Hour of the Day', fontsize=14)
ax.set_ylabel('Mean Time (in hours)', fontsize=14)
ax.set_title('Occupancy of Processes between ' + start_event + ' and ' + finish_event + ' by Hour of the Day (Density Based)', fontsize=14)
ax.set_xticks(hours)
ax.set_xticklabels([f'{h:02}:00' for h in hours], rotation=45)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Summing the hourly distribution across all patients
hourly_totals = np.count_nonzero(hourly_df[[f'Hour_{h:02}' for h in range(24)]], axis=0)

# Normalize the density values to map them to a color gradient
norm = plt.Normalize(vmin=hourly_totals.min(), vmax=hourly_totals.max())

# Create a colormap (e.g., 'viridis' for density mapping)
cmap = plt.cm.viridis

# Apply the colormap to each bar according to the density (hourly total)
colors = cmap(norm(hourly_totals))

# Plotting the total operating room occupancy by hour with density-based color mapping
fig, ax = plt.subplots(figsize=(10, 6))

# X-axis: hours (0-23), Y-axis: total time spent in each hour (sum of fractions)
hours = np.arange(24)
bars = ax.bar(hours, hourly_totals, color=colors)

# Add colorbar to show the density scale
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(hourly_totals)  # Set the array for ScalarMappable
cbar = fig.colorbar(sm, ax=ax)  # Explicitly associate the colorbar with the axes
cbar.set_label('Density (Number of Operations)', fontsize=12)

# Label the plot
ax.set_xlabel('Hour of the Day', fontsize=14)
ax.set_ylabel('Number of Process', fontsize=14)
ax.set_title('Distribution of Processes in ' + start_event + ' and ' + finish_event + ' by Hour', fontsize=14)
ax.set_xticks(hours)
ax.set_xticklabels([f'{h:02}:00' for h in hours], rotation=45)

# Show the plot
plt.tight_layout()
plt.show()