In [1]:
import pandas as pd
import datetime as dt

In [2]:
info = pd.read_csv("data/patient_information.csv", dtype=str)

In [3]:
info = info.drop_duplicates()
info = info[['MRN','LOG_ID','HOSP_ADMSN_TIME','HOSP_DISCH_TIME','SURGERY_DATE',
             'IN_OR_DTTM','OUT_OR_DTTM','AN_START_DATETIME','AN_STOP_DATETIME',
             'DISCH_DISP_C', 'DISCH_DISP', 'ICU_ADMIN_FLAG', 
             'BIRTH_DATE', 'HEIGHT', 'WEIGHT', 'SEX', 
             'PRIMARY_ANES_TYPE_NM', 'ASA_RATING_C', 'ASA_RATING',
             'PATIENT_CLASS_GROUP', 'PATIENT_CLASS_NM', 'PRIMARY_PROCEDURE_NM']]
info.loc[:, 'MRN'] = info['MRN'].astype("string")
info.loc[:, 'LOG_ID'] = info['LOG_ID'].astype("string")
info.loc[:, 'HOSP_ADMSN_TIME'] = pd.to_datetime(info['HOSP_ADMSN_TIME'], format="%m/%d/%y %H:%M") 
info.loc[:, 'HOSP_DISCH_TIME'] = pd.to_datetime(info['HOSP_DISCH_TIME'], format="%m/%d/%y %H:%M") 
info.loc[:, 'SURGERY_DATE'] = pd.to_datetime(info['SURGERY_DATE'], format="%m/%d/%y %H:%M") 
info.loc[:, 'IN_OR_DTTM'] = pd.to_datetime(info['IN_OR_DTTM'], format="%m/%d/%y %H:%M")  
info.loc[:, 'OUT_OR_DTTM'] = pd.to_datetime(info['OUT_OR_DTTM'], format="%m/%d/%y %H:%M") 
info.loc[:, 'AN_START_DATETIME'] = pd.to_datetime(info['AN_START_DATETIME'], format="%m/%d/%y %H:%M")  
info.loc[:, 'AN_STOP_DATETIME'] = pd.to_datetime(info['AN_STOP_DATETIME'], format="%m/%d/%y %H:%M")
info.loc[:, 'DISCH_DISP_C'] = info['DISCH_DISP_C'].astype("string")
info.loc[:, 'DISCH_DISP'] = info['DISCH_DISP'].astype("string")
info.loc[:, 'ICU_ADMIN_FLAG'] = info["ICU_ADMIN_FLAG"].astype("string")
info.loc[:, 'BIRTH_DATE'] = pd.to_numeric(info["BIRTH_DATE"], downcast="integer", errors="coerce")
info.loc[:, "HEIGHT"] = info["HEIGHT"].astype("string")
info.loc[:, "SEX"] = info["SEX"].astype("string")
info.loc[:, "PRIMARY_ANES_TYPE_NM"] = info["PRIMARY_ANES_TYPE_NM"].astype("string")
info.loc[:, "ASA_RATING_C"] = info["ASA_RATING_C"].astype("string")
info.loc[:, "ASA_RATING"] = info["ASA_RATING"].astype("string")
info.loc[:, "PATIENT_CLASS_GROUP"] = info["PATIENT_CLASS_GROUP"].astype("string")
info.loc[:, "PATIENT_CLASS_NM"] = info["PATIENT_CLASS_NM"].astype("string")
info.loc[:, "PRIMARY_PROCEDURE_NM"] = info["PRIMARY_PROCEDURE_NM"].astype("string")

info = info.loc[info['PRIMARY_PROCEDURE_NM'].eq('CHOLECYSTECTOMY, LAPAROSCOPIC')]

In [4]:
def surgery_times(row):
    surg_type = None
    plan_time = None
    cancel_time = None

    if pd.isnull(row[['IN_OR_DTTM', 'OUT_OR_DTTM', 'AN_START_DATETIME', 'AN_STOP_DATETIME']]).all():
        # Is the surgery date 1 day before admission?
        if (row['HOSP_ADMSN_TIME'] - row['SURGERY_DATE']) > dt.timedelta(days=1):
            surg_type = 'Surgery Date Passed'
            plan_time = row['SURGERY_DATE']
            cancel_time = row['SURGERY_DATE']
        else:
            surg_type = 'Surgery Cancelled'
            if row['SURGERY_DATE'] <= row['HOSP_ADMSN_TIME']:
                plan_time = row['HOSP_ADMSN_TIME'] + dt.timedelta(minutes=1)
            else:
                plan_time = row['SURGERY_DATE'] + dt.timedelta(minutes=1)
            # If the surgery date + 23:59 exceeds the discharge time, the cancellation time is discharge - 1 min.
            if row['SURGERY_DATE'] + dt.timedelta(hours=23, minutes=59) >= row['HOSP_DISCH_TIME']:
                cancel_time = row['HOSP_DISCH_TIME'] - dt.timedelta(minutes=1)
            else:
                cancel_time = row['SURGERY_DATE'] + dt.timedelta(hours=23, minutes=59)            
    else:
        surg_type = 'Surgery Performed'
        cancel_time = pd.NaT
        # If the surgery date is before or on the same day, schedule the surgery 1 minute after admission.
        if row['SURGERY_DATE'] <= row['HOSP_ADMSN_TIME']:
            plan_time = row['HOSP_ADMSN_TIME'] + dt.timedelta(minutes=1)
        else:
            plan_time = row['SURGERY_DATE']
    
    return surg_type, plan_time, cancel_time  

# Apply the function to create the 'SURGERY_TYPE' column
info['SURGERY_TYPE'], info['SRG_PLN_TIME'], info['SRG_CNL_TIME'] = info.apply(surgery_times, axis=1, result_type='expand').T.values

In [5]:
info = info[['MRN','LOG_ID','HOSP_ADMSN_TIME','HOSP_DISCH_TIME',
             'SURGERY_DATE', 'SURGERY_TYPE', 'SRG_PLN_TIME', 'SRG_CNL_TIME',
             'IN_OR_DTTM','OUT_OR_DTTM','AN_START_DATETIME','AN_STOP_DATETIME',
             'DISCH_DISP_C', 'DISCH_DISP', 'ICU_ADMIN_FLAG', 
             'BIRTH_DATE', 'HEIGHT', 'WEIGHT', 'SEX', 
             'PRIMARY_ANES_TYPE_NM', 'ASA_RATING_C', 'ASA_RATING',
             'PATIENT_CLASS_GROUP', 'PATIENT_CLASS_NM', 'PRIMARY_PROCEDURE_NM']]
info = info.sort_values(by=['MRN', 'HOSP_ADMSN_TIME', 'SRG_PLN_TIME', 'LOG_ID'])

In [6]:
surg = pd.read_csv("data/patient_procedure events.csv", dtype=str)
surg = surg[['MRN','LOG_ID','EVENT_DISPLAY_NAME','EVENT_TIME']]
surg['MRN'] = surg['MRN'].astype('string')
surg['LOG_ID'] = surg['LOG_ID'].astype('string')
surg['EVENT_DISPLAY_NAME'] = surg['EVENT_DISPLAY_NAME'].astype('string')
surg['EVENT_TIME'] = pd.to_datetime(surg['EVENT_TIME'], format="%m/%d/%y %H:%M")

surg = surg.loc[surg['MRN'].isin(info['MRN']) & surg['LOG_ID'].isin(info['LOG_ID'])]

print("number of duplicate rows: ", surg[surg.duplicated()].shape)
surg = surg.drop_duplicates()
surg = surg.dropna(how='any') #axis{0 or ‘index’, 1 or ‘columns’}, default 0

number of duplicate rows:  (491, 4)


In [7]:
df = info

In [8]:
surg_filter = surg[surg.EVENT_DISPLAY_NAME == 'Anesthesia Ready']

surg_filter = surg_filter[['MRN', 'LOG_ID', 'EVENT_TIME']]
surg_filter.columns = ['MRN', 'LOG_ID', 'AN_READY_DATETIME']
df = pd.merge(df, surg_filter, how="left", on=["MRN", "LOG_ID"])

surg_filter = surg[surg.EVENT_DISPLAY_NAME == 'Transported to PACU/ICU with O2, vital signs stable']

surg_filter = surg_filter[['MRN', 'LOG_ID', 'EVENT_TIME']]
surg_filter.columns = ['MRN', 'LOG_ID', 'PACU_DATETIME']
df = pd.merge(df, surg_filter, how="left", on=["MRN", "LOG_ID"])

## Construct OCEL data file

In [9]:
# Create OCEL structure
ocel = {
    "ocel:global-event": {
        "ocel:activity": "__INVALID__"
    },
    "ocel:global-object": {
        "ocel:type": "__INVALID__"
    },
    "ocel:global-log": {
        "ocel:attribute-names": [
            "age", 
            "gender", 
            "height", 
            "weight",
            "admitted ICU",
            "patient class",
            "discharge type"
        ],
        "ocel:object-types": [
            "patient", 
            "surgery", 
            "anesthesia", 
            "rating"
        ],
        "ocel:version": "1.0",
        "ocel:ordering": "timestamp"
    },
    "ocel:events": {},
    "ocel:objects": {}
}

In [10]:
# Define object type configurations
object_configs = {
    'patient': {
        'prefix': 'p_',
        'id_column': 'MRN',
        'type': 'patient',
        'attributes': {
            'gender': 'SEX'
        }
    },
    'surgery': {
        'prefix': 's_',
        'id_column': 'LOG_ID',
        'type': 'surgery',
        'attributes': {
            'age': 'BIRTH_DATE',
            'height': 'HEIGHT',
            'weight': 'WEIGHT',
            'admitted ICU': 'ICU_ADMIN_FLAG',
            'patient class': 'PATIENT_CLASS_NM'
        }
    },
    'anesthesia': {
        'prefix': 'a_',
        'id_column': 'PRIMARY_ANES_TYPE_NM',
        'type': 'anesthesia',
        'attributes': {}
    },
    'rating': {
        'prefix': 'r_',
        'id_column': 'ASA_RATING',
        'type': 'rating',
        'attributes': {}
    }
}

In [11]:
# Initialize objects dictionary if it doesn't exist
if "ocel:objects" not in ocel:
    ocel["ocel:objects"] = {}

# Process each row
for _, row in df.iterrows():
    for obj_name, config in object_configs.items():
        if pd.notna(row[config['id_column']]) & pd.notnull(row[config['id_column']]) & (row[config['id_column']] != ''):
            # Generate object ID
            obj_id = f"{config['prefix']}{row[config['id_column']]}"
            
            # Skip if object already exists
            if obj_id in ocel["ocel:objects"]:
                continue
            
            # Build attribute map
            ovmap = {}
            for attr_key, attr_column in config['attributes'].items():
                if attr_column in row and pd.notna(row[attr_column]):
                    ovmap[attr_key] = str(row[attr_column])
            
            # Create object entry
            entry = {
                "ocel:type": config['type'],
                "ocel:ovmap": ovmap
            }
            
            # Add to OCEL
            ocel["ocel:objects"][obj_id] = entry

In [12]:
# Track created events to prevent duplicates
created_events = set()

# Single loop through DataFrame
for _, row in df.iterrows():
    try:
        # Helper function to safely add event
        def add_event_if_new(event_id, event_entry):
            if event_id not in created_events:
                ocel["ocel:events"][event_id] = event_entry
                created_events.add(event_id)
        
        # 1. Hospital Admission
        event_id = f"e-ha_{row['MRN']}_{row['HOSP_ADMSN_TIME']}"
        event = {
            "ocel:activity": "Hospital Admission",
            "ocel:timestamp": row['HOSP_ADMSN_TIME'],  # Remove f-string for datetime objects
            "ocel:omap": [
                f"p_{row['MRN']}"
            ],
            "ocel:vmap": {}
        }
        add_event_if_new(event_id, event)

        # 2. Book Surgery
        event_id = f"e-bs_{row['LOG_ID']}"
        event = {
            "ocel:activity": f"Book Surgery",
            "ocel:timestamp": row['SRG_PLN_TIME'],
            "ocel:omap": [
                f"p_{row['MRN']}",
                f"r_{row['ASA_RATING']}",
                f"s_{row['LOG_ID']}"
            ],
            "ocel:vmap": {}
        }
        add_event_if_new(event_id, event)

        # 2. Cancel Surgery
        if pd.notna(row['SRG_CNL_TIME']) & pd.notnull(row['SRG_CNL_TIME']) & (row['SRG_CNL_TIME'] != ''):
            event_id = f"e-cs_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"Cancel Surgery",
                "ocel:timestamp": row['SRG_CNL_TIME'],
                "ocel:omap": [
                    f"p_{row['MRN']}",
                    f"s_{row['LOG_ID']}"
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)

        # 2. Operating Room In
        if pd.notna(row['IN_OR_DTTM']) & pd.notnull(row['IN_OR_DTTM']) & (row['IN_OR_DTTM'] != ''):
            event_id = f"e-in_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"Room IN",
                "ocel:timestamp": row['IN_OR_DTTM'],
                "ocel:omap": [
                    f"s_{row['LOG_ID']}",
                    f"p_{row['MRN']}",
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)
        
        # 2. Operating Room OUT
        if pd.notna(row['OUT_OR_DTTM']) & pd.notnull(row['OUT_OR_DTTM']) & (row['OUT_OR_DTTM'] != ''):
            event_id = f"e-out_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"Room OUT",
                "ocel:timestamp": row['OUT_OR_DTTM'],
                "ocel:omap": [
                    f"s_{row['LOG_ID']}",
                    f"p_{row['MRN']}"
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)

        # 3. Anesthesia Begin
        if pd.notna(row['AN_START_DATETIME']) & pd.notnull(row['AN_START_DATETIME']) & (row['AN_START_DATETIME'] != ''):
            event_id = f"e-ab_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"Anesthesia Start",
                "ocel:timestamp": row['AN_START_DATETIME'],
                "ocel:omap": [
                    f"s_{row['LOG_ID']}",
                    f"p_{row['MRN']}",
                    f"a_{row['PRIMARY_ANES_TYPE_NM']}"
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)

        # 3. Anesthesia Ready
        if pd.notna(row['AN_READY_DATETIME']) & pd.notnull(row['AN_READY_DATETIME']) & (row['AN_READY_DATETIME'] != ''):
            event_id = f"e-ar_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"Anesthesia Ready",
                "ocel:timestamp": row['AN_READY_DATETIME'],
                "ocel:omap": [
                    f"s_{row['LOG_ID']}",
                    f"p_{row['MRN']}",
                    f"a_{row['PRIMARY_ANES_TYPE_NM']}"
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)

        # 3. Anesthesia Stop
        if pd.notna(row['AN_STOP_DATETIME']) & pd.notnull(row['AN_STOP_DATETIME']) & (row['AN_STOP_DATETIME'] != ''):
            event_id = f"e-af_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"Anesthesia Stop",
                "ocel:timestamp": row['AN_STOP_DATETIME'],
                "ocel:omap": [
                    f"s_{row['LOG_ID']}",
                    f"p_{row['MRN']}",
                    f"a_{row['PRIMARY_ANES_TYPE_NM']}"
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)

        # 3. PACU_ICU
        if pd.notna(row['PACU_DATETIME']) & pd.notnull(row['PACU_DATETIME']) & (row['PACU_DATETIME'] != ''):
            event_id = f"e-pi_{row['LOG_ID']}"
            event = {
                "ocel:activity": f"PACU_ICU",
                "ocel:timestamp": row['PACU_DATETIME'],
                "ocel:omap": [
                    f"s_{row['LOG_ID']}",
                    f"p_{row['MRN']}"
                ],
                "ocel:vmap": {}
            }
            add_event_if_new(event_id, event)

        # 4. Hospital Discharge
        event_id = f"e-hd_{row['MRN']}_{row['HOSP_DISCH_TIME']}"
        event = {
            "ocel:activity": "Discharge",
            "ocel:timestamp": row['HOSP_DISCH_TIME'],  # Remove f-string for datetime objects
            "ocel:omap": [
                f"p_{row['MRN']}"
            ],
            "ocel:vmap": {
                'discharge type': f"{row['DISCH_DISP']}"
            }
        }
        add_event_if_new(event_id, event)
            
    except KeyError as e:
        print(f"Warning: Missing column {e} in row {row.name}")
        continue
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        continue

In [13]:
import json

output_file = "results/mover_OCEL.jsonocel"

# Save OCEL to file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(ocel, f, indent=2, ensure_ascii=False, default=str)

print(f"Successfully converted to OCEL format. Saved as: {output_file}")
print(f"Processed {len(ocel['ocel:events'])} events and {len(ocel['ocel:objects'])} objects")

Successfully converted to OCEL format. Saved as: results/mover_OCEL.jsonocel
Processed 9350 events and 2310 objects


In [14]:
from pm4py.objects.ocel.validation import jsonocel
import importlib.util

if importlib.util.find_spec("jsonschema"):
    # validate a JSONOCEL file against the corresponding schema
    validation_result = jsonocel.apply(output_file, "templates/schema.json")
    if validation_result:
        print("OCEL data validated")
    else:
        print("OCEL data can not be validated")
else:
    print("NO SCHEMA")

NO SCHEMA
