<a href="https://colab.research.google.com/github/victormurcia/VCHAMPS/blob/main/Generate_Encounter_IDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import glob,shutil,os,warnings,math,time,sys,re
from typing import List
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

#For converting states to their abbreviations
#!pip install us
#import us

#For performing UTC normalization on datetime columns based on the STATE column
import pytz

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#For EDA
#!pip install dataprep
#from dataprep.eda import create_report

#Enable data to be extracted and downloaded from my Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Specify the path to the desired directory
directory_path = r'/content/drive/MyDrive/VCHAMPS - Train Cleaned'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
cwd = os.getcwd()

print(f"Current working directory: {cwd}")

Current working directory: /content/drive/MyDrive/VCHAMPS - Train Cleaned


In [3]:
def load_parquet_files(folder_path):
    # Get a list of all files in the current directory
    all_files = [f.path for f in os.scandir(folder_path) if f.is_file()]

    # Get a list of all subdirectories (dataset folders) in the current directory
    dataset_folders = [f.path for f in os.scandir(folder_path) if f.is_dir()]

    # Initialize an empty list to store the DataFrames
    dataframes = []

    # Load Parquet files in the current directory
    for file in all_files:
        if file.endswith('.parquet'):
            df = dd.read_parquet(file, engine='pyarrow')
            dataframes.append(df)

    # Load Parquet files in the subfolders
    for folder in dataset_folders:
      if folder.endswith('.parquet'):
        df = dd.read_parquet(folder + '/*.parquet', engine='pyarrow')
        dataframes.append(df)

    return dataframes

dataframes = load_parquet_files(cwd)

# Generate Encounter IDs
Do this based on the inpatient_admisssions, ed_visits,and outpatient visits files.

I'll either use a Hash or a UUID to define the Encounter ID

In [4]:
import uuid
# Define a dictionary to store generated UUIDs
uuid_dict = {}

# Define a custom function to generate UUIDs and ensure uniqueness
def generate_uuid(row, df_val):
    if df_val == 1:
      columns = ['Internalpatientid', 'Ed visit start date', 'Discharge date ed']
    elif df_val == 2:
      columns = ['Internalpatientid', 'Admission date', 'Discharge date']
    elif df_val == 3:
      columns = ['Internalpatientid', 'Visit start date']

    unique_values = tuple(row[column] for column in columns)
    unique_values += (df_val,)  # Append the additional parameter to the unique_values tuple
    key = str(unique_values)

    # Check if UUID already exists in the dictionary
    if key in uuid_dict:
        return uuid_dict[key]

    # Generate a new UUID and store it in the dictionary
    new_uuid = str(uuid.uuid5(uuid.NAMESPACE_OID, key))
    uuid_dict[key] = new_uuid
    return new_uuid

In [9]:
import hashlib

def generate_hash_id(row, df_val):
    if df_val == 1:
        columns = ['Internalpatientid', 'Ed visit start date', 'Discharge date ed']
    elif df_val == 2:
        columns = ['Internalpatientid', 'Admission date', 'Discharge date']
    elif df_val == 3:
        columns = ['Internalpatientid', 'Visit start date']

    data = ''.join(str(row[column]) for column in columns)
    data += str(df_val)

    hash_object = hashlib.sha256(data.encode())
    hash_id = hash_object.hexdigest()

    return hash_id

In [5]:
#Load the Dataframes
ed_visits_df            = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/ed_visits.parquet/*.parquet')
inpatient_admissions_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/inpatient_admissions.parquet')
outpatient_visits_df    = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/outpatient_visits.parquet/*.parquet')

I decided to use UUID instead of Hash since UUIDs used far less memory

In [6]:
# Assign UUIDs to each row
ed_visits_df['Encounter ID'] = ed_visits_df.apply(generate_uuid, args=(1,), axis=1, meta=(None, 'object'))
inpatient_admissions_df['Encounter ID'] = inpatient_admissions_df.apply(generate_uuid, args=(2,), axis=1, meta=(None, 'object'))
outpatient_visits_df['Encounter ID'] = outpatient_visits_df.apply(generate_uuid, args=(3,), axis=1, meta=(None, 'object'))

In [15]:
ed_visits_df.head()

Unnamed: 0,Internalpatientid,Age at ed visit,Ed visit start date,Discharge date ed,Died during ed visit,CV diagnosis,code,cc Status,Encounter ID
0,101689,64,2021-08-26 00:53:30,2021-08-26 04:24:27,0,1,I502,NCC,2afc637f-1e93-5ce7-b817-0784cebc77a1
1,101689,64,2021-08-26 00:53:30,2021-08-26 04:24:27,0,1,N189,NCC,2afc637f-1e93-5ce7-b817-0784cebc77a1
2,107210,71,2022-05-18 11:21:28,2022-05-18 18:30:21,0,0,A419,MCC,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141
3,107210,71,2022-05-18 11:21:28,2022-05-18 18:30:21,0,0,I959,NCC,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141
4,107866,61,2017-06-26 15:41:11,2017-06-26 23:33:50,0,0,R918,NCC,efb622f7-8c68-5b26-ae9f-09147ff0dce7


In [18]:
inpatient_admissions_df.head()

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,Died during admission,Outpatientreferralflag,...,cc Status_CC,cc Status_MCC,cc Status_NCC,Discharge disposition_Death with autopsy,Discharge disposition_Death without autopsy,Discharge disposition_Irregular,Discharge disposition_NBC or while ASIH,Discharge disposition_Regular,Discharge disposition_Transfer,Encounter ID
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,0,1,0,0,0,0,0,1,0,974dbd04-1e56-59c6-a720-b8784dc604b2
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,0,0,1,0,0,0,0,1,0,974dbd04-1e56-59c6-a720-b8784dc604b2
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,0,c86658fc-cb34-536f-bf7c-a6a05ad131cc
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,0,0,1,0,0,0,0,1,0,c86658fc-cb34-536f-bf7c-a6a05ad131cc
4,100001,85,2010-11-10 04:32:39,2010-11-19 08:49:45,SURGERY,SURGERY,SUBSTANCE ABUSE RES TRMT PROG,ORTHOPEDIC,0,0,...,0,0,1,0,0,0,0,1,0,9b0880c7-906a-5869-9bbb-698c6183bdec


In [19]:
outpatient_visits_df.head()

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,Visit End Date,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,diagnosis,code,cc Status,Encounter ID
0,10037,57,2014-06-17 11:56:20,2014-06-17 13:56:20,EKG,,,,,,other hypertrophic cardiomyopathy,I422,CC,7ca25b58-b33a-57a1-83c1-7fab40faaaff
0,65015,67,2018-06-21 01:25:05,2018-06-21 03:25:05,OPTOMETRY,,,,No,,type 2 diabetes mellitus without complications,E119,NCC,2631ee2b-20a4-5b72-bc1f-ff7c8e4589dd
0,6139,74,2022-07-14 21:44:44,2022-07-14 23:44:44,HT NON-VIDEO MONITORING,,,,Yes,,type 2 diabetes mellitus with unspecified comp...,E118,NCC,2c2ca169-48b0-5fce-bfe5-d0786a534457
0,18667,89,2021-03-22 11:41:03,2021-03-22 13:41:03,HOSPITAL IN HOME,,,,,,chronic atrial fibrillation,I482,NCC,2fc52270-9dde-554c-a79d-81022ad98b67
0,77347,64,2019-07-03 13:45:28,2019-07-03 15:45:28,PRIMARY CARE/MEDICINE,,,,Yes,,kidney transplant status,Z940,CC,0afb8078-9f9b-50cf-b0d8-397ad8b01805


Now that each of the visits have a UUID associated with it, I can start mapping the other dataframes to them

In [7]:
death_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned/death.parquet')

In [9]:
ed_visits_df = ed_visits_df.compute()

In [30]:
inpatient_admissions_df = inpatient_admissions_df.compute()

In [37]:
outpatient_visits_df = outpatient_visits_df.compute()

In [49]:
ed_visits_df

Unnamed: 0,Internalpatientid,Age at ed visit,Ed visit start date,Discharge date ed,Died during ed visit,CV diagnosis,code,cc Status,Encounter ID
0,101689,64,2021-08-26 00:53:30,2021-08-26 04:24:27,0,1,I502,NCC,2afc637f-1e93-5ce7-b817-0784cebc77a1
1,101689,64,2021-08-26 00:53:30,2021-08-26 04:24:27,0,1,N189,NCC,2afc637f-1e93-5ce7-b817-0784cebc77a1
2,107210,71,2022-05-18 11:21:28,2022-05-18 18:30:21,0,0,A419,MCC,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141
3,107210,71,2022-05-18 11:21:28,2022-05-18 18:30:21,0,0,I959,NCC,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141
4,107866,61,2017-06-26 15:41:11,2017-06-26 23:33:50,0,0,R918,NCC,efb622f7-8c68-5b26-ae9f-09147ff0dce7
...,...,...,...,...,...,...,...,...,...
1099360,9883,90,2020-02-20 21:23:37,2020-02-20 22:56:30,0,0,W108,NCC,da0bbfb2-28fc-5b3a-afcb-2642eee54c6f
1099422,99235,87,2016-02-25 01:22:23,2016-02-25 06:33:45,0,0,R197,NCC,7c5682aa-be6e-5fca-8c73-ec208a0ac60f
1099538,99902,66,2018-09-28 23:21:41,2018-09-28 23:56:55,0,0,M796,NCC,15df3aab-aa5b-51fa-b993-2d1cad1fa444
1099548,99934,84,2022-07-22 09:19:20,2022-07-22 09:42:57,0,0,M796,NCC,b3f5f12b-57fd-5978-842e-8f8f8c293522


In [31]:
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,Died during admission,Outpatientreferralflag,...,cc Status_CC,cc Status_MCC,cc Status_NCC,Discharge disposition_Death with autopsy,Discharge disposition_Death without autopsy,Discharge disposition_Irregular,Discharge disposition_NBC or while ASIH,Discharge disposition_Regular,Discharge disposition_Transfer,Encounter ID
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,0,1,0,0,0,0,0,1,0,974dbd04-1e56-59c6-a720-b8784dc604b2
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,0,0,1,0,0,0,0,1,0,974dbd04-1e56-59c6-a720-b8784dc604b2
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,0,c86658fc-cb34-536f-bf7c-a6a05ad131cc
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,0,0,1,0,0,0,0,1,0,c86658fc-cb34-536f-bf7c-a6a05ad131cc
4,100001,85,2010-11-10 04:32:39,2010-11-19 08:49:45,SURGERY,SURGERY,SUBSTANCE ABUSE RES TRMT PROG,ORTHOPEDIC,0,0,...,0,0,1,0,0,0,0,1,0,9b0880c7-906a-5869-9bbb-698c6183bdec
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085813,99975,89,2018-01-05 00:59:05,2018-01-06 23:18:39,NON-COUNT,NON-COUNT,PODIATRY,MEDICAL OBSERVATION,0,0,...,0,0,1,0,0,0,0,1,0,95223157-498c-5de5-a061-13d1e9ae985a
1085814,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,0,0,1,0,0,0,0,1,0,bd343752-5059-5181-81ad-dd066c9179c3
1085815,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,0,0,1,0,0,0,0,1,0,bd343752-5059-5181-81ad-dd066c9179c3
1085816,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,1,0,0,0,0,0,0,1,0,a0b6452b-5012-5b06-81f3-150b3a92b656


In [40]:
outpatient_visits_df

Unnamed: 0,Internalpatientid,Age at visit,Visit start date,Visit End Date,Stop code,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,diagnosis,code,cc Status,Encounter ID
0,10037,57,2014-06-17 11:56:20,2014-06-17 13:56:20,EKG,,,,,,other hypertrophic cardiomyopathy,I422,CC,7ca25b58-b33a-57a1-83c1-7fab40faaaff
0,65015,67,2018-06-21 01:25:05,2018-06-21 03:25:05,OPTOMETRY,,,,No,,type 2 diabetes mellitus without complications,E119,NCC,2631ee2b-20a4-5b72-bc1f-ff7c8e4589dd
0,6139,74,2022-07-14 21:44:44,2022-07-14 23:44:44,HT NON-VIDEO MONITORING,,,,Yes,,type 2 diabetes mellitus with unspecified comp...,E118,NCC,2c2ca169-48b0-5fce-bfe5-d0786a534457
0,18667,89,2021-03-22 11:41:03,2021-03-22 13:41:03,HOSPITAL IN HOME,,,,,,chronic atrial fibrillation,I482,NCC,2fc52270-9dde-554c-a79d-81022ad98b67
0,77347,64,2019-07-03 13:45:28,2019-07-03 15:45:28,PRIMARY CARE/MEDICINE,,,,Yes,,kidney transplant status,Z940,CC,0afb8078-9f9b-50cf-b0d8-397ad8b01805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054843,10037,57,2014-05-07 15:00:57,2014-05-07 17:00:57,TELEPHONE/MEDICINE,,,,,,type 2 diabetes mellitus with ophthalmic compl...,E113,NCC,0f2852c3-7975-5250-8a19-cc0fc47eba3a
1054845,10037,57,2014-05-16 06:07:53,2014-05-16 08:07:53,PRIMARY CARE/MEDICINE,,,,,,dizziness and giddiness,R42,NCC,5c42cde4-5596-5ec4-944e-0608829d5835
1054846,10037,57,2014-05-16 06:07:53,2014-05-16 08:07:53,PRIMARY CARE/MEDICINE,,,,,,type 2 diabetes mellitus with ophthalmic compl...,E113,NCC,5c42cde4-5596-5ec4-944e-0608829d5835
1054847,10037,57,2014-06-17 11:56:20,2014-06-17 13:56:20,PRIMARY CARE/MEDICINE,,,,,,fracture of other and unspecified finger(s),S626,NCC,7ca25b58-b33a-57a1-83c1-7fab40faaaff


In [10]:
death_df = death_df.compute()

In [56]:
#Make a list of all use encounter IDs. Will be used to ensure uniqueness later
encounter_ids = []
encounter_ids.extend(ed_visits_df['Encounter ID'].tolist())
encounter_ids.extend(inpatient_admissions_df['Encounter ID'].tolist())
encounter_ids.extend(outpatient_visits_df['Encounter ID'].tolist())

In [57]:
len(encounter_ids)

17736562

In [61]:
def map_encounter_id(row, age_col, date_col):
    patient_id    = row['Internalpatientid']
    patient_age   = row[age_col]
    date_to_match = row[date_col]

    #1. Check for matches in the ed_visits first
    # Filter the first dataframe for matching patient ID and age conditions
    filtered_df = ed_visits_df[(ed_visits_df['Internalpatientid'] == patient_id) & (ed_visits_df['Age at ed visit'] <= patient_age)]
    # Find the first instance where date_to_match is between the start and end dates
    filtered_df = filtered_df[(filtered_df['Ed visit start date'] <= date_to_match) & (filtered_df['Discharge date ed'] >= date_to_match)]

    if len(filtered_df) > 0:
        print('Found match in ed_visits for ',patient_id)
        return filtered_df['Encounter ID'].iloc[0]
    else:
        #2. Check for matches in the inpatient_visits if no match is found in ed_visits
        filtered_df = inpatient_admissions_df[(inpatient_admissions_df['Internalpatientid'] == patient_id) & (inpatient_admissions_df['Age at admission'] <= patient_age)]
         # Find the first instance where date_to_match is between the start and end dates
        filtered_df = filtered_df[(filtered_df['Admission date'] <= date_to_match) & (filtered_df['Discharge date'] >= date_to_match)]
        if len(filtered_df) > 0:
          print('Found match in inpatient_visits for ',patient_id)
          return filtered_df['Encounter ID'].iloc[0]
        else:
          #3. Check for matches in the outpatient_visits if no match is found in inpatient_visits
          filtered_df = outpatient_visits_df[(outpatient_visits_df['Internalpatientid'] == patient_id) & (outpatient_visits_df['Age at visit'] <= patient_age)]
          # Find the first instance where date_to_match is between the start and end dates
          filtered_df = filtered_df[(filtered_df['Visit start date'] <= date_to_match) & (filtered_df['Visit End Date'] >= date_to_match)]
          if len(filtered_df) > 0:
            print('Found match in outpatient_visits for ',patient_id)
            return filtered_df['Encounter ID'].iloc[0]
          else:
            print('No match found. Producing unique Encounter ID')
            return str(uuid.uuid4())
    return None

In [62]:
small_death_df = death_df[:100]
small_death_df

Unnamed: 0,Internalpatientid,Age at death,Death date
0,100036,72,2015-01-18 20:49:10
1,100037,75,2008-10-16 15:14:26
2,100045,85,2019-01-26 11:33:52
3,100095,84,2004-08-10 21:49:35
4,10013,68,2007-06-11 00:17:06
...,...,...,...
95,102820,82,2001-03-05 01:52:28
96,102875,91,2018-10-19 23:59:54
97,102878,68,2011-03-30 06:54:04
98,102924,100,2015-01-22 02:06:58


In [63]:
small_death_df['Encounter ID'] = small_death_df.apply(lambda row: map_encounter_id(row, 'Age at death', 'Death date'), axis=1)

No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique Encounter ID
No match found. Producing unique E

In [64]:
small_death_df[:50]

Unnamed: 0,Internalpatientid,Age at death,Death date,Encounter ID
0,100036,72,2015-01-18 20:49:10,90a55891-2482-4daf-b922-0aacffb7206f
1,100037,75,2008-10-16 15:14:26,1489d5d8-0602-4a85-a6a7-181a8dc409fb
2,100045,85,2019-01-26 11:33:52,9fec9d60-0482-40e1-b59e-ae09b937f7fd
3,100095,84,2004-08-10 21:49:35,3bc1d5ac-6de0-4385-8dcb-e498573b4605
4,10013,68,2007-06-11 00:17:06,07e1d659-c12a-4296-8c87-ec0eb8e15db8
5,10018,91,2011-06-24 04:28:36,3798d0e9-4db6-4789-9c09-8af8e42cebca
6,100217,97,2022-10-01 01:49:06,56447770-17ba-4c45-a7a8-8da7047fee0f
7,100247,78,2021-07-20 10:56:42,75dbce07-745c-4bcb-8f17-49492c1213fa
8,100373,95,2018-04-12 19:08:00,10a6801d-e261-46dd-af1b-31526768a18e
9,100401,81,2012-04-23 02:46:26,fc06ab7c-8975-4b18-a446-e4f5d09b3c2b
