In [1]:
import polars as pl
import os
import pandas as pd
import json

In [2]:
def encode_column_and_replace(df, col_name,encoding_dict_=None):

    if encoding_dict_ is None:
        out = df.select(pl.col(col_name).unique()).with_row_count()
        out = out.select([col_name,'row_nr'])
        encoding_dict = dict(out.iter_rows())
    else:
        encoding_dict = encoding_dict_
    df = df.with_columns(pl.col(col_name).map_dict(encoding_dict,return_dtype=pl.UInt64).alias(col_name+'_encoded'))
    return df,encoding_dict

def encode_two_columns(df, col_1,col_2):
    encoding_dict = dict(df.select([col_1,col_2]).unique().iter_rows())
    return encoding_dict



# Cleaning patient history sample

In [3]:
patient_history_encoding = {}
cols_to_encode = ['dx_name']
data = pl.read_csv('EPIC_EMR/patient_history.csv',dtypes={'diagnosis_code':pl.Utf8})
data = data.unique()

In [4]:
data.shape

(437721, 3)

In [5]:
data.head()

mrn,diagnosis_code,dx_name
str,str,str
"""cdce1f36836256…","""117.5""","""Cryptococcosis…"
"""309411c5b6b965…","""153.1""","""Malignant neop…"
"""13e76a6d564f80…","""238.2""","""Neoplasm of un…"
"""f42b6b0d38e95a…","""238.2""","""Neoplasm of un…"
"""ef7b3e727d8cc7…","""238.2""","""Neoplasm of un…"


In [6]:
patient_history_encoding['dx_name___diagnosis_code'] = encode_two_columns(data,'dx_name','diagnosis_code')
for col in cols_to_encode:
    data, patient_history_encoding[col] = encode_column_and_replace(data,col)


In [7]:
data.head()

mrn,diagnosis_code,dx_name,dx_name_encoded
str,str,str,u64
"""cdce1f36836256…","""117.5""","""Cryptococcosis…",32990
"""309411c5b6b965…","""153.1""","""Malignant neop…",45319
"""13e76a6d564f80…","""238.2""","""Neoplasm of un…",37245
"""f42b6b0d38e95a…","""238.2""","""Neoplasm of un…",37245
"""ef7b3e727d8cc7…","""238.2""","""Neoplasm of un…",37245


In [8]:
data.null_count()

mrn,diagnosis_code,dx_name,dx_name_encoded
u32,u32,u32,u32
0,107424,0,0


In [9]:
patient_history_encoding.keys()

dict_keys(['dx_name___diagnosis_code', 'dx_name'])

In [10]:
data = data.select([
    'mrn','dx_name_encoded'])
data.shape

(437721, 2)

In [11]:
data.head()

mrn,dx_name_encoded
str,u64
"""cdce1f36836256…",32990
"""309411c5b6b965…",45319
"""13e76a6d564f80…",37245
"""f42b6b0d38e95a…",37245
"""ef7b3e727d8cc7…",37245


In [12]:
data.write_csv('EPIC_EMR_cleaned/patient_history_cleaned.csv')

In [13]:
len(patient_history_encoding['dx_name'])

59056

# Cleaning patient information
Here The columns ICU_ADMIN_FLAG, SEX, PRIMARY_ANES_TYPE_NM, PATIENT_CLASS_GROUP, DISCH_DISP_C, ASA_RATING_C are encoded

In [14]:
patient_information_encoding = {}
cols_to_encode = ['ICU_ADMIN_FLAG','SEX','PRIMARY_ANES_TYPE_NM','PATIENT_CLASS_GROUP'] # DISCH_DISP
data = pl.read_csv('EPIC_EMR/patient_information.csv',dtypes={})
data = data.unique()
data = data.with_columns(pl.col('DISCH_DISP_C').cast(pl.UInt64),pl.col('ASA_RATING_C').cast(pl.UInt64))

two_columns_encoding = [('DISCH_DISP','DISCH_DISP_C'),('ASA_RATING','ASA_RATING_C')]


In [15]:
for col_1,col_2 in two_columns_encoding:
    patient_information_encoding[col_1+'___'+col_2] = encode_two_columns(data,col_1,col_2)

for col in cols_to_encode:
    data, patient_information_encoding[col] = encode_column_and_replace(data,col)


In [16]:
data.null_count()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,7,7,0,14,14,0,0,0,12726,2363,0,36,6811,6811,0,0,6,6403,6469,7313,7327,0,0,0,0


In [17]:

data, patient_information_encoding['DISCH_DISP'] = encode_column_and_replace(data,'DISCH_DISP',patient_information_encoding['DISCH_DISP___DISCH_DISP_C'])
data, patient_information_encoding['ASA_RATING'] = encode_column_and_replace(data,'ASA_RATING',patient_information_encoding['ASA_RATING___ASA_RATING_C'])


In [18]:
data.null_count()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded,DISCH_DISP_encoded,ASA_RATING_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,7,7,0,14,14,0,0,0,12726,2363,0,36,6811,6811,0,0,6,6403,6469,7313,7327,0,0,0,0,7,6811


In [19]:
data.head()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded,DISCH_DISP_encoded,ASA_RATING_encoded
str,str,u64,str,str,str,f64,str,str,i64,str,f64,str,str,u64,str,str,str,str,str,str,str,str,u64,u64,u64,u64,u64,u64
"""6af225abbb809c…","""7c32ed97b90276…",20,"""Home Healthcar…","""10/28/20 9:28""","""11/5/20 18:52""",8.0,"""Yes""","""10/28/20 0:00""",55,"""5' 0""",2620.83,"""Female""","""Regional""",3,"""Severe Systemi…","""Inpatient""","""Hospital Inpat…","""TRANSPLANT REC…","""10/28/20 12:38…","""10/28/20 16:02…","""10/28/20 12:38…","""10/28/20 16:08…",0,2,10,0,20,3
"""3119d72120dacb…","""cdc958fff5570a…",15,"""Home Routine""","""4/7/19 5:18""","""4/8/19 9:50""",1.0,"""Yes""","""4/7/19 0:00""",54,"""5' 2""",2125.23,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""SACROCOLPOPEXY…","""4/7/19 7:12""","""4/7/19 13:38""","""4/7/19 7:12""","""4/7/19 13:43""",0,2,1,1,15,2
"""f0d16bb8e59321…","""95d54ee9f5abfa…",15,"""Home Routine""","""12/14/18 7:16""","""12/16/18 16:05…",2.0,"""No""","""12/14/18 0:00""",43,"""5' 11""",3710.78,"""Male""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""URETHROPLASTY,…","""12/14/18 8:44""","""12/14/18 12:14…","""12/14/18 8:46""","""12/14/18 12:30…",1,1,1,1,15,2
"""32f1980e394417…","""0dcace1cd2a4d8…",15,"""Home Routine""","""6/22/19 8:10""","""6/23/19 11:15""",1.0,"""No""","""6/22/19 0:00""",79,,1968.27,"""Male""","""General""",3,"""Severe Systemi…","""Outpatient""","""Hospital Outpa…","""DECOMPRESSION,…","""6/22/19 10:36""","""6/22/19 13:05""","""6/22/19 10:36""","""6/22/19 13:12""",1,1,1,1,15,3
"""db83846371c937…","""e3025ead155cb9…",15,"""Home Routine""","""10/12/19 8:43""","""10/13/19 18:02…",1.0,"""No""","""10/12/19 0:00""",67,"""5' 4.5""",2567.92,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""HYSTERECTOMY, …","""10/12/19 10:38…","""10/12/19 14:31…","""10/12/19 10:38…","""10/12/19 14:37…",1,2,1,1,15,2


The Height is fiven in feet'inch metric. We convert this into centimeters.

In [20]:
data = data.with_columns(pl.col('HEIGHT').str.split(by="'").map_elements(lambda x: float(x[0])*30.48  + float(x[1])*2.54).alias('HEIGHT_IN_CM'))

In [21]:
data.head()

LOG_ID,MRN,DISCH_DISP_C,DISCH_DISP,HOSP_ADMSN_TIME,HOSP_DISCH_TIME,LOS,ICU_ADMIN_FLAG,SURGERY_DATE,BIRTH_DATE,HEIGHT,WEIGHT,SEX,PRIMARY_ANES_TYPE_NM,ASA_RATING_C,ASA_RATING,PATIENT_CLASS_GROUP,PATIENT_CLASS_NM,PRIMARY_PROCEDURE_NM,IN_OR_DTTM,OUT_OR_DTTM,AN_START_DATETIME,AN_STOP_DATETIME,ICU_ADMIN_FLAG_encoded,SEX_encoded,PRIMARY_ANES_TYPE_NM_encoded,PATIENT_CLASS_GROUP_encoded,DISCH_DISP_encoded,ASA_RATING_encoded,HEIGHT_IN_CM
str,str,u64,str,str,str,f64,str,str,i64,str,f64,str,str,u64,str,str,str,str,str,str,str,str,u64,u64,u64,u64,u64,u64,f64
"""6af225abbb809c…","""7c32ed97b90276…",20,"""Home Healthcar…","""10/28/20 9:28""","""11/5/20 18:52""",8.0,"""Yes""","""10/28/20 0:00""",55,"""5' 0""",2620.83,"""Female""","""Regional""",3,"""Severe Systemi…","""Inpatient""","""Hospital Inpat…","""TRANSPLANT REC…","""10/28/20 12:38…","""10/28/20 16:02…","""10/28/20 12:38…","""10/28/20 16:08…",0,2,10,0,20,3,152.4
"""3119d72120dacb…","""cdc958fff5570a…",15,"""Home Routine""","""4/7/19 5:18""","""4/8/19 9:50""",1.0,"""Yes""","""4/7/19 0:00""",54,"""5' 2""",2125.23,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""SACROCOLPOPEXY…","""4/7/19 7:12""","""4/7/19 13:38""","""4/7/19 7:12""","""4/7/19 13:43""",0,2,1,1,15,2,157.48
"""f0d16bb8e59321…","""95d54ee9f5abfa…",15,"""Home Routine""","""12/14/18 7:16""","""12/16/18 16:05…",2.0,"""No""","""12/14/18 0:00""",43,"""5' 11""",3710.78,"""Male""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""URETHROPLASTY,…","""12/14/18 8:44""","""12/14/18 12:14…","""12/14/18 8:46""","""12/14/18 12:30…",1,1,1,1,15,2,180.34
"""32f1980e394417…","""0dcace1cd2a4d8…",15,"""Home Routine""","""6/22/19 8:10""","""6/23/19 11:15""",1.0,"""No""","""6/22/19 0:00""",79,,1968.27,"""Male""","""General""",3,"""Severe Systemi…","""Outpatient""","""Hospital Outpa…","""DECOMPRESSION,…","""6/22/19 10:36""","""6/22/19 13:05""","""6/22/19 10:36""","""6/22/19 13:12""",1,1,1,1,15,3,
"""db83846371c937…","""e3025ead155cb9…",15,"""Home Routine""","""10/12/19 8:43""","""10/13/19 18:02…",1.0,"""No""","""10/12/19 0:00""",67,"""5' 4.5""",2567.92,"""Female""","""General""",2,"""Mild Systemic …","""Outpatient""","""Hospital Outpa…","""HYSTERECTOMY, …","""10/12/19 10:38…","""10/12/19 14:31…","""10/12/19 10:38…","""10/12/19 14:37…",1,2,1,1,15,2,163.83


We filter the columns that provide useful information for our analysis and models.

In [23]:
columns_needed=[
    'LOG_ID',
    'MRN',
 'HOSP_ADMSN_TIME',
 'HOSP_DISCH_TIME',
 'LOS',
 'SURGERY_DATE',
 'BIRTH_DATE',
 'WEIGHT',
 'IN_OR_DTTM',
 'OUT_OR_DTTM',
 'AN_START_DATETIME',
 'AN_STOP_DATETIME',
 'ICU_ADMIN_FLAG_encoded',
 'SEX_encoded',
 'PRIMARY_ANES_TYPE_NM_encoded',
 'PATIENT_CLASS_GROUP_encoded',
 'DISCH_DISP_encoded',
 'ASA_RATING_encoded',
 'HEIGHT_IN_CM']

In [24]:
len(columns_needed)

19

In [25]:
data = data.select(columns_needed)
print(data.shape)
data.write_csv('EPIC_EMR_cleaned/patient_information_cleaned.csv')

(64364, 19)


# Cleaning patient lab data

This is a very large csv file with the size of 3GB. This file contains lab samples of every patient. The column 'Abnormal Flag' is created by lab faculty based on all the other attributes of the patient, which means it can summarise all the other attributes. Thus, we can omit rest of the columns.

In [26]:
patient_lab_encoding = {}
cols_to_encode = ['Abnormal Flag'] # DISCH_DISP
data = pl.read_csv('EPIC_EMR/patient_labs.csv',dtypes={})
data = data.unique()


In [27]:
data.head()

LOG_ID,MRN,ENC_TYPE_NM,Lab Code,Lab Name,Observation Value,Measurement Units,Reference Range,Abnormal Flag,Collection Datetime
str,str,str,str,str,f64,str,str,str,str
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2028-9""","""Carbon dioxide…",24.0,"""mmol/L""","""21-31""","""N""","""2020-03-10 23:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2951-2""","""Sodium""",136.0,"""mmol/L""","""136-145""","""N""","""2020-03-08 04:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""30428-7""","""Erythrocyte me…",92.1,"""FL""","""81.5-97.0""","""N""","""2020-03-09 05:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""48642-3""","""Glomerular fil…",9999999.0,"""Unknown""","""Unknown""","""N""","""2020-03-12 21:…"
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""48643-1""","""Glomerular fil…",9999999.0,"""Unknown""","""Unknown""","""N""","""2020-03-12 21:…"


In [28]:
for col in cols_to_encode:
    data, patient_lab_encoding[col] = encode_column_and_replace(data,col)

In [29]:
data.null_count()

LOG_ID,MRN,ENC_TYPE_NM,Lab Code,Lab Name,Observation Value,Measurement Units,Reference Range,Abnormal Flag,Collection Datetime,Abnormal Flag_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,89706,0,0,201566,0,0


In [30]:
data.head()

LOG_ID,MRN,ENC_TYPE_NM,Lab Code,Lab Name,Observation Value,Measurement Units,Reference Range,Abnormal Flag,Collection Datetime,Abnormal Flag_encoded
str,str,str,str,str,f64,str,str,str,str,u64
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2028-9""","""Carbon dioxide…",24.0,"""mmol/L""","""21-31""","""N""","""2020-03-10 23:…",1
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""2951-2""","""Sodium""",136.0,"""mmol/L""","""136-145""","""N""","""2020-03-08 04:…",1
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""30428-7""","""Erythrocyte me…",92.1,"""FL""","""81.5-97.0""","""N""","""2020-03-09 05:…",1
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""48642-3""","""Glomerular fil…",9999999.0,"""Unknown""","""Unknown""","""N""","""2020-03-12 21:…",1
"""3ab8e6344cc9f9…","""559c869f9d7db8…","""Hospital Encou…","""48643-1""","""Glomerular fil…",9999999.0,"""Unknown""","""Unknown""","""N""","""2020-03-12 21:…",1


In [31]:
columns_needed = ['Abnormal Flag_encoded','Collection Datetime','LOG_ID','MRN']
data = data.select(columns_needed)
print(data.shape)
data.write_csv('EPIC_EMR_cleaned/patient_labs_cleaned.csv')

(29071808, 4)


# Cleaning patient post op complications

This Dataset contains the post operative complications observed by the patients. All of the columns provide useful information for our analysis, which must be encoded.

In [56]:
patient_post_op_encoding = {}
cols_to_encode = ['CONTEXT_NAME','Element_abbr','SMRTDTA_ELEM_VALUE'] 
data = pl.read_csv('EPIC_EMR/patient_post_op_complications.csv',dtypes={})
data = data.unique()


In [57]:
data.head()

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE
str,str,str,str,str,str
"""b1df0a3b9037bd…","""1a479b8bad2165…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""567f27d6b31690…","""f35086637e7870…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""0059e815691350…","""73acc50cd25826…","""AN AQI POST-OP…","""NOTE""","""AN Post-op Com…","""None"""
"""a7a15da2257283…","""aee6ffad9c9ecc…","""AN AQI POST-OP…","""ENCOUNTER""","""AN Post-op Com…","""None"""
"""00072a39c8cfa0…","""c88dcd13cb0cf4…","""AN AQI POST-OP…","""NOTE""","""AN Post-op Com…","""None"""


In [58]:
data = data.with_columns(pl.col('Element_abbr').str.replace(r"AN Post-op Complications",''))
data.head()

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE
str,str,str,str,str,str
"""b1df0a3b9037bd…","""1a479b8bad2165…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""567f27d6b31690…","""f35086637e7870…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""0059e815691350…","""73acc50cd25826…","""AN AQI POST-OP…","""NOTE""","""""","""None"""
"""a7a15da2257283…","""aee6ffad9c9ecc…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None"""
"""00072a39c8cfa0…","""c88dcd13cb0cf4…","""AN AQI POST-OP…","""NOTE""","""""","""None"""


In [35]:
for col in cols_to_encode:
    data, patient_post_op_encoding[col] = encode_column_and_replace(data,col)


In [36]:
data.head()

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE,CONTEXT_NAME_encoded,Element_abbr_encoded,SMRTDTA_ELEM_VALUE_encoded
str,str,str,str,str,str,u64,u64,u64
"""a8b3f9afa63c96…","""aaebaad9587efc…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",2,2,63
"""5dc0d50fa1d11e…","""dc6e2920190557…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",2,2,63
"""c66c0c773c2280…","""66d9279ff5f4b9…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",2,2,63
"""bec65b5c7ae241…","""c80dbde76c38ec…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""Administrative…",2,2,233
"""104d64f326946e…","""bb42a6512d570b…","""AN AQI POST-OP…","""ENCOUNTER""","""""","""None""",2,2,63


In [37]:
data.null_count()

LOG_ID,MRN,Element_Name,CONTEXT_NAME,Element_abbr,SMRTDTA_ELEM_VALUE,CONTEXT_NAME_encoded,Element_abbr_encoded,SMRTDTA_ELEM_VALUE_encoded
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,483,0,0,0


In [38]:
columns_needed = ['SMRTDTA_ELEM_VALUE_encoded','Element_abbr_encoded','CONTEXT_NAME_encoded','LOG_ID','MRN']
data = data.select(columns_needed)
print(data.shape)
data.write_csv('EPIC_EMR_cleaned/patient_post_op_complication_cleaned.csv')

(84950, 5)


# Cleaning Patient Vist
This dataset contains the diagnosis information of all visits of patients

In [39]:
patient_visit_encoding = {}
cols_to_encode = ['dx_name'] 
data = pl.read_csv('EPIC_EMR/patient_visit.csv',dtypes={'diagnosis_code':pl.Utf8})
data = data.unique()

In [40]:
data.head()

LOG_ID,mrn,diagnosis_code,dx_name
str,str,str,str
"""fa149b1b36855b…","""ad8291a53a44f0…","""153.3""","""Malignant neop…"
"""5e2775d183fc11…","""5f3940d268bee2…","""153.3""","""Malignant neop…"
"""1cb372141215f6…","""3668823554b94e…","""188.0""","""Malignant neop…"
"""f091dc2a1a7965…","""9736aa7fdd136c…","""188.0""","""Malignant neop…"
"""0c357a71be7d7d…","""c09ed71d87105c…","""188.0""","""Malignant neop…"


In [41]:

patient_visit_encoding['dx_name___diagnosis_code'] = encode_two_columns(data,'dx_name','diagnosis_code')
for col in cols_to_encode:
    data, patient_visit_encoding[col] = encode_column_and_replace(data,col)

In [42]:
data.head()

LOG_ID,mrn,diagnosis_code,dx_name,dx_name_encoded
str,str,str,str,u64
"""fa149b1b36855b…","""ad8291a53a44f0…","""153.3""","""Malignant neop…",4008
"""5e2775d183fc11…","""5f3940d268bee2…","""153.3""","""Malignant neop…",4008
"""1cb372141215f6…","""3668823554b94e…","""188.0""","""Malignant neop…",4583
"""f091dc2a1a7965…","""9736aa7fdd136c…","""188.0""","""Malignant neop…",4583
"""0c357a71be7d7d…","""c09ed71d87105c…","""188.0""","""Malignant neop…",4583


In [43]:
data.null_count()

LOG_ID,mrn,diagnosis_code,dx_name,dx_name_encoded
u32,u32,u32,u32,u32
0,0,38138,0,0


In [44]:
data = data.select(pl.col('LOG_ID'),pl.col('mrn').alias('MRN'),pl.col('dx_name_encoded'))

In [45]:
data.shape

(131455, 3)

In [46]:
data.write_csv('EPIC_EMR_cleaned/patient_visit_cleaned.csv')

# Cleaning patient procedure events

In [47]:
patient_procedure_event_encoding = {}
cols_to_encode = ['EVENT_DISPLAY_NAME'] 
data = pl.read_csv('EPIC_EMR/patient_procedure events.csv')
data = data.unique()

In [48]:
data.head()

LOG_ID,MRN,EVENT_DISPLAY_NAME,EVENT_TIME,NOTE_TEXT
str,str,str,str,str
"""e1a066d449ed18…","""499386c1a8693f…","""Quick Note""","""4/27/19 3:04""",
"""068b548cf0d919…","""68d58e4c2dcd06…","""Quick Note""","""5/6/19 11:46""",
"""05195f76452108…","""85a062b39530bf…","""Transported to…","""6/12/19 20:59""",
"""33a907e9affc52…","""f7bf5c74f687a5…","""Mark Now""","""9/23/19 7:57""",
"""59dc47ed0b52c5…","""424d7ceb87a7de…","""IV Antibiotics…","""9/3/19 7:24""",


In [49]:
for col in cols_to_encode:
    data, patient_procedure_event_encoding[col] = encode_column_and_replace(data,col)

In [50]:
data.head()

LOG_ID,MRN,EVENT_DISPLAY_NAME,EVENT_TIME,NOTE_TEXT,EVENT_DISPLAY_NAME_encoded
str,str,str,str,str,u64
"""e1a066d449ed18…","""499386c1a8693f…","""Quick Note""","""4/27/19 3:04""",,72
"""068b548cf0d919…","""68d58e4c2dcd06…","""Quick Note""","""5/6/19 11:46""",,72
"""05195f76452108…","""85a062b39530bf…","""Transported to…","""6/12/19 20:59""",,27
"""33a907e9affc52…","""f7bf5c74f687a5…","""Mark Now""","""9/23/19 7:57""",,55
"""59dc47ed0b52c5…","""424d7ceb87a7de…","""IV Antibiotics…","""9/3/19 7:24""",,29


In [51]:
data = data.select(['LOG_ID','MRN','EVENT_TIME','EVENT_DISPLAY_NAME_encoded'])

In [52]:
data.write_csv('EPIC_EMR_cleaned/patient_procedure_events_cleaned.csv')

Combine all the encoded data from each dataset and store it in json file, inorder to access the actual value of each encoded value

In [54]:
encoding_dict = {}
encoding_dict['patient_history'] = patient_history_encoding
encoding_dict['patient_information'] = patient_information_encoding
encoding_dict['patient_lab'] = patient_lab_encoding
encoding_dict['patient_post_op_complications'] = patient_post_op_encoding
encoding_dict['patient_visit'] = patient_visit_encoding
encoding_dict['patient_procedure_event'] = patient_procedure_event_encoding


In [55]:
with open ('EPIC_EMR_cleaned/encoding.json','w') as f:
    json.dump(encoding_dict,f)