# Objective for Part 3

To prepare our dataset for machine modeling, we will perform feature engineering as follow:
- One-hot encoding on categorical features
- Append one-hot encoded features into the dataset

In [1]:
# Step 1: import pandas
import pandas as pd

In [2]:
# Step 2: Read your Part I CSV as a DataFrame
df = pd.read_csv(r"data/data_part1.csv", index_col=False)
df.head()

Unnamed: 0,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,Floor,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,Floor,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,Emergency Department,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,Operating Room,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,74489,83,0,67.0,27.56,0,Caucasian,M,190.5,Direct Admit,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic


In [3]:
# Step 3: Declare a variable and store the dummified/one-hot encoded values from 'ethnicity'
onehot_ethnicity = pd.get_dummies(df['ethnicity'], drop_first="True")
onehot_ethnicity.head()

Unnamed: 0,Asian,Caucasian,Hispanic,Native American,Other/Unknown
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0


In [4]:
# Step 4: Declare a variable and store the dummified/one-hot encoded values from 'gender'
onehot_gender = pd.get_dummies(df['gender'], drop_first="True")
onehot_gender.head()

Unnamed: 0,M
0,1
1,0
2,0
3,0
4,1


In [5]:
# Step 5: Declare a variable and store the dummified/one-hot encoded values from 'icu_stay'
onehot_icu_stay = pd.get_dummies(df['icu_stay_type'], drop_first="True")
onehot_icu_stay.head()

Unnamed: 0,readmit,transfer
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [6]:
# Step 6: Declare a variable and store the dummified/one-hot encoded values from 'icu_type'
onehot_icu_type = pd.get_dummies(df['icu_type'], drop_first="True")
onehot_icu_type.head()

Unnamed: 0,CSICU,CTICU,Cardiac ICU,MICU,Med-Surg ICU,Neuro ICU,SICU
0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0


In [7]:
# Step 7: Pick the specified columns in the DataFrame
col_list = ["hospital_death","age","bmi","elective_surgery","height","pre_icu_los_days","readmission_status","weight"]

In [8]:
df2=df[col_list].copy()

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82965 entries, 0 to 82964
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   hospital_death      82965 non-null  int64  
 1   age                 82965 non-null  float64
 2   bmi                 82965 non-null  float64
 3   elective_surgery    82965 non-null  int64  
 4   height              82965 non-null  float64
 5   pre_icu_los_days    82965 non-null  float64
 6   readmission_status  82965 non-null  int64  
 7   weight              82965 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 5.1 MB


In [10]:
# Step 8: Concatenate your DataFrames together
df_merged = pd.concat([df2,onehot_ethnicity,onehot_gender,onehot_icu_stay,onehot_icu_type], axis=1)

In [11]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82965 entries, 0 to 82964
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   hospital_death      82965 non-null  int64  
 1   age                 82965 non-null  float64
 2   bmi                 82965 non-null  float64
 3   elective_surgery    82965 non-null  int64  
 4   height              82965 non-null  float64
 5   pre_icu_los_days    82965 non-null  float64
 6   readmission_status  82965 non-null  int64  
 7   weight              82965 non-null  float64
 8   Asian               82965 non-null  uint8  
 9   Caucasian           82965 non-null  uint8  
 10  Hispanic            82965 non-null  uint8  
 11  Native American     82965 non-null  uint8  
 12  Other/Unknown       82965 non-null  uint8  
 13  M                   82965 non-null  uint8  
 14  readmit             82965 non-null  uint8  
 15  transfer            82965 non-null  uint8  
 16  CSIC

In [12]:
# Step 9: Export your DataFrame as a CSV
df_merged.to_csv(r"data/data_part3.csv", index=False)