In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

%matplotlib inline

In [2]:
df = pd.read_csv('../data/raw/training_v2.csv')

In [3]:
df.drop('encounter_id', axis=1, inplace=True)
df.drop('patient_id', axis=1, inplace=True)
df.drop('hospital_id', axis=1, inplace=True)
df.drop('icu_id', axis=1, inplace=True)

In [4]:
col_list = []
with open("../data/raw/del_columns.txt", "r") as f:
    for line in f:
        col_list.append(line[:-1])
        
df.drop(columns=col_list, inplace=True, axis=1)

In [5]:
df.shape

(91713, 93)

In [6]:
df['gender'] = df['gender'].fillna(df['gender'].value_counts().index[0])
print(df['gender'].isnull().sum())
df['gender'] = df['gender'].map({'M': 0, 'F': 1})

0


In [7]:
df['ethnicity'].value_counts()

Caucasian           70684
African American     9547
Other/Unknown        4374
Hispanic             3796
Asian                1129
Native American       788
Name: ethnicity, dtype: int64

In [8]:
df['ethnicity'] = df['ethnicity'].fillna("Other/Unknown")
print(df['ethnicity'].isnull().sum())

0


In [9]:
df.age.fillna(int(df.age.mean()), inplace=True)

d = {range(1, 50): 1, range(50, 65): 2, range(65, 75): 3, range(75, 100): 4}

df.age = df.age.apply(lambda x: next((v for k, v in d.items() if x in k), 0))
df.age.value_counts()

2    29709
4    23347
3    20645
1    18012
Name: age, dtype: int64

In [10]:
df.dropna(subset=['heart_rate_apache', 'temp_apache'], inplace=True)

In [11]:
d = {range(25, 80): 1, range(80, 120): 2, range(120, 150): 3, range(150, 180): 4, range(180, 200): 5}

df.heart_rate_apache = df.heart_rate_apache.apply(lambda x: next((v for k, v in d.items() if x in k), 0))
df.heart_rate_apache.value_counts()

2    44776
1    20398
3    18899
4     3481
Name: heart_rate_apache, dtype: int64

In [12]:
d = {range(int(df.temp_apache.min()), 37): 0, range(37, int(df.temp_apache.max())+1): 1}

df.temp_apache = df.temp_apache.apply(lambda x: next((v for k, v in d.items() if x in k), 0))
df.temp_apache.value_counts()

0    85299
1     2255
Name: temp_apache, dtype: int64

In [13]:
df.urineoutput_apache.fillna(df.urineoutput_apache.mean(), inplace=True)
df.urineoutput_apache.isna().sum()

0

In [14]:
# normalize urineoutput_apache
x = df[['urineoutput_apache']].values.astype(float)
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df.urineoutput_apache = x_scaled

df.urineoutput_apache.describe()

count    87554.000000
mean         0.199557
std          0.114528
min          0.000000
25%          0.167427
50%          0.199557
75%          0.199557
max          1.000000
Name: urineoutput_apache, dtype: float64

In [15]:
df.shape

(87554, 93)

In [16]:
df.columns[df.dtypes == "object"]

Index(['ethnicity', 'apache_3j_bodysystem', 'apache_2_bodysystem'], dtype='object')

In [17]:
print("apache_3j_bodysystem groups:\n\n{}".format(df.apache_3j_bodysystem.value_counts()))

apache_3j_bodysystem groups:

Cardiovascular          28743
Neurological            11575
Sepsis                  11377
Respiratory             11141
Gastrointestinal         8712
Metabolic                7308
Trauma                   3781
Genitourinary            2066
Musculoskeletal/Skin     1105
Hematological             607
Gynecological             300
Name: apache_3j_bodysystem, dtype: int64


In [18]:
print("ethnicity groups:\n\n{}".format(df.ethnicity.value_counts()))

ethnicity groups:

Caucasian           67892
African American     8973
Other/Unknown        5306
Hispanic             3637
Asian                 976
Native American       770
Name: ethnicity, dtype: int64


In [19]:
und_diag = {"Undefined diagnoses": "Undefined Diagnoses"}
df["apache_2_bodysystem"].replace(und_diag, inplace = True)

In [20]:
print("apache_2_bodysystem groups:\n\n{}".format(df.apache_2_bodysystem.value_counts()))

apache_2_bodysystem groups:

Cardiovascular         37229
Neurologic             11575
Respiratory            11141
Gastrointestinal        8712
Metabolic               7308
Undefined Diagnoses     4017
Trauma                  3781
Renal/Genitourinary     2345
Haematologic             607
Name: apache_2_bodysystem, dtype: int64


In [21]:
print("apache_2_bodysystem NaN: {}".format(df.apache_2_bodysystem.isna().sum()))
print("apache_3j_bodysystem NaN: {}".format(df.apache_3j_bodysystem.isna().sum()))
print("ethnicity NaN: {}".format(df.ethnicity.isna().sum()))

apache_2_bodysystem NaN: 839
apache_3j_bodysystem NaN: 839
ethnicity NaN: 0


In [23]:
enc = OrdinalEncoder()
cat_columns = ['ethnicity', 'apache_3j_bodysystem', 'apache_2_bodysystem']
#reshape_df = np.array(df).reshape(-1, 1)
#for col in cat_columns:
#    df[col] = enc.fit_transform(reshape_df[col].astype('str'))