# Prepartion

In [None]:
import os

import numpy as np
import pandas as pd
pd.options.display.max_columns = 500
pd.options.plotting.backend = 'matplotlib'

# import plotly.io as pio
# import plotly.express as px
# pio.templates.default = "plotly_white" # "plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"

# import seaborn as sns
# sns.set_style("whitegrid", {"axes.facecolor": ".9"})

import matplotlib.pyplot as plt
# plt.rcParams['text.usetex'] = True

%matplotlib inline

# Data Schema

In [None]:
df = pd.read_csv(
        "data/dataset.csv",
  #     na_values=[ '', ' ', '?', '?|?','None', '-NaN', '-nan', '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null']
) #.fillna(np.nan)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#
# According to the documentation, i've put the features into 2 sets
# URL: https://www.hindawi.com/journals/bmri/2014/781670/tab1/
#

#
# patient and encounter id
# (statistically not interesting)
ids = [
    'encounter_id',
    'patient_nbr'
]

numerical_features = [
    'time_in_hospital',    # Integer number of days between admission and discharge
    'number_outpatient',
    'number_inpatient',
    'number_emergency',
    'num_lab_procedures',
    'number_diagnoses',
    'num_medications',
    'num_procedures'
]

categorical_features = [
    'race', 
    'gender',
    'age', 
    'weight',              # documentation says numeric, but here it is categorical
    'admission_type_id',
    'discharge_disposition_id',
    'admission_source_id', 
    'payer_code',
    'medical_specialty',
    'primary_diagnosis_code',
    'other_diagnosis_codes',
    'ndc_code',
    'max_glu_serum',
    'A1Cresult',
    'change',
    'readmitted'
]

# Data Distribution

### Categorical features

In [None]:
df["race"].hist()

In [None]:
df["gender"].hist()

In [None]:
df["age"].hist()

### Numerical features

In [None]:
df[numerical_features].hist(figsize=(20,10), bins=100, layout=(2,4))

# Data missingness

In [None]:
df_columns_info = []

df_columns_info += [{
    'Feature': col, 
    'Frquency': df.shape[0],
    '# Freq_Null': df[col].isnull().sum(), 
    '% Freq_Null': np.around(df[col].isnull().sum()/df.shape[0]*100, 1), 
    '# Freq_Zeros': (df[col]==0).sum(), 
    '% Freq_Zeros': np.around((df[col] == 0).sum()/df.shape[0]*100, 1), 
    'Type': 'Numerical'
} for col in list(numerical_features)]

df_columns_info += [{
    'Feature': col, 
    'Frequency': df.shape[0],
    '# Freq_Null': df[col].isnull().sum(), 
    '% Freq_Null': np.around(df[col].isnull().sum()/df.shape[0]*100, 1), 
    '# Freq_Zeros': '-', 
    '% Freq_Zeros': '-', 
    'Type': 'Categorical'
} for col in list(categorical_features)]


pd.DataFrame(df_columns_info)

# Data Outlier

In [None]:
df[numerical_features].boxplot(figsize=(20,10))