# Baseline Characteristics


In [9]:
from math import sqrt
import numpy as np
import pandas as pd
from sympy.stats import StudentT, P
from common_model_eicu import get_full_data
from common_eicu import CATEGORICAL_COLUMNS, KEY_IDENTITY, KEY_FLAG

In [10]:
df_data = get_full_data(np.nan)

len(df_data)

100308

In [11]:
df_data[KEY_IDENTITY].nunique()

17729

In [12]:
df_data[KEY_FLAG].value_counts()

0    96442
1     3866
Name: flag, dtype: int64

In [13]:
df_data[KEY_FLAG].value_counts() / len(df_data)

0    0.961459
1    0.038541
Name: flag, dtype: float64

In [14]:
MAX_BMI = 100
MIN_BMI = 10

df_data.loc[df_data['BMI'] > MAX_BMI, 'BMI'] = MAX_BMI
df_data.loc[df_data['BMI'] < MIN_BMI, 'BMI'] = MIN_BMI

In [29]:
T_TEST_SAMPLES = 1_000_000

def t_test(x: np.ndarray, y: np.ndarray) -> float:
    '''
    Test H_0: mean(x) = mean(y).

    Returns
    -------
    p -- P(mean(x) == mean(y))
    '''
    mean_x = x.mean()
    mean_y = y.mean()
    n_x = len(x)
    n_y = len(y)
    S2_x = x.var(ddof=1)
    S2_y = y.var(ddof=1)
    dof = n_x + n_y - 2
    S = sqrt(((n_x - 1) * S2_x + (n_y - 1) * S2_y) / dof)
    T = (mean_x - mean_y) / S / sqrt(1 / n_x + 1 / n_y)
    var = StudentT('var', dof)
    try:
        p = P(var > T, numsamples=T_TEST_SAMPLES)
    except TypeError as error:
        print(f'n_x = {n_x}')
        print(f'n_y = {n_y}')
        print(f'S2_x = {S2_x}')
        print(f'S2_y = {S2_y}')
        print(f'dof = {dof}')
        print(f'S = {S}')
        print(f'T = {T}')
        raise error
    if p > 0.5:
        return 2 * (1 - p)
    else:
        return 2 * p

In [30]:
KEY_BASELINE_INDEX = 'baseline_index'
KEY_POSITIVE = 'positive'
KEY_NEGATIVE = 'negative'
KEY_P = 'p'

df_positive = df_data.loc[df_data[KEY_FLAG] == 1]
df_negative = df_data.loc[df_data[KEY_FLAG] == 0]

baseline_records = []

for column in df_data.columns:

    if column == KEY_FLAG or column == KEY_IDENTITY:
        continue

    if column in CATEGORICAL_COLUMNS:
        baseline_records.append({
            KEY_BASELINE_INDEX: column,
            KEY_POSITIVE: df_positive[column].mode()[0],
            KEY_NEGATIVE: df_negative[column].mode()[0],
        })
    else:
        baseline_records.append({
            KEY_BASELINE_INDEX: column,
            KEY_POSITIVE: df_positive[column].mean(),
            KEY_NEGATIVE: df_negative[column].mean(),
        })

df_baseline = pd.DataFrame(baseline_records)
df_baseline.set_index(KEY_BASELINE_INDEX, inplace=True)

df_baseline[KEY_P] = 0.0
for baseline_index in df_baseline.index:
    value_positive = df_baseline.at[baseline_index, KEY_POSITIVE]
    value_negative = df_baseline.at[baseline_index, KEY_NEGATIVE]
    if baseline_index in CATEGORICAL_COLUMNS:
        df_baseline.at[baseline_index, KEY_P] = \
            1.0 if value_positive == value_negative else 0.0
    else:
        samples_positive = df_positive[baseline_index] \
            .dropna() \
            .to_numpy()
        samples_negative = df_negative[baseline_index] \
            .dropna() \
            .to_numpy()
        df_baseline.at[baseline_index, KEY_P] = \
            t_test(samples_positive, samples_negative)

In [33]:
df_baseline \
    .sort_values(by=KEY_P, ascending=True) \
    .to_csv('./data/baseline.csv')