# Data Collecting

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/stroke.csv')

In [4]:
df.shape

(43400, 12)

In [5]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [7]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

Column 'id' is patients' id. It is not a contributing factor to stroke. So I will drop the 'id' column.

In [8]:
df = df.drop(columns=['id'])

In [9]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


# Missing value

In [10]:
df.isnull().sum(axis=0)

gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

The dataset contains missing values in 'bmi' and 'smoking_status'. Need to perform Little's MCAR test. However, entries in 'smoking_status' is not numerical. So first need to transform the data.

In [11]:
df['smoking_status'].unique()

array([nan, 'never smoked', 'formerly smoked', 'smokes'], dtype=object)

In [12]:
def transform_never_smoked(x):
    if x == 'never smoked':
        return 1
    if x == 'formerly smoked' or x == 'smokes':
        return 0
    return np.nan

def transform_formerly_smoked(x):
    if x == 'formerly smoked':
        return 1
    if x == 'never smoked' or x == 'smokes':
        return 0
    return np.nan

def transform_smokes(x):
    if x == 'smokes':
        return 1
    if x == 'never smoked' or x == 'formerly smokes':
        return 0
    return np.nan

df['never_smoked'] = df['smoking_status'].apply(transform_never_smoked)
df['formerly_smoked'] = df['smoking_status'].apply(transform_formerly_smoked)
df['smokes'] = df['smoking_status'].apply(transform_smokes)
df.drop(columns=['smoking_status'])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,never_smoked,formerly_smoked,smokes
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0,,,
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0,1.0,0.0,0.0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0,,,
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0,0.0,1.0,
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0,,,


In [13]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
df_transform = df[['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type']]
onehot_encoded = onehot_enc.fit_transform(df_transform)
df_transform = pd.DataFrame(onehot_encoded, columns = onehot_enc.get_feature_names())
df = df.drop(columns=['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                      'Residence_type', 'smoking_status'])
df = pd.concat([df, df_transform], axis=1)
df.head()

Unnamed: 0,age,avg_glucose_level,bmi,stroke,never_smoked,formerly_smoked,smokes,x0_Female,x0_Male,x0_Other,...,x2_1,x3_No,x3_Yes,x4_Govt_job,x4_Never_worked,x4_Private,x4_Self-employed,x4_children,x5_Rural,x5_Urban
0,3.0,95.12,18.0,0,,,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,58.0,87.96,39.2,0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,8.0,110.89,17.6,0,,,,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,70.0,69.04,35.9,0,0.0,1.0,,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,14.0,161.28,19.1,0,,,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


The dataset contains missing values in bmi and smoking_status.

In [14]:
import math as ma
import scipy.stats as st

def checks_input_mcar_tests(data):
    """ Checks whether the input parameter of class McarTests is correct
            Parameters
            ----------
            data:
                The input of McarTests specified as 'data'
            Returns
            -------
            bool
                True if input is correct
            """

    if not isinstance(data, pd.DataFrame):
        print("Error: Data should be a Pandas DataFrame")
        return False

    if not any(data.dtypes.values == np.float):
        if not any(data.dtypes.values == np.int):
            print("Error: Dataset cannot contain other value types than floats and/or integers")
            return False

    if not data.isnull().values.any():
        print("Error: No NaN's in given data")
        return False

    return True


def mcar_test(data):
    """ Implementation of Little's MCAR test
    Parameters
    ----------
    data: Pandas DataFrame
        An incomplete dataset with samples as index and variables as columns
    Returns
    -------
    p_value: Float
        This value is the outcome of a chi-square statistical test, testing whether the null hypothesis
        'the missingness mechanism of the incomplete dataset is MCAR' can be rejected.
    """

    if not checks_input_mcar_tests(data):
        raise Exception("Input not correct")

    dataset = data.copy()
    vars = dataset.dtypes.index.values
    n_var = dataset.shape[1]

    # mean and covariance estimates
    # ideally, this is done with a maximum likelihood estimator
    gmean = dataset.mean()
    gcov = dataset.cov()

    # set up missing data patterns
    r = 1 * dataset.isnull()
    mdp = np.dot(r, list(map(lambda x: ma.pow(2, x), range(n_var))))
    sorted_mdp = sorted(np.unique(mdp))
    n_pat = len(sorted_mdp)
    correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp))
    dataset['mdp'] = pd.Series(correct_mdp, index=dataset.index)

    # calculate statistic and df
    pj = 0
    d2 = 0
    for i in range(n_pat):
        dataset_temp = dataset.loc[dataset['mdp'] == i, vars]
        select_vars = ~dataset_temp.isnull().any()
        pj += np.sum(select_vars)
        select_vars = vars[select_vars]
        means = dataset_temp[select_vars].mean() - gmean[select_vars]
        select_cov = gcov.loc[select_vars, select_vars]
        mj = len(dataset_temp)
        parta = np.dot(means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1])))
        d2 += mj * (np.dot(parta, means))

    df = pj - n_var

    # perform test and save output
    p_value = st.chi2.cdf(d2, df)

    return p_value

In [15]:
mcar_test(df)

0.0

In [5]:
df = df.dropna()
df.shape

(29072, 12)

In [25]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
df_onehot = df[['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type']]
onehot_encoded = onehot_enc.fit_transform(df_onehot)
df_onehot = pd.DataFrame(onehot_encoded, columns = onehot_enc.get_feature_names())

In [24]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
df_minmax = df[['age', 'avg_glucose_level', 'bmi']]
minmax_encoded = minmax_scaler.fit_transform(df_minmax)
df_minmax = pd.DataFrame(minmax_encoded, columns = ['age', 'avg_glucose_level', 'bmi'])

In [26]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
label_encoded = label_enc.fit_transform(df['stroke'])
df_label = pd.DataFrame(label_encoded, columns = ['stroke'])

In [36]:
df_preprocessed = pd.concat([df_onehot, df_minmax, df_label], axis = 1)

(29072, 23)

In [37]:
df_preprocessed.shape

(29072, 23)

In [38]:
df_preprocessed.columns

Index(['x0_Female', 'x0_Male', 'x0_Other', 'x1_0', 'x1_1', 'x2_0', 'x2_1',
       'x3_No', 'x3_Yes', 'x4_Govt_job', 'x4_Never_worked', 'x4_Private',
       'x4_Self-employed', 'x4_children', 'x5_Rural', 'x5_Urban',
       'x6_formerly smoked', 'x6_never smoked', 'x6_smokes', 'age',
       'avg_glucose_level', 'bmi', 'stroke'],
      dtype='object')

In [39]:
df_preprocessed.to_csv('../data/stroke_preprocessed.csv')