In [157]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np


In [158]:
data = pd.read_csv("data/heart_disease_uci.csv")
data.drop(columns=["id"], inplace=True)
data.head()

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [159]:
data.shape

(920, 15)

In [160]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    int64  
 1   sex       920 non-null    str    
 2   dataset   920 non-null    str    
 3   cp        920 non-null    str    
 4   trestbps  861 non-null    float64
 5   chol      890 non-null    float64
 6   fbs       830 non-null    object 
 7   restecg   918 non-null    str    
 8   thalch    865 non-null    float64
 9   exang     865 non-null    object 
 10  oldpeak   858 non-null    float64
 11  slope     611 non-null    str    
 12  ca        309 non-null    float64
 13  thal      434 non-null    str    
 14  num       920 non-null    int64  
dtypes: float64(5), int64(2), object(2), str(6)
memory usage: 107.9+ KB


In [161]:
data.describe()

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [162]:
data.isna().sum()

age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

stats: 

distribution of `num`: 

In [163]:
data["num"].value_counts()

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

binarization of target variable (`num`):

In [164]:
#if we don't copy we would modify original data and when running this cell alone we won't get what's intended
binary_data = data.copy()
binary_data["num"] = binary_data["num"] != 0
binary_data["num"].value_counts()

num
True     509
False    411
Name: count, dtype: int64

the dataset now became somehow balanced (509 vs 411).

#### handling missing values: 

In [165]:
cols_with_missing_data = binary_data.columns[binary_data.isna().sum()>0]
cols_with_missing_data.tolist()

['trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalch',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

first we will drop any duplicate records or that have **all** these columns missing:

In [166]:
binary_data = binary_data.drop_duplicates(ignore_index=True)
binary_data = binary_data.dropna(subset=cols_with_missing_data, how="all", ignore_index=True)

then we will drop any missing values that are less than 5% of the dataset. 
dropping is not recommended unless the data is Missing Completely At Random (MCAR), but if the pourcentage is less than 5% then it's okay. 

In [167]:
percentage_of_missing = binary_data[cols_with_missing_data].isna().sum()/len(binary_data)*100
percentage_of_missing

trestbps     6.427015
chol         3.159041
fbs          9.803922
restecg      0.217865
thalch       5.991285
exang        5.991285
oldpeak      6.753813
slope       33.442266
ca          66.339869
thal        52.723312
dtype: float64

In [168]:
cols_with_less_than_5_missing = cols_with_missing_data[percentage_of_missing < 5]
cols_with_less_than_5_missing.tolist()

['chol', 'restecg']

these two columns have less than 5% of their data missing, so it's safe to just drop these records:

In [169]:
print("data records before dropping less than 5%: ", binary_data.shape[0])
binary_data = binary_data.dropna(subset=cols_with_less_than_5_missing, ignore_index=True)
print("data records after dropping less than 5%: ", binary_data.shape[0])

data records before dropping less than 5%:  918
data records after dropping less than 5%:  887


now we should handle columns with more than 5% missing data. 
I don't really know much about these medical measures, but I'll try to decide which are MCAR, MAR, and MNAR, then decide which imputation technique to use accordingly. 

In [170]:
remaining_missing = cols_with_missing_data[percentage_of_missing >= 5]
remaining_missing.tolist()


['trestbps', 'fbs', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [171]:
binary_data[remaining_missing]

Unnamed: 0,trestbps,fbs,thalch,exang,oldpeak,slope,ca,thal
0,145.0,True,150.0,False,2.3,downsloping,0.0,fixed defect
1,160.0,False,108.0,True,1.5,flat,3.0,normal
2,120.0,False,129.0,True,2.6,flat,2.0,reversable defect
3,130.0,False,187.0,False,3.5,downsloping,0.0,normal
4,130.0,False,172.0,False,1.4,upsloping,0.0,normal
...,...,...,...,...,...,...,...,...
882,127.0,True,154.0,False,0.0,,,
883,,False,,,,,,
884,122.0,True,100.0,False,0.0,,,fixed defect
885,,True,,,,,,


this in fact looks like Missing At Random data (distribution of missing data is related to the other variables). 

we will use an iterative imputer over the numerical missing values, but before we should train test split to avoid any data leakage. (encoding before splitting also causes data leakage, so better split then encode)

In [172]:
from sklearn.model_selection import train_test_split
#necessary because IterativeImputer is still experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


In [173]:
X = binary_data.drop(columns=['num'])
y = binary_data.num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

we separate categorical variables from numercial ones

In [174]:
categorical_cols = X_train.select_dtypes(include=["object", "str"]).columns
numerical_cols = X_train.select_dtypes(exclude=["object", "str"]).columns

now we create the pipelines of encoding and imputation:   
- for categorical variables: we impute each variable using its own mode, then we encode.   
- for numerical variables: we scale then impute using IterativeImputer (ML-based imputation).   

**Note:** we should scale before using the imputer, because it's ML based.  

**Note2:** after imputing, the mean and std will shift (they will not be 0 and 1 respectively) because of the new data added using imputation, but we don't need to rescale again, because our features are already of the same scale as we scaled before imputation. (it's about having the same scale through features not having 0 mean and 1 std). (however I will scale again that won't hurt)

In [175]:
categorical_pipe = Pipeline(
    [
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("encoder", OneHotEncoder())
    ]
)

numerical_pipe = Pipeline(
    [    ("scaler", StandardScaler()),
        ("imputer", IterativeImputer(random_state=42, max_iter=40, initial_strategy='median')),
        ("post_impute_scaler", StandardScaler())
    ]
)

wrap both of them in a ColumnTransformer:

In [176]:
preprocessor = ColumnTransformer(
    [
    ("num", numerical_pipe, numerical_cols), 
    ("cat", categorical_pipe, categorical_cols)
    ]
)

now we will transfrom our data and save it to be used to train and test the MLP: 

In [None]:
#we fit over the training data
X_train_ready = preprocessor.fit_transform(X_train)
# we should not re-fit over testing data to avoid data leakage
X_test_ready = preprocessor.transform(X_test)
# encode target variable: 
y_train = np.array(y_train.astype(int))
y_test = np.array(y_test.astype(int))


[ 1.56589989e-18  7.84515847e-17 -1.73814888e-17 -8.45585943e-18
  5.35537764e-17 -8.45585943e-18  2.07334274e-01  7.92665726e-01
  3.38504937e-01  3.03244006e-01  1.28349788e-01  2.29901269e-01
  5.47249647e-01  1.94640339e-01  2.12976023e-01  4.51339915e-02
  8.57545839e-01  1.42454161e-01  2.10155148e-01  5.90973202e-01
  1.98871650e-01  6.17771509e-01  3.82228491e-01  7.33427362e-02
  6.99576869e-01  2.27080395e-01  4.51339915e-02  7.43300423e-01
  2.11565585e-01]


save them directly as `.joblib`:

In [178]:
import joblib
import os

ready = {"train": {
            "X": X_train_ready, 
            "y": y_train
            }, 
        "test": {
            "X": X_test_ready, 
            "y": y_test
        }}

os.makedirs("data/ready", exist_ok=True)
joblib.dump(ready, "data/ready/ready.joblib")

['data/ready/ready.joblib']