- Importing the data as csv
- Dropping the column "time"

In [10]:
import pandas as pd

df = pd.read_csv('../_assets/heart_failure_clinical_records_dataset.csv')
df.drop('time', inplace=True, axis=1)

- The DataFrame Looks fine now.

In [11]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,1


- Upsampling the minority group to mitigate imbalance in the dataset

In [12]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.DEATH_EVENT==0]
df_minority = df[df.DEATH_EVENT==1]

# df_majority.shape
# df_minority.shape

# Downsample majority class
df_minority_upsampled = resample(df_minority,
                                 replace=True, # sample without replacement
                                 n_samples=df_majority.shape[0], # to match minority class
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_upsampled.DEATH_EVENT.value_counts()

df = df_upsampled

- Normalizing the data ([0,1])

In [13]:
import pandas as pd
from sklearn import preprocessing

# x = training_features.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(df)
df = pd.DataFrame(x_scaled)

- Having a look at the pre-processed data
- So, 11 features for each instance

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.163636,1.0,0.007272,0.0,0.242424,1.0,0.487211,0.05618,0.714286,0.0,0.0,0.0
1,0.454545,1.0,0.0037,0.0,0.166667,1.0,0.304158,0.089888,0.685714,0.0,0.0,0.0
2,0.236364,0.0,0.005103,1.0,0.69697,0.0,0.415687,0.033708,0.628571,1.0,0.0,0.0
3,0.181818,1.0,0.017351,1.0,0.242424,0.0,0.335677,0.078652,0.714286,0.0,0.0,0.0
4,0.363636,0.0,0.335928,1.0,0.242424,0.0,0.339314,0.202247,0.685714,1.0,0.0,0.0


- Splitting the dataset into training and testing data
- Splitting the training and testing data into respective labels and features

In [15]:
training_features = df.iloc[:, :-1]
training_labels = df.iloc[:, -1:]
training_features.shape

testing_data = df.sample(frac=0.1)
testing_features = testing_data.iloc[:, :-1]
testing_labels = testing_data.iloc[:, -1:]

- Dumping the preprocessed dataframe

In [16]:
import pickle
import os

picklefile_tX = open('../dump/training_features.pickle', 'wb')
picklefile_tY = open('../dump/training_labels.pickle', 'wb')

picklefile_TX = open('../dump/testing_features.pickle', 'wb')
picklefile_TY = open('../dump/testing_labels.pickle', 'wb')

pickle.dump(training_features, picklefile_tX)
pickle.dump(training_labels, picklefile_tY)

pickle.dump(testing_features, picklefile_TX)
pickle.dump(testing_labels, picklefile_TY)

picklefile_tX.close()
picklefile_tY.close()
picklefile_TX.close()
picklefile_TY.close()