# Imports

In [None]:
import pandas as pd

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Preprocessing

In [11]:
# load data and remove irrelevant feature
data = pd.read_csv('../../CVD_cleaned.csv').drop('Checkup', axis=1)

data.head(5)

Unnamed: 0,General_Health,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [12]:
# convert categorical variables to category
categorical_cols = ['General_Health', 'Age_Category', 'Sex']
for col in categorical_cols:
  data[col] = data[col].astype('category')

# define categories in the correct order for ordinal encoding
health_order = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
age_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']
sex_order = ['Female', 'Male']
# encode categories
ordinal_encoder = OrdinalEncoder(categories=[health_order, age_order, sex_order])
# fit and transform categorical columns
data[categorical_cols] = ordinal_encoder.fit_transform(data[categorical_cols])


# convert binary cols (yes/no) to binary (1/0)
binary_cols = ['Exercise', 'Heart_Disease', 'Skin_Cancer', 'Other_Cancer',
               'Depression', 'Diabetes', 'Arthritis', 'Smoking_History']
for col in binary_cols:
  # if string contains 'Yes' or 'No' due to pre-diabetes and gestational diabetes:
  data[col] = data[col].apply(lambda x: 1 if 'Yes' in str(x) else (0 if 'No' in str(x) else x))
  data[col] = data[col].astype('category')


data.head(5)

Unnamed: 0,General_Health,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,0.0,0,0,0,0,0,0,1,0.0,10.0,150.0,32.66,14.54,1,0.0,30.0,16.0,12.0
1,3.0,0,1,0,0,0,1,0,0.0,10.0,165.0,77.11,28.29,0,0.0,30.0,0.0,4.0
2,3.0,1,0,0,0,0,1,0,0.0,8.0,163.0,88.45,33.47,0,4.0,12.0,3.0,16.0
3,0.0,1,1,0,0,0,1,0,1.0,11.0,180.0,93.44,28.73,0,0.0,30.0,30.0,8.0
4,2.0,0,0,0,0,0,0,0,1.0,12.0,191.0,88.45,24.37,1,0.0,8.0,4.0,0.0


In [13]:
# train test split (can do this last since our preprocessing does not touch Heart_Disease other than binary conversion)
X = data.drop('Heart_Disease', axis=1)
y = data['Heart_Disease']

# 80-20 split due to class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
    random_state=1, stratify=y)

print('training:', X_train.shape[0], 'individuals')
print('test:', X_test.shape[0], 'individuals\n')
print('case/control distribution in training set:', y_train.value_counts(normalize=True), '\n')
print('case/control distribution in test set:', y_test.value_counts(normalize=True))

training: 247083 individuals
test: 61771 individuals

case/control distribution in training set: Heart_Disease
0    0.919149
1    0.080851
Name: proportion, dtype: float64 

case/control distribution in test set: Heart_Disease
0    0.919153
1    0.080847
Name: proportion, dtype: float64


In [14]:
# save preprocessed data
X_train.to_pickle('../X_train.pkl')
X_test.to_pickle('../X_test.pkl')
y_train.to_pickle('../y_train.pkl')
y_test.to_pickle('../y_test.pkl')