In [1]:
#Import necessary modules
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import norm

In [2]:
#Bring in the data CSV
data = "cardio_train.csv"
cardio_df = pd.read_csv(data, delimiter = ';')

cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
#Clean up age & drop the unneeded ID column
cardio1_df= cardio_df.drop(['id'],axis=1)
# cardio1_df.active[cardio1_df.active == 1] = "True"
# cardio1_df.active[cardio1_df.active == 0] = "False"
age_upd = cardio1_df['age'].div(365)
rounded_age = round(age_upd)
cardio1_df.update(rounded_age)

# Convert to US understood numbers & add column for BMI
weight_upd = round(cardio1_df['weight']*2.2)
height_upd = round(cardio1_df['height'].div(2.54))

cardio1_df.update(weight_upd)
cardio1_df.update(height_upd)
cardio1_df.head()

cardio1_df["BMI"] = (cardio1_df['weight']*703)/(cardio1_df["height"]*cardio1_df["height"])
cardio1_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50.0,2,66.0,136.0,110,80,1,1,0,0,1,0,21.948577
1,55.0,1,61.0,187.0,140,90,3,1,0,0,1,1,35.329481
2,52.0,1,65.0,141.0,130,70,3,1,0,0,0,1,23.461065
3,48.0,2,67.0,180.0,150,100,1,1,0,0,1,1,28.188906
4,48.0,1,61.0,123.0,100,60,1,1,0,0,0,0,23.238108


In [4]:
#Clean Cardio0 to remove outliers - negative and impossibly high BPs
cardio1_df = cardio1_df[cardio1_df['ap_lo'] >= 50]
cardio1_df = cardio1_df[cardio1_df['ap_lo'] < 200]
cardio1_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50.0,2,66.0,136.0,110,80,1,1,0,0,1,0,21.948577
1,55.0,1,61.0,187.0,140,90,3,1,0,0,1,1,35.329481
2,52.0,1,65.0,141.0,130,70,3,1,0,0,0,1,23.461065
3,48.0,2,67.0,180.0,150,100,1,1,0,0,1,1,28.188906
4,48.0,1,61.0,123.0,100,60,1,1,0,0,0,0,23.238108


In [5]:
#Clean Cardio0 to remove outliers - negative and impossibly high BPs
cardio1_df = cardio1_df[cardio1_df['ap_hi'] >= 50]
cardio1_df = cardio1_df[cardio1_df['ap_hi'] < 200]
cardio1_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50.0,2,66.0,136.0,110,80,1,1,0,0,1,0,21.948577
1,55.0,1,61.0,187.0,140,90,3,1,0,0,1,1,35.329481
2,52.0,1,65.0,141.0,130,70,3,1,0,0,0,1,23.461065
3,48.0,2,67.0,180.0,150,100,1,1,0,0,1,1,28.188906
4,48.0,1,61.0,123.0,100,60,1,1,0,0,0,0,23.238108


In [10]:
# Assign X (data) and y (target)
X = cardio1_df
y = cardio1_df["cardio"]
print(X.shape, y.shape)

(68606, 13) (68606,)


In [12]:
data = X.copy()

data_binary_encoded = pd.get_dummies(data, columns=["gender", "cholesterol", "gluc", "smoke", "alco", "active"])
data_binary_encoded.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,BMI,gender_1,gender_2,cholesterol_1,...,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
0,50.0,66.0,136.0,110,80,0,21.948577,0,1,1,...,0,1,0,0,1,0,1,0,0,1
1,55.0,61.0,187.0,140,90,1,35.329481,1,0,0,...,1,1,0,0,1,0,1,0,0,1
2,52.0,65.0,141.0,130,70,1,23.461065,1,0,0,...,1,1,0,0,1,0,1,0,1,0
3,48.0,67.0,180.0,150,100,1,28.188906,0,1,1,...,0,1,0,0,1,0,1,0,0,1
4,48.0,61.0,123.0,100,60,0,23.238108,1,0,1,...,0,1,0,0,1,0,1,0,1,0


In [13]:
data_binary_encoded.to_csv("cleaned_cardio.csv", index = False)