In [1]:
# Add the Pandas dependency.
import pandas as pd

In [2]:
# Files to load
cardio_data_to_load = "cardio_data.csv"

In [3]:
# Read the cardio data file and store it in a Pandas DataFrame.
cardio_data_df = pd.read_csv(cardio_data_to_load)
cardio_data_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# Create new column with coverted 'age'.
cardio_data_df['cardio_age'] = cardio_data_df['age']/365
cardio_data_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,cardio_age
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50.391781
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55.419178
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51.663014
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48.282192
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47.873973


In [5]:
# Format the data (round age down so that year is a whole number)
cardio_data_df['cardio_age'] = cardio_data_df['cardio_age'].map("{:.0f}".format)
cardio_data_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,cardio_age
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48


In [6]:
# Drop the original 'age' column.
cardio_data_df.drop('age', axis=1, inplace=True)
cardio_data_df.head()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,cardio_age
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48


In [7]:
# Rearrange the columns
cardio_data_df = cardio_data_df[['id', 'cardio_age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 
                                    'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']] 
cardio_data_df.head()

Unnamed: 0,id,cardio_age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0


In [8]:
# Rename the new column 
cardio_data_df = cardio_data_df.rename(columns = {"cardio_age":"age"}) 
cardio_data_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0


In [9]:
cardio_data_df.dtypes

id               int64
age             object
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [10]:
# Remove outliers in the 'ap_lo' coulmn that are below 60
cardio_data_df.drop(cardio_data_df[cardio_data_df['ap_lo'] < 60].index, inplace = True)

In [11]:
# Remove outliers in the 'ap_lo' coulmn that are above 140
cardio_data_df.drop(cardio_data_df[cardio_data_df['ap_lo'] > 140].index, inplace = True)

In [12]:
# Remove outliers in the 'ap_lo' coulmn that are above 200
cardio_data_df.drop(cardio_data_df[cardio_data_df['ap_hi'] > 200].index, inplace = True)

In [13]:
# Remove outliers in the 'ap_lo' coulmn that are below 100
cardio_data_df.drop(cardio_data_df[cardio_data_df['ap_hi'] <100].index, inplace = True)

In [14]:
cardio_data_df.head(30)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0
5,8,60,1,151,67.0,120,80,2,2,0,0,0,0
6,9,61,1,157,93.0,130,80,3,1,0,0,1,0
7,12,62,2,178,95.0,130,90,3,3,0,0,1,1
8,13,48,1,158,71.0,110,70,1,1,0,0,1,0
9,14,54,1,164,68.0,110,60,1,1,0,0,0,0


In [15]:
cardio_data_df.shape

(67482, 13)

In [16]:
cardio_data_df.to_csv(r'cardio_training_clean.csv', index = False)