In [2]:
import pandas as pd

In [3]:
stroke_df = pd.read_csv('data/raw_data/healthcare-dataset-stroke-data.csv')
heart_df = pd.read_csv('data/raw_data/heart.csv')

In [4]:
stroke_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
heart_df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Data Encoding

In [3]:
print(f"Gender: {stroke_df['gender'].unique()}")
print(f"Married: {stroke_df['ever_married'].unique()}")
print(f"Work types: {stroke_df['work_type'].unique()}")
print(f"Residence types: {stroke_df['Residence_type'].unique()}")
print(f"Smoking status: {stroke_df['smoking_status'].unique()}")

Gender: ['Male' 'Female' 'Other']
Married: ['Yes' 'No']
Work types: ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence types: ['Urban' 'Rural']
Smoking status: ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [4]:
# For gender, there is 'Other' category
stroke_df[stroke_df['gender'] == 'Other']

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
3116,56156,Other,26.0,0,0,No,Private,Rural,143.33,22.4,formerly smoked,0


In [5]:
gender_mapping = {'Male': 0, 'Female': 1}
# Drop this single item from the dataset before encoding to binary gender
stroke_df.drop(stroke_df[stroke_df['gender'] == 'Other'].index, inplace=True)
stroke_df.reset_index(drop=True, inplace=True)
stroke_df['gender'] = stroke_df['gender'].map(gender_mapping)
stroke_df['age'] = stroke_df['age'].astype(int)

In [6]:
# Customize encodings on variables
married = {'Yes': 1, 'No': 0}
work_type = {'Private': 1, 'Self-employed': 2, 'Govt_job': 3, 'children': 4, 'Never_worked': 0}
residence_type = {'Urban': 0, 'Rural': 1}
smoking_stat = {'formerly smoked':1, 'never smoked':0, 'smokes':2, 'Unknown': 3}

# Apply mapping to the 'category' column
stroke_df['ever_married'] = stroke_df['ever_married'].map(married)
stroke_df['work_type'] = stroke_df['work_type'].map(work_type)
stroke_df['Residence_type'] = stroke_df['Residence_type'].map(residence_type)
stroke_df['smoking_status'] = stroke_df['smoking_status'].map(smoking_stat)

In [7]:
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [8]:
print(f"Sex: {heart_df['Sex'].unique()}")
print(f"ChestPainType: {heart_df['ChestPainType'].unique()}")
print(f"RestingECG: {heart_df['RestingECG'].unique()}")
print(f"ExerciseAngina: {heart_df['ExerciseAngina'].unique()}")
print(f"ST_Slope: {heart_df['ST_Slope'].unique()}")

Sex: ['M' 'F']
ChestPainType: ['ATA' 'NAP' 'ASY' 'TA']
RestingECG: ['Normal' 'ST' 'LVH']
ExerciseAngina: ['N' 'Y']
ST_Slope: ['Up' 'Flat' 'Down']


In [9]:
# Customize encodings on variables
gender = {'M': 0, 'F': 1}
chest_pain = {'ATA': 0, 'NAP': 1, 'ASY': 2, 'TA': 3}
rest_ecg = {'Normal': 0, 'ST': 1, 'LVH': 2}
exercise = {'N': 0, 'Y': 1}
st_slope = {'Up': 0, 'Flat': 1, 'Down': 2}

# Apply mapping to the 'category' column
heart_df['Sex'] = heart_df['Sex'].map(gender)
heart_df['ChestPainType'] = heart_df['ChestPainType'].map(chest_pain)
heart_df['RestingECG'] = heart_df['RestingECG'].map(rest_ecg)
heart_df['ExerciseAngina'] = heart_df['ExerciseAngina'].map(exercise)
heart_df['ST_Slope'] = heart_df['ST_Slope'].map(st_slope)
heart_df['Age'] = heart_df['Age'].astype(int)

## Handling Missing Data

In [10]:
stroke_df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [11]:
# Impute missing value with mean
stroke_df['bmi'].fillna(stroke_df['bmi'].mean(), inplace=True)

In [12]:
heart_df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

## Output preprocessed dataframe

In [13]:
stroke_df.to_csv("data/processed_data/processed_stroke.csv", index=False)
heart_df.to_csv("data/processed_data/processed_heart.csv", index=False)