<a href="https://colab.research.google.com/github/thaonguyyen/project_chd/blob/main/cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

Read in csv files:

In [None]:
train_url = 'https://raw.githubusercontent.com/thaonguyyen/project_chd/main/fhs_train.csv'
test_url = 'https://raw.githubusercontent.com/thaonguyyen/project_chd/main/fhs_test.csv'

train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)

Cleaning train test datasets:

In [None]:
nan_columns = train_df.isnull().sum()
nan_columns

Unnamed: 0           0
sex                  0
age                  0
education           85
currentSmoker        0
cigsPerDay          24
BPMeds              37
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             39
sysBP                0
diaBP                0
BMI                 15
heartRate            0
glucose            285
TenYearCHD           0
dtype: int64

In [None]:
train_df.shape

(3180, 17)

In [None]:
class fillNA(BaseEstimator, TransformerMixin):
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.column] = X[self.column].fillna(self.value)
        return X

class dropNA(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.dropna(subset=[self.column], inplace=True)
        return X

# Define pipeline steps for each column
pipeline_steps = [
    # replace NAs with 0 in education to keep it numerical
    ('fill_education', fillNA(column='education', value=0)),
    # adjust NAs for glucose to the avg
    ('fill_glucose', fillNA(column='glucose', value=85)),
    # replace NAs with 0 for BPMeds = not taking
    ('fill_BPMeds', fillNA(column='BPMeds', value=0)),
    # drop NAs for totChol, BMI, and cigsPerDay
    ('drop_totChol', dropNA(column='totChol')),
    ('drop_BMI', dropNA(column='BMI')),
    ('drop_cigsPerDay', dropNA(column='cigsPerDay'))
]

pipeline = Pipeline(steps=pipeline_steps)

train_df = pipeline.fit_transform(train_df)
test_df = pipeline.fit_transform(test_df)

Save cleaned datasets into csv file:

In [None]:
train_df.to_csv("cleaned_train_data.csv", encoding = 'utf-8')
cleaned_train_df = pd.read_csv("cleaned_train_data.csv")

test_df.to_csv("cleaned_test_data.csv", encoding = 'utf-8')
cleaned_test_df = pd.read_csv("cleaned_test_data.csv")

In [None]:
from google.colab import files
files.download('cleaned_train_data.csv')
files.download('cleaned_test_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>