<a href="https://colab.research.google.com/github/thaonguyyen/project_chd/blob/main/cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries:

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

Read in csv files:

In [14]:
train_url = 'https://raw.githubusercontent.com/thaonguyyen/project_chd/main/fhs_train.csv'
test_url = 'https://raw.githubusercontent.com/thaonguyyen/project_chd/main/fhs_test.csv'

train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)

Cleaning train test datasets:

In [15]:
nan_columns = train_df.isnull().sum()
nan_columns

Unnamed: 0           0
sex                  0
age                  0
education           85
currentSmoker        0
cigsPerDay          24
BPMeds              37
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             39
sysBP                0
diaBP                0
BMI                 15
heartRate            0
glucose            285
TenYearCHD           0
dtype: int64

In [16]:
train_df.shape

(3180, 17)

In [17]:
class fillNA(BaseEstimator, TransformerMixin):
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.column] = X[self.column].fillna(self.value)
        return X

class dropNA(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.dropna(subset=[self.column], inplace=True)
        return X

# Define pipeline steps for each column
pipeline_steps = [
    # replace NAs with 0 in education to keep it numerical
    ('fill_education', fillNA(column='education', value=0)),
    # adjust NAs for glucose to the avg
    ('fill_glucose', fillNA(column='glucose', value=85)),
    # replace NAs with 0 for BPMeds = not taking
    ('fill_BPMeds', fillNA(column='BPMeds', value=0)),
    # drop NAs for totChol, BMI, and cigsPerDay
    ('drop_totChol', dropNA(column='totChol')),
    ('drop_BMI', dropNA(column='BMI')),
    ('drop_cigsPerDay', dropNA(column='cigsPerDay'))
]

pipeline = Pipeline(steps=pipeline_steps)

train_df = pipeline.fit_transform(train_df)
test_df = pipeline.fit_transform(test_df)

The education variable is categorical but appears numeric. Change entries into text so it is easier to create dummy variables in linear regression.

In [18]:
# Clean the education variable
train_df['education'] = train_df['education'].replace(0.0, 'Unknown education')
train_df['education'] = train_df['education'].replace(1.0, 'Some high school')
train_df['education'] = train_df['education'].replace(2.0, 'High school/GED')
train_df['education'] = train_df['education'].replace(3.0, 'Some college')
train_df['education'] = train_df['education'].replace(4.0, 'College')

test_df['education'] = test_df['education'].replace(0.0, 'Unknown education')
test_df['education'] = test_df['education'].replace(1.0, 'Some high school')
test_df['education'] = test_df['education'].replace(2.0, 'High school/GED')
test_df['education'] = test_df['education'].replace(3.0, 'Some college')
test_df['education'] = test_df['education'].replace(4.0, 'College')

train_df.head()

Unnamed: 0.1,Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1267,1,58,Some high school,0,0.0,0.0,0,0,0,220.0,143.0,104.0,29.85,75,87.0,1
1,1209,0,40,Some high school,1,15.0,0.0,0,0,0,199.0,122.0,82.0,22.16,85,77.0,0
2,2050,0,52,Some high school,0,0.0,0.0,0,0,0,275.0,112.0,71.0,25.68,80,85.0,0
3,1183,1,38,High school/GED,1,43.0,0.0,0,1,0,170.0,130.0,94.0,23.9,110,75.0,0
4,3225,0,43,Some high school,0,0.0,0.0,0,0,0,202.0,124.0,92.0,21.26,75,74.0,0


Save cleaned datasets into csv file:

In [19]:
train_df.to_csv("cleaned_train_data.csv", encoding = 'utf-8')
cleaned_train_df = pd.read_csv("cleaned_train_data.csv")

test_df.to_csv("cleaned_test_data.csv", encoding = 'utf-8')
cleaned_test_df = pd.read_csv("cleaned_test_data.csv")

In [22]:
from google.colab import files
files.download('cleaned_train_data.csv')
files.download('cleaned_test_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>