<a href="https://colab.research.google.com/github/tracyhua2/DS3001/blob/main/Final%20Project/DS3001_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

In [17]:
url = "https://raw.githubusercontent.com/tracyhua2/DS3001/refs/heads/main/Data/NPHA-doctor-visits.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Number of Doctors Visited,Age,Phyiscal Health,Mental Health,Dental Health,Employment,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Race,Gender
0,3,2,4,3,3,3,0,0,0,0,1,2,3,1,2
1,2,2,4,2,3,3,1,0,0,1,0,3,3,1,1
2,3,2,3,2,3,3,0,0,0,0,1,3,3,4,1
3,1,2,3,2,3,3,0,0,0,1,0,3,3,4,2
4,3,2,3,3,3,3,1,0,0,0,0,2,3,1,2


In [18]:
# imputing NAs where survey is "Refused" or "Not Asked"
df_imputed = df.copy()

df_imputed['Trouble Sleeping'] = df_imputed['Trouble Sleeping'].fillna(-1)

df_imputed['Prescription Sleep Medication'] = (
    df_imputed['Prescription Sleep Medication']
      .fillna(-1)
)

df_imputed['Race'] = df_imputed['Race'].fillna(-2)
df_imputed['Race'] = df_imputed['Race'].fillna(-1)

df_imputed['Gender'] = df_imputed['Gender'].fillna(-2)
df_imputed['Gender'] = df_imputed['Gender'].fillna(-1)

df_imputed.isna().sum()

# this confirms that there are no NAs even when imputing where patients refused to answer
# or were never asked

Unnamed: 0,0
Number of Doctors Visited,0
Age,0
Phyiscal Health,0
Mental Health,0
Dental Health,0
Employment,0
Stress Keeps Patient from Sleeping,0
Medication Keeps Patient from Sleeping,0
Pain Keeps Patient from Sleeping,0
Bathroom Needs Keeps Patient from Sleeping,0


In [19]:
# converting Trouble Sleeping, Prescription Sleep Medication, and Employment to binary
# 1 if yes, 0 if no

sleep_map = {1:1, 2:1, -1:0, 3:0}
emp_map = {1:1, 2:1, -1:0, 3:0}

df['Trouble Sleeping'] = df['Trouble Sleeping'].replace(sleep_map)
df['Prescription Sleep Medication'] = df['Prescription Sleep Medication'].replace(sleep_map)
df['Employment'] = df['Employment'].replace(emp_map)

df.head()

Unnamed: 0,Number of Doctors Visited,Age,Phyiscal Health,Mental Health,Dental Health,Employment,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Race,Gender
0,3,2,4,3,3,0,0,0,0,0,1,1,0,1,2
1,2,2,4,2,3,0,1,0,0,1,0,0,0,1,1
2,3,2,3,2,3,0,0,0,0,0,1,0,0,4,1
3,1,2,3,2,3,0,0,0,0,1,0,0,0,4,2
4,3,2,3,3,3,0,1,0,0,0,0,1,0,1,2


In [20]:
# response variable is 'Number of Doctors Visited'
X = df.drop('Number of Doctors Visited', axis=1)
y = df['Number of Doctors Visited']

In [21]:
df.columns

Index(['Number of Doctors Visited', 'Age', 'Phyiscal Health', 'Mental Health',
       'Dental Health', 'Employment', 'Stress Keeps Patient from Sleeping',
       'Medication Keeps Patient from Sleeping',
       'Pain Keeps Patient from Sleeping',
       'Bathroom Needs Keeps Patient from Sleeping',
       'Uknown Keeps Patient from Sleeping', 'Trouble Sleeping',
       'Prescription Sleep Medication', 'Race', 'Gender'],
      dtype='object')

In [22]:
# drop 'Age' column because all data is 65-80 years old
df.drop(columns=['Age'])

Unnamed: 0,Number of Doctors Visited,Phyiscal Health,Mental Health,Dental Health,Employment,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Race,Gender
0,3,4,3,3,0,0,0,0,0,1,1,0,1,2
1,2,4,2,3,0,1,0,0,1,0,0,0,1,1
2,3,3,2,3,0,0,0,0,0,1,0,0,4,1
3,1,3,2,3,0,0,0,0,1,0,0,0,4,2
4,3,3,3,3,0,1,0,0,0,0,1,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,2,2,2,2,0,0,0,0,1,0,0,0,1,1
710,3,2,2,2,1,1,0,0,0,1,1,0,1,2
711,3,4,2,3,0,0,0,0,0,0,0,0,1,1
712,3,3,1,3,0,1,0,1,1,1,0,0,1,2


In [23]:
# encode ordinal features using OrdinalEncoder
ordinal_features = ['Phyiscal Health','Mental Health','Dental Health',
                'Trouble Sleeping', 'Prescription Sleep Medication']

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(df[ordinal_features])

df[ordinal_features] = ordinal_encoder.transform(df[ordinal_features])

# this doesn't do anything but reconfirms the ordinal nature of these features

In [24]:
# encode nominal features using OneHotEncoder using pd.get_dummies
nominal_features = ['Employment', 'Race']

df = pd.get_dummies(df, columns = nominal_features, drop_first=True, dtype=int)

df.head()

Unnamed: 0,Number of Doctors Visited,Age,Phyiscal Health,Mental Health,Dental Health,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Gender,Employment_1,Employment_4,Race_2,Race_3,Race_4,Race_5
0,3,2,4.0,3.0,3.0,0,0,0,0,1,1.0,0.0,2,0,0,0,0,0,0
1,2,2,4.0,2.0,3.0,1,0,0,1,0,0.0,0.0,1,0,0,0,0,0,0
2,3,2,3.0,2.0,3.0,0,0,0,0,1,0.0,0.0,1,0,0,0,0,1,0
3,1,2,3.0,2.0,3.0,0,0,0,1,0,0.0,0.0,2,0,0,0,0,1,0
4,3,2,3.0,3.0,3.0,1,0,0,0,0,1.0,0.0,2,0,0,0,0,0,0


In [25]:
# export data to csv for modeling
df.to_csv('encoded_data.csv', index=False)