In [13]:
# Importing libraries
import pandas as pd
import numpy as np

In [88]:
# Read in the data
wls_raw = pd.read_csv('wls.csv', low_memory=False)
# Select the variables of interest
wls_select = wls_raw[['z_livgrad', 'z_ix011rec', 'z_ax341re', 'z_ax342re',
               'z_ix001rer', 'z_gx360re', 'z_ix013rec', 'z_gu025re',
               'z_brdxdy', 'z_gb103red', 'z_gp260hec', 'z_gc042re', 'z_sexrsp',
               'z_ie020re', 'z_il003rer'
               ]]
# Show the first five rows
wls_select.head(n=5)

Unnamed: 0,z_livgrad,z_ix011rec,z_ax341re,z_ax342re,z_ix001rer,z_gx360re,z_ix013rec,z_gu025re,z_brdxdy,z_gb103red,z_gp260hec,z_gc042re,z_sexrsp,z_ie020re,z_il003rer
0,1,32,1.0,2.0,4,1,2,1,38,19,81784,1,1,1,3
1,1,27,2.0,2.0,4,2,2,1,39,20,87000,3,1,1,4
2,1,29,1.0,1.0,5,2,2,1,39,12,85354,3,1,1,3
3,1,28,,,5,2,1,1,38,14,336604,3,1,1,4
4,1,26,,,4,1,2,1,41,12,82800,1,1,1,-3


### Preprocessing

In [89]:
# Copy the data for preprocessing
wls = wls_select.copy()
# Check the missing values in the target
print("There is {} missing value in the target: disposition status"\
    .format(wls_select['z_livgrad'].isnull().sum()))
# Recode the target for easier interpretation
wls['deceased'] = np.where(wls['z_livgrad'] == 1, "Alive", "Deceased")
wls = wls.drop('z_livgrad', axis=1)
# Show the balance of the target
print("There are {} positive cases and {} negative cases in target"\
    .format(wls['deceased'].value_counts()[0], wls['deceased'].value_counts()[1]))


There is 0 missing value in the target: disposition status
There are 7565 positive cases and 2384 negative cases in target


In [90]:
# Preprocess continuous/ordinal variables
# Create a list of continuous/ordinal variables
con_col = ['z_ix011rec', 'z_ix001rer', 'z_brdxdy', 'z_gb103red', 'z_gp260hec',
           'z_il003rer']
# Create a new dataframe with only continuous/ordinal variables
wls_ord = wls[wls[con_col].notnull()].copy()[con_col]
# Set values of missing ordinal variables originally coded as negative to nan
wls_ord[wls_ord < 0] = np.nan
# Check the number of missing values in original ordinal variables
wls_ord_na = wls_ord.dropna()
print("Fixed {} respondents with missing ordinal variables"\
    .format(len(wls_ord) - len(wls_ord_na)))
# Fill the missing values with the mean of the variable
wls_ord.fillna(wls_ord.mean(), inplace=True)
# Convert the continuous variables from text to numeric variables
for col in con_col:
    wls_ord[col] = pd.to_numeric(wls_ord[col])
# Name the columns
wls_ord.columns = ['BMI', 'Self-Rated Health', 'Age', 'Years of Education',
                   'Household Income', 'Religion Importance']
# Get the summary statistics of the ordinal variables
wls_ord.describe()

Fixed 876 respondents with missing ordinal variables


Unnamed: 0,BMI,Self-Rated Health,Age,Years of Education,Household Income,Religion Importance
count,9949.0,9949.0,9949.0,9949.0,9949.0,9949.0
mean,27.874934,4.006127,39.381785,13.774586,64790.547838,3.558197
std,4.849958,0.675229,4.236784,2.403011,79392.196592,1.085866
min,19.0,1.0,18.0,0.0,0.0,1.0
25%,25.0,4.0,38.0,12.0,22272.0,3.0
50%,27.0,4.0,39.0,12.0,46000.0,4.0
75%,30.0,4.0,39.0,16.0,78400.0,4.0
max,45.0,5.0,60.0,21.0,710000.0,5.0


While preprocessing the data, I found that `z_ax341re` and `z_ax342re` has many missing values. This is because the questionnaire was only sent to a portion of the respondents. Because based on the previous analysis on NSHAP, I believe these two features are important, I decided to keep these two features.

In [91]:
# Preprocess categorical/binary variables
cat_col = ['z_ax341re', 'z_ax342re', 'z_gx360re', 'z_ix013rec','z_gu025re',
           'z_gc042re', 'z_sexrsp', 'z_ie020re']
# Create a new dataframe with only categorical/binary variables
wls_cat = wls[wls[cat_col].notnull()].copy()[cat_col]
# Set values of missing categorical variables originally coded as negative to nan
wls_cat[wls_cat < 0] = np.nan
# Check the number of missing values in original categorical variables
print("Fixed missing on hypertension", wls_cat['z_ax341re'].isnull().sum())
print("Fixed missing on diabetes", wls_cat['z_ax342re'].isnull().sum())
# Fill the missing values with the mode of the variable
wls_cat.fillna(wls_cat.mode().iloc[0], inplace=True)
# Set values of missing categorical variables originally coded as negative to nan
for col in cat_col[:5]:
    wls_cat[col] = np.where(wls_cat[col] == 1, "Yes", "No")
    wls_cat[col] = wls_cat[col].astype('category')
wls_cat['z_sexrsp'] = np.where(wls_cat['z_sexrsp'] == 1, "Male", "Female")
wls_cat['z_ie020re'] = np.where(wls_cat['z_ie020re'] == 1, "White", "Non-White")
# Name the columns
wls_cat.columns = ['Hypertension', 'Diabetes', 'Arthritis', 'Smoking Status',
                   'Drunk Alcohol', 'Martial Status', 'Sex', 'Race']

Fixed missing on hypertension 4466
Fixed missing on diabetes 4458


In [92]:
wls_clean = pd.concat([wls['deceased'], wls_ord, wls_cat], axis=1)
wls_clean.head(n=5)

Unnamed: 0,deceased,BMI,Self-Rated Health,Age,Years of Education,Household Income,Religion Importance,Hypertension,Diabetes,Arthritis,Smoking Status,Drunk Alcohol,Martial Status,Sex,Race
0,Alive,32.0,4.0,38.0,19.0,81784.0,3.0,Yes,No,Yes,No,Yes,1.0,Male,White
1,Alive,27.0,4.0,39.0,20.0,87000.0,4.0,No,No,No,No,Yes,3.0,Male,White
2,Alive,29.0,5.0,39.0,12.0,85354.0,3.0,Yes,Yes,No,No,Yes,3.0,Male,White
3,Alive,28.0,5.0,38.0,14.0,336604.0,4.0,No,No,No,Yes,Yes,3.0,Male,White
4,Alive,26.0,4.0,41.0,12.0,82800.0,3.558197,No,No,Yes,No,Yes,1.0,Male,White


In [86]:
wls_clean.to_csv('wls_clean.csv', index=False)