<a href= "https://hrs.isr.umich.edu/sites/default/files/meta/2002/core/codebook/h02_00.html?_ga=2.14020593.714056361.1676427746-1610833755.1676427746">codebook</a>

In [52]:
# Importing the libraries
import glob
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [35]:
# Importing all disposition files of every wave
all_dispositon_files = glob.glob('hrs_data/disposition/*.csv')
# Create an empty list to store all the dataframes
lst_disposition = []
# Loop through all the files and read them into a list of dataframes
for filename in all_dispositon_files:
    df_disposition = pd.read_csv(filename, index_col=None, header=0)
    lst_disposition.append(df_disposition)
# Concatenate all the dataframes into one dataframe
dispositon_data = pd.concat(lst_disposition, axis=0,
                            ignore_index=True).iloc[:, :2]
# As these are all the disposition files, we can set deceased to 1
dispositon_data['deceased'] = 1

In [36]:
# Again, import all the core files in the baseline wave 2002
all_core_files = glob.glob('hrs_data/*.csv')
# Create an empty dataframe to concatenate with all the dataframes
core_data_raw = pd.DataFrame()
# Loop through all the files and read them into a dataframe
for filename in all_core_files:
    dfcore = pd.read_csv(filename, index_col=None, header=0)
    col_to_merge = dfcore.columns.difference(core_data_raw.columns)
    core_data_raw = pd.concat([core_data_raw, dfcore[col_to_merge]], axis=1)
# Select the columns we need
core_data = core_data_raw[['HHID', 'PN',
                           'HC139', 'HC005', 'HC010', 'HC001', 'HC070', 
                           'HC117', 'HC128', 'HX067_R', 'HB014A', 'HC134', 'HQ331', 'HQ376', 'HMARITAL', 'HX060_R', 'HB031A',
                           'HB053'
                           ]]
# Show the first 5 rows of the dataframe
core_data.head()

Unnamed: 0,HHID,PN,HC139,HC005,HC010,HC001,HC070,HC117,HC128,HX067_R,HB014A,HC134,HQ331,HQ376,HMARITAL,HX060_R,HB031A,HB053
0,3,10,170,5,5,3,5,5.0,1,1936,12.0,,,,1.0,1,1.0,1
1,3,20,179,5,5,3,1,5.0,5,1938,16.0,,,,1.0,2,1.0,1
2,10001,10,180,5,5,2,5,5.0,5,1939,12.0,,,250000.0,6.0,1,1.0,3
3,10003,30,120,1,5,1,5,5.0,5,1956,16.0,,,,5.0,2,1.0,1
4,10004,10,230,1,5,2,1,5.0,1,1939,16.0,,4000.0,,1.0,1,1.0,1


In [37]:
# Merge the core data with the disposition data
df02_d = core_data.merge(dispositon_data, on=["HHID", "PN"], how="left")
# Fill the respondents not appearing in the disposition data as not deceased
df02_d['deceased'] = df02_d['deceased'].fillna(0)
# Show the breakdown of deceased and not deceased
df02_d['deceased'].value_counts()

0.0    17443
1.0      800
Name: deceased, dtype: int64

In [41]:
con_col = ['HC139', 'HC001', 'HB014A', 'HB053', 'HC134', 'HQ331', 'HQ376', ]

df02_d[con_col]

Unnamed: 0,HC139,HC001,HB014A,HB053,HC134,HQ331,HQ376
0,170,3,12.0,1,,,
1,179,3,16.0,1,,,
2,180,2,12.0,3,,,250000.0
3,120,1,16.0,1,,,
4,230,2,16.0,1,,4000.0,
...,...,...,...,...,...,...,...
18238,145,2,14.0,1,,,
18239,145,2,12.0,3,,,
18240,186,5,15.0,5,,,
18241,132,3,15.0,5,,,


In [60]:
# Preprocessing the continuous/ordinal variables
con_col = ['HC139', 'HC001', 'HB014A', 'HB053', 'HX067_R',
           'HC134', 'HQ331', 'HQ376', ]
# Create a new dataframe with only continuous/ordinal variables
df02_d_ord = df02_d[con_col].copy()
# Construct the household asset variable
wealth_col = ['HC134', 'HQ331', 'HQ376']
for col in wealth_col:
    # Set values of no wealth to 0
    df02_d_ord[col] = df02_d_ord[col].replace(np.nan, 0)
df02_d_ord['wealth_amt'] = df02_d_ord['HC134'] + df02_d_ord['HQ331'] + df02_d_ord['HQ376']
# Scale the wealth variable to 0-100
scaler = MinMaxScaler(feature_range=(0, 100))
df02_d_ord['wealth_amt'] = scaler.fit_transform(df02_d_ord['wealth_amt']\
    .values.reshape(-1, 1))
df02_d_ord.drop(['HC134', 'HQ331', 'HQ376'], axis=1, inplace=True)
# Construct the age variable
df02_d_ord['age'] = 2002 - df02_d_ord['HX067_R']
df02_d_ord.drop(['HX067_R'], axis=1, inplace=True)
# Set values of missing ordinal variables originally coded as negative to nan
df02_d_ord[df02_d_ord < 0] = np.nan
# Check the number of missing values in original ordinal variables
df02_d_ord_nona = df02_d_ord.dropna()
print("Filled {} respondents' missing ordinal variables by mean"\
    .format(len(df02_d_ord) - len(df02_d_ord_nona)))
# Fill the missing values with the mean of the variable
df02_d_ord.fillna(df02_d_ord.mean(), inplace=True)
# Convert the continuous variables from text to numeric variables
for col in con_col[:3]:
    df02_d_ord[col] = pd.to_numeric(df02_d_ord[col])
# Name the columns
df02_d_ord.columns = ['Weight (Pounds)', 'Self-Rated Health', 'Education',
                      'Religion Importance','Wealth', 'Age'
]
# Get the summary statistics of the ordinal variables
df02_d_ord.describe()

Filled 1 respondents' missing ordinal variables by mean


Unnamed: 0,Weight (Pounds),Self-Rated Health,Education,Religion Importance,Wealth,Age
count,18243.0,18243.0,18243.0,18243.0,18243.0,18243.0
mean,185.463082,2.883407,12.129591,1.891027,1.857271,68.815601
std,115.27371,1.14129,3.926426,1.365811,10.078518,10.504341
min,65.0,1.0,0.0,1.0,0.0,26.0
25%,143.0,2.0,11.0,1.0,0.0,61.0
50%,169.0,3.0,12.0,1.0,0.0,68.0
75%,195.5,4.0,14.0,3.0,0.0,76.0
max,999.0,9.0,99.0,9.0,100.0,110.0


In [87]:
# Preprocessing the binary/categorical variables
cat_col = ['HC005', 'HC010', 'HC070', 'HC117', 'HC128', 'HMARITAL', 'HX060_R', 'HB031A']
df02_d_cat = df02_d[cat_col].copy()
# Check the number of missing values in original categorical variables
print("Missing values in the columns", "\n", df02_d_cat.isnull().sum())
# Fill the missing values with the mode of the variable
df02_d_cat.fillna(df02_d_cat.mode().iloc[0], inplace=True)
for i in cat_col[:3]:
    df02_d_cat[i] = df02_d_cat[i].isin([1, 3])
for i in cat_col[3:6]:
    df02_d_cat[i] = df02_d_cat[i].isin([1])
df02_d_cat['HX060_R'] = np.where(df02_d_cat['HX060_R'] == 1, "Male", "Female")
df02_d_cat['HB031A'] = np.where(df02_d_cat['HB031A'] == 1, "White", "non-White")
df02_d_cat.columns = ['Hypertension', 'Diabetes', 'Arthritis', 'Smoking',
                      'Drunk Alcohol', 'Married','Sex', 'Race']
df02_d_cat

Missing values in the columns 
 HC005         0
HC010         0
HC070         0
HC117       114
HC128         0
HMARITAL      1
HX060_R       0
HB031A       29
dtype: int64


Unnamed: 0,Hypertension,Diabetes,Arthritis,Smoking,Drunk Alcohol,Married,Sex,Race
0,False,False,False,False,True,True,Male,White
1,False,False,True,False,False,True,Female,White
2,False,False,False,False,False,False,Male,White
3,True,False,False,False,False,False,Female,White
4,True,False,True,False,True,True,Male,White
...,...,...,...,...,...,...,...,...
18238,False,False,False,False,False,False,Female,White
18239,True,False,True,False,False,False,Female,non-White
18240,True,False,True,True,True,True,Male,White
18241,True,False,True,False,True,True,Female,White


In [88]:
df02_d_clean = pd.concat([df02_d_ord, df02_d_cat], axis=1)
df02_d_clean

Unnamed: 0,Weight (Pounds),Self-Rated Health,Education,Religion Importance,Wealth,Age,Hypertension,Diabetes,Arthritis,Smoking,Drunk Alcohol,Married,Sex,Race
0,170,3,12.000000,1,0.00,66,False,False,False,False,True,True,Male,White
1,179,3,16.000000,1,0.00,64,False,False,True,False,False,True,Female,White
2,180,2,12.000000,3,1.25,63,False,False,False,False,False,False,Male,White
3,120,1,16.000000,1,0.00,46,True,False,False,False,False,False,Female,White
4,230,2,16.000000,1,0.02,63,True,False,True,False,True,True,Male,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18238,145,2,14.000000,1,0.00,72,False,False,False,False,False,False,Female,White
18239,145,2,12.000000,3,0.00,77,True,False,True,False,False,False,Female,non-White
18240,186,5,15.000000,5,0.00,75,True,False,True,True,True,True,Male,White
18241,132,3,15.000000,5,0.00,76,True,False,True,False,True,True,Female,White
