# Feature Engineering

Start by importing the packages needed for feature engineering.

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, model_selection
import category_encoders as ce

In [2]:
df = pd.read_csv("../data/clean_data.csv", index_col=0)

## Convert Categorical Variables

In [3]:
# Print categorical variables
cat = df.select_dtypes(["object"])

print(cat.columns)

Index(['Housing', 'Monthly payment', 'Home value', 'Parents education',
       'Household education', 'Education', 'Military status', 'Age', 'Race',
       'Marital status', 'Percent of poverty line', 'Region'],
      dtype='object')


In [4]:
for i in cat.columns: 
    print(i, cat[i].nunique(), "\n", cat[i].value_counts(), "\n")

Housing 4 
 I own my home                     3986
I rent                            1554
I do not currently own or rent     515
Refused                             14
Name: Housing, dtype: int64 

Monthly payment 10 
 Less than $300       1060
$1,000-1,499         1031
$500-749              846
$750-999              786
Prefer not to say     560
$2,000 or more        544
$1,500-1,999          523
$300-499              442
I dont know           236
Refused                41
Name: Monthly payment, dtype: int64 

Home value 8 
 Doesn't apply         2083
$150,000-249,999      1006
Less than $150,000     991
$250,000-399,999       954
$400,000 or more       786
Prefer not to say      141
I dont know            103
Refused                  5
Name: Home value, dtype: int64 

Parents education 6 
 High school degree/GED          1958
Some college/Associate          1494
Bachelors degree                1032
Graduate/professional degree     954
Less than high school            605
Refused     

In [5]:
# One hot encoding: Housing, Race

ohe = ce.one_hot.OneHotEncoder(cols=["Housing", "Race"], use_cat_names=True)

ohe.fit(df)
df = ohe.transform(df)

# Drop one column to avoid dummy variable trap
df = df.drop(["Race_Other - Non-Hispanic", "Housing_Refused"], axis=1)

df[['Race_White - Non-Hispanic', 'Race_Black - Non-Hispanic', 'Housing_I own my home', 'Housing_I rent',
    'Housing_I do not currently own or rent']].head()

Unnamed: 0,Race_White - Non-Hispanic,Race_Black - Non-Hispanic,Housing_I own my home,Housing_I rent,Housing_I do not currently own or rent
0,1,0,1,0,0
1,1,0,1,0,0
2,0,1,1,0,0
3,0,0,0,1,0
4,1,0,1,0,0


In [6]:
# Ordinal encoding: Monthly payment, Home value, Parents education, Household education, 
#                   Education, Age, Percent of poverty line

oe = ce.ordinal.OrdinalEncoder(cols=["Monthly payment", "Home value", 
                                     "Parents education", "Household education", 
                                     "Education", "Age", "Percent of poverty line"],
                               mapping=[
                                   {"col": "Monthly payment", 
                                    "mapping": {
                                        "Refused": 0,
                                        "Less than $300": 2,
                                        "$300-499": 3,
                                        "$500-749": 4,
                                        "$750-999": 5,
                                        "$1,000-1,499": 6,
                                        "$1,500-1,999": 7,
                                        "$2,000 or more": 8,
                                        "I dont know": 0,
                                        "Prefer not to say": 1
                                    }}, 
                                   {"col": "Home value", 
                                    "mapping": {
                                        "Doesn't apply": 0,
                                        "Refused": 0,
                                        "Less than $150,000": 2,
                                        "$150,000-249,999": 3,
                                        "$250,000-399,999": 4,
                                        "$400,000 or more": 5,
                                        "I dont know": 0,
                                        "Prefer not to say": 1
                                    }},
                                   {"col": "Parents education",
                                    "mapping": {
                                        "Refused": 0,
                                        "Less than high school": 1,
                                        "High school degree/GED": 2,
                                        "Some college/Associate": 3,
                                        "Bachelors degree": 4,
                                        "Graduate/professional degree": 5
                                    }}, 
                                   {"col": "Household education",
                                    "mapping": {
                                        "Refused": 0,
                                        "Less than high school": 1,
                                        "High school degree/GED": 2,
                                        "Some college/Associate": 3,
                                        "Bachelors degree": 4,
                                        "Graduate/professional degree": 5
                                    }},
                                   {"col": "Education",
                                    "mapping": {
                                        "Refused": 0,
                                        "Less than high school": 1,
                                        "High school degree/GED": 2,
                                        "Some college/Associate": 3,
                                        "Bachelors degree": 4,
                                        "Graduate/professional degree": 5
                                    }},
                                   {"col": "Age",
                                    "mapping": {
                                        "18-24": 1,
                                        "25-34": 2,
                                        "35-44": 3,
                                        "45-54": 4,
                                        "55-61": 5,
                                        "62-69": 6,
                                        "70-74": 7,
                                        "75+": 8
                                    }},
                                   {"col": "Percent of poverty line",
                                    "mapping": {
                                        "<100% FPL": 1,
                                        "100%-199% FPL": 2,
                                        "200%+ FPL": 3
                                    }}
                                       ])

oe.fit(df)
df = oe.transform(df)

df[["Monthly payment", "Home value", "Parents education", "Household education", 
    "Education", "Age", "Percent of poverty line"]].head()

Unnamed: 0,Monthly payment,Home value,Parents education,Household education,Education,Age,Percent of poverty line
0,5,3,4,4,4,8,3
1,5,3,2,2,2,3,3
2,4,4,3,3,3,3,3
3,3,0,2,2,2,2,3
4,5,2,4,4,4,2,3


In [7]:
# Binary encoding: Military status, Region

be = ce.binary.BinaryEncoder(cols=["Military status", "Region", "Marital status"])
be.fit(df)

df = be.transform(df)

df[['Region_0', 'Region_1', 'Region_2', 'Region_3', 'Military status_0', 'Military status_1', 'Military status_2']].head()

Unnamed: 0,Region_0,Region_1,Region_2,Region_3,Military status_0,Military status_1,Military status_2
0,0,0,0,1,0,0,1
1,0,0,1,0,0,0,1
2,0,0,1,1,0,0,1
3,0,1,0,0,0,0,1
4,0,0,1,0,0,1,0


In [8]:
# Make the independent variables binary variables for the model
h67 = df.Happiness.isin((6, 7))
f67 = df.Future.isin((6, 7))
hw67 = df.Hardwork.isin((6, 7))

df.loc[h67, ["Happiness"]] = 1
df.loc[h67!=True, ["Happiness"]] = 0
df.loc[f67, ["Future"]] = 1
df.loc[f67!=True, ["Future"]] = 0
df.loc[hw67, ["Hardwork"]] = 1
df.loc[hw67!=True, ["Hardwork"]] = 0

df[["Happiness", "Future", "Hardwork"]].head(20)

Unnamed: 0,Happiness,Future,Hardwork
0,0,0,1
1,1,1,1
2,0,0,0
3,0,0,0
4,0,1,0
5,0,0,1
6,0,0,0
7,0,0,0
8,1,1,1
9,0,1,1


There are still some counts of people that refused to respond to certain questions, but because there are so few in each variable I think it would be best to change these to 0 before standardizing the data. 

In [9]:
for i in df.columns:
    count_refused = df[i].value_counts()[df[i].value_counts().index==-1]
    
    if count_refused.empty:
        continue
        
    print(f"{i}: {count_refused.iloc[0]}")

Financial knowledge: 39
Reliable: 15
Self-commitment: 9
Financial Goals: 60
Goal confidence: 8
Expenses difficulty: 14
Savings habit: 4
Retirement savings: 61
Non-retirement savings: 68
Remaining mortgage: 43
Housing satisfaction: 14
Savings amount: 12
Number of earners: 37
Income volatility: 28
Health: 20
Stress: 7
Probability of 75 years: 49
Financial planning horizon: 33
Kids 0-7: 1
Kids 7-12: 2
Kids 13-17: 1
Kids 18+: 7


In [10]:
df = df.replace(-1, 0)

## Split Data

In [11]:
X = df.drop(["Happiness", "Future", "Hardwork"], axis=1)
y = df[["Happiness", "Future", "Hardwork"]]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state=1)

print(X_train.describe())
print(X_test.describe())
print(y_train.describe())
print(y_test.describe())

          FWBscore      FSscore      FKscore      KHscore  \
count  4855.000000  4855.000000  4855.000000  4855.000000   
mean     56.102369    50.832132     0.843941     0.710173   
std      14.116139    12.493824     0.243152     0.201846   
min      14.000000     5.000000     0.000000     0.111111   
25%      48.000000    42.000000     0.666667     0.555556   
50%      56.000000    50.000000     1.000000     0.777778   
75%      65.000000    57.000000     1.000000     0.888889   
max      95.000000    85.000000     1.000000     1.000000   

       Financial knowledge     Reliable  Self-commitment  Financial Goals  \
count          4855.000000  4855.000000      4855.000000      4855.000000   
mean              4.695160     4.219361         3.605767         0.635427   
std               1.243925     0.871584         0.908396         0.481360   
min               0.000000     0.000000         0.000000         0.000000   
25%               4.000000     4.000000         3.000000         

## Standardize Data

First fit the scaler to the training sets then use the fitted scaler to transform both the training and the test sets.

In [12]:
scaler = preprocessing.StandardScaler()

scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

print(X_train[["FWBscore", "FSscore", "Material hardship", "Life shocks"]].describe())
print(X_test[["FWBscore", "FSscore", "Material hardship", "Life shocks"]].describe())

           FWBscore       FSscore  Material hardship   Life shocks
count  4.855000e+03  4.855000e+03       4.855000e+03  4.855000e+03
mean   5.190950e-17  1.186830e-16       3.201924e-16 -1.189574e-16
std    1.000103e+00  1.000103e+00       1.000103e+00  1.000103e+00
min   -2.982877e+00 -3.668761e+00      -5.122625e-01 -7.660544e-01
25%   -5.740382e-01 -7.069927e-01      -5.122625e-01 -7.660544e-01
50%   -7.252637e-03 -6.661032e-02      -5.122625e-01  2.050146e-01
75%    6.303812e-01  4.937242e-01       1.285606e-01  2.050146e-01
max    2.755827e+00  2.735062e+00       3.332676e+00  9.915704e+00
          FWBscore      FSscore  Material hardship  Life shocks
count  1214.000000  1214.000000        1214.000000  1214.000000
mean      0.001793    -0.018344          -0.033493     0.018640
std       0.975840     0.992943           0.954540     0.969178
min      -2.982877    -3.668761          -0.512262    -0.766054
25%      -0.574038    -0.706993          -0.512262    -0.766054
50%      -0.0

## Export Data

In [13]:
X_train.to_csv("../data/X_train.csv")
y_train.to_csv("../data/y_train.csv")
X_test.to_csv("../data/X_test.csv")
y_test.to_csv("../data/y_test.csv")