This notebook is for data processing and feature engineering.

In [1]:
import pandas as pd

In [2]:
# Functions for data preprocessing
def format_str_data(pd_series):
    """
    Helper function to strip whitespaces and convert every string to lower case.
    """
    if pd_series.dtype == object:
        pd_series = pd_series.str.strip()
        pd_series = pd_series.str.lower()
    return(pd_series)

def prepare_data(df, target_var, target_var_encoding, cols_to_drop = []):
    """
    Takes target variable, the way to encode it, and the columns to drop, 
    does the processing and turns categorical variables into dummies for model training.
    """
    df.drop(cols_to_drop, axis=1, inplace=True)
    df = df.apply(format_str_data)
    df[target_var] = df[target_var].apply(lambda key: target_var_encoding[key])
    categorical_cols = [col for col in df.columns.values if df[col].dtype == object]
    df = pd.get_dummies(df, columns = categorical_cols, drop_first=True)
    return df

In [3]:
# Load the data
raw_data = pd.read_csv('adult.csv')

In [4]:
# Drop rows with '?', irrelevant columns, and run data preparation
raw_data = raw_data[(raw_data['occupation'] != '?') & (raw_data['workclass'] != '?')]
raw_data = raw_data[raw_data.fnlwgt <= raw_data.fnlwgt.quantile(0.99)]

cols_to_drop = ['relationship', 'capital.gain', 'capital.loss', 'native.country']
target_var = 'income'
target_var_encoding = {'<=50k' : 0, '>50k': 1}

df = prepare_data(raw_data, target_var, target_var_encoding, cols_to_drop)

In [5]:
df.head()

Unnamed: 0,age,fnlwgt,education.num,hours.per.week,income,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,...,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,race_asian-pac-islander,race_black,race_other,race_white,sex_male
1,82,132870,9,18,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,54,140359,4,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,41,264663,10,40,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
5,34,216864,9,45,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,38,150601,6,40,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [6]:
# Save the data for modeling
df.to_csv('us_income_data.csv')